aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1011
-rw-r--r--mm/Kconfig.debug164
-rw-r--r--mm/Makefile41
-rw-r--r--mm/backing-dev.c453
-rw-r--r--mm/balloon_compaction.c18
-rw-r--r--mm/bootmem_info.c128
-rw-r--r--mm/cleancache.c315
-rw-r--r--mm/cma.c182
-rw-r--r--mm/cma.h26
-rw-r--r--mm/cma_debug.c13
-rw-r--r--mm/cma_sysfs.c112
-rw-r--r--mm/compaction.c1121
-rw-r--r--mm/damon/Kconfig107
-rw-r--r--mm/damon/Makefile9
-rw-r--r--mm/damon/core-test.h367
-rw-r--r--mm/damon/core.c1471
-rw-r--r--mm/damon/dbgfs-test.h163
-rw-r--r--mm/damon/dbgfs.c1133
-rw-r--r--mm/damon/lru_sort.c323
-rw-r--r--mm/damon/modules-common.c42
-rw-r--r--mm/damon/modules-common.h49
-rw-r--r--mm/damon/ops-common.c122
-rw-r--r--mm/damon/ops-common.h18
-rw-r--r--mm/damon/paddr.c351
-rw-r--r--mm/damon/reclaim.c284
-rw-r--r--mm/damon/sysfs-common.c107
-rw-r--r--mm/damon/sysfs-common.h56
-rw-r--r--mm/damon/sysfs-schemes.c1707
-rw-r--r--mm/damon/sysfs.c1795
-rw-r--r--mm/damon/vaddr-test.h322
-rw-r--r--mm/damon/vaddr.c721
-rw-r--r--mm/debug.c160
-rw-r--r--mm/debug_page_alloc.c59
-rw-r--r--mm/debug_vm_pgtable.c1344
-rw-r--r--mm/dmapool.c421
-rw-r--r--mm/dmapool_test.c147
-rw-r--r--mm/early_ioremap.c26
-rw-r--r--mm/fadvise.c40
-rw-r--r--mm/fail_page_alloc.c66
-rw-r--r--mm/failslab.c13
-rw-r--r--mm/filemap.c3975
-rw-r--r--mm/folio-compat.c134
-rw-r--r--mm/frame_vector.c240
-rw-r--r--mm/frontswap.c270
-rw-r--r--mm/gup.c2551
-rw-r--r--mm/gup_benchmark.c201
-rw-r--r--mm/gup_test.c395
-rw-r--r--mm/gup_test.h45
-rw-r--r--mm/highmem.c392
-rw-r--r--mm/hmm.c67
-rw-r--r--mm/huge_memory.c2031
-rw-r--r--mm/hugetlb.c4749
-rw-r--r--mm/hugetlb_cgroup.c242
-rw-r--r--mm/hugetlb_vmemmap.c599
-rw-r--r--mm/hugetlb_vmemmap.h60
-rw-r--r--mm/hwpoison-inject.c30
-rw-r--r--mm/init-mm.c21
-rw-r--r--mm/internal.h832
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/io-mapping.c29
-rw-r--r--mm/ioremap.c266
-rw-r--r--mm/kasan/Makefile40
-rw-r--r--mm/kasan/common.c925
-rw-r--r--mm/kasan/generic.c342
-rw-r--r--mm/kasan/generic_report.c165
-rw-r--r--mm/kasan/hw_tags.c396
-rw-r--r--mm/kasan/init.c60
-rw-r--r--mm/kasan/kasan.h703
-rw-r--r--mm/kasan/kasan_test.c1581
-rw-r--r--mm/kasan/kasan_test_module.c81
-rw-r--r--mm/kasan/quarantine.c148
-rw-r--r--mm/kasan/report.c789
-rw-r--r--mm/kasan/report_generic.c399
-rw-r--r--mm/kasan/report_hw_tags.c71
-rw-r--r--mm/kasan/report_sw_tags.c95
-rw-r--r--mm/kasan/report_tags.c116
-rw-r--r--mm/kasan/shadow.c650
-rw-r--r--mm/kasan/sw_tags.c176
-rw-r--r--mm/kasan/tags.c248
-rw-r--r--mm/kasan/tags_report.c93
-rw-r--r--mm/kfence/.kunitconfig6
-rw-r--r--mm/kfence/Makefile6
-rw-r--r--mm/kfence/core.c1182
-rw-r--r--mm/kfence/kfence.h142
-rw-r--r--mm/kfence/kfence_test.c851
-rw-r--r--mm/kfence/report.c327
-rw-r--r--mm/khugepaged.c2215
-rw-r--r--mm/kmemleak.c415
-rw-r--r--mm/kmsan/Makefile34
-rw-r--r--mm/kmsan/core.c454
-rw-r--r--mm/kmsan/hooks.c424
-rw-r--r--mm/kmsan/init.c235
-rw-r--r--mm/kmsan/instrumentation.c333
-rw-r--r--mm/kmsan/kmsan.h211
-rw-r--r--mm/kmsan/kmsan_test.c652
-rw-r--r--mm/kmsan/report.c219
-rw-r--r--mm/kmsan/shadow.c308
-rw-r--r--mm/ksm.c981
-rw-r--r--mm/list_lru.c480
-rw-r--r--mm/maccess.c169
-rw-r--r--mm/madvise.c1148
-rw-r--r--mm/mapping_dirty_helpers.c51
-rw-r--r--mm/memblock.c437
-rw-r--r--mm/memcontrol.c3648
-rw-r--r--mm/memfd.c113
-rw-r--r--mm/memory-failure.c2144
-rw-r--r--mm/memory-tiers.c731
-rw-r--r--mm/memory.c3223
-rw-r--r--mm/memory_hotplug.c1556
-rw-r--r--mm/mempolicy.c1279
-rw-r--r--mm/mempool.c31
-rw-r--r--mm/memremap.c244
-rw-r--r--mm/memtest.c6
-rw-r--r--mm/migrate.c3255
-rw-r--r--mm/migrate_device.c956
-rw-r--r--mm/mincore.c37
-rw-r--r--mm/mlock.c834
-rw-r--r--mm/mm_init.c2626
-rw-r--r--mm/mm_slot.h55
-rw-r--r--mm/mmap.c3270
-rw-r--r--mm/mmap_lock.c246
-rw-r--r--mm/mmu_gather.c123
-rw-r--r--mm/mmu_notifier.c56
-rw-r--r--mm/mmzone.c31
-rw-r--r--mm/mprotect.c509
-rw-r--r--mm/mremap.c608
-rw-r--r--mm/msync.c8
-rw-r--r--mm/nommu.c406
-rw-r--r--mm/oom_kill.c250
-rw-r--r--mm/page-writeback.c1172
-rw-r--r--mm/page_alloc.c6364
-rw-r--r--mm/page_counter.c26
-rw-r--r--mm/page_ext.c171
-rw-r--r--mm/page_idle.c94
-rw-r--r--mm/page_io.c538
-rw-r--r--mm/page_isolation.c515
-rw-r--r--mm/page_owner.c324
-rw-r--r--mm/page_poison.c56
-rw-r--r--mm/page_reporting.c71
-rw-r--r--mm/page_reporting.h5
-rw-r--r--mm/page_table_check.c258
-rw-r--r--mm/page_vma_mapped.c325
-rw-r--r--mm/pagewalk.c190
-rw-r--r--mm/percpu-internal.h75
-rw-r--r--mm/percpu-km.c16
-rw-r--r--mm/percpu-stats.c41
-rw-r--r--mm/percpu-vm.c47
-rw-r--r--mm/percpu.c683
-rw-r--r--mm/pgalloc-track.h6
-rw-r--r--mm/pgtable-generic.c73
-rw-r--r--mm/process_vm_access.c10
-rw-r--r--mm/ptdump.c35
-rw-r--r--mm/readahead.c636
-rw-r--r--mm/rmap.c1997
-rw-r--r--mm/rodata_test.c8
-rw-r--r--mm/secretmem.c293
-rw-r--r--mm/shmem.c2278
-rw-r--r--mm/show_mem.c429
-rw-r--r--mm/shrinker_debug.c294
-rw-r--r--mm/shuffle.c27
-rw-r--r--mm/shuffle.h6
-rw-r--r--mm/slab.c1150
-rw-r--r--mm/slab.h610
-rw-r--r--mm/slab_common.c721
-rw-r--r--mm/slob.c720
-rw-r--r--mm/slub.c4100
-rw-r--r--mm/sparse-vmemmap.c237
-rw-r--r--mm/sparse.c136
-rw-r--r--mm/swap.c1255
-rw-r--r--mm/swap.h148
-rw-r--r--mm/swap_cgroup.c10
-rw-r--r--mm/swap_slots.c34
-rw-r--r--mm/swap_state.c521
-rw-r--r--mm/swapfile.c1023
-rw-r--r--mm/truncate.c658
-rw-r--r--mm/usercopy.c138
-rw-r--r--mm/userfaultfd.c560
-rw-r--r--mm/util.c422
-rw-r--r--mm/vmacache.c117
-rw-r--r--mm/vmalloc.c2513
-rw-r--r--mm/vmpressure.c20
-rw-r--r--mm/vmscan.c5989
-rw-r--r--mm/vmstat.c516
-rw-r--r--mm/workingset.c369
-rw-r--r--mm/z3fold.c663
-rw-r--r--mm/zbud.c323
-rw-r--r--mm/zpool.c69
-rw-r--r--mm/zsmalloc.c1282
-rw-r--r--mm/zswap.c717
189 files changed, 79145 insertions, 39680 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e72e61c1d62e..09130434e30d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -2,6 +2,401 @@
menu "Memory Management options"
+#
+# For some reason microblaze and nios2 hard code SWAP=n. Hopefully we can
+# add proper SWAP support to them, in which case this can be remove.
+#
+config ARCH_NO_SWAP
+ bool
+
+config ZPOOL
+ bool
+
+menuconfig SWAP
+ bool "Support for paging of anonymous memory (swap)"
+ depends on MMU && BLOCK && !ARCH_NO_SWAP
+ default y
+ help
+ This option allows you to choose whether you want to have support
+ for so called swap devices or swap files in your kernel that are
+ used to provide more virtual memory than the actual RAM present
+ in your computer. If unsure say Y.
+
+config ZSWAP
+ bool "Compressed cache for swap pages"
+ depends on SWAP
+ select FRONTSWAP
+ select CRYPTO
+ select ZPOOL
+ help
+ A lightweight compressed cache for swap pages. It takes
+ pages that are in the process of being swapped out and attempts to
+ compress them into a dynamically allocated RAM-based memory pool.
+ This can result in a significant I/O reduction on swap device and,
+ in the case where decompressing from RAM is faster than swap device
+ reads, can also improve workload performance.
+
+config ZSWAP_DEFAULT_ON
+ bool "Enable the compressed cache for swap pages by default"
+ depends on ZSWAP
+ help
+ If selected, the compressed cache for swap pages will be enabled
+ at boot, otherwise it will be disabled.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.enabled=' option.
+
+config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
+ bool "Invalidate zswap entries when pages are loaded"
+ depends on ZSWAP
+ help
+ If selected, exclusive loads for zswap will be enabled at boot,
+ otherwise it will be disabled.
+
+ If exclusive loads are enabled, when a page is loaded from zswap,
+ the zswap entry is invalidated at once, as opposed to leaving it
+ in zswap until the swap entry is freed.
+
+ This avoids having two copies of the same page in memory
+ (compressed and uncompressed) after faulting in a page from zswap.
+ The cost is that if the page was never dirtied and needs to be
+ swapped out again, it will be re-compressed.
+
+choice
+ prompt "Default compressor"
+ depends on ZSWAP
+ default ZSWAP_COMPRESSOR_DEFAULT_LZO
+ help
+ Selects the default compression algorithm for the compressed cache
+ for swap pages.
+
+ For an overview what kind of performance can be expected from
+ a particular compression algorithm please refer to the benchmarks
+ available at the following LWN page:
+ https://lwn.net/Articles/751795/
+
+ If in doubt, select 'LZO'.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.compressor=' option.
+
+config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ bool "Deflate"
+ select CRYPTO_DEFLATE
+ help
+ Use the Deflate algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZO
+ bool "LZO"
+ select CRYPTO_LZO
+ help
+ Use the LZO algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_842
+ bool "842"
+ select CRYPTO_842
+ help
+ Use the 842 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ bool "LZ4"
+ select CRYPTO_LZ4
+ help
+ Use the LZ4 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ bool "LZ4HC"
+ select CRYPTO_LZ4HC
+ help
+ Use the LZ4HC algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ bool "zstd"
+ select CRYPTO_ZSTD
+ help
+ Use the zstd algorithm as the default compression algorithm.
+endchoice
+
+config ZSWAP_COMPRESSOR_DEFAULT
+ string
+ depends on ZSWAP
+ default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO
+ default "842" if ZSWAP_COMPRESSOR_DEFAULT_842
+ default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ default ""
+
+choice
+ prompt "Default allocator"
+ depends on ZSWAP
+ default ZSWAP_ZPOOL_DEFAULT_ZBUD
+ help
+ Selects the default allocator for the compressed cache for
+ swap pages.
+ The default is 'zbud' for compatibility, however please do
+ read the description of each of the allocators below before
+ making a right choice.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.zpool=' option.
+
+config ZSWAP_ZPOOL_DEFAULT_ZBUD
+ bool "zbud"
+ select ZBUD
+ help
+ Use the zbud allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ bool "z3fold"
+ select Z3FOLD
+ help
+ Use the z3fold allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ bool "zsmalloc"
+ select ZSMALLOC
+ help
+ Use the zsmalloc allocator as the default allocator.
+endchoice
+
+config ZSWAP_ZPOOL_DEFAULT
+ string
+ depends on ZSWAP
+ default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
+ default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ default ""
+
+config ZBUD
+ tristate "2:1 compression allocator (zbud)"
+ depends on ZSWAP
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to two compressed pages per physical
+ page. While this design limits storage density, it has simple and
+ deterministic reclaim properties that make it preferable to a higher
+ density approach when reclaim will be used.
+
+config Z3FOLD
+ tristate "3:1 compression allocator (z3fold)"
+ depends on ZSWAP
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to three compressed pages per physical
+ page. It is a ZBUD derivative so the simplicity and determinism are
+ still there.
+
+config ZSMALLOC
+ tristate
+ prompt "N:1 compression allocator (zsmalloc)" if ZSWAP
+ depends on MMU
+ help
+ zsmalloc is a slab-based memory allocator designed to store
+ pages of various compression levels efficiently. It achieves
+ the highest storage density with the least amount of fragmentation.
+
+config ZSMALLOC_STAT
+ bool "Export zsmalloc statistics"
+ depends on ZSMALLOC
+ select DEBUG_FS
+ help
+ This option enables code in the zsmalloc to collect various
+ statistics about what's happening in zsmalloc and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
+config ZSMALLOC_CHAIN_SIZE
+ int "Maximum number of physical pages per-zspage"
+ default 8
+ range 4 16
+ depends on ZSMALLOC
+ help
+ This option sets the upper limit on the number of physical pages
+ that a zmalloc page (zspage) can consist of. The optimal zspage
+ chain size is calculated for each size class during the
+ initialization of the pool.
+
+ Changing this option can alter the characteristics of size classes,
+ such as the number of pages per zspage and the number of objects
+ per zspage. This can also result in different configurations of
+ the pool, as zsmalloc merges size classes with similar
+ characteristics.
+
+ For more information, see zsmalloc documentation.
+
+menu "SLAB allocator options"
+
+choice
+ prompt "Choose SLAB allocator"
+ default SLUB
+ help
+ This option allows to select a slab allocator.
+
+config SLAB_DEPRECATED
+ bool "SLAB (DEPRECATED)"
+ depends on !PREEMPT_RT
+ help
+ Deprecated and scheduled for removal in a few cycles. Replaced by
+ SLUB.
+
+ If you cannot migrate to SLUB, please contact linux-mm@kvack.org
+ and the people listed in the SLAB ALLOCATOR section of MAINTAINERS
+ file, explaining why.
+
+ The regular slab allocator that is established and known to work
+ well in all environments. It organizes cache hot objects in
+ per cpu and per node queues.
+
+config SLUB
+ bool "SLUB (Unqueued Allocator)"
+ help
+ SLUB is a slab allocator that minimizes cache line usage
+ instead of managing queues of cached objects (SLAB approach).
+ Per cpu caching is realized using slabs of objects instead
+ of queues of objects. SLUB can use memory efficiently
+ and has enhanced diagnostics. SLUB is the default choice for
+ a slab allocator.
+
+endchoice
+
+config SLAB
+ bool
+ default y
+ depends on SLAB_DEPRECATED
+
+config SLUB_TINY
+ bool "Configure SLUB for minimal memory footprint"
+ depends on SLUB && EXPERT
+ select SLAB_MERGE_DEFAULT
+ help
+ Configures the SLUB allocator in a way to achieve minimal memory
+ footprint, sacrificing scalability, debugging and other features.
+ This is intended only for the smallest system that had used the
+ SLOB allocator and is not recommended for systems with more than
+ 16MB RAM.
+
+ If unsure, say N.
+
+config SLAB_MERGE_DEFAULT
+ bool "Allow slab caches to be merged"
+ default y
+ depends on SLAB || SLUB
+ help
+ For reduced kernel memory fragmentation, slab caches can be
+ merged when they share the same size and other characteristics.
+ This carries a risk of kernel heap overflows being able to
+ overwrite objects from merged caches (and more easily control
+ cache layout), which makes such heap attacks easier to exploit
+ by attackers. By keeping caches unmerged, these kinds of exploits
+ can usually only damage objects in the same cache. To disable
+ merging at runtime, "slab_nomerge" can be passed on the kernel
+ command line.
+
+config SLAB_FREELIST_RANDOM
+ bool "Randomize slab freelist"
+ depends on SLAB || (SLUB && !SLUB_TINY)
+ help
+ Randomizes the freelist order used on creating new pages. This
+ security feature reduces the predictability of the kernel slab
+ allocator against heap overflows.
+
+config SLAB_FREELIST_HARDENED
+ bool "Harden slab freelist metadata"
+ depends on SLAB || (SLUB && !SLUB_TINY)
+ help
+ Many kernel heap attacks try to target slab cache metadata and
+ other infrastructure. This options makes minor performance
+ sacrifices to harden the kernel slab allocator against common
+ freelist exploit methods. Some slab implementations have more
+ sanity-checking than others. This option is most effective with
+ CONFIG_SLUB.
+
+config SLUB_STATS
+ default n
+ bool "Enable SLUB performance statistics"
+ depends on SLUB && SYSFS && !SLUB_TINY
+ help
+ SLUB statistics are useful to debug SLUBs allocation behavior in
+ order find ways to optimize the allocator. This should never be
+ enabled for production use since keeping statistics slows down
+ the allocator by a few percentage points. The slabinfo command
+ supports the determination of the most active slabs to figure
+ out which slabs are relevant to a particular load.
+ Try running: slabinfo -DA
+
+config SLUB_CPU_PARTIAL
+ default y
+ depends on SLUB && SMP && !SLUB_TINY
+ bool "SLUB per cpu partial cache"
+ help
+ Per cpu partial caches accelerate objects allocation and freeing
+ that is local to a processor at the price of more indeterminism
+ in the latency of the free. On overflow these caches will be cleared
+ which requires the taking of locks that may cause latency spikes.
+ Typically one would choose no for a realtime system.
+
+endmenu # SLAB allocator options
+
+config SHUFFLE_PAGE_ALLOCATOR
+ bool "Page allocator randomization"
+ default SLAB_FREELIST_RANDOM && ACPI_NUMA
+ help
+ Randomization of the page allocator improves the average
+ utilization of a direct-mapped memory-side-cache. See section
+ 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI
+ 6.2a specification for an example of how a platform advertises
+ the presence of a memory-side-cache. There are also incidental
+ security benefits as it reduces the predictability of page
+ allocations to compliment SLAB_FREELIST_RANDOM, but the
+ default granularity of shuffling on the MAX_ORDER i.e, 10th
+ order of pages is selected based on cache utilization benefits
+ on x86.
+
+ While the randomization improves cache utilization it may
+ negatively impact workloads on platforms without a cache. For
+ this reason, by default, the randomization is enabled only
+ after runtime detection of a direct-mapped memory-side-cache.
+ Otherwise, the randomization may be force enabled with the
+ 'page_alloc.shuffle' kernel command line parameter.
+
+ Say Y if unsure.
+
+config COMPAT_BRK
+ bool "Disable heap randomization"
+ default y
+ help
+ Randomizing heap placement makes heap exploits harder, but it
+ also breaks ancient binaries (including anything libc5 based).
+ This option changes the bootup default to heap randomization
+ disabled, and can be overridden at runtime by setting
+ /proc/sys/kernel/randomize_va_space to 2.
+
+ On non-ancient distros (post-2000 ones) N is usually a safe choice.
+
+config MMAP_ALLOW_UNINITIALIZED
+ bool "Allow mmapped anonymous memory to be uninitialized"
+ depends on EXPERT && !MMU
+ default n
+ help
+ Normally, and according to the Linux spec, anonymous memory obtained
+ from mmap() has its contents cleared before it is passed to
+ userspace. Enabling this config option allows you to request that
+ mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
+ providing a huge performance boost. If this option is not enabled,
+ then the flag will be ignored.
+
+ This is taken advantage of by uClibc's malloc(), and also by
+ ELF-FDPIC binfmt's brk and stack allocator.
+
+ Because of the obvious security issues, this option should only be
+ enabled on embedded devices where you control what is run in
+ userspace. Since that isn't generally a problem on no-MMU systems,
+ it is normally safe to say Y here.
+
+ See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
+
config SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SELECT_MEMORY_MODEL
@@ -9,7 +404,6 @@ config SELECT_MEMORY_MODEL
choice
prompt "Memory model"
depends on SELECT_MEMORY_MODEL
- default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
help
@@ -20,7 +414,7 @@ choice
config FLATMEM_MANUAL
bool "Flat Memory"
- depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
+ depends on !ARCH_SPARSEMEM_ENABLE || ARCH_FLATMEM_ENABLE
help
This option is best suited for non-NUMA systems with
flat address space. The FLATMEM is the most efficient
@@ -33,21 +427,6 @@ config FLATMEM_MANUAL
If unsure, choose this option (Flat Memory) over any other.
-config DISCONTIGMEM_MANUAL
- bool "Discontiguous Memory"
- depends on ARCH_DISCONTIGMEM_ENABLE
- help
- This option provides enhanced support for discontiguous
- memory systems, over FLATMEM. These systems have holes
- in their physical address spaces, and this option provides
- more efficient handling of these holes.
-
- Although "Discontiguous Memory" is still used by several
- architectures, it is considered deprecated in favor of
- "Sparse Memory".
-
- If unsure, choose "Sparse Memory" over this option.
-
config SPARSEMEM_MANUAL
bool "Sparse Memory"
depends on ARCH_SPARSEMEM_ENABLE
@@ -63,30 +442,13 @@ config SPARSEMEM_MANUAL
endchoice
-config DISCONTIGMEM
- def_bool y
- depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
-
config SPARSEMEM
def_bool y
depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
config FLATMEM
def_bool y
- depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
-
-config FLAT_NODE_MEM_MAP
- def_bool y
- depends on !SPARSEMEM
-
-#
-# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
-# to represent different areas of memory. This variable allows
-# those dependencies to exist individually.
-#
-config NEED_MULTIPLE_NODES
- def_bool y
- depends on DISCONTIGMEM || NUMA
+ depends on !SPARSEMEM || FLATMEM_MANUAL
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
@@ -121,6 +483,12 @@ config SPARSEMEM_VMEMMAP
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
+#
+config ARCH_WANT_OPTIMIZE_VMEMMAP
+ bool
config HAVE_MEMBLOCK_PHYS_MAP
bool
@@ -142,6 +510,13 @@ config NUMA_KEEP_MEMINFO
config MEMORY_ISOLATION
bool
+# IORESOURCE_SYSTEM_RAM regions in the kernel resource tree that are marked
+# IORESOURCE_EXCLUSIVE cannot be mapped to user space, for example, via
+# /dev/mem.
+config EXCLUSIVE_SYSTEM_RAM
+ def_bool y
+ depends on !DEVMEM || STRICT_DEVMEM
+
#
# Only be set on architectures that have completely implemented memory hotplug
# feature. If you are not sure, don't touch it.
@@ -149,17 +524,22 @@ config MEMORY_ISOLATION
config HAVE_BOOTMEM_INFO_NODE
def_bool n
+config ARCH_ENABLE_MEMORY_HOTPLUG
+ bool
+
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ bool
+
# eventually, we can have this option just 'select SPARSEMEM'
-config MEMORY_HOTPLUG
- bool "Allow for memory hot-add"
- depends on SPARSEMEM || X86_64_ACPI_NUMA
+menuconfig MEMORY_HOTPLUG
+ bool "Memory hotplug"
+ select MEMORY_ISOLATION
+ depends on SPARSEMEM
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on 64BIT || BROKEN
+ depends on 64BIT
select NUMA_KEEP_MEMINFO if NUMA
-config MEMORY_HOTPLUG_SPARSE
- def_bool y
- depends on SPARSEMEM && MEMORY_HOTPLUG
+if MEMORY_HOTPLUG
config MEMORY_HOTPLUG_DEFAULT_ONLINE
bool "Online the newly added memory blocks by default"
@@ -178,11 +558,17 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
- select MEMORY_ISOLATION
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
+config MHP_MEMMAP_ON_MEMORY
+ def_bool y
+ depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+ depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
+endif # MEMORY_HOTPLUG
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
@@ -242,6 +628,12 @@ config COMPACTION
it and then we would be really interested to hear about that at
linux-mm@kvack.org.
+config COMPACT_UNEVICTABLE_DEFAULT
+ int
+ depends on COMPACTION
+ default 0 if PREEMPT_RT
+ default 1
+
#
# support for free page reporting
config PAGE_REPORTING
@@ -268,12 +660,25 @@ config MIGRATION
pages as migration can relocate pages to satisfy a huge page
allocation instead of reclaiming.
+config DEVICE_MIGRATION
+ def_bool MIGRATION && ZONE_DEVICE
+
config ARCH_ENABLE_HUGEPAGE_MIGRATION
bool
config ARCH_ENABLE_THP_MIGRATION
bool
+config HUGETLB_PAGE_SIZE_VARIABLE
+ def_bool n
+ help
+ Allows the pageblock_order value to be dynamic instead of just standard
+ HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
+ on a platform.
+
+ Note that the pageblock_order cannot exceed MAX_ORDER and will be
+ clamped down to MAX_ORDER.
+
config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
@@ -283,24 +688,14 @@ config PHYS_ADDR_T_64BIT
config BOUNCE
bool "Enable bounce buffers"
default y
- depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
- help
- Enable bounce buffers for devices that cannot access
- the full range of memory available to the CPU. Enabled
- by default when ZONE_DMA or HIGHMEM is selected, but you
- may say n to override this.
-
-config VIRT_TO_BUS
- bool
+ depends on BLOCK && MMU && HIGHMEM
help
- An architecture should select this if it implements the
- deprecated interface virt_to_bus(). All new architectures
- should probably not select this.
-
+ Enable bounce buffers for devices that cannot access the full range of
+ memory available to the CPU. Enabled by default when HIGHMEM is
+ selected, but you may say n to override this.
config MMU_NOTIFIER
bool
- select SRCU
select INTERVAL_TREE
config KSM
@@ -314,7 +709,7 @@ config KSM
the many instances by a single page with that content, so
saving memory until one or another app needs to modify the content.
Recommended for use with KVM, or with other duplicative applications.
- See Documentation/vm/ksm.rst for more information: KSM is inactive
+ See Documentation/mm/ksm.rst for more information: KSM is inactive
until a program has madvised that an area is MADV_MERGEABLE, and
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
@@ -385,9 +780,15 @@ config NOMMU_INITIAL_TRIM_EXCESS
See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
-config TRANSPARENT_HUGEPAGE
+config ARCH_WANT_GENERAL_HUGETLB
+ bool
+
+config ARCH_WANTS_THP_SWAP
+ def_bool n
+
+menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
help
@@ -400,6 +801,8 @@ config TRANSPARENT_HUGEPAGE
If memory constrained on embedded, you may want to say N.
+if TRANSPARENT_HUGEPAGE
+
choice
prompt "Transparent Hugepage Support sysfs defaults"
depends on TRANSPARENT_HUGEPAGE
@@ -424,12 +827,9 @@ choice
benefit.
endchoice
-config ARCH_WANTS_THP_SWAP
- def_bool n
-
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT
help
Swap transparent huge pages in one piece, without splitting.
XXX: For now, swap cluster backing transparent huge page
@@ -437,51 +837,41 @@ config THP_SWAP
For selection by architectures with reasonable THP sizes.
+config READ_ONLY_THP_FOR_FS
+ bool "Read-only THP for filesystems (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGEPAGE && SHMEM
+
+ help
+ Allow khugepaged to put read-only file-backed pages in THP.
+
+ This is marked experimental because it is a new feature. Write
+ support of file THPs will be developed in the next few release
+ cycles.
+
+endif # TRANSPARENT_HUGEPAGE
+
#
# UP and nommu archs use km based percpu allocator
#
config NEED_PER_CPU_KM
- depends on !SMP
+ depends on !SMP || !MMU
bool
default y
-config CLEANCACHE
- bool "Enable cleancache driver to cache clean pages if tmem is present"
- help
- Cleancache can be thought of as a page-granularity victim cache
- for clean pages that the kernel's pageframe replacement algorithm
- (PFRA) would like to keep around, but can't since there isn't enough
- memory. So when the PFRA "evicts" a page, it first attempts to use
- cleancache code to put the data contained in that page into
- "transcendent memory", memory that is not directly accessible or
- addressable by the kernel and is of unknown and possibly
- time-varying size. And when a cleancache-enabled
- filesystem wishes to access a page in a file on disk, it first
- checks cleancache to see if it already contains it; if it does,
- the page is copied into the kernel and a disk access is avoided.
- When a transcendent memory driver is available (such as zcache or
- Xen transcendent memory), a significant I/O reduction
- may be achieved. When none is available, all cleancache calls
- are reduced to a single pointer-compare-against-NULL resulting
- in a negligible performance hit.
-
- If unsure, say Y to enable cleancache
+config NEED_PER_CPU_EMBED_FIRST_CHUNK
+ bool
-config FRONTSWAP
- bool "Enable frontswap to cache swap pages if tmem is present"
- depends on SWAP
- help
- Frontswap is so named because it can be thought of as the opposite
- of a "backing" store for a swap device. The data is stored into
- "transcendent memory", memory that is not directly accessible or
- addressable by the kernel and is of unknown and possibly
- time-varying size. When space in transcendent memory is available,
- a significant swap I/O reduction may be achieved. When none is
- available, all frontswap calls are reduced to a single pointer-
- compare-against-NULL resulting in a negligible performance hit
- and swap data is stored as normal on the matching swap device.
+config NEED_PER_CPU_PAGE_FIRST_CHUNK
+ bool
- If unsure, say Y to enable frontswap.
+config USE_PERCPU_NUMA_NODE_ID
+ bool
+
+config HAVE_SETUP_PER_CPU_AREA
+ bool
+
+config FRONTSWAP
+ bool
config CMA
bool "Contiguous Memory Allocator"
@@ -513,6 +903,13 @@ config CMA_DEBUGFS
help
Turns on the DebugFS interface for CMA.
+config CMA_SYSFS
+ bool "CMA information through sysfs interface"
+ depends on CMA && SYSFS
+ help
+ This option exposes some sysfs attributes to get information
+ from CMA.
+
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
@@ -537,215 +934,20 @@ config MEM_SOFT_DIRTY
See Documentation/admin-guide/mm/soft-dirty.rst for more details.
-config ZSWAP
- bool "Compressed cache for swap pages (EXPERIMENTAL)"
- depends on FRONTSWAP && CRYPTO=y
- select ZPOOL
- help
- A lightweight compressed cache for swap pages. It takes
- pages that are in the process of being swapped out and attempts to
- compress them into a dynamically allocated RAM-based memory pool.
- This can result in a significant I/O reduction on swap device and,
- in the case where decompressing from RAM is faster that swap device
- reads, can also improve workload performance.
-
- This is marked experimental because it is a new feature (as of
- v3.11) that interacts heavily with memory reclaim. While these
- interactions don't cause any known issues on simple memory setups,
- they have not be fully explored on the large set of potential
- configurations and workloads that exist.
-
-choice
- prompt "Compressed cache for swap pages default compressor"
- depends on ZSWAP
- default ZSWAP_COMPRESSOR_DEFAULT_LZO
- help
- Selects the default compression algorithm for the compressed cache
- for swap pages.
-
- For an overview what kind of performance can be expected from
- a particular compression algorithm please refer to the benchmarks
- available at the following LWN page:
- https://lwn.net/Articles/751795/
-
- If in doubt, select 'LZO'.
-
- The selection made here can be overridden by using the kernel
- command line 'zswap.compressor=' option.
-
-config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
- bool "Deflate"
- select CRYPTO_DEFLATE
- help
- Use the Deflate algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_LZO
- bool "LZO"
- select CRYPTO_LZO
- help
- Use the LZO algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_842
- bool "842"
- select CRYPTO_842
- help
- Use the 842 algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_LZ4
- bool "LZ4"
- select CRYPTO_LZ4
- help
- Use the LZ4 algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
- bool "LZ4HC"
- select CRYPTO_LZ4HC
- help
- Use the LZ4HC algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_ZSTD
- bool "zstd"
- select CRYPTO_ZSTD
- help
- Use the zstd algorithm as the default compression algorithm.
-endchoice
-
-config ZSWAP_COMPRESSOR_DEFAULT
- string
- depends on ZSWAP
- default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
- default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO
- default "842" if ZSWAP_COMPRESSOR_DEFAULT_842
- default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4
- default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
- default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
- default ""
-
-choice
- prompt "Compressed cache for swap pages default allocator"
- depends on ZSWAP
- default ZSWAP_ZPOOL_DEFAULT_ZBUD
- help
- Selects the default allocator for the compressed cache for
- swap pages.
- The default is 'zbud' for compatibility, however please do
- read the description of each of the allocators below before
- making a right choice.
-
- The selection made here can be overridden by using the kernel
- command line 'zswap.zpool=' option.
-
-config ZSWAP_ZPOOL_DEFAULT_ZBUD
- bool "zbud"
- select ZBUD
- help
- Use the zbud allocator as the default allocator.
-
-config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
- bool "z3fold"
- select Z3FOLD
- help
- Use the z3fold allocator as the default allocator.
-
-config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
- bool "zsmalloc"
- select ZSMALLOC
- help
- Use the zsmalloc allocator as the default allocator.
-endchoice
-
-config ZSWAP_ZPOOL_DEFAULT
- string
- depends on ZSWAP
- default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
- default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
- default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
- default ""
-
-config ZSWAP_DEFAULT_ON
- bool "Enable the compressed cache for swap pages by default"
- depends on ZSWAP
- help
- If selected, the compressed cache for swap pages will be enabled
- at boot, otherwise it will be disabled.
-
- The selection made here can be overridden by using the kernel
- command line 'zswap.enabled=' option.
-
-config ZPOOL
- tristate "Common API for compressed memory storage"
- help
- Compressed memory storage API. This allows using either zbud or
- zsmalloc.
-
-config ZBUD
- tristate "Low (Up to 2x) density storage for compressed pages"
- help
- A special purpose allocator for storing compressed pages.
- It is designed to store up to two compressed pages per physical
- page. While this design limits storage density, it has simple and
- deterministic reclaim properties that make it preferable to a higher
- density approach when reclaim will be used.
-
-config Z3FOLD
- tristate "Up to 3x density storage for compressed pages"
- depends on ZPOOL
- help
- A special purpose allocator for storing compressed pages.
- It is designed to store up to three compressed pages per physical
- page. It is a ZBUD derivative so the simplicity and determinism are
- still there.
-
-config ZSMALLOC
- tristate "Memory allocator for compressed pages"
- depends on MMU
- help
- zsmalloc is a slab-based memory allocator designed to store
- compressed RAM pages. zsmalloc uses virtual memory mapping
- in order to reduce fragmentation. However, this results in a
- non-standard allocator interface where a handle, not a pointer, is
- returned by an alloc(). This handle must be mapped in order to
- access the allocated space.
-
-config ZSMALLOC_PGTABLE_MAPPING
- bool "Use page table mapping to access object in zsmalloc"
- depends on ZSMALLOC=y
- help
- By default, zsmalloc uses a copy-based object mapping method to
- access allocations that span two pages. However, if a particular
- architecture (ex, ARM) performs VM mapping faster than copying,
- then you should select this. This causes zsmalloc to use page table
- mapping rather than copying for object mapping.
-
- You can check speed with zsmalloc benchmark:
- https://github.com/spartacus06/zsmapbench
-
-config ZSMALLOC_STAT
- bool "Export zsmalloc statistics"
- depends on ZSMALLOC
- select DEBUG_FS
- help
- This option enables code in the zsmalloc to collect various
- statistics about whats happening in zsmalloc and exports that
- information to userspace via debugfs.
- If unsure, say N.
-
config GENERIC_EARLY_IOREMAP
bool
-config MAX_STACK_SIZE_MB
- int "Maximum user stack size for 32-bit processes (MB)"
- default 80
+config STACK_MAX_DEFAULT_SIZE_MB
+ int "Default maximum user stack size for 32-bit processes (MB)"
+ default 100
range 8 2048
depends on STACK_GROWSUP && (!64BIT || COMPAT)
help
This is the maximum stack size in Megabytes in the VM layout of 32-bit
user processes when the stack grows upwards (currently only on parisc
- arch). The stack will be located at the highest memory address minus
- the given value, unless the RLIMIT_STACK hard limit is changed to a
- smaller value in which case that is used.
+ arch) when the RLIMIT_STACK hard limit is unlimited.
- A sane initial value is 80 MB.
+ A sane initial value is 100 MB.
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
@@ -762,10 +964,18 @@ config DEFERRED_STRUCT_PAGE_INIT
lifetime of the system until these kthreads finish the
initialisation.
+config PAGE_IDLE_FLAG
+ bool
+ select PAGE_EXTENSION if !64BIT
+ help
+ This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed
+ bit writers can set the state of the bit in the flags so that PTE
+ Accessed bit readers may avoid disturbance.
+
config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"
depends on SYSFS && MMU
- select PAGE_EXTENSION if !64BIT
+ select PAGE_IDLE_FLAG
help
This feature allows to estimate the amount of user pages that have
not been touched during a given period of time. This information can
@@ -775,9 +985,33 @@ config IDLE_PAGE_TRACKING
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
+config ARCH_HAS_CACHE_LINE_SIZE
+ bool
+
+config ARCH_HAS_CURRENT_STACK_POINTER
+ bool
+ help
+ In support of HARDENED_USERCOPY performing stack variable lifetime
+ checking, an architecture-agnostic way to find the stack pointer
+ is needed. Once an architecture defines an unsigned long global
+ register alias named "current_stack_pointer", this config can be
+ selected.
+
config ARCH_HAS_PTE_DEVMAP
bool
+config ARCH_HAS_ZONE_DMA_SET
+ bool
+
+config ZONE_DMA
+ bool "Support DMA zone" if ARCH_HAS_ZONE_DMA_SET
+ default y if ARM64 || X86
+
+config ZONE_DMA32
+ bool "Support DMA32 zone" if ARCH_HAS_ZONE_DMA_SET
+ depends on !X86_32
+ default y if ARM64
+
config ZONE_DEVICE
bool "Device memory (pmem, HMM, etc...) hotplug support"
depends on MEMORY_HOTPLUG
@@ -795,9 +1029,6 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
-config DEV_PAGEMAP_OPS
- bool
-
#
# Helpers to mirror range of the CPU page tables of a process into device page
# tables.
@@ -806,17 +1037,21 @@ config HMM_MIRROR
bool
depends on MMU
+config GET_FREE_REGION
+ depends on SPARSEMEM
+ bool
+
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
depends on ZONE_DEVICE
- select DEV_PAGEMAP_OPS
+ select GET_FREE_REGION
help
Allows creation of struct pages to represent unaddressable device
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
-config FRAME_VECTOR
+config VMAP_PFN
bool
config ARCH_USES_HIGH_VMA_FLAGS
@@ -824,6 +1059,23 @@ config ARCH_USES_HIGH_VMA_FLAGS
config ARCH_HAS_PKEYS
bool
+config ARCH_USES_PG_ARCH_X
+ bool
+ help
+ Enable the definition of PG_arch_x page flags with x > 1. Only
+ suitable for 64-bit architectures with CONFIG_FLATMEM or
+ CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be
+ enough room for additional bits in page->flags.
+
+config VM_EVENT_COUNTERS
+ default y
+ bool "Enable VM event counters for /proc/vmstat" if EXPERT
+ help
+ VM event counters are needed for event counts to be shown.
+ This option allows the disabling of the VM event counters
+ on EXPERT systems. /proc/vmstat will only show page counts
+ if VM event counters are disabled.
+
config PERCPU_STATS
bool "Collect percpu memory statistics"
help
@@ -831,27 +1083,40 @@ config PERCPU_STATS
information includes global and per chunk statistics, which can
be used to help understand percpu memory usage.
-config GUP_BENCHMARK
- bool "Enable infrastructure for get_user_pages() and related calls benchmarking"
+config GUP_TEST
+ bool "Enable infrastructure for get_user_pages()-related unit tests"
+ depends on DEBUG_FS
help
- Provides /sys/kernel/debug/gup_benchmark that helps with testing
- performance of get_user_pages() and related calls.
+ Provides /sys/kernel/debug/gup_test, which in turn provides a way
+ to make ioctl calls that can launch kernel-based unit tests for
+ the get_user_pages*() and pin_user_pages*() family of API calls.
- See tools/testing/selftests/vm/gup_benchmark.c
+ These tests include benchmark testing of the _fast variants of
+ get_user_pages*() and pin_user_pages*(), as well as smoke tests of
+ the non-_fast variants.
-config GUP_GET_PTE_LOW_HIGH
- bool
+ There is also a sub-test that allows running dump_page() on any
+ of up to eight pages (selected by command line args) within the
+ range of user-space addresses. These pages are either pinned via
+ pin_user_pages*(), or pinned via get_user_pages*(), as specified
+ by other command line arguments.
-config READ_ONLY_THP_FOR_FS
- bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGEPAGE && SHMEM
+ See tools/testing/selftests/mm/gup_test.c
- help
- Allow khugepaged to put read-only file-backed pages in THP.
+comment "GUP_TEST needs to have DEBUG_FS enabled"
+ depends on !GUP_TEST && !DEBUG_FS
- This is marked experimental because it is a new feature. Write
- support of file THPs will be developed in the next few release
- cycles.
+config GUP_GET_PXX_LOW_HIGH
+ bool
+
+config DMAPOOL_TEST
+ tristate "Enable a module to run time tests on dma_pool"
+ depends on HAS_DMA
+ help
+ Provides a test module that will allocate and free many blocks of
+ various sizes and report how long it takes. This is intended to
+ provide a consistent way to measure how changes to the
+ dma_pool_alloc/free routines affect performance.
config ARCH_HAS_PTE_SPECIAL
bool
@@ -869,4 +1134,108 @@ config ARCH_HAS_HUGEPD
config MAPPING_DIRTY_HELPERS
bool
+config KMAP_LOCAL
+ bool
+
+config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
+ bool
+
+# struct io_mapping based helper. Selected by drivers that need them
+config IO_MAPPING
+ bool
+
+config SECRETMEM
+ default y
+ bool "Enable memfd_secret() system call" if EXPERT
+ depends on ARCH_HAS_SET_DIRECT_MAP
+ help
+ Enable the memfd_secret() system call with the ability to create
+ memory areas visible only in the context of the owning process and
+ not mapped to other processes and other kernel page tables.
+
+config ANON_VMA_NAME
+ bool "Anonymous VMA name support"
+ depends on PROC_FS && ADVISE_SYSCALLS && MMU
+
+ help
+ Allow naming anonymous virtual memory areas.
+
+ This feature allows assigning names to virtual memory areas. Assigned
+ names can be later retrieved from /proc/pid/maps and /proc/pid/smaps
+ and help identifying individual anonymous memory areas.
+ Assigning a name to anonymous virtual memory area might prevent that
+ area from being merged with adjacent virtual memory areas due to the
+ difference in their name.
+
+config USERFAULTFD
+ bool "Enable userfaultfd() system call"
+ depends on MMU
+ help
+ Enable the userfaultfd() system call that allows to intercept and
+ handle page faults in userland.
+
+config HAVE_ARCH_USERFAULTFD_WP
+ bool
+ help
+ Arch has userfaultfd write protection support
+
+config HAVE_ARCH_USERFAULTFD_MINOR
+ bool
+ help
+ Arch has userfaultfd minor fault support
+
+config PTE_MARKER_UFFD_WP
+ bool "Userfaultfd write protection support for shmem/hugetlbfs"
+ default y
+ depends on HAVE_ARCH_USERFAULTFD_WP
+
+ help
+ Allows to create marker PTEs for userfaultfd write protection
+ purposes. It is required to enable userfaultfd write protection on
+ file-backed memory types like shmem and hugetlbfs.
+
+# multi-gen LRU {
+config LRU_GEN
+ bool "Multi-Gen LRU"
+ depends on MMU
+ # make sure folio->flags has enough spare bits
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
+ help
+ A high performance LRU implementation to overcommit memory. See
+ Documentation/admin-guide/mm/multigen_lru.rst for details.
+
+config LRU_GEN_ENABLED
+ bool "Enable by default"
+ depends on LRU_GEN
+ help
+ This option enables the multi-gen LRU by default.
+
+config LRU_GEN_STATS
+ bool "Full stats for debugging"
+ depends on LRU_GEN
+ help
+ Do not enable this option unless you plan to look at historical stats
+ from evicted generations for debugging purpose.
+
+ This option has a per-memcg and per-node memory overhead.
+# }
+
+config ARCH_SUPPORTS_PER_VMA_LOCK
+ def_bool n
+
+config PER_VMA_LOCK
+ def_bool y
+ depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP
+ help
+ Allow per-vma locking during page fault handling.
+
+ This feature allows locking each virtual memory area separately when
+ handling page faults instead of taking mmap_lock.
+
+config LOCK_MM_AND_FIND_VMA
+ bool
+ depends on !STACK_GROWSUP
+
+source "mm/damon/Kconfig"
+
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 864f129f1937..018a5bd2f576 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -45,6 +45,39 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
+config DEBUG_SLAB
+ bool "Debug slab memory allocations"
+ depends on DEBUG_KERNEL && SLAB
+ help
+ Say Y here to have the kernel do limited verification on memory
+ allocation as well as poisoning memory on free to catch use of freed
+ memory. This can make kmalloc/kfree-intensive workloads much slower.
+
+config SLUB_DEBUG
+ default y
+ bool "Enable SLUB debugging support" if EXPERT
+ depends on SLUB && SYSFS && !SLUB_TINY
+ select STACKDEPOT if STACKTRACE_SUPPORT
+ help
+ SLUB has extensive debug support features. Disabling these can
+ result in significant savings in code size. While /sys/kernel/slab
+ will still exist (with SYSFS enabled), it will not provide e.g. cache
+ validation.
+
+config SLUB_DEBUG_ON
+ bool "SLUB debugging on by default"
+ depends on SLUB && SLUB_DEBUG
+ select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
+ default n
+ help
+ Boot with debugging on by default. SLUB boots by default with
+ the runtime debug capabilities switched off. Enabling this is
+ equivalent to specifying the "slub_debug" parameter on boot.
+ There is no support for more fine grained debug control like
+ possible with slub_debug=xxx. SLUB debugging may be switched
+ off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
+ "slub_debug=-".
+
config PAGE_OWNER
bool "Track page owner"
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
@@ -57,14 +90,38 @@ config PAGE_OWNER
help to find bare alloc_page(s) leaks. Even if you include this
feature on your build, it is disabled in default. You should pass
"page_owner=on" to boot parameter in order to enable it. Eats
- a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ a fair amount of memory if enabled. See tools/mm/page_owner_sort.c
for user-space helper.
If unsure, say N.
+config PAGE_TABLE_CHECK
+ bool "Check for invalid mappings in user page tables"
+ depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ depends on EXCLUSIVE_SYSTEM_RAM
+ select PAGE_EXTENSION
+ help
+ Check that anonymous page is not being mapped twice with read write
+ permissions. Check that anonymous and file pages are not being
+ erroneously shared. Since the checking is performed at the time
+ entries are added and removed to user page tables, leaking, corruption
+ and double mapping problems are detected synchronously.
+
+ If unsure say "n".
+
+config PAGE_TABLE_CHECK_ENFORCED
+ bool "Enforce the page table checking by default"
+ depends on PAGE_TABLE_CHECK
+ help
+ Always enable page table checking. By default the page table checking
+ is disabled, and can be optionally enabled via page_table_check=on
+ kernel parameter. This config enforces that page table check is always
+ enabled.
+
+ If unsure say "n".
+
config PAGE_POISONING
bool "Poison pages after freeing"
- select PAGE_POISONING_NO_SANITY if HIBERNATION
help
Fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages. The filling of the memory helps
@@ -75,30 +132,11 @@ config PAGE_POISONING
Note that "poison" here is not the same thing as the "HWPoison"
for CONFIG_MEMORY_FAILURE. This is software poisoning only.
- If unsure, say N
-
-config PAGE_POISONING_NO_SANITY
- depends on PAGE_POISONING
- bool "Only poison, don't sanity check"
- help
- Skip the sanity checking on alloc, only fill the pages with
- poison on free. This reduces some of the overhead of the
- poisoning feature.
-
- If you are only interested in sanitization, say Y. Otherwise
- say N.
+ If you are only interested in sanitization of freed pages without
+ checking the poison pattern on alloc, you can boot the kernel with
+ "init_on_free=1" instead of enabling this.
-config PAGE_POISONING_ZERO
- bool "Use zero for poisoning instead of debugging value"
- depends on PAGE_POISONING
- help
- Instead of using the existing poison value, fill the pages with
- zeros. This makes it harder to detect when errors are occurring
- due to sanitization but the zeroing at free means that it is
- no longer necessary to write zeros when GFP_ZERO is used on
- allocation.
-
- If unsure, say N
+ If unsure, say N
config DEBUG_PAGE_REF
bool "Enable tracepoint to track down page reference manipulation"
@@ -170,3 +208,79 @@ config PTDUMP_DEBUGFS
kernel.
If in doubt, say N.
+
+config HAVE_DEBUG_KMEMLEAK
+ bool
+
+config DEBUG_KMEMLEAK
+ bool "Kernel memory leak detector"
+ depends on DEBUG_KERNEL && HAVE_DEBUG_KMEMLEAK
+ select DEBUG_FS
+ select STACKTRACE if STACKTRACE_SUPPORT
+ select KALLSYMS
+ select CRC32
+ select STACKDEPOT
+ select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF
+ help
+ Say Y here if you want to enable the memory leak
+ detector. The memory allocation/freeing is traced in a way
+ similar to the Boehm's conservative garbage collector, the
+ difference being that the orphan objects are not freed but
+ only shown in /sys/kernel/debug/kmemleak. Enabling this
+ feature will introduce an overhead to memory
+ allocations. See Documentation/dev-tools/kmemleak.rst for more
+ details.
+
+ Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances
+ of finding leaks due to the slab objects poisoning.
+
+ In order to access the kmemleak file, debugfs needs to be
+ mounted (usually at /sys/kernel/debug).
+
+config DEBUG_KMEMLEAK_MEM_POOL_SIZE
+ int "Kmemleak memory pool size"
+ depends on DEBUG_KMEMLEAK
+ range 200 1000000
+ default 16000
+ help
+ Kmemleak must track all the memory allocations to avoid
+ reporting false positives. Since memory may be allocated or
+ freed before kmemleak is fully initialised, use a static pool
+ of metadata objects to track such callbacks. After kmemleak is
+ fully initialised, this memory pool acts as an emergency one
+ if slab allocations fail.
+
+config DEBUG_KMEMLEAK_DEFAULT_OFF
+ bool "Default kmemleak to off"
+ depends on DEBUG_KMEMLEAK
+ help
+ Say Y here to disable kmemleak by default. It can then be enabled
+ on the command line via kmemleak=on.
+
+config DEBUG_KMEMLEAK_AUTO_SCAN
+ bool "Enable kmemleak auto scan thread on boot up"
+ default y
+ depends on DEBUG_KMEMLEAK
+ help
+ Depending on the cpu, kmemleak scan may be cpu intensive and can
+ stall user tasks at times. This option enables/disables automatic
+ kmemleak scan at boot up.
+
+ Say N here to disable kmemleak auto scan thread to stop automatic
+ scanning. Disabling this option disables automatic reporting of
+ memory leaks.
+
+ If unsure, say Y.
+
+config PER_VMA_LOCK_STATS
+ bool "Statistics for per-vma locks"
+ depends on PER_VMA_LOCK
+ help
+ Say Y here to enable success, retry and failure counters of page
+ faults handled under protection of per-vma locks. When enabled, the
+ counters are exposed in /proc/vmstat. This information is useful for
+ kernel developers to evaluate effectiveness of per-vma locks and to
+ identify pathological cases. Counting these events introduces a small
+ overhead in the page fault path.
+
+ If in doubt, say N.
diff --git a/mm/Makefile b/mm/Makefile
index d73aed0fc99c..678530a07326 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,12 +15,13 @@ KCSAN_SANITIZE_slab_common.o := n
KCSAN_SANITIZE_slab.o := n
KCSAN_SANITIZE_slub.o := n
KCSAN_SANITIZE_page_alloc.o := n
+# But enable explicit instrumentation for memory barriers.
+KCSAN_INSTRUMENT_BARRIERS := y
# These files are disabled because they produce non-interesting and/or
# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
# free pages, or a task is migrated between nodes.
KCOV_INSTRUMENT_slab_common.o := n
-KCOV_INSTRUMENT_slob.o := n
KCOV_INSTRUMENT_slab.o := n
KCOV_INSTRUMENT_slub.o := n
KCOV_INSTRUMENT_page_alloc.o := n
@@ -38,7 +39,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o ioremap.o
+ pgtable-generic.o rmap.o vmalloc.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -46,21 +47,25 @@ mmu-$(CONFIG_MMU) += process_vm_access.o
endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
- maccess.o page-writeback.o \
+ maccess.o page-writeback.o folio-compat.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o vmacache.o \
+ compaction.o show_mem.o\
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o $(mmu-y)
+ debug.o gup.o mmap_lock.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+# Give 'memory_hotplug' its own module-parameter namespace
+memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+
obj-y += page-alloc.o
obj-y += init-mm.o
obj-y += memblock.o
+obj-y += $(memory-hotplug-y)
ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
@@ -71,33 +76,39 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
-obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KASAN) += kasan/
+obj-$(CONFIG_KFENCE) += kfence/
+obj-$(CONFIG_KMSAN) += kmsan/
obj-$(CONFIG_FAILSLAB) += failslab.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_NUMA) += memory-tiers.o
+obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
-obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
+ifdef CONFIG_SWAP
+obj-$(CONFIG_MEMCG) += swap_cgroup.o
+endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
-obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
+obj-$(CONFIG_GUP_TEST) += gup_test.o
+obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
-obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
obj-$(CONFIG_ZBUD) += zbud.o
@@ -107,11 +118,15 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_SECRETMEM) += secretmem.o
+obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
-obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
+obj-$(CONFIG_DAMON) += damon/
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
@@ -120,3 +135,7 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_IO_MAPPING) += io-mapping.o
+obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
+obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 408d5051d05b..3ffc3cfa7a14 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,13 +1,16 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/blkdev.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
-#include <linux/backing-dev.h>
#include <linux/kthread.h>
+#include <linux/backing-dev.h>
+#include <linux/blk-cgroup.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
@@ -17,7 +20,6 @@
struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
-static struct class *bdi_class;
static const char *bdi_unknown_name = "(unknown)";
/*
@@ -32,6 +34,8 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -69,7 +73,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
global_dirty_limits(&background_thresh, &dirty_thresh);
wb_thresh = wb_calc_thresh(wb, dirty_thresh);
-#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
"BdiWriteback: %10lu kB\n"
"BdiReclaimable: %10lu kB\n"
@@ -98,7 +101,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_more_io,
nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->wb.state);
-#undef K
return 0;
}
@@ -146,15 +148,13 @@ static ssize_t read_ahead_kb_store(struct device *dev,
return count;
}
-#define K(pages) ((pages) << (PAGE_SHIFT - 10))
-
#define BDI_SHOW(name, expr) \
static ssize_t name##_show(struct device *dev, \
- struct device_attribute *attr, char *page) \
+ struct device_attribute *attr, char *buf) \
{ \
struct backing_dev_info *bdi = dev_get_drvdata(dev); \
\
- return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+ return sysfs_emit(buf, "%lld\n", (long long)expr); \
} \
static DEVICE_ATTR_RW(name);
@@ -177,7 +177,26 @@ static ssize_t min_ratio_store(struct device *dev,
return ret;
}
-BDI_SHOW(min_ratio, bdi->min_ratio)
+BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)
+
+static ssize_t min_ratio_fine_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_ratio_no_scale(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(min_ratio_fine, bdi->min_ratio)
static ssize_t max_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
@@ -196,54 +215,161 @@ static ssize_t max_ratio_store(struct device *dev,
return ret;
}
-BDI_SHOW(max_ratio, bdi->max_ratio)
+BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)
+
+static ssize_t max_ratio_fine_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_ratio_no_scale(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(max_ratio_fine, bdi->max_ratio)
+
+static ssize_t min_bytes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
+}
+
+static ssize_t min_bytes_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ u64 bytes;
+ ssize_t ret;
+
+ ret = kstrtoull(buf, 10, &bytes);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_bytes(bdi, bytes);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+static DEVICE_ATTR_RW(min_bytes);
+
+static ssize_t max_bytes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
+}
+
+static ssize_t max_bytes_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ u64 bytes;
+ ssize_t ret;
+
+ ret = kstrtoull(buf, 10, &bytes);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_bytes(bdi, bytes);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+static DEVICE_ATTR_RW(max_bytes);
static ssize_t stable_pages_required_show(struct device *dev,
struct device_attribute *attr,
- char *page)
+ char *buf)
{
dev_warn_once(dev,
"the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
- return snprintf(page, PAGE_SIZE-1, "%d\n", 0);
+ return sysfs_emit(buf, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);
+static ssize_t strict_limit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int strict_limit;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &strict_limit);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_strict_limit(bdi, strict_limit);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+
+static ssize_t strict_limit_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%d\n",
+ !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
+}
+static DEVICE_ATTR_RW(strict_limit);
+
static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
+ &dev_attr_min_ratio_fine.attr,
&dev_attr_max_ratio.attr,
+ &dev_attr_max_ratio_fine.attr,
+ &dev_attr_min_bytes.attr,
+ &dev_attr_max_bytes.attr,
&dev_attr_stable_pages_required.attr,
+ &dev_attr_strict_limit.attr,
NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);
+static const struct class bdi_class = {
+ .name = "bdi",
+ .dev_groups = bdi_dev_groups,
+};
+
static __init int bdi_class_init(void)
{
- bdi_class = class_create(THIS_MODULE, "bdi");
- if (IS_ERR(bdi_class))
- return PTR_ERR(bdi_class);
+ int ret;
+
+ ret = class_register(&bdi_class);
+ if (ret)
+ return ret;
- bdi_class->dev_groups = bdi_dev_groups;
bdi_debug_init();
return 0;
}
postcore_initcall(bdi_class_init);
-static int bdi_init(struct backing_dev_info *bdi);
-
static int __init default_bdi_init(void)
{
- int err;
-
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
-
- err = bdi_init(&noop_backing_dev_info);
-
- return err;
+ return 0;
}
subsys_initcall(default_bdi_init);
@@ -266,10 +392,18 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
unsigned long timeout;
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (test_bit(WB_registered, &wb->state))
queue_delayed_work(bdi_wq, &wb->dwork, timeout);
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
+}
+
+static void wb_update_bandwidth_workfn(struct work_struct *work)
+{
+ struct bdi_writeback *wb = container_of(to_delayed_work(work),
+ struct bdi_writeback, bw_dwork);
+
+ wb_update_bandwidth(wb);
}
/*
@@ -284,8 +418,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
- if (wb != &bdi->wb)
- bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@@ -294,6 +426,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
+ atomic_set(&wb->writeback_inodes, 0);
wb->bw_time_stamp = jiffies;
wb->balanced_dirty_ratelimit = INIT_BW;
wb->dirty_ratelimit = INIT_BW;
@@ -303,11 +436,12 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+ INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
- goto out_put_bdi;
+ return err;
for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
err = percpu_counter_init(&wb->stat[i], 0, gfp);
@@ -321,9 +455,6 @@ out_destroy_stat:
while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
-out_put_bdi:
- if (wb != &bdi->wb)
- bdi_put(bdi);
return err;
}
@@ -335,12 +466,12 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
static void wb_shutdown(struct bdi_writeback *wb)
{
/* Make sure nobody queues further work */
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
return;
}
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
cgwb_remove_from_bdi_list(wb);
/*
@@ -351,6 +482,7 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ flush_delayed_work(&wb->bw_dwork);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -363,8 +495,6 @@ static void wb_exit(struct bdi_writeback *wb)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
- if (wb != &wb->bdi->wb)
- bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -372,17 +502,30 @@ static void wb_exit(struct bdi_writeback *wb)
#include <linux/memcontrol.h>
/*
- * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
- * bdi->cgwb_tree is also RCU protected.
+ * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
+ * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;
+static LIST_HEAD(offline_cgwbs);
+static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
+static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
+
+static void cgwb_free_rcu(struct rcu_head *rcu_head)
+{
+ struct bdi_writeback *wb = container_of(rcu_head,
+ struct bdi_writeback, rcu);
+
+ percpu_ref_exit(&wb->refcnt);
+ kfree(wb);
+}
+
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
+ struct backing_dev_info *bdi = wb->bdi;
mutex_lock(&wb->bdi->cgwb_release_mutex);
wb_shutdown(wb);
@@ -392,12 +535,18 @@ static void cgwb_release_workfn(struct work_struct *work)
mutex_unlock(&wb->bdi->cgwb_release_mutex);
/* triggers blkg destruction if no online users left */
- blkcg_unpin_online(blkcg);
+ blkcg_unpin_online(wb->blkcg_css);
fprop_local_destroy_percpu(&wb->memcg_completions);
- percpu_ref_exit(&wb->refcnt);
+
+ spin_lock_irq(&cgwb_lock);
+ list_del(&wb->offline_node);
+ spin_unlock_irq(&cgwb_lock);
+
wb_exit(wb);
- kfree_rcu(wb, rcu);
+ bdi_put(bdi);
+ WARN_ON_ONCE(!list_empty(&wb->b_attached));
+ call_rcu(&wb->rcu, cgwb_free_rcu);
}
static void cgwb_release(struct percpu_ref *refcnt)
@@ -414,6 +563,7 @@ static void cgwb_kill(struct bdi_writeback *wb)
WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
list_del(&wb->memcg_node);
list_del(&wb->blkcg_node);
+ list_add(&wb->offline_node, &offline_cgwbs);
percpu_ref_kill(&wb->refcnt);
}
@@ -429,7 +579,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
{
struct mem_cgroup *memcg;
struct cgroup_subsys_state *blkcg_css;
- struct blkcg *blkcg;
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
struct bdi_writeback *wb;
unsigned long flags;
@@ -437,9 +586,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
- blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = &memcg->cgwb_list;
- blkcg_cgwb_list = &blkcg->cgwb_list;
+ blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
/* look up again under lock and discard on blkcg mismatch */
spin_lock_irqsave(&cgwb_lock, flags);
@@ -473,8 +621,10 @@ static int cgwb_create(struct backing_dev_info *bdi,
wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css;
+ INIT_LIST_HEAD(&wb->b_attached);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
+ bdi_get(bdi);
/*
* The root wb determines the registered state of the whole bdi and
@@ -492,7 +642,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
- blkcg_pin_online(blkcg);
+ blkcg_pin_online(blkcg_css);
css_get(memcg_css);
css_get(blkcg_css);
}
@@ -506,6 +656,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
goto out_put;
err_fprop_exit:
+ bdi_put(bdi);
fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit:
percpu_ref_exit(&wb->refcnt);
@@ -580,7 +731,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
{
struct bdi_writeback *wb;
- might_sleep_if(gfpflags_allow_blocking(gfp));
+ might_alloc(gfp);
if (!memcg_css->parent)
return &bdi->wb;
@@ -634,6 +785,54 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
mutex_unlock(&bdi->cgwb_release_mutex);
}
+/*
+ * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
+ *
+ * Try to release dying cgwbs by switching attached inodes to the nearest
+ * living ancestor's writeback. Processed wbs are placed at the end
+ * of the list to guarantee the forward progress.
+ */
+static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
+{
+ struct bdi_writeback *wb;
+ LIST_HEAD(processed);
+
+ spin_lock_irq(&cgwb_lock);
+
+ while (!list_empty(&offline_cgwbs)) {
+ wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
+ offline_node);
+ list_move(&wb->offline_node, &processed);
+
+ /*
+ * If wb is dirty, cleaning up the writeback by switching
+ * attached inodes will result in an effective removal of any
+ * bandwidth restrictions, which isn't the goal. Instead,
+ * it can be postponed until the next time, when all io
+ * will be likely completed. If in the meantime some inodes
+ * will get re-dirtied, they should be eventually switched to
+ * a new cgwb.
+ */
+ if (wb_has_dirty_io(wb))
+ continue;
+
+ if (!wb_tryget(wb))
+ continue;
+
+ spin_unlock_irq(&cgwb_lock);
+ while (cleanup_offline_cgwb(wb))
+ cond_resched();
+ spin_lock_irq(&cgwb_lock);
+
+ wb_put(wb);
+ }
+
+ if (!list_empty(&processed))
+ list_splice_tail(&processed, &offline_cgwbs);
+
+ spin_unlock_irq(&cgwb_lock);
+}
+
/**
* wb_memcg_offline - kill all wb's associated with a memcg being offlined
* @memcg: memcg being offlined
@@ -650,22 +849,25 @@ void wb_memcg_offline(struct mem_cgroup *memcg)
cgwb_kill(wb);
memcg_cgwb_list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
+
+ queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
}
/**
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
- * @blkcg: blkcg being offlined
+ * @css: blkcg being offlined
*
* Also prevents creation of any new wb's associated with @blkcg.
*/
-void wb_blkcg_offline(struct blkcg *blkcg)
+void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
struct bdi_writeback *wb, *next;
+ struct list_head *list = blkcg_get_cgwb_list(css);
spin_lock_irq(&cgwb_lock);
- list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+ list_for_each_entry_safe(wb, next, list, blkcg_node)
cgwb_kill(wb);
- blkcg->cgwb_list.next = NULL; /* prevent new wb's */
+ list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
}
@@ -712,23 +914,19 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
#endif /* CONFIG_CGROUP_WRITEBACK */
-static int bdi_init(struct backing_dev_info *bdi)
+int bdi_init(struct backing_dev_info *bdi)
{
- int ret;
-
bdi->dev = NULL;
kref_init(&bdi->refcnt);
bdi->min_ratio = 0;
- bdi->max_ratio = 100;
+ bdi->max_ratio = 100 * BDI_RATIO_SCALE;
bdi->max_prop_frac = FPROP_FRAC_BASE;
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
- ret = cgwb_bdi_init(bdi);
-
- return ret;
+ return cgwb_bdi_init(bdi);
}
struct backing_dev_info *bdi_alloc(int node_id)
@@ -746,6 +944,7 @@ struct backing_dev_info *bdi_alloc(int node_id)
bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
bdi->ra_pages = VM_READAHEAD_PAGES;
bdi->io_pages = VM_READAHEAD_PAGES;
+ timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
return bdi;
}
EXPORT_SYMBOL(bdi_alloc);
@@ -807,7 +1006,7 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
return 0;
vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
- dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
+ dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
if (IS_ERR(dev))
return PTR_ERR(dev);
@@ -867,11 +1066,20 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
+ del_timer_sync(&bdi->laptop_mode_wb_timer);
+
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
cgwb_bdi_unregister(bdi);
+ /*
+ * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
+ * update the global bdi_min_ratio.
+ */
+ if (bdi->min_ratio)
+ bdi_set_min_ratio(bdi, 0);
+
if (bdi->dev) {
bdi_debug_unregister(bdi);
device_unregister(bdi->dev);
@@ -883,14 +1091,14 @@ void bdi_unregister(struct backing_dev_info *bdi)
bdi->owner = NULL;
}
}
+EXPORT_SYMBOL(bdi_unregister);
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
- if (test_bit(WB_registered, &bdi->wb.state))
- bdi_unregister(bdi);
+ WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
kfree(bdi);
@@ -902,115 +1110,26 @@ void bdi_put(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_put);
-const char *bdi_dev_name(struct backing_dev_info *bdi)
-{
- if (!bdi || !bdi->dev)
- return bdi_unknown_name;
- return bdi->dev_name;
-}
-EXPORT_SYMBOL_GPL(bdi_dev_name);
-
-static wait_queue_head_t congestion_wqh[2] = {
- __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
- __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
- };
-static atomic_t nr_wb_congested[2];
-
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
-{
- wait_queue_head_t *wqh = &congestion_wqh[sync];
- enum wb_congested_state bit;
-
- bit = sync ? WB_sync_congested : WB_async_congested;
- if (test_and_clear_bit(bit, &bdi->wb.congested))
- atomic_dec(&nr_wb_congested[sync]);
- smp_mb__after_atomic();
- if (waitqueue_active(wqh))
- wake_up(wqh);
-}
-EXPORT_SYMBOL(clear_bdi_congested);
-
-void set_bdi_congested(struct backing_dev_info *bdi, int sync)
-{
- enum wb_congested_state bit;
-
- bit = sync ? WB_sync_congested : WB_async_congested;
- if (!test_and_set_bit(bit, &bdi->wb.congested))
- atomic_inc(&nr_wb_congested[sync]);
-}
-EXPORT_SYMBOL(set_bdi_congested);
-
-/**
- * congestion_wait - wait for a backing_dev to become uncongested
- * @sync: SYNC or ASYNC IO
- * @timeout: timeout in jiffies
- *
- * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
- * write congestion. If no backing_devs are congested then just wait for the
- * next write to be completed.
- */
-long congestion_wait(int sync, long timeout)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
- long ret;
- unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];
-
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
+ struct super_block *sb;
- trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
- jiffies_to_usecs(jiffies - start));
+ if (!inode)
+ return &noop_backing_dev_info;
- return ret;
+ sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+ if (sb_is_blkdev_sb(sb))
+ return I_BDEV(inode)->bd_disk->bdi;
+#endif
+ return sb->s_bdi;
}
-EXPORT_SYMBOL(congestion_wait);
+EXPORT_SYMBOL(inode_to_bdi);
-/**
- * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
- * @sync: SYNC or ASYNC IO
- * @timeout: timeout in jiffies
- *
- * In the event of a congested backing_dev (any backing_dev) this waits
- * for up to @timeout jiffies for either a BDI to exit congestion of the
- * given @sync queue or a write to complete.
- *
- * The return value is 0 if the sleep is for the full timeout. Otherwise,
- * it is the number of jiffies that were still remaining when the function
- * returned. return_value == timeout implies the function did not sleep.
- */
-long wait_iff_congested(int sync, long timeout)
+const char *bdi_dev_name(struct backing_dev_info *bdi)
{
- long ret;
- unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];
-
- /*
- * If there is no congestion, yield if necessary instead
- * of sleeping on the congestion queue
- */
- if (atomic_read(&nr_wb_congested[sync]) == 0) {
- cond_resched();
-
- /* In case we scheduled, work out time remaining */
- ret = timeout - (jiffies - start);
- if (ret < 0)
- ret = 0;
-
- goto out;
- }
-
- /* Sleep until uncongested or a write happens */
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
-
-out:
- trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
- jiffies_to_usecs(jiffies - start));
-
- return ret;
+ if (!bdi || !bdi->dev)
+ return bdi_unknown_name;
+ return bdi->dev_name;
}
-EXPORT_SYMBOL(wait_iff_congested);
+EXPORT_SYMBOL_GPL(bdi_dev_name);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 26de020aae7b..22c96fed70b5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
/**
* balloon_page_list_dequeue() - removes pages from balloon's page list and
* returns a list of the pages.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
* @pages: pointer to the list of pages that would be returned to the caller.
* @n_req_pages: number of requested pages.
*
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
/*
* balloon_page_dequeue - removes a page from balloon's page list and returns
* its address to allow the driver to release the page.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
*
* Driver must call this function to properly dequeue a previously enqueued page
* before definitively releasing it back to the guest system.
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
-bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
+static bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
@@ -217,7 +217,7 @@ bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
return true;
}
-void balloon_page_putback(struct page *page)
+static void balloon_page_putback(struct page *page)
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
@@ -228,10 +228,8 @@ void balloon_page_putback(struct page *page)
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
-
/* move_to_new_page() counterpart for a ballooned page */
-int balloon_page_migrate(struct address_space *mapping,
- struct page *newpage, struct page *page,
+static int balloon_page_migrate(struct page *newpage, struct page *page,
enum migrate_mode mode)
{
struct balloon_dev_info *balloon = balloon_page_device(page);
@@ -250,11 +248,11 @@ int balloon_page_migrate(struct address_space *mapping,
return balloon->migratepage(balloon, newpage, page, mode);
}
-const struct address_space_operations balloon_aops = {
- .migratepage = balloon_page_migrate,
+const struct movable_operations balloon_mops = {
+ .migrate_page = balloon_page_migrate,
.isolate_page = balloon_page_isolate,
.putback_page = balloon_page_putback,
};
-EXPORT_SYMBOL_GPL(balloon_aops);
+EXPORT_SYMBOL_GPL(balloon_mops);
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
new file mode 100644
index 000000000000..b1efebfcf94b
--- /dev/null
+++ b/mm/bootmem_info.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Bootmem core functions.
+ *
+ * Copyright (c) 2020, Bytedance.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ */
+#include <linux/mm.h>
+#include <linux/compiler.h>
+#include <linux/memblock.h>
+#include <linux/bootmem_info.h>
+#include <linux/memory_hotplug.h>
+#include <linux/kmemleak.h>
+
+void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
+{
+ page->index = type;
+ SetPagePrivate(page);
+ set_page_private(page, info);
+ page_ref_inc(page);
+}
+
+void put_page_bootmem(struct page *page)
+{
+ unsigned long type = page->index;
+
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
+
+ if (page_ref_dec_return(page) == 1) {
+ page->index = 0;
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ INIT_LIST_HEAD(&page->lru);
+ kmemleak_free_part(page_to_virt(page), PAGE_SIZE);
+ free_reserved_page(page);
+ }
+}
+
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void __init register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ /* Get section's memmap address */
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ /*
+ * Get page for the memmap's phys address
+ * XXX: need more consideration for sparse_vmemmap...
+ */
+ page = virt_to_page(memmap);
+ mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+ mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+ /* remember memmap's page */
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, SECTION_INFO);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+
+}
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void __init register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+ unsigned long i, pfn, end_pfn, nr_pages;
+ int node = pgdat->node_id;
+ struct page *page;
+
+ nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+ page = virt_to_page(pgdat);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+
+ pfn = pgdat->node_start_pfn;
+ end_pfn = pgdat_end_pfn(pgdat);
+
+ /* register section info */
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ /*
+ * Some platforms can assign the same pfn to multiple nodes - on
+ * node0 as well as nodeN. To avoid registering a pfn against
+ * multiple nodes we check that this pfn does not already
+ * reside in some other nodes.
+ */
+ if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
+ register_page_bootmem_info_section(pfn);
+ }
+}
diff --git a/mm/cleancache.c b/mm/cleancache.c
deleted file mode 100644
index db7eee9c0886..000000000000
--- a/mm/cleancache.c
+++ /dev/null
@@ -1,315 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Cleancache frontend
- *
- * This code provides the generic "frontend" layer to call a matching
- * "backend" driver implementation of cleancache. See
- * Documentation/vm/cleancache.rst for more information.
- *
- * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
- * Author: Dan Magenheimer
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/exportfs.h>
-#include <linux/mm.h>
-#include <linux/debugfs.h>
-#include <linux/cleancache.h>
-
-/*
- * cleancache_ops is set by cleancache_register_ops to contain the pointers
- * to the cleancache "backend" implementation functions.
- */
-static const struct cleancache_ops *cleancache_ops __read_mostly;
-
-/*
- * Counters available via /sys/kernel/debug/cleancache (if debugfs is
- * properly configured. These are for information only so are not protected
- * against increment races.
- */
-static u64 cleancache_succ_gets;
-static u64 cleancache_failed_gets;
-static u64 cleancache_puts;
-static u64 cleancache_invalidates;
-
-static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
-{
- switch (sb->cleancache_poolid) {
- case CLEANCACHE_NO_BACKEND:
- __cleancache_init_fs(sb);
- break;
- case CLEANCACHE_NO_BACKEND_SHARED:
- __cleancache_init_shared_fs(sb);
- break;
- }
-}
-
-/*
- * Register operations for cleancache. Returns 0 on success.
- */
-int cleancache_register_ops(const struct cleancache_ops *ops)
-{
- if (cmpxchg(&cleancache_ops, NULL, ops))
- return -EBUSY;
-
- /*
- * A cleancache backend can be built as a module and hence loaded after
- * a cleancache enabled filesystem has called cleancache_init_fs. To
- * handle such a scenario, here we call ->init_fs or ->init_shared_fs
- * for each active super block. To differentiate between local and
- * shared filesystems, we temporarily initialize sb->cleancache_poolid
- * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
- * respectively in case there is no backend registered at the time
- * cleancache_init_fs or cleancache_init_shared_fs is called.
- *
- * Since filesystems can be mounted concurrently with cleancache
- * backend registration, we have to be careful to guarantee that all
- * cleancache enabled filesystems that has been mounted by the time
- * cleancache_register_ops is called has got and all mounted later will
- * get cleancache_poolid. This is assured by the following statements
- * tied together:
- *
- * a) iterate_supers skips only those super blocks that has started
- * ->kill_sb
- *
- * b) if iterate_supers encounters a super block that has not finished
- * ->mount yet, it waits until it is finished
- *
- * c) cleancache_init_fs is called from ->mount and
- * cleancache_invalidate_fs is called from ->kill_sb
- *
- * d) we call iterate_supers after cleancache_ops has been set
- *
- * From a) it follows that if iterate_supers skips a super block, then
- * either the super block is already dead, in which case we do not need
- * to bother initializing cleancache for it, or it was mounted after we
- * initiated iterate_supers. In the latter case, it must have seen
- * cleancache_ops set according to d) and initialized cleancache from
- * ->mount by itself according to c). This proves that we call
- * ->init_fs at least once for each active super block.
- *
- * From b) and c) it follows that if iterate_supers encounters a super
- * block that has already started ->init_fs, it will wait until ->mount
- * and hence ->init_fs has finished, then check cleancache_poolid, see
- * that it has already been set and therefore do nothing. This proves
- * that we call ->init_fs no more than once for each super block.
- *
- * Combined together, the last two paragraphs prove the function
- * correctness.
- *
- * Note that various cleancache callbacks may proceed before this
- * function is called or even concurrently with it, but since
- * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
- * until the corresponding ->init_fs has been actually called and
- * cleancache_ops has been set.
- */
- iterate_supers(cleancache_register_ops_sb, NULL);
- return 0;
-}
-EXPORT_SYMBOL(cleancache_register_ops);
-
-/* Called by a cleancache-enabled filesystem at time of mount */
-void __cleancache_init_fs(struct super_block *sb)
-{
- int pool_id = CLEANCACHE_NO_BACKEND;
-
- if (cleancache_ops) {
- pool_id = cleancache_ops->init_fs(PAGE_SIZE);
- if (pool_id < 0)
- pool_id = CLEANCACHE_NO_POOL;
- }
- sb->cleancache_poolid = pool_id;
-}
-EXPORT_SYMBOL(__cleancache_init_fs);
-
-/* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(struct super_block *sb)
-{
- int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
-
- if (cleancache_ops) {
- pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE);
- if (pool_id < 0)
- pool_id = CLEANCACHE_NO_POOL;
- }
- sb->cleancache_poolid = pool_id;
-}
-EXPORT_SYMBOL(__cleancache_init_shared_fs);
-
-/*
- * If the filesystem uses exportable filehandles, use the filehandle as
- * the key, else use the inode number.
- */
-static int cleancache_get_key(struct inode *inode,
- struct cleancache_filekey *key)
-{
- int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
- int len = 0, maxlen = CLEANCACHE_KEY_MAX;
- struct super_block *sb = inode->i_sb;
-
- key->u.ino = inode->i_ino;
- if (sb->s_export_op != NULL) {
- fhfn = sb->s_export_op->encode_fh;
- if (fhfn) {
- len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
- if (len <= FILEID_ROOT || len == FILEID_INVALID)
- return -1;
- if (maxlen > CLEANCACHE_KEY_MAX)
- return -1;
- }
- }
- return 0;
-}
-
-/*
- * "Get" data from cleancache associated with the poolid/inode/index
- * that were specified when the data was put to cleanache and, if
- * successful, use it to fill the specified page with data and return 0.
- * The pageframe is unchanged and returns -1 if the get fails.
- * Page must be locked by caller.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
- */
-int __cleancache_get_page(struct page *page)
-{
- int ret = -1;
- int pool_id;
- struct cleancache_filekey key = { .u.key = { 0 } };
-
- if (!cleancache_ops) {
- cleancache_failed_gets++;
- goto out;
- }
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (pool_id < 0)
- goto out;
-
- if (cleancache_get_key(page->mapping->host, &key) < 0)
- goto out;
-
- ret = cleancache_ops->get_page(pool_id, key, page->index, page);
- if (ret == 0)
- cleancache_succ_gets++;
- else
- cleancache_failed_gets++;
-out:
- return ret;
-}
-EXPORT_SYMBOL(__cleancache_get_page);
-
-/*
- * "Put" data from a page to cleancache and associate it with the
- * (previously-obtained per-filesystem) poolid and the page's,
- * inode and page index. Page must be locked. Note that a put_page
- * always "succeeds", though a subsequent get_page may succeed or fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
- */
-void __cleancache_put_page(struct page *page)
-{
- int pool_id;
- struct cleancache_filekey key = { .u.key = { 0 } };
-
- if (!cleancache_ops) {
- cleancache_puts++;
- return;
- }
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (pool_id >= 0 &&
- cleancache_get_key(page->mapping->host, &key) >= 0) {
- cleancache_ops->put_page(pool_id, key, page->index, page);
- cleancache_puts++;
- }
-}
-EXPORT_SYMBOL(__cleancache_put_page);
-
-/*
- * Invalidate any data from cleancache associated with the poolid and the
- * page's inode and page index so that a subsequent "get" will fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
- */
-void __cleancache_invalidate_page(struct address_space *mapping,
- struct page *page)
-{
- /* careful... page->mapping is NULL sometimes when this is called */
- int pool_id = mapping->host->i_sb->cleancache_poolid;
- struct cleancache_filekey key = { .u.key = { 0 } };
-
- if (!cleancache_ops)
- return;
-
- if (pool_id >= 0) {
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (cleancache_get_key(mapping->host, &key) >= 0) {
- cleancache_ops->invalidate_page(pool_id,
- key, page->index);
- cleancache_invalidates++;
- }
- }
-}
-EXPORT_SYMBOL(__cleancache_invalidate_page);
-
-/*
- * Invalidate all data from cleancache associated with the poolid and the
- * mappings's inode so that all subsequent gets to this poolid/inode
- * will fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
- */
-void __cleancache_invalidate_inode(struct address_space *mapping)
-{
- int pool_id = mapping->host->i_sb->cleancache_poolid;
- struct cleancache_filekey key = { .u.key = { 0 } };
-
- if (!cleancache_ops)
- return;
-
- if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
- cleancache_ops->invalidate_inode(pool_id, key);
-}
-EXPORT_SYMBOL(__cleancache_invalidate_inode);
-
-/*
- * Called by any cleancache-enabled filesystem at time of unmount;
- * note that pool_id is surrendered and may be returned by a subsequent
- * cleancache_init_fs or cleancache_init_shared_fs.
- */
-void __cleancache_invalidate_fs(struct super_block *sb)
-{
- int pool_id;
-
- pool_id = sb->cleancache_poolid;
- sb->cleancache_poolid = CLEANCACHE_NO_POOL;
-
- if (cleancache_ops && pool_id >= 0)
- cleancache_ops->invalidate_fs(pool_id);
-}
-EXPORT_SYMBOL(__cleancache_invalidate_fs);
-
-static int __init init_cleancache(void)
-{
-#ifdef CONFIG_DEBUG_FS
- struct dentry *root = debugfs_create_dir("cleancache", NULL);
-
- debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets);
- debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets);
- debugfs_create_u64("puts", 0444, root, &cleancache_puts);
- debugfs_create_u64("invalidates", 0444, root, &cleancache_invalidates);
-#endif
- return 0;
-}
-module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 7f415d7cda9f..a4cfe995e11e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -24,7 +24,6 @@
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/mm.h>
-#include <linux/mutex.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/log2.h>
@@ -34,6 +33,7 @@
#include <linux/kmemleak.h>
#include <trace/events/cma.h>
+#include "internal.h"
#include "cma.h"
struct cma cma_areas[MAX_CMA_AREAS];
@@ -81,50 +81,46 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
}
static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
- unsigned int count)
+ unsigned long count)
{
unsigned long bitmap_no, bitmap_count;
+ unsigned long flags;
bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
- mutex_lock(&cma->lock);
+ spin_lock_irqsave(&cma->lock, flags);
bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
- mutex_unlock(&cma->lock);
+ spin_unlock_irqrestore(&cma->lock, flags);
}
static void __init cma_activate_area(struct cma *cma)
{
- unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
- unsigned i = cma->count >> pageblock_order;
+ unsigned long base_pfn = cma->base_pfn, pfn;
struct zone *zone;
cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
if (!cma->bitmap)
goto out_error;
- WARN_ON_ONCE(!pfn_valid(pfn));
- zone = page_zone(pfn_to_page(pfn));
-
- do {
- unsigned j;
-
- base_pfn = pfn;
- for (j = pageblock_nr_pages; j; --j, pfn++) {
- WARN_ON_ONCE(!pfn_valid(pfn));
- /*
- * alloc_contig_range requires the pfn range
- * specified to be in the same zone. Make this
- * simple by forcing the entire CMA resv range
- * to be in the same zone.
- */
- if (page_zone(pfn_to_page(pfn)) != zone)
- goto not_in_zone;
- }
- init_cma_reserved_pageblock(pfn_to_page(base_pfn));
- } while (--i);
+ /*
+ * alloc_contig_range() requires the pfn range specified to be in the
+ * same zone. Simplify by forcing the entire CMA resv range to be in the
+ * same zone.
+ */
+ WARN_ON_ONCE(!pfn_valid(base_pfn));
+ zone = page_zone(pfn_to_page(base_pfn));
+ for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) {
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ goto not_in_zone;
+ }
+
+ for (pfn = base_pfn; pfn < base_pfn + cma->count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
- mutex_init(&cma->lock);
+ spin_lock_init(&cma->lock);
#ifdef CONFIG_CMA_DEBUGFS
INIT_HLIST_HEAD(&cma->mem_head);
@@ -136,6 +132,12 @@ static void __init cma_activate_area(struct cma *cma)
not_in_zone:
bitmap_free(cma->bitmap);
out_error:
+ /* Expose all pages to the buddy, they are useless for CMA. */
+ if (!cma->reserve_pages_on_error) {
+ for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
+ free_reserved_page(pfn_to_page(pfn));
+ }
+ totalcma_pages -= cma->count;
cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
return;
@@ -152,6 +154,11 @@ static int __init cma_init_reserved_areas(void)
}
core_initcall(cma_init_reserved_areas);
+void __init cma_reserve_pages_on_error(struct cma *cma)
+{
+ cma->reserve_pages_on_error = true;
+}
+
/**
* cma_init_reserved_mem() - create custom contiguous area from reserved memory
* @base: Base address of the reserved area
@@ -170,7 +177,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
struct cma **res_cma)
{
struct cma *cma;
- phys_addr_t alignment;
/* Sanity checks */
if (cma_area_count == ARRAY_SIZE(cma_areas)) {
@@ -181,15 +187,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
- /* ensure minimal alignment required by mm core */
- alignment = PAGE_SIZE <<
- max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
-
/* alignment should be aligned with order_per_bit */
- if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
+ if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit))
return -EINVAL;
- if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size)
+ /* ensure minimal alignment required by mm core */
+ if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
/*
@@ -264,14 +267,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (alignment && !is_power_of_2(alignment))
return -EINVAL;
- /*
- * Sanitise input arguments.
- * Pages both ends in CMA area could be merged into adjacent unmovable
- * migratetype page by page allocator's buddy algorithm. In the case,
- * you couldn't get a contiguous memory, which is not what we want.
- */
- alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
- max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+ /* Sanitise input arguments. */
+ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
if (fixed && base & (alignment - 1)) {
ret = -EINVAL;
pr_err("Region at %pa must be aligned to %pa bytes\n",
@@ -326,12 +323,29 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
phys_addr_t addr = 0;
/*
+ * If there is enough memory, try a bottom-up allocation first.
+ * It will place the new cma area close to the start of the node
+ * and guarantee that the compaction is moving pages out of the
+ * cma area and not into it.
+ * Avoid using first 4GB to not interfere with constrained zones
+ * like DMA/DMA32.
+ */
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) {
+ memblock_set_bottom_up(true);
+ addr = memblock_alloc_range_nid(size, alignment, SZ_4G,
+ limit, nid, true);
+ memblock_set_bottom_up(false);
+ }
+#endif
+
+ /*
* All pages in the reserved area must come from the same zone.
* If the requested region crosses the low/high memory boundary,
* try allocating from high memory first and fall back to low
* memory in case of failure.
*/
- if (base < highmem_start && limit > highmem_start) {
+ if (!addr && base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range_nid(size, alignment,
highmem_start, limit, nid, true);
limit = highmem_start;
@@ -363,7 +377,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
return 0;
free_mem:
- memblock_free(base, size);
+ memblock_phys_free(base, size);
err:
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
return ret;
@@ -377,7 +391,7 @@ static void cma_debug_show_areas(struct cma *cma)
unsigned long nr_part, nr_total = 0;
unsigned long nbits = cma_bitmap_maxno(cma);
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
pr_info("number of available pages: ");
for (;;) {
next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
@@ -392,7 +406,7 @@ static void cma_debug_show_areas(struct cma *cma)
start = next_zero_bit + nr_zero;
}
pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
}
#else
static inline void cma_debug_show_areas(struct cma *cma) { }
@@ -408,25 +422,27 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
- bool no_warn)
+struct page *cma_alloc(struct cma *cma, unsigned long count,
+ unsigned int align, bool no_warn)
{
unsigned long mask, offset;
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
- size_t i;
+ unsigned long i;
struct page *page = NULL;
int ret = -ENOMEM;
if (!cma || !cma->count || !cma->bitmap)
- return NULL;
+ goto out;
- pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
+ pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
count, align);
if (!count)
- return NULL;
+ goto out;
+
+ trace_cma_alloc_start(cma->name, count, align);
mask = cma_bitmap_aligned_mask(cma, align);
offset = cma_bitmap_aligned_offset(cma, align);
@@ -434,15 +450,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- return NULL;
+ goto out;
for (;;) {
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
bitmap_maxno, start, bitmap_count, mask,
offset);
if (bitmap_no >= bitmap_maxno) {
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
break;
}
bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
@@ -451,7 +467,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
* our exclusive use. If the migration fails we will take the
* lock again and unmark it.
*/
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma_mutex);
@@ -467,13 +483,16 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
if (ret != -EBUSY)
break;
- pr_debug("%s(): memory range at %p is busy, retrying\n",
- __func__, pfn_to_page(pfn));
+ pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
+ __func__, pfn, pfn_to_page(pfn));
+
+ trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
+ count, align);
/* try again with a bit different memory target */
start = bitmap_no + mask + 1;
}
- trace_cma_alloc(pfn, page, count, align);
+ trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
/*
* CMA can allocate multiple page blocks, which results in different
@@ -486,15 +505,44 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
}
if (ret && !no_warn) {
- pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
- __func__, count, ret);
+ pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n",
+ __func__, cma->name, count, ret);
cma_debug_show_areas(cma);
}
pr_debug("%s(): returned %p\n", __func__, page);
+out:
+ if (page) {
+ count_vm_event(CMA_ALLOC_SUCCESS);
+ cma_sysfs_account_success_pages(cma, count);
+ } else {
+ count_vm_event(CMA_ALLOC_FAIL);
+ if (cma)
+ cma_sysfs_account_fail_pages(cma, count);
+ }
+
return page;
}
+bool cma_pages_valid(struct cma *cma, const struct page *pages,
+ unsigned long count)
+{
+ unsigned long pfn;
+
+ if (!cma || !pages)
+ return false;
+
+ pfn = page_to_pfn(pages);
+
+ if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) {
+ pr_debug("%s(page %p, count %lu)\n", __func__,
+ (void *)pages, count);
+ return false;
+ }
+
+ return true;
+}
+
/**
* cma_release() - release allocated pages
* @cma: Contiguous memory region for which the allocation is performed.
@@ -505,25 +553,23 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
-bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
+bool cma_release(struct cma *cma, const struct page *pages,
+ unsigned long count)
{
unsigned long pfn;
- if (!cma || !pages)
+ if (!cma_pages_valid(cma, pages, count))
return false;
- pr_debug("%s(page %p)\n", __func__, (void *)pages);
+ pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
pfn = page_to_pfn(pages);
- if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
- return false;
-
VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
free_contig_range(pfn, count);
cma_clear_bitmap(cma, pfn, count);
- trace_cma_release(pfn, pages, count);
+ trace_cma_release(cma->name, pfn, pages, count);
return true;
}
diff --git a/mm/cma.h b/mm/cma.h
index 42ae082cb067..88a0595670b7 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -3,19 +3,34 @@
#define __MM_CMA_H__
#include <linux/debugfs.h>
+#include <linux/kobject.h>
+
+struct cma_kobject {
+ struct kobject kobj;
+ struct cma *cma;
+};
struct cma {
unsigned long base_pfn;
unsigned long count;
unsigned long *bitmap;
unsigned int order_per_bit; /* Order of pages represented by one bit */
- struct mutex lock;
+ spinlock_t lock;
#ifdef CONFIG_CMA_DEBUGFS
struct hlist_head mem_head;
spinlock_t mem_head_lock;
struct debugfs_u32_array dfs_bitmap;
#endif
char name[CMA_MAX_NAME];
+#ifdef CONFIG_CMA_SYSFS
+ /* the number of CMA page successful allocations */
+ atomic64_t nr_pages_succeeded;
+ /* the number of CMA page allocation failures */
+ atomic64_t nr_pages_failed;
+ /* kobject requires dynamic object */
+ struct cma_kobject *cma_kobj;
+#endif
+ bool reserve_pages_on_error;
};
extern struct cma cma_areas[MAX_CMA_AREAS];
@@ -26,4 +41,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
return cma->count >> cma->order_per_bit;
}
+#ifdef CONFIG_CMA_SYSFS
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+#else
+static inline void cma_sysfs_account_success_pages(struct cma *cma,
+ unsigned long nr_pages) {};
+static inline void cma_sysfs_account_fail_pages(struct cma *cma,
+ unsigned long nr_pages) {};
+#endif
#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index d5bf8aa34fdc..602fff89b15f 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val)
struct cma *cma = data;
unsigned long used;
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
/* pages counter is smaller than sizeof(int) */
used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
*val = (u64)used << cma->order_per_bit;
return 0;
@@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
unsigned long start, end = 0;
unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
for (;;) {
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
if (start >= bitmap_maxno)
@@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
}
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
*val = (u64)maxchunk << cma->order_per_bit;
return 0;
@@ -163,11 +163,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
- char name[16];
- scnprintf(name, sizeof(name), "cma-%s", cma->name);
-
- tmp = debugfs_create_dir(name, root_dentry);
+ tmp = debugfs_create_dir(cma->name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
new file mode 100644
index 000000000000..56347d15b7e8
--- /dev/null
+++ b/mm/cma_sysfs.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CMA SysFS Interface
+ *
+ * Copyright (c) 2021 Minchan Kim <minchan@kernel.org>
+ */
+
+#include <linux/cma.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "cma.h"
+
+#define CMA_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_succeeded);
+}
+
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_failed);
+}
+
+static inline struct cma *cma_from_kobj(struct kobject *kobj)
+{
+ return container_of(kobj, struct cma_kobject, kobj)->cma;
+}
+
+static ssize_t alloc_pages_success_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n",
+ atomic64_read(&cma->nr_pages_succeeded));
+}
+CMA_ATTR_RO(alloc_pages_success);
+
+static ssize_t alloc_pages_fail_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed));
+}
+CMA_ATTR_RO(alloc_pages_fail);
+
+static void cma_kobj_release(struct kobject *kobj)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+ struct cma_kobject *cma_kobj = cma->cma_kobj;
+
+ kfree(cma_kobj);
+ cma->cma_kobj = NULL;
+}
+
+static struct attribute *cma_attrs[] = {
+ &alloc_pages_success_attr.attr,
+ &alloc_pages_fail_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(cma);
+
+static const struct kobj_type cma_ktype = {
+ .release = cma_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = cma_groups,
+};
+
+static int __init cma_sysfs_init(void)
+{
+ struct kobject *cma_kobj_root;
+ struct cma_kobject *cma_kobj;
+ struct cma *cma;
+ int i, err;
+
+ cma_kobj_root = kobject_create_and_add("cma", mm_kobj);
+ if (!cma_kobj_root)
+ return -ENOMEM;
+
+ for (i = 0; i < cma_area_count; i++) {
+ cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL);
+ if (!cma_kobj) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cma = &cma_areas[i];
+ cma->cma_kobj = cma_kobj;
+ cma_kobj->cma = cma;
+ err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype,
+ cma_kobj_root, "%s", cma->name);
+ if (err) {
+ kobject_put(&cma_kobj->kobj);
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ while (--i >= 0) {
+ cma = &cma_areas[i];
+ kobject_put(&cma->cma_kobj->kobj);
+ }
+ kobject_put(cma_kobj_root);
+
+ return err;
+}
+subsys_initcall(cma_sysfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index 6c63844fc061..eacca2794e47 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -26,6 +26,11 @@
#include "internal.h"
#ifdef CONFIG_COMPACTION
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+#define HPAGE_FRAG_CHECK_INTERVAL_MSEC (500)
+
static inline void count_compact_event(enum vm_event_item item)
{
count_vm_event(item);
@@ -47,13 +52,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
-#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
-#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
-
-/*
- * Fragmentation score check interval for proactive compaction purposes.
- */
-static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
/*
* Page order with-respect-to which proactive compaction
@@ -110,42 +108,37 @@ static void split_map_pages(struct list_head *list)
}
#ifdef CONFIG_COMPACTION
-
-int PageMovable(struct page *page)
+bool PageMovable(struct page *page)
{
- struct address_space *mapping;
+ const struct movable_operations *mops;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!__PageMovable(page))
- return 0;
+ return false;
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
- return 1;
+ mops = page_movable_ops(page);
+ if (mops)
+ return true;
- return 0;
+ return false;
}
-EXPORT_SYMBOL(PageMovable);
-void __SetPageMovable(struct page *page, struct address_space *mapping)
+void __SetPageMovable(struct page *page, const struct movable_operations *mops)
{
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
- page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+ VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
+ page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
}
EXPORT_SYMBOL(__SetPageMovable);
void __ClearPageMovable(struct page *page)
{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageMovable(page), page);
/*
- * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
- * flag so that VM can catch up released page by driver after isolation.
- * With it, VM migration doesn't try to put it back.
+ * This page still has the type of a movable page, but it's
+ * actually not movable any more.
*/
- page->mapping = (void *)((unsigned long)page->mapping &
- PAGE_MAPPING_MOVABLE);
+ page->mapping = (void *)PAGE_MAPPING_MOVABLE;
}
EXPORT_SYMBOL(__ClearPageMovable);
@@ -157,7 +150,7 @@ EXPORT_SYMBOL(__ClearPageMovable);
* allocation success. 1 << compact_defer_shift, compactions are skipped up
* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
*/
-void defer_compaction(struct zone *zone, int order)
+static void defer_compaction(struct zone *zone, int order)
{
zone->compact_considered = 0;
zone->compact_defer_shift++;
@@ -172,7 +165,7 @@ void defer_compaction(struct zone *zone, int order)
}
/* Returns true if compaction should be skipped this time */
-bool compaction_deferred(struct zone *zone, int order)
+static bool compaction_deferred(struct zone *zone, int order)
{
unsigned long defer_limit = 1UL << zone->compact_defer_shift;
@@ -209,7 +202,7 @@ void compaction_defer_reset(struct zone *zone, int order,
}
/* Returns true if restarting compaction after many failures */
-bool compaction_restarting(struct zone *zone, int order)
+static bool compaction_restarting(struct zone *zone, int order)
{
if (order < zone->compact_order_failed)
return false;
@@ -236,8 +229,35 @@ static void reset_cached_positions(struct zone *zone)
pageblock_start_pfn(zone_end_pfn(zone) - 1);
}
+#ifdef CONFIG_SPARSEMEM
+/*
+ * If the PFN falls into an offline section, return the start PFN of the
+ * next online section. If the PFN falls into an online section or if
+ * there is no next online section, return 0.
+ */
+static unsigned long skip_offline_sections(unsigned long start_pfn)
+{
+ unsigned long start_nr = pfn_to_section_nr(start_pfn);
+
+ if (online_section_nr(start_nr))
+ return 0;
+
+ while (++start_nr <= __highest_present_section_nr) {
+ if (online_section_nr(start_nr))
+ return section_nr_to_pfn(start_nr);
+ }
+
+ return 0;
+}
+#else
+static unsigned long skip_offline_sections(unsigned long start_pfn)
+{
+ return 0;
+}
+#endif
+
/*
- * Compound pages of >= pageblock_order should consistenly be skipped until
+ * Compound pages of >= pageblock_order should consistently be skipped until
* released. It is always pointless to compact pages of such order (if they are
* migratable), and the pageblocks they occupy cannot contain any free pages.
*/
@@ -307,20 +327,17 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
* is necessary for the block to be a migration source/target.
*/
do {
- if (pfn_valid_within(pfn)) {
- if (check_source && PageLRU(page)) {
- clear_pageblock_skip(page);
- return true;
- }
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
- if (check_target && PageBuddy(page)) {
- clear_pageblock_skip(page);
- return true;
- }
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
}
page += (1 << PAGE_ALLOC_COSTLY_ORDER);
- pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
} while (page <= end_page);
return false;
@@ -402,18 +419,14 @@ void reset_isolation_suitable(pg_data_t *pgdat)
* Sets the pageblock skip bit if it was clear. Note that this is a hint as
* locks are not required for read/writers. Returns true if it was already set.
*/
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
- unsigned long pfn)
+static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
bool skip;
- /* Do no update if skip hint is being ignored */
+ /* Do not update if skip hint is being ignored */
if (cc->ignore_skip_hint)
return false;
- if (!IS_ALIGNED(pfn, pageblock_nr_pages))
- return false;
-
skip = get_pageblock_skip(page);
if (!skip && !cc->no_set_skip_hint)
set_pageblock_skip(page);
@@ -450,9 +463,6 @@ static void update_pageblock_skip(struct compact_control *cc,
if (cc->no_set_skip_hint)
return;
- if (!page)
- return;
-
set_pageblock_skip(page);
/* Update where async and sync compaction should restart */
@@ -480,8 +490,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
{
}
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
- unsigned long pfn)
+static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
return false;
}
@@ -517,15 +526,12 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
* very heavily contended. The lock should be periodically unlocked to avoid
* having disabled IRQs for a long time, even when there is nobody waiting on
* the lock. It might also be that allowing the IRQs will result in
- * need_resched() becoming true. If scheduling is needed, async compaction
- * aborts. Sync compaction schedules.
+ * need_resched() becoming true. If scheduling is needed, compaction schedules.
* Either compaction type will also abort if a fatal signal is pending.
* In either case if the lock was locked, it is dropped and not regained.
*
- * Returns true if compaction should abort due to fatal signal pending, or
- * async compaction due to need_resched()
- * Returns false when compaction can continue (sync compaction might have
- * scheduled)
+ * Returns true if compaction should abort due to fatal signal pending.
+ * Returns false when compaction can continue.
*/
static bool compact_unlock_should_abort(spinlock_t *lock,
unsigned long flags, bool *locked, struct compact_control *cc)
@@ -578,16 +584,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort if fatal signal
- * pending or async compaction detects need_resched()
+ * pending.
*/
- if (!(blockpfn % SWAP_CLUSTER_MAX)
+ if (!(blockpfn % COMPACT_CLUSTER_MAX)
&& compact_unlock_should_abort(&cc->zone->lock, flags,
&locked, cc))
break;
nr_scanned++;
- if (!pfn_valid_within(blockpfn))
- goto isolate_fail;
/*
* For compound pages such as THP and hugetlbfs, we can save
@@ -598,9 +602,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
- if (likely(order < MAX_ORDER)) {
+ if (likely(order <= MAX_ORDER)) {
blockpfn += (1UL << order) - 1;
cursor += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
}
goto isolate_fail;
}
@@ -608,13 +613,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (!PageBuddy(page))
goto isolate_fail;
- /*
- * If we already hold the lock, we can skip some rechecking.
- * Note that if we hold the lock now, checked_pageblock was
- * already set in some previous iteration (or strict is true),
- * so it is correct to skip the suitable migration target
- * recheck as well.
- */
+ /* If we already hold the lock, we can skip some rechecking. */
if (!locked) {
locked = compact_lock_irqsave(&cc->zone->lock,
&flags, cc);
@@ -625,12 +624,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
}
/* Found a free page, will break it into order-0 pages */
- order = page_order(page);
+ order = buddy_order(page);
isolated = __isolate_free_page(page, order);
if (!isolated)
break;
set_page_private(page, order);
+ nr_scanned += isolated - 1;
total_isolated += isolated;
cc->nr_freepages += isolated;
list_add_tail(&page->lru, freelist);
@@ -764,8 +764,11 @@ isolate_freepages_range(struct compact_control *cc,
}
/* Similar to reclaim, but different enough that they don't share logic */
-static bool too_many_isolated(pg_data_t *pgdat)
+static bool too_many_isolated(struct compact_control *cc)
{
+ pg_data_t *pgdat = cc->zone->zone_pgdat;
+ bool too_many;
+
unsigned long active, inactive, isolated;
inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
@@ -775,7 +778,22 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
node_page_state(pgdat, NR_ISOLATED_ANON);
- return isolated > (inactive + active) / 2;
+ /*
+ * Allow GFP_NOFS to isolate past the limit set for regular
+ * compaction runs. This prevents an ABBA deadlock when other
+ * compactors have already isolated to the limit, but are
+ * blocked on filesystem locks held by the GFP_NOFS thread.
+ */
+ if (cc->gfp_mask & __GFP_FS) {
+ inactive >>= 3;
+ active >>= 3;
+ }
+
+ too_many = isolated > (inactive + active) / 2;
+ if (!too_many)
+ wake_throttle_isolated(pgdat);
+
+ return too_many;
}
/**
@@ -784,47 +802,55 @@ static bool too_many_isolated(pg_data_t *pgdat)
* @cc: Compaction control structure.
* @low_pfn: The first PFN to isolate
* @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
- * @isolate_mode: Isolation mode to be used.
+ * @mode: Isolation mode to be used.
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
- * Returns zero if there is a fatal signal pending, otherwise PFN of the
- * first page that was not scanned (which may be both less, equal to or more
- * than end_pfn).
+ * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
+ * -ENOMEM in case we could not allocate a page, or 0.
+ * cc->migrate_pfn will contain the next pfn to scan.
*
* The pages are isolated on cc->migratepages list (not required to be empty),
- * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
- * is neither read nor updated.
+ * and cc->nr_migratepages is updated accordingly.
*/
-static unsigned long
+static int
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
- unsigned long end_pfn, isolate_mode_t isolate_mode)
+ unsigned long end_pfn, isolate_mode_t mode)
{
pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
- bool locked = false;
+ struct lruvec *locked = NULL;
+ struct folio *folio = NULL;
struct page *page = NULL, *valid_page = NULL;
+ struct address_space *mapping;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
bool skip_updated = false;
+ int ret = 0;
+
+ cc->migrate_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
- while (unlikely(too_many_isolated(pgdat))) {
+ while (unlikely(too_many_isolated(cc))) {
+ /* stop isolation if there are still pages not migrated */
+ if (cc->nr_migratepages)
+ return -EAGAIN;
+
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
- return 0;
+ return -EAGAIN;
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
if (fatal_signal_pending(current))
- return 0;
+ return -EINTR;
}
cond_resched();
@@ -864,33 +890,81 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
- if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&pgdat->lru_lock,
- flags, &locked, cc)) {
- low_pfn = 0;
- goto fatal_pending;
+ if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = true;
+ ret = -EINTR;
+
+ goto fatal_pending;
+ }
+
+ cond_resched();
}
- if (!pfn_valid_within(low_pfn))
- goto isolate_fail;
nr_scanned++;
page = pfn_to_page(low_pfn);
/*
* Check if the pageblock has already been marked skipped.
- * Only the aligned PFN is checked as the caller isolates
+ * Only the first PFN is checked as the caller isolates
* COMPACT_CLUSTER_MAX at a time so the second call must
* not falsely conclude that the block should be skipped.
*/
- if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
- if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ if (!valid_page && (pageblock_aligned(low_pfn) ||
+ low_pfn == cc->zone->zone_start_pfn)) {
+ if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
+ folio = NULL;
goto isolate_abort;
}
valid_page = page;
}
+ if (PageHuge(page) && cc->alloc_contig) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
+ ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+
+ /*
+ * Fail isolation in case isolate_or_dissolve_huge_page()
+ * reports an error. In case of -ENOMEM, abort right away.
+ */
+ if (ret < 0) {
+ /* Do not report -EBUSY down the chain */
+ if (ret == -EBUSY)
+ ret = 0;
+ low_pfn += compound_nr(page) - 1;
+ nr_scanned += compound_nr(page) - 1;
+ goto isolate_fail;
+ }
+
+ if (PageHuge(page)) {
+ /*
+ * Hugepage was successfully isolated and placed
+ * on the cc->migratepages list.
+ */
+ folio = page_folio(page);
+ low_pfn += folio_nr_pages(folio) - 1;
+ goto isolate_success_no_list;
+ }
+
+ /*
+ * Ok, the hugepage was dissolved. Now these pages are
+ * Buddy and cannot be re-allocated because they are
+ * isolated. Fall-through as the check below handles
+ * Buddy pages.
+ */
+ }
+
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
@@ -898,15 +972,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* potential isolation targets.
*/
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
* a valid page order. Consider only values in the
* valid order range to prevent low_pfn overflow.
*/
- if (freepage_order > 0 && freepage_order < MAX_ORDER)
+ if (freepage_order > 0 && freepage_order <= MAX_ORDER) {
low_pfn += (1UL << freepage_order) - 1;
+ nr_scanned += (1UL << freepage_order) - 1;
+ }
continue;
}
@@ -921,8 +997,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (PageCompound(page) && !cc->alloc_contig) {
const unsigned int order = compound_order(page);
- if (likely(order < MAX_ORDER))
+ if (likely(order <= MAX_ORDER)) {
low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
+ }
goto isolate_fail;
}
@@ -939,97 +1017,166 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock,
- flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
- if (!isolate_movable_page(page, isolate_mode))
+ if (isolate_movable_page(page, mode)) {
+ folio = page_folio(page);
goto isolate_success;
+ }
}
goto isolate_fail;
}
/*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ folio = folio_get_nontail_page(page);
+ if (unlikely(!folio))
+ goto isolate_fail;
+
+ /*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
- if (!page_mapping(page) &&
- page_count(page) > page_mapcount(page))
- goto isolate_fail;
+ mapping = folio_mapping(folio);
+ if (!mapping && (folio_ref_count(folio) - 1) > folio_mapcount(folio))
+ goto isolate_fail_put;
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
- if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
- goto isolate_fail;
+ if (!(cc->gfp_mask & __GFP_FS) && mapping)
+ goto isolate_fail_put;
+
+ /* Only take pages on LRU: a check now makes later tests safe */
+ if (!folio_test_lru(folio))
+ goto isolate_fail_put;
+
+ /* Compaction might skip unevictable pages but CMA takes them */
+ if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio))
+ goto isolate_fail_put;
+
+ /*
+ * To minimise LRU disruption, the caller can indicate with
+ * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages
+ * it will be able to migrate without blocking - clean pages
+ * for the most part. PageWriteback would require blocking.
+ */
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
+ goto isolate_fail_put;
+
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) {
+ bool migrate_dirty;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migrate_folio callback are possible to migrate
+ * without blocking. However, we can be racing with
+ * truncation so it's necessary to lock the page
+ * to stabilise the mapping as truncation holds
+ * the page lock until after the page is removed
+ * from the page cache.
+ */
+ if (!folio_trylock(folio))
+ goto isolate_fail_put;
+
+ mapping = folio_mapping(folio);
+ migrate_dirty = !mapping ||
+ mapping->a_ops->migrate_folio;
+ folio_unlock(folio);
+ if (!migrate_dirty)
+ goto isolate_fail_put;
+ }
+
+ /* Try isolate the folio */
+ if (!folio_test_clear_lru(folio))
+ goto isolate_fail_put;
+
+ lruvec = folio_lruvec(folio);
/* If we already hold the lock, we can skip some rechecking */
- if (!locked) {
- locked = compact_lock_irqsave(&pgdat->lru_lock,
- &flags, cc);
+ if (lruvec != locked) {
+ if (locked)
+ unlock_page_lruvec_irqrestore(locked, flags);
+
+ compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ locked = lruvec;
- /* Try get exclusive access under lock */
- if (!skip_updated) {
+ lruvec_memcg_debug(lruvec, folio);
+
+ /*
+ * Try get exclusive access under lock. If marked for
+ * skip, the scan is aborted unless the current context
+ * is a rescan to reach the end of the pageblock.
+ */
+ if (!skip_updated && valid_page) {
skip_updated = true;
- if (test_and_set_skip(cc, page, low_pfn))
+ if (test_and_set_skip(cc, valid_page) &&
+ !cc->finish_pageblock) {
goto isolate_abort;
+ }
}
- /* Recheck PageLRU and PageCompound under lock */
- if (!PageLRU(page))
- goto isolate_fail;
-
/*
- * Page become compound since the non-locked check,
- * and it's on LRU. It can only be a THP so the order
- * is safe to read and it's 0 for tail pages.
+ * folio become large since the non-locked check,
+ * and it's on LRU.
*/
- if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
- low_pfn += compound_nr(page) - 1;
- goto isolate_fail;
+ if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) {
+ low_pfn += folio_nr_pages(folio) - 1;
+ nr_scanned += folio_nr_pages(folio) - 1;
+ folio_set_lru(folio);
+ goto isolate_fail_put;
}
}
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- /* Try isolate the page */
- if (__isolate_lru_page(page, isolate_mode) != 0)
- goto isolate_fail;
-
- /* The whole page is taken off the LRU; skip the tail pages. */
- if (PageCompound(page))
- low_pfn += compound_nr(page) - 1;
+ /* The folio is taken off the LRU */
+ if (folio_test_large(folio))
+ low_pfn += folio_nr_pages(folio) - 1;
/* Successfully isolated */
- del_page_from_lru_list(page, lruvec, page_lru(page));
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- thp_nr_pages(page));
+ lruvec_del_folio(lruvec, folio);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
isolate_success:
- list_add(&page->lru, &cc->migratepages);
- cc->nr_migratepages++;
- nr_isolated++;
+ list_add(&folio->lru, &cc->migratepages);
+isolate_success_no_list:
+ cc->nr_migratepages += folio_nr_pages(folio);
+ nr_isolated += folio_nr_pages(folio);
+ nr_scanned += folio_nr_pages(folio) - 1;
/*
* Avoid isolating too much unless this block is being
- * rescanned (e.g. dirty/writeback pages, parallel allocation)
+ * fully scanned (e.g. dirty/writeback pages, parallel allocation)
* or a lock is contended. For contention, isolate quickly to
* potentially remove one source of contention.
*/
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
- !cc->rescan && !cc->contended) {
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
+ !cc->finish_pageblock && !cc->contended) {
++low_pfn;
break;
}
continue;
+
+isolate_fail_put:
+ /* Avoid potential deadlock in freeing page under lru_lock */
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+ folio_put(folio);
+
isolate_fail:
- if (!skip_on_failure)
+ if (!skip_on_failure && ret != -ENOMEM)
continue;
/*
@@ -1039,8 +1186,8 @@ isolate_fail:
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
@@ -1055,6 +1202,9 @@ isolate_fail:
*/
next_skip_pfn += 1UL << cc->order;
}
+
+ if (ret == -ENOMEM)
+ break;
}
/*
@@ -1064,20 +1214,26 @@ isolate_fail:
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
+ folio = NULL;
+
isolate_abort:
if (locked)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(locked, flags);
+ if (folio) {
+ folio_set_lru(folio);
+ folio_put(folio);
+ }
/*
- * Updated the cached scanner pfn once the pageblock has been scanned
+ * Update the cached scanner pfn once the pageblock has been scanned.
* Pages will either be migrated in which case there is no point
* scanning in the near future or migration failed in which case the
* failure reason may persist. The block is marked for skipping if
* there were no pages isolated in the block or if the block is
* rescanned twice in a row.
*/
- if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
- if (valid_page && !skip_updated)
+ if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) {
+ if (!cc->no_set_skip_hint && valid_page && !skip_updated)
set_pageblock_skip(valid_page);
update_cached_migrate(cc, low_pfn);
}
@@ -1090,7 +1246,9 @@ fatal_pending:
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
- return low_pfn;
+ cc->migrate_pfn = low_pfn;
+
+ return ret;
}
/**
@@ -1099,15 +1257,15 @@ fatal_pending:
* @start_pfn: The first PFN to start isolating.
* @end_pfn: The one-past-last PFN.
*
- * Returns zero if isolation fails fatally due to e.g. pending signal.
- * Otherwise, function returns one-past-the-last PFN of isolated page
- * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
+ * in case we could not allocate a page, or 0.
*/
-unsigned long
+int
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn, block_start_pfn, block_end_pfn;
+ int ret = 0;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
@@ -1126,17 +1284,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
block_end_pfn, cc->zone))
continue;
- pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
- ISOLATE_UNEVICTABLE);
+ ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
+ ISOLATE_UNEVICTABLE);
- if (!pfn)
+ if (ret)
break;
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
break;
}
- return pfn;
+ return ret;
}
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
@@ -1172,7 +1330,7 @@ static bool suitable_migration_target(struct compact_control *cc,
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (page_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= pageblock_order)
return false;
}
@@ -1217,8 +1375,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
if (!list_is_last(freelist, &freepage->lru)) {
list_cut_before(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1235,16 +1392,15 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
if (!list_is_first(freelist, &freepage->lru)) {
list_cut_position(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
static void
-fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
+fast_isolate_around(struct compact_control *cc, unsigned long pfn)
{
unsigned long start_pfn, end_pfn;
- struct page *page = pfn_to_page(pfn);
+ struct page *page;
/* Do not search around if there are enough pages already */
if (cc->nr_freepages >= cc->nr_migratepages)
@@ -1255,24 +1411,20 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
return;
/* Pageblock boundaries */
- start_pfn = pageblock_start_pfn(pfn);
- end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
+ start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
- /* Scan before */
- if (start_pfn != pfn) {
- isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
- if (cc->nr_freepages >= cc->nr_migratepages)
- return;
- }
+ page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone);
+ if (!page)
+ return;
- /* Scan after */
- start_pfn = pfn + nr_isolated;
- if (start_pfn < end_pfn)
- isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+ isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
- if (cc->nr_freepages < cc->nr_migratepages)
+ if (start_pfn == end_pfn)
set_pageblock_skip(page);
+
+ return;
}
/* Search orders in round-robin fashion */
@@ -1293,12 +1445,11 @@ static int next_search_order(struct compact_control *cc, int order)
return order;
}
-static unsigned long
-fast_isolate_freepages(struct compact_control *cc)
+static void fast_isolate_freepages(struct compact_control *cc)
{
- unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
- unsigned int nr_scanned = 0;
- unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int nr_scanned = 0, total_isolated = 0;
+ unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
unsigned long distance;
struct page *page = NULL;
@@ -1307,7 +1458,7 @@ fast_isolate_freepages(struct compact_control *cc)
/* Full compaction passes in a negative order */
if (cc->order <= 0)
- return cc->free_pfn;
+ return;
/*
* If starting the scan, use a deeper search and use the highest
@@ -1343,6 +1494,7 @@ fast_isolate_freepages(struct compact_control *cc)
struct page *freepage;
unsigned long flags;
unsigned int order_scanned = 0;
+ unsigned long high_pfn = 0;
if (!area->nr_free)
continue;
@@ -1357,7 +1509,8 @@ fast_isolate_freepages(struct compact_control *cc)
pfn = page_to_pfn(freepage);
if (pfn >= highest)
- highest = pageblock_start_pfn(pfn);
+ highest = max(pageblock_start_pfn(pfn),
+ cc->zone->zone_start_pfn);
if (pfn >= low_pfn) {
cc->fast_search_fail = 0;
@@ -1393,6 +1546,8 @@ fast_isolate_freepages(struct compact_control *cc)
if (__isolate_free_page(page, order)) {
set_page_private(page, order);
nr_isolated = 1 << order;
+ nr_scanned += nr_isolated - 1;
+ total_isolated += nr_isolated;
cc->nr_freepages += nr_isolated;
list_add_tail(&page->lru, &cc->freepages);
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -1405,14 +1560,21 @@ fast_isolate_freepages(struct compact_control *cc)
spin_unlock_irqrestore(&cc->zone->lock, flags);
+ /* Skip fast search if enough freepages isolated */
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ break;
+
/*
- * Smaller scan on next order so the total scan ig related
+ * Smaller scan on next order so the total scan is related
* to freelist_scan_limit.
*/
if (order_scanned >= limit)
- limit = min(1U, limit >> 1);
+ limit = max(1U, limit >> 1);
}
+ trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn,
+ nr_scanned, total_isolated);
+
if (!page) {
cc->fast_search_fail++;
if (scan_start) {
@@ -1421,13 +1583,14 @@ fast_isolate_freepages(struct compact_control *cc)
* not found, be pessimistic for direct compaction
* and use the min mark.
*/
- if (highest) {
+ if (highest >= min_pfn) {
page = pfn_to_page(highest);
cc->free_pfn = highest;
} else {
if (cc->direct_compaction && pfn_valid(min_pfn)) {
page = pageblock_pfn_to_page(min_pfn,
- pageblock_end_pfn(min_pfn),
+ min(pageblock_end_pfn(min_pfn),
+ zone_end_pfn(cc->zone)),
cc->zone);
cc->free_pfn = min_pfn;
}
@@ -1442,11 +1605,10 @@ fast_isolate_freepages(struct compact_control *cc)
cc->total_free_scanned += nr_scanned;
if (!page)
- return cc->free_pfn;
+ return;
low_pfn = page_to_pfn(page);
- fast_isolate_around(cc, low_pfn, nr_isolated);
- return low_pfn;
+ fast_isolate_around(cc, low_pfn);
}
/*
@@ -1465,7 +1627,7 @@ static void isolate_freepages(struct compact_control *cc)
unsigned int stride;
/* Try a small search of the free lists for a candidate */
- isolate_start_pfn = fast_isolate_freepages(cc);
+ fast_isolate_freepages(cc);
if (cc->nr_freepages)
goto splitmap;
@@ -1502,7 +1664,7 @@ static void isolate_freepages(struct compact_control *cc)
* This can iterate a massively long zone without finding any
* suitable migration targets, so periodically check resched.
*/
- if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ if (!(block_start_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
@@ -1570,11 +1732,10 @@ splitmap:
* This is a migrate-callback that "allocates" freepages by taking pages
* from the isolated freelists in the block we are migrating to.
*/
-static struct page *compaction_alloc(struct page *migratepage,
- unsigned long data)
+static struct folio *compaction_alloc(struct folio *src, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
- struct page *freepage;
+ struct folio *dst;
if (list_empty(&cc->freepages)) {
isolate_freepages(cc);
@@ -1583,11 +1744,11 @@ static struct page *compaction_alloc(struct page *migratepage,
return NULL;
}
- freepage = list_entry(cc->freepages.next, struct page, lru);
- list_del(&freepage->lru);
+ dst = list_entry(cc->freepages.next, struct folio, lru);
+ list_del(&dst->lru);
cc->nr_freepages--;
- return freepage;
+ return dst;
}
/*
@@ -1595,11 +1756,11 @@ static struct page *compaction_alloc(struct page *migratepage,
* freelist. All pages on the freelist are from the same zone, so there is no
* special handling needed for NUMA.
*/
-static void compaction_free(struct page *page, unsigned long data)
+static void compaction_free(struct folio *dst, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
- list_add(&page->lru, &cc->freepages);
+ list_add(&dst->lru, &cc->freepages);
cc->nr_freepages++;
}
@@ -1614,11 +1775,15 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
-#ifdef CONFIG_PREEMPT_RT
-int sysctl_compact_unevictable_allowed __read_mostly = 0;
-#else
-int sysctl_compact_unevictable_allowed __read_mostly = 1;
-#endif
+static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
+/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+static int sysctl_extfrag_threshold = 500;
+static int __read_mostly sysctl_compact_memory;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -1657,12 +1822,20 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
unsigned long pfn = cc->migrate_pfn;
unsigned long high_pfn;
int order;
+ bool found_block = false;
/* Skip hints are relied on to avoid repeats on the fast search */
if (cc->ignore_skip_hint)
return pfn;
/*
+ * If the pageblock should be finished then do not select a different
+ * pageblock.
+ */
+ if (cc->finish_pageblock)
+ return pfn;
+
+ /*
* If the migrate_pfn is not at the start of a zone or the start
* of a pageblock then assume this is a continuation of a previous
* scan restarted due to COMPACT_CLUSTER_MAX.
@@ -1699,7 +1872,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
for (order = cc->order - 1;
- order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
order--) {
struct free_area *area = &cc->zone->free_area[order];
struct list_head *freelist;
@@ -1714,7 +1887,11 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
list_for_each_entry(freepage, freelist, lru) {
unsigned long free_pfn;
- nr_scanned++;
+ if (nr_scanned++ >= limit) {
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+
free_pfn = page_to_pfn(freepage);
if (free_pfn < high_pfn) {
/*
@@ -1723,26 +1900,18 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* the list assumes an entry is deleted, not
* reordered.
*/
- if (get_pageblock_skip(freepage)) {
- if (list_is_last(freelist, &freepage->lru))
- break;
-
+ if (get_pageblock_skip(freepage))
continue;
- }
/* Reorder to so a future search skips recent pages */
move_freelist_tail(freelist, freepage);
update_fast_start_pfn(cc, free_pfn);
pfn = pageblock_start_pfn(free_pfn);
+ if (pfn < cc->zone->zone_start_pfn)
+ pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
- set_pageblock_skip(freepage);
- break;
- }
-
- if (nr_scanned >= limit) {
- cc->fast_search_fail++;
- move_freelist_tail(freelist, freepage);
+ found_block = true;
break;
}
}
@@ -1755,9 +1924,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* If fast scanning failed then use a cached entry for a page block
* that had free pages as the basis for starting a linear scan.
*/
- if (pfn == cc->migrate_pfn)
+ if (!found_block) {
+ cc->fast_search_fail++;
pfn = reinit_migrate_pfn(cc);
-
+ }
return pfn;
}
@@ -1803,7 +1973,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
*/
for (; block_end_pfn <= cc->free_pfn;
fast_find_block = false,
- low_pfn = block_end_pfn,
+ cc->migrate_pfn = low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1812,13 +1982,19 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* many pageblocks unsuitable, so periodically check if we
* need to schedule.
*/
- if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ if (!(low_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
page = pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, cc->zone);
- if (!page)
+ if (!page) {
+ unsigned long next_pfn;
+
+ next_pfn = skip_offline_sections(block_start_pfn);
+ if (next_pfn)
+ block_end_pfn = min(next_pfn, cc->free_pfn);
continue;
+ }
/*
* If isolation recently failed, do not retry. Only check the
@@ -1827,17 +2003,18 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* before making it "skip" so other compaction instances do
* not scan the same block.
*/
- if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ if ((pageblock_aligned(low_pfn) ||
+ low_pfn == cc->zone->zone_start_pfn) &&
!fast_find_block && !isolation_suitable(cc, page))
continue;
/*
- * For async compaction, also only scan in MOVABLE blocks
- * without huge pages. Async compaction is optimistic to see
- * if the minimum amount of work satisfies the allocation.
- * The cached PFN is updated as it's possible that all
- * remaining blocks between source and target are unsuitable
- * and the compaction scanners fail to meet.
+ * For async direct compaction, only scan the pageblocks of the
+ * same migratetype without huge pages. Async direct compaction
+ * is optimistic to see if the minimum amount of work satisfies
+ * the allocation. The cached PFN is updated as it's possible
+ * that all remaining blocks between source and target are
+ * unsuitable and the compaction scanners fail to meet.
*/
if (!suitable_migration_source(cc, page)) {
update_cached_migrate(cc, block_end_pfn);
@@ -1845,10 +2022,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
}
/* Perform the isolation */
- low_pfn = isolate_migratepages_block(cc, low_pfn,
- block_end_pfn, isolate_mode);
-
- if (!low_pfn)
+ if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
+ isolate_mode))
return ISOLATE_ABORT;
/*
@@ -1859,9 +2034,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
break;
}
- /* Record where migration scanner will be restarted. */
- cc->migrate_pfn = low_pfn;
-
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
@@ -1874,27 +2046,47 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
+/*
+ * Determine whether kswapd is (or recently was!) running on this node.
+ *
+ * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
+ * zero it.
+ */
static bool kswapd_is_running(pg_data_t *pgdat)
{
- return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+ bool running;
+
+ pgdat_kswapd_lock(pgdat);
+ running = pgdat->kswapd && task_is_running(pgdat->kswapd);
+ pgdat_kswapd_unlock(pgdat);
+
+ return running;
}
/*
* A zone's fragmentation score is the external fragmentation wrt to the
- * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
- * in the range [0, 100].
+ * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
+ */
+static unsigned int fragmentation_score_zone(struct zone *zone)
+{
+ return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+}
+
+/*
+ * A weighted zone's fragmentation score is the external fragmentation
+ * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
+ * returns a value in the range [0, 100].
*
* The scaling factor ensures that proactive compaction focuses on larger
* zones like ZONE_NORMAL, rather than smaller, specialized zones like
* ZONE_DMA32. For smaller zones, the score value remains close to zero,
* and thus never exceeds the high threshold for proactive compaction.
*/
-static unsigned int fragmentation_score_zone(struct zone *zone)
+static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
{
unsigned long score;
- score = zone->present_pages *
- extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+ score = zone->present_pages * fragmentation_score_zone(zone);
return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
}
@@ -1914,7 +2106,9 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
struct zone *zone;
zone = &pgdat->node_zones[zoneid];
- score += fragmentation_score_zone(zone);
+ if (!populated_zone(zone))
+ continue;
+ score += fragmentation_score_zone_weighted(zone);
}
return score;
@@ -1925,8 +2119,8 @@ static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
unsigned int wmark_low;
/*
- * Cap the low watermak to avoid excessive compaction
- * activity in case a user sets the proactivess tunable
+ * Cap the low watermark to avoid excessive compaction
+ * activity in case a user sets the proactiveness tunable
* close to 100 (maximum).
*/
wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
@@ -1998,12 +2192,12 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* migration source is unmovable/reclaimable but it's not worth
* special casing.
*/
- if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ if (!pageblock_aligned(cc->migrate_pfn))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
ret = COMPACT_NO_SUITABLE_PAGE;
- for (order = cc->order; order < MAX_ORDER; order++) {
+ for (order = cc->order; order <= MAX_ORDER; order++) {
struct free_area *area = &cc->zone->free_area[order];
bool can_steal;
@@ -2022,29 +2216,16 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1) {
-
- /* movable pages are OK in any pageblock */
- if (migratetype == MIGRATE_MOVABLE)
- return COMPACT_SUCCESS;
-
+ true, &can_steal) != -1)
/*
- * We are stealing for a non-movable allocation. Make
- * sure we finish compacting the current pageblock
- * first so it is as free as possible and we won't
- * have to steal another one soon. This only applies
- * to sync compaction, as async compaction operates
- * on pageblocks of the same migratetype.
+ * Movable pages are OK in any pageblock. If we are
+ * stealing for a non-movable allocation, make sure
+ * we finish compacting the current pageblock first
+ * (which is assured by the above migrate_pfn align
+ * check) so it is as free as possible and we won't
+ * have to steal another one soon.
*/
- if (cc->mode == MIGRATE_ASYNC ||
- IS_ALIGNED(cc->migrate_pfn,
- pageblock_nr_pages)) {
- return COMPACT_SUCCESS;
- }
-
- ret = COMPACT_CONTINUE;
- break;
- }
+ return COMPACT_SUCCESS;
}
out:
@@ -2066,32 +2247,11 @@ static enum compact_result compact_finished(struct compact_control *cc)
return ret;
}
-/*
- * compaction_suitable: Is this suitable to run compaction on this zone now?
- * Returns
- * COMPACT_SKIPPED - If there are too few free pages for compaction
- * COMPACT_SUCCESS - If the allocation would succeed without compaction
- * COMPACT_CONTINUE - If compaction should run now
- */
-static enum compact_result __compaction_suitable(struct zone *zone, int order,
- unsigned int alloc_flags,
- int highest_zoneidx,
- unsigned long wmark_target)
+static bool __compaction_suitable(struct zone *zone, int order,
+ int highest_zoneidx,
+ unsigned long wmark_target)
{
unsigned long watermark;
-
- if (is_via_compact_memory(order))
- return COMPACT_CONTINUE;
-
- watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
- /*
- * If watermarks for high-order allocation are already met, there
- * should be no need for compaction at all.
- */
- if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
- alloc_flags))
- return COMPACT_SUCCESS;
-
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
@@ -2109,22 +2269,20 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
- if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target))
- return COMPACT_SKIPPED;
-
- return COMPACT_CONTINUE;
+ return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
+ ALLOC_CMA, wmark_target);
}
-enum compact_result compaction_suitable(struct zone *zone, int order,
- unsigned int alloc_flags,
- int highest_zoneidx)
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ */
+bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
{
- enum compact_result ret;
- int fragindex;
+ enum compact_result compact_result;
+ bool suitable;
- ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
- zone_page_state(zone, NR_FREE_PAGES));
+ suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
@@ -2141,17 +2299,24 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
* excessive compaction for costly orders, but it should not be at the
* expense of system stability.
*/
- if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
- fragindex = fragmentation_index(zone, order);
- if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- ret = COMPACT_NOT_SUITABLE_ZONE;
+ if (suitable) {
+ compact_result = COMPACT_CONTINUE;
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ int fragindex = fragmentation_index(zone, order);
+
+ if (fragindex >= 0 &&
+ fragindex <= sysctl_extfrag_threshold) {
+ suitable = false;
+ compact_result = COMPACT_NOT_SUITABLE_ZONE;
+ }
+ }
+ } else {
+ compact_result = COMPACT_SKIPPED;
}
- trace_mm_compaction_suitable(zone, order, ret);
- if (ret == COMPACT_NOT_SUITABLE_ZONE)
- ret = COMPACT_SKIPPED;
+ trace_mm_compaction_suitable(zone, order, compact_result);
- return ret;
+ return suitable;
}
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
@@ -2167,7 +2332,6 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
- enum compact_result compact_result;
/*
* Do not consider all the reclaimable memory because we do not
@@ -2177,9 +2341,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
- compact_result = __compaction_suitable(zone, order, alloc_flags,
- ac->highest_zoneidx, available);
- if (compact_result != COMPACT_SKIPPED)
+ if (__compaction_suitable(zone, order, ac->highest_zoneidx,
+ available))
return true;
}
@@ -2195,6 +2358,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
+ unsigned int nr_succeeded = 0;
/*
* These counters track activities during zone compaction. Initialize
@@ -2208,14 +2372,22 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
INIT_LIST_HEAD(&cc->migratepages);
cc->migratetype = gfp_migratetype(cc->gfp_mask);
- ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
- cc->highest_zoneidx);
- /* Compaction is likely to fail */
- if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
- return ret;
- /* huh, compaction_suitable is returning something unexpected */
- VM_BUG_ON(ret != COMPACT_CONTINUE);
+ if (!is_via_compact_memory(cc->order)) {
+ unsigned long watermark;
+
+ /* Allocation can already succeed, nothing to do */
+ watermark = wmark_pages(cc->zone,
+ cc->alloc_flags & ALLOC_WMARK_MASK);
+ if (zone_watermark_ok(cc->zone, cc->order, watermark,
+ cc->highest_zoneidx, cc->alloc_flags))
+ return COMPACT_SUCCESS;
+
+ /* Compaction is likely to fail */
+ if (!compaction_suitable(cc->zone, cc->order,
+ cc->highest_zoneidx))
+ return COMPACT_SKIPPED;
+ }
/*
* Clear pageblock skip if there were failures recently and compaction
@@ -2264,29 +2436,30 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
update_cached = !sync &&
cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
- trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
- cc->free_pfn, end_pfn, sync);
+ trace_mm_compaction_begin(cc, start_pfn, end_pfn, sync);
- migrate_prep_local();
+ /* lru_add_drain_all could be expensive with involving other CPUs */
+ lru_add_drain();
while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
- unsigned long start_pfn = cc->migrate_pfn;
+ unsigned long iteration_start_pfn = cc->migrate_pfn;
/*
- * Avoid multiple rescans which can happen if a page cannot be
- * isolated (dirty/writeback in async mode) or if the migrated
- * pages are being allocated before the pageblock is cleared.
- * The first rescan will capture the entire pageblock for
- * migration. If it fails, it'll be marked skip and scanning
- * will proceed as normal.
+ * Avoid multiple rescans of the same pageblock which can
+ * happen if a page cannot be isolated (dirty/writeback in
+ * async mode) or if the migrated pages are being allocated
+ * before the pageblock is cleared. The first rescan will
+ * capture the entire pageblock for migration. If it fails,
+ * it'll be marked skip and scanning will proceed as normal.
*/
- cc->rescan = false;
+ cc->finish_pageblock = false;
if (pageblock_start_pfn(last_migrated_pfn) ==
- pageblock_start_pfn(start_pfn)) {
- cc->rescan = true;
+ pageblock_start_pfn(iteration_start_pfn)) {
+ cc->finish_pageblock = true;
}
+rescan:
switch (isolate_migratepages(cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
@@ -2307,16 +2480,14 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
goto check_drain;
case ISOLATE_SUCCESS:
update_cached = false;
- last_migrated_pfn = start_pfn;
- ;
+ last_migrated_pfn = iteration_start_pfn;
}
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
- MR_COMPACTION);
+ MR_COMPACTION, &nr_succeeded);
- trace_mm_compaction_migratepages(cc->nr_migratepages, err,
- &cc->migratepages);
+ trace_mm_compaction_migratepages(cc, nr_succeeded);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
@@ -2331,18 +2502,39 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
goto out;
}
/*
- * We failed to migrate at least one page in the current
- * order-aligned block, so skip the rest of it.
+ * If an ASYNC or SYNC_LIGHT fails to migrate a page
+ * within the current order-aligned block and
+ * fast_find_migrateblock may be used then scan the
+ * remainder of the pageblock. This will mark the
+ * pageblock "skip" to avoid rescanning in the near
+ * future. This will isolate more pages than necessary
+ * for the request but avoid loops due to
+ * fast_find_migrateblock revisiting blocks that were
+ * recently partially scanned.
*/
- if (cc->direct_compaction &&
- (cc->mode == MIGRATE_ASYNC)) {
- cc->migrate_pfn = block_end_pfn(
- cc->migrate_pfn - 1, cc->order);
- /* Draining pcplists is useless in this case */
- last_migrated_pfn = 0;
+ if (!pageblock_aligned(cc->migrate_pfn) &&
+ !cc->ignore_skip_hint && !cc->finish_pageblock &&
+ (cc->mode < MIGRATE_SYNC)) {
+ cc->finish_pageblock = true;
+
+ /*
+ * Draining pcplists does not help THP if
+ * any page failed to migrate. Even after
+ * drain, the pageblock will not be free.
+ */
+ if (cc->order == COMPACTION_HPAGE_ORDER)
+ last_migrated_pfn = 0;
+
+ goto rescan;
}
}
+ /* Stop if a page has been captured */
+ if (capc && capc->page) {
+ ret = COMPACT_SUCCESS;
+ break;
+ }
+
check_drain:
/*
* Has the migration scanner moved away from the previous
@@ -2361,12 +2553,6 @@ check_drain:
last_migrated_pfn = 0;
}
}
-
- /* Stop if a page has been captured */
- if (capc && capc->page) {
- ret = COMPACT_SUCCESS;
- break;
- }
}
out:
@@ -2392,8 +2578,10 @@ out:
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
- trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
- cc->free_pfn, end_pfn, sync, ret);
+ trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
+
+ VM_BUG_ON(!list_empty(&cc->freepages));
+ VM_BUG_ON(!list_empty(&cc->migratepages));
return ret;
}
@@ -2433,9 +2621,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
ret = compact_zone(&cc, &capc);
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
-
/*
* Make sure we hide capture control first before we read the captured
* page pointer, otherwise an interrupt could free and capture a page
@@ -2443,12 +2628,18 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
*/
WRITE_ONCE(current->capture_control, NULL);
*capture = READ_ONCE(capc.page);
+ /*
+ * Technically, it is also possible that compaction is skipped but
+ * the page is still captured out of luck(IRQ came and freed the page).
+ * Returning COMPACT_SUCCESS in such cases helps in properly accounting
+ * the COMPACT[STALL|FAIL] when compaction is skipped.
+ */
+ if (*capture)
+ ret = COMPACT_SUCCESS;
return ret;
}
-int sysctl_extfrag_threshold = 500;
-
/**
* try_to_compact_pages - Direct compact to satisfy a high-order allocation
* @gfp_mask: The GFP mask of the current allocation
@@ -2464,7 +2655,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, struct page **capture)
{
- int may_perform_io = gfp_mask & __GFP_IO;
+ int may_perform_io = (__force int)(gfp_mask & __GFP_IO);
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
@@ -2559,8 +2750,10 @@ static void proactive_compact_node(pg_data_t *pgdat)
compact_zone(&cc, NULL);
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
+ count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+ cc.total_migrate_scanned);
+ count_compact_events(KCOMPACTD_FREE_SCANNED,
+ cc.total_free_scanned);
}
}
@@ -2588,9 +2781,6 @@ static void compact_node(int nid)
cc.zone = zone;
compact_zone(&cc, NULL);
-
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
}
}
@@ -2606,23 +2796,48 @@ static void compact_nodes(void)
compact_node(nid);
}
-/* The written value is actually unused, all memory is compacted */
-int sysctl_compact_memory;
+static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int rc, nid;
-/*
- * Tunable for proactive compaction. It determines how
- * aggressively the kernel should compact memory in the
- * background. It takes values in the range [0, 100].
- */
-unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write && sysctl_compaction_proactiveness) {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (pgdat->proactive_compact_trigger)
+ continue;
+
+ pgdat->proactive_compact_trigger = true;
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, -1,
+ pgdat->nr_zones - 1);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+ }
+ }
+
+ return 0;
+}
/*
* This is the entry point for compacting all nodes via
* /proc/sys/vm/compact_memory
*/
-int sysctl_compaction_handler(struct ctl_table *table, int write,
+static int sysctl_compaction_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, length, ppos);
+ if (ret)
+ return ret;
+
+ if (sysctl_compact_memory != 1)
+ return -EINVAL;
+
if (write)
compact_nodes();
@@ -2630,9 +2845,9 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
-static ssize_t sysfs_compact_node(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
int nid = dev->id;
@@ -2645,7 +2860,7 @@ static ssize_t sysfs_compact_node(struct device *dev,
return count;
}
-static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node);
+static DEVICE_ATTR_WO(compact);
int compaction_register_node(struct node *node)
{
@@ -2660,7 +2875,8 @@ void compaction_unregister_node(struct node *node)
static inline bool kcompactd_work_requested(pg_data_t *pgdat)
{
- return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+ return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
+ pgdat->proactive_compact_trigger;
}
static bool kcompactd_node_suitable(pg_data_t *pgdat)
@@ -2675,8 +2891,14 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
if (!populated_zone(zone))
continue;
- if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
- highest_zoneidx) == COMPACT_CONTINUE)
+ /* Allocation can already succeed, check other zones */
+ if (zone_watermark_ok(zone, pgdat->kcompactd_max_order,
+ min_wmark_pages(zone),
+ highest_zoneidx, 0))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order,
+ highest_zoneidx))
return true;
}
@@ -2713,8 +2935,12 @@ static void kcompactd_do_work(pg_data_t *pgdat)
if (compaction_deferred(zone, cc.order))
continue;
- if (compaction_suitable(zone, cc.order, 0, zoneid) !=
- COMPACT_CONTINUE)
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, cc.order,
+ min_wmark_pages(zone), zoneid, 0))
+ continue;
+
+ if (!compaction_suitable(zone, cc.order, zoneid))
continue;
if (kthread_should_stop())
@@ -2745,9 +2971,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
cc.total_migrate_scanned);
count_compact_events(KCOMPACTD_FREE_SCANNED,
cc.total_free_scanned);
-
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
}
/*
@@ -2793,9 +3016,10 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
*/
static int kcompactd(void *p)
{
- pg_data_t *pgdat = (pg_data_t*)p;
+ pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
- unsigned int proactive_defer = 0;
+ long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
+ long timeout = default_timeout;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -2810,25 +3034,39 @@ static int kcompactd(void *p)
while (!kthread_should_stop()) {
unsigned long pflags;
+ /*
+ * Avoid the unnecessary wakeup for proactive compaction
+ * when it is disabled.
+ */
+ if (!sysctl_compaction_proactiveness)
+ timeout = MAX_SCHEDULE_TIMEOUT;
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
- kcompactd_work_requested(pgdat),
- msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+ kcompactd_work_requested(pgdat), timeout) &&
+ !pgdat->proactive_compact_trigger) {
psi_memstall_enter(&pflags);
kcompactd_do_work(pgdat);
psi_memstall_leave(&pflags);
+ /*
+ * Reset the timeout value. The defer timeout from
+ * proactive compaction is lost here but that is fine
+ * as the condition of the zone changing substantionally
+ * then carrying on with the previous defer interval is
+ * not useful.
+ */
+ timeout = default_timeout;
continue;
}
- /* kcompactd wait timeout */
+ /*
+ * Start the proactive work with default timeout. Based
+ * on the fragmentation score, this timeout is updated.
+ */
+ timeout = default_timeout;
if (should_proactive_compact_node(pgdat)) {
unsigned int prev_score, score;
- if (proactive_defer) {
- proactive_defer--;
- continue;
- }
prev_score = fragmentation_score_node(pgdat);
proactive_compact_node(pgdat);
score = fragmentation_score_node(pgdat);
@@ -2836,9 +3074,12 @@ static int kcompactd(void *p)
* Defer proactive compaction if the fragmentation
* score did not go down i.e. no progress made.
*/
- proactive_defer = score < prev_score ?
- 0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+ if (unlikely(score >= prev_score))
+ timeout =
+ default_timeout << COMPACT_MAX_DEFER_SHIFT;
}
+ if (unlikely(pgdat->proactive_compact_trigger))
+ pgdat->proactive_compact_trigger = false;
}
return 0;
@@ -2848,28 +3089,25 @@ static int kcompactd(void *p)
* This kcompactd start function will be called by init and node-hot-add.
* On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
*/
-int kcompactd_run(int nid)
+void __meminit kcompactd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- int ret = 0;
if (pgdat->kcompactd)
- return 0;
+ return;
pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
if (IS_ERR(pgdat->kcompactd)) {
pr_err("Failed to start kcompactd on node %d\n", nid);
- ret = PTR_ERR(pgdat->kcompactd);
pgdat->kcompactd = NULL;
}
- return ret;
}
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
*/
-void kcompactd_stop(int nid)
+void __meminit kcompactd_stop(int nid)
{
struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
@@ -2897,11 +3135,69 @@ static int kcompactd_cpu_online(unsigned int cpu)
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
- set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ if (pgdat->kcompactd)
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
}
return 0;
}
+static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, old;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ old = *(int *)table->data;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+ if (old != *(int *)table->data)
+ pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
+ table->procname, current->comm,
+ task_pid_nr(current));
+ return ret;
+}
+
+static struct ctl_table vm_compaction[] = {
+ {
+ .procname = "compact_memory",
+ .data = &sysctl_compact_memory,
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = sysctl_compaction_handler,
+ },
+ {
+ .procname = "compaction_proactiveness",
+ .data = &sysctl_compaction_proactiveness,
+ .maxlen = sizeof(sysctl_compaction_proactiveness),
+ .mode = 0644,
+ .proc_handler = compaction_proactiveness_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "extfrag_threshold",
+ .data = &sysctl_extfrag_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+ {
+ .procname = "compact_unevictable_allowed",
+ .data = &sysctl_compact_unevictable_allowed,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_warn_RT_change,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
static int __init kcompactd_init(void)
{
int nid;
@@ -2917,6 +3213,7 @@ static int __init kcompactd_init(void)
for_each_node_state(nid, N_MEMORY)
kcompactd_run(nid);
+ register_sysctl_init("vm", vm_compaction);
return 0;
}
subsys_initcall(kcompactd_init)
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
new file mode 100644
index 000000000000..436c6b4cb5ec
--- /dev/null
+++ b/mm/damon/Kconfig
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Data Access Monitoring"
+
+config DAMON
+ bool "DAMON: Data Access Monitoring Framework"
+ help
+ This builds a framework that allows kernel subsystems to monitor
+ access frequency of each memory region. The information can be useful
+ for performance-centric DRAM level memory management.
+
+ See https://damonitor.github.io/doc/html/latest-damon/index.html for
+ more information.
+
+config DAMON_KUNIT_TEST
+ bool "Test for damon" if !KUNIT_ALL_TESTS
+ depends on DAMON && KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the DAMON Kunit test suite.
+
+ For more information on KUnit and unit tests in general, please refer
+ to the KUnit documentation.
+
+ If unsure, say N.
+
+config DAMON_VADDR
+ bool "Data access monitoring operations for virtual address spaces"
+ depends on DAMON && MMU
+ select PAGE_IDLE_FLAG
+ help
+ This builds the default data access monitoring operations for DAMON
+ that work for virtual address spaces.
+
+config DAMON_PADDR
+ bool "Data access monitoring operations for the physical address space"
+ depends on DAMON && MMU
+ select PAGE_IDLE_FLAG
+ help
+ This builds the default data access monitoring operations for DAMON
+ that works for the physical address space.
+
+config DAMON_VADDR_KUNIT_TEST
+ bool "Test for DAMON operations" if !KUNIT_ALL_TESTS
+ depends on DAMON_VADDR && KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the DAMON virtual addresses operations Kunit test suite.
+
+ For more information on KUnit and unit tests in general, please refer
+ to the KUnit documentation.
+
+ If unsure, say N.
+
+config DAMON_SYSFS
+ bool "DAMON sysfs interface"
+ depends on DAMON && SYSFS
+ help
+ This builds the sysfs interface for DAMON. The user space can use
+ the interface for arbitrary data access monitoring.
+
+config DAMON_DBGFS
+ bool "DAMON debugfs interface (DEPRECATED!)"
+ depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
+ help
+ This builds the debugfs interface for DAMON. The user space admins
+ can use the interface for arbitrary data access monitoring.
+
+ If unsure, say N.
+
+ This is deprecated, so users should move to the sysfs interface
+ (DAMON_SYSFS). If you depend on this and cannot move, please report
+ your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
+
+config DAMON_DBGFS_KUNIT_TEST
+ bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
+ depends on DAMON_DBGFS && KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the DAMON debugfs interface Kunit test suite.
+
+ For more information on KUnit and unit tests in general, please refer
+ to the KUnit documentation.
+
+ If unsure, say N.
+
+config DAMON_RECLAIM
+ bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
+ depends on DAMON_PADDR
+ help
+ This builds the DAMON-based reclamation subsystem. It finds pages
+ that not accessed for a long time (cold) using DAMON and reclaim
+ those.
+
+ This is suggested to be used as a proactive and lightweight
+ reclamation under light memory pressure, while the traditional page
+ scanning-based reclamation is used for heavy pressure.
+
+config DAMON_LRU_SORT
+ bool "Build DAMON-based LRU-lists sorting (DAMON_LRU_SORT)"
+ depends on DAMON_PADDR
+ help
+ This builds the DAMON-based LRU-lists sorting subsystem. It tries to
+ protect frequently accessed (hot) pages while rarely accessed (cold)
+ pages reclaimed first under memory pressure.
+
+endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
new file mode 100644
index 000000000000..f7add3f4aa79
--- /dev/null
+++ b/mm/damon/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y := core.o
+obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o
+obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
+obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o
+obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
+obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o
+obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
new file mode 100644
index 000000000000..bb07721909e1
--- /dev/null
+++ b/mm/damon/core-test.h
@@ -0,0 +1,367 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Access Monitor Unit Tests
+ *
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifdef CONFIG_DAMON_KUNIT_TEST
+
+#ifndef _DAMON_CORE_TEST_H
+#define _DAMON_CORE_TEST_H
+
+#include <kunit/test.h>
+
+static void damon_test_regions(struct kunit *test)
+{
+ struct damon_region *r;
+ struct damon_target *t;
+
+ r = damon_new_region(1, 2);
+ KUNIT_EXPECT_EQ(test, 1ul, r->ar.start);
+ KUNIT_EXPECT_EQ(test, 2ul, r->ar.end);
+ KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
+
+ t = damon_new_target();
+ KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
+
+ damon_add_region(r, t);
+ KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t));
+
+ damon_del_region(r, t);
+ KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
+
+ damon_free_target(t);
+}
+
+static unsigned int nr_damon_targets(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ unsigned int nr_targets = 0;
+
+ damon_for_each_target(t, ctx)
+ nr_targets++;
+
+ return nr_targets;
+}
+
+static void damon_test_target(struct kunit *test)
+{
+ struct damon_ctx *c = damon_new_ctx();
+ struct damon_target *t;
+
+ t = damon_new_target();
+ KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
+
+ damon_add_target(c, t);
+ KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c));
+
+ damon_destroy_target(t);
+ KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
+
+ damon_destroy_ctx(c);
+}
+
+/*
+ * Test kdamond_reset_aggregated()
+ *
+ * DAMON checks access to each region and aggregates this information as the
+ * access frequency of each region. In detail, it increases '->nr_accesses' of
+ * regions that an access has confirmed. 'kdamond_reset_aggregated()' flushes
+ * the aggregated information ('->nr_accesses' of each regions) to the result
+ * buffer. As a result of the flushing, the '->nr_accesses' of regions are
+ * initialized to zero.
+ */
+static void damon_test_aggregate(struct kunit *test)
+{
+ struct damon_ctx *ctx = damon_new_ctx();
+ unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} };
+ unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} };
+ unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} };
+ struct damon_target *t;
+ struct damon_region *r;
+ int it, ir;
+
+ for (it = 0; it < 3; it++) {
+ t = damon_new_target();
+ damon_add_target(ctx, t);
+ }
+
+ it = 0;
+ damon_for_each_target(t, ctx) {
+ for (ir = 0; ir < 3; ir++) {
+ r = damon_new_region(saddr[it][ir], eaddr[it][ir]);
+ r->nr_accesses = accesses[it][ir];
+ damon_add_region(r, t);
+ }
+ it++;
+ }
+ kdamond_reset_aggregated(ctx);
+ it = 0;
+ damon_for_each_target(t, ctx) {
+ ir = 0;
+ /* '->nr_accesses' should be zeroed */
+ damon_for_each_region(r, t) {
+ KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
+ ir++;
+ }
+ /* regions should be preserved */
+ KUNIT_EXPECT_EQ(test, 3, ir);
+ it++;
+ }
+ /* targets also should be preserved */
+ KUNIT_EXPECT_EQ(test, 3, it);
+
+ damon_destroy_ctx(ctx);
+}
+
+static void damon_test_split_at(struct kunit *test)
+{
+ struct damon_ctx *c = damon_new_ctx();
+ struct damon_target *t;
+ struct damon_region *r;
+
+ t = damon_new_target();
+ r = damon_new_region(0, 100);
+ damon_add_region(r, t);
+ damon_split_region_at(t, r, 25);
+ KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
+ KUNIT_EXPECT_EQ(test, r->ar.end, 25ul);
+
+ r = damon_next_region(r);
+ KUNIT_EXPECT_EQ(test, r->ar.start, 25ul);
+ KUNIT_EXPECT_EQ(test, r->ar.end, 100ul);
+
+ damon_free_target(t);
+ damon_destroy_ctx(c);
+}
+
+static void damon_test_merge_two(struct kunit *test)
+{
+ struct damon_target *t;
+ struct damon_region *r, *r2, *r3;
+ int i;
+
+ t = damon_new_target();
+ r = damon_new_region(0, 100);
+ r->nr_accesses = 10;
+ damon_add_region(r, t);
+ r2 = damon_new_region(100, 300);
+ r2->nr_accesses = 20;
+ damon_add_region(r2, t);
+
+ damon_merge_two_regions(t, r, r2);
+ KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
+ KUNIT_EXPECT_EQ(test, r->ar.end, 300ul);
+ KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u);
+
+ i = 0;
+ damon_for_each_region(r3, t) {
+ KUNIT_EXPECT_PTR_EQ(test, r, r3);
+ i++;
+ }
+ KUNIT_EXPECT_EQ(test, i, 1);
+
+ damon_free_target(t);
+}
+
+static struct damon_region *__nth_region_of(struct damon_target *t, int idx)
+{
+ struct damon_region *r;
+ unsigned int i = 0;
+
+ damon_for_each_region(r, t) {
+ if (i++ == idx)
+ return r;
+ }
+
+ return NULL;
+}
+
+static void damon_test_merge_regions_of(struct kunit *test)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184};
+ unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230};
+ unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2};
+
+ unsigned long saddrs[] = {0, 114, 130, 156, 170};
+ unsigned long eaddrs[] = {112, 130, 156, 170, 230};
+ int i;
+
+ t = damon_new_target();
+ for (i = 0; i < ARRAY_SIZE(sa); i++) {
+ r = damon_new_region(sa[i], ea[i]);
+ r->nr_accesses = nrs[i];
+ damon_add_region(r, t);
+ }
+
+ damon_merge_regions_of(t, 9, 9999);
+ /* 0-112, 114-130, 130-156, 156-170 */
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u);
+ for (i = 0; i < 5; i++) {
+ r = __nth_region_of(t, i);
+ KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]);
+ KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]);
+ }
+ damon_free_target(t);
+}
+
+static void damon_test_split_regions_of(struct kunit *test)
+{
+ struct damon_ctx *c = damon_new_ctx();
+ struct damon_target *t;
+ struct damon_region *r;
+
+ t = damon_new_target();
+ r = damon_new_region(0, 22);
+ damon_add_region(r, t);
+ damon_split_regions_of(t, 2);
+ KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
+ damon_free_target(t);
+
+ t = damon_new_target();
+ r = damon_new_region(0, 220);
+ damon_add_region(r, t);
+ damon_split_regions_of(t, 4);
+ KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
+ damon_free_target(t);
+ damon_destroy_ctx(c);
+}
+
+static void damon_test_ops_registration(struct kunit *test)
+{
+ struct damon_ctx *c = damon_new_ctx();
+ struct damon_operations ops, bak;
+
+ /* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */
+ KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0);
+ KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0);
+
+ /* Double-registration is prohibited */
+ ops.id = DAMON_OPS_VADDR;
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+ ops.id = DAMON_OPS_PADDR;
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+
+ /* Unknown ops id cannot be registered */
+ KUNIT_EXPECT_EQ(test, damon_select_ops(c, NR_DAMON_OPS), -EINVAL);
+
+ /* Registration should success after unregistration */
+ mutex_lock(&damon_ops_lock);
+ bak = damon_registered_ops[DAMON_OPS_VADDR];
+ damon_registered_ops[DAMON_OPS_VADDR] = (struct damon_operations){};
+ mutex_unlock(&damon_ops_lock);
+
+ ops.id = DAMON_OPS_VADDR;
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), 0);
+
+ mutex_lock(&damon_ops_lock);
+ damon_registered_ops[DAMON_OPS_VADDR] = bak;
+ mutex_unlock(&damon_ops_lock);
+
+ /* Check double-registration failure again */
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+}
+
+static void damon_test_set_regions(struct kunit *test)
+{
+ struct damon_target *t = damon_new_target();
+ struct damon_region *r1 = damon_new_region(4, 16);
+ struct damon_region *r2 = damon_new_region(24, 32);
+ struct damon_addr_range range = {.start = 8, .end = 28};
+ unsigned long expects[] = {8, 16, 16, 24, 24, 28};
+ int expect_idx = 0;
+ struct damon_region *r;
+
+ damon_add_region(r1, t);
+ damon_add_region(r2, t);
+ damon_set_regions(t, &range, 1);
+
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
+ damon_for_each_region(r, t) {
+ KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
+ KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
+ }
+ damon_destroy_target(t);
+}
+
+static void damon_test_update_monitoring_result(struct kunit *test)
+{
+ struct damon_attrs old_attrs = {
+ .sample_interval = 10, .aggr_interval = 1000,};
+ struct damon_attrs new_attrs;
+ struct damon_region *r = damon_new_region(3, 7);
+
+ r->nr_accesses = 15;
+ r->age = 20;
+
+ new_attrs = (struct damon_attrs){
+ .sample_interval = 100, .aggr_interval = 10000,};
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ KUNIT_EXPECT_EQ(test, r->nr_accesses, 15);
+ KUNIT_EXPECT_EQ(test, r->age, 2);
+
+ new_attrs = (struct damon_attrs){
+ .sample_interval = 1, .aggr_interval = 1000};
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
+ KUNIT_EXPECT_EQ(test, r->age, 2);
+
+ new_attrs = (struct damon_attrs){
+ .sample_interval = 1, .aggr_interval = 100};
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
+ KUNIT_EXPECT_EQ(test, r->age, 20);
+}
+
+static void damon_test_set_attrs(struct kunit *test)
+{
+ struct damon_ctx *c = damon_new_ctx();
+ struct damon_attrs valid_attrs = {
+ .min_nr_regions = 10, .max_nr_regions = 1000,
+ .sample_interval = 5000, .aggr_interval = 100000,};
+ struct damon_attrs invalid_attrs;
+
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &valid_attrs), 0);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.min_nr_regions = 1;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.max_nr_regions = 9;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.aggr_interval = 4999;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
+}
+
+static struct kunit_case damon_test_cases[] = {
+ KUNIT_CASE(damon_test_target),
+ KUNIT_CASE(damon_test_regions),
+ KUNIT_CASE(damon_test_aggregate),
+ KUNIT_CASE(damon_test_split_at),
+ KUNIT_CASE(damon_test_merge_two),
+ KUNIT_CASE(damon_test_merge_regions_of),
+ KUNIT_CASE(damon_test_split_regions_of),
+ KUNIT_CASE(damon_test_ops_registration),
+ KUNIT_CASE(damon_test_set_regions),
+ KUNIT_CASE(damon_test_update_monitoring_result),
+ KUNIT_CASE(damon_test_set_attrs),
+ {},
+};
+
+static struct kunit_suite damon_test_suite = {
+ .name = "damon",
+ .test_cases = damon_test_cases,
+};
+kunit_test_suite(damon_test_suite);
+
+#endif /* _DAMON_CORE_TEST_H */
+
+#endif /* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/core.c b/mm/damon/core.c
new file mode 100644
index 000000000000..eb9580942a5c
--- /dev/null
+++ b/mm/damon/core.c
@@ -0,0 +1,1471 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Data Access Monitor
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#define pr_fmt(fmt) "damon: " fmt
+
+#include <linux/damon.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/damon.h>
+
+#ifdef CONFIG_DAMON_KUNIT_TEST
+#undef DAMON_MIN_REGION
+#define DAMON_MIN_REGION 1
+#endif
+
+static DEFINE_MUTEX(damon_lock);
+static int nr_running_ctxs;
+static bool running_exclusive_ctxs;
+
+static DEFINE_MUTEX(damon_ops_lock);
+static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
+
+static struct kmem_cache *damon_region_cache __ro_after_init;
+
+/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
+static bool __damon_is_registered_ops(enum damon_ops_id id)
+{
+ struct damon_operations empty_ops = {};
+
+ if (!memcmp(&empty_ops, &damon_registered_ops[id], sizeof(empty_ops)))
+ return false;
+ return true;
+}
+
+/**
+ * damon_is_registered_ops() - Check if a given damon_operations is registered.
+ * @id: Id of the damon_operations to check if registered.
+ *
+ * Return: true if the ops is set, false otherwise.
+ */
+bool damon_is_registered_ops(enum damon_ops_id id)
+{
+ bool registered;
+
+ if (id >= NR_DAMON_OPS)
+ return false;
+ mutex_lock(&damon_ops_lock);
+ registered = __damon_is_registered_ops(id);
+ mutex_unlock(&damon_ops_lock);
+ return registered;
+}
+
+/**
+ * damon_register_ops() - Register a monitoring operations set to DAMON.
+ * @ops: monitoring operations set to register.
+ *
+ * This function registers a monitoring operations set of valid &struct
+ * damon_operations->id so that others can find and use them later.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_register_ops(struct damon_operations *ops)
+{
+ int err = 0;
+
+ if (ops->id >= NR_DAMON_OPS)
+ return -EINVAL;
+ mutex_lock(&damon_ops_lock);
+ /* Fail for already registered ops */
+ if (__damon_is_registered_ops(ops->id)) {
+ err = -EINVAL;
+ goto out;
+ }
+ damon_registered_ops[ops->id] = *ops;
+out:
+ mutex_unlock(&damon_ops_lock);
+ return err;
+}
+
+/**
+ * damon_select_ops() - Select a monitoring operations to use with the context.
+ * @ctx: monitoring context to use the operations.
+ * @id: id of the registered monitoring operations to select.
+ *
+ * This function finds registered monitoring operations set of @id and make
+ * @ctx to use it.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id)
+{
+ int err = 0;
+
+ if (id >= NR_DAMON_OPS)
+ return -EINVAL;
+
+ mutex_lock(&damon_ops_lock);
+ if (!__damon_is_registered_ops(id))
+ err = -EINVAL;
+ else
+ ctx->ops = damon_registered_ops[id];
+ mutex_unlock(&damon_ops_lock);
+ return err;
+}
+
+/*
+ * Construct a damon_region struct
+ *
+ * Returns the pointer to the new struct if success, or NULL otherwise
+ */
+struct damon_region *damon_new_region(unsigned long start, unsigned long end)
+{
+ struct damon_region *region;
+
+ region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL);
+ if (!region)
+ return NULL;
+
+ region->ar.start = start;
+ region->ar.end = end;
+ region->nr_accesses = 0;
+ INIT_LIST_HEAD(&region->list);
+
+ region->age = 0;
+ region->last_nr_accesses = 0;
+
+ return region;
+}
+
+void damon_add_region(struct damon_region *r, struct damon_target *t)
+{
+ list_add_tail(&r->list, &t->regions_list);
+ t->nr_regions++;
+}
+
+static void damon_del_region(struct damon_region *r, struct damon_target *t)
+{
+ list_del(&r->list);
+ t->nr_regions--;
+}
+
+static void damon_free_region(struct damon_region *r)
+{
+ kmem_cache_free(damon_region_cache, r);
+}
+
+void damon_destroy_region(struct damon_region *r, struct damon_target *t)
+{
+ damon_del_region(r, t);
+ damon_free_region(r);
+}
+
+/*
+ * Check whether a region is intersecting an address range
+ *
+ * Returns true if it is.
+ */
+static bool damon_intersect(struct damon_region *r,
+ struct damon_addr_range *re)
+{
+ return !(r->ar.end <= re->start || re->end <= r->ar.start);
+}
+
+/*
+ * Fill holes in regions with new regions.
+ */
+static int damon_fill_regions_holes(struct damon_region *first,
+ struct damon_region *last, struct damon_target *t)
+{
+ struct damon_region *r = first;
+
+ damon_for_each_region_from(r, t) {
+ struct damon_region *next, *newr;
+
+ if (r == last)
+ break;
+ next = damon_next_region(r);
+ if (r->ar.end != next->ar.start) {
+ newr = damon_new_region(r->ar.end, next->ar.start);
+ if (!newr)
+ return -ENOMEM;
+ damon_insert_region(newr, r, next, t);
+ }
+ }
+ return 0;
+}
+
+/*
+ * damon_set_regions() - Set regions of a target for given address ranges.
+ * @t: the given target.
+ * @ranges: array of new monitoring target ranges.
+ * @nr_ranges: length of @ranges.
+ *
+ * This function adds new regions to, or modify existing regions of a
+ * monitoring target to fit in specific ranges.
+ *
+ * Return: 0 if success, or negative error code otherwise.
+ */
+int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
+ unsigned int nr_ranges)
+{
+ struct damon_region *r, *next;
+ unsigned int i;
+ int err;
+
+ /* Remove regions which are not in the new ranges */
+ damon_for_each_region_safe(r, next, t) {
+ for (i = 0; i < nr_ranges; i++) {
+ if (damon_intersect(r, &ranges[i]))
+ break;
+ }
+ if (i == nr_ranges)
+ damon_destroy_region(r, t);
+ }
+
+ r = damon_first_region(t);
+ /* Add new regions or resize existing regions to fit in the ranges */
+ for (i = 0; i < nr_ranges; i++) {
+ struct damon_region *first = NULL, *last, *newr;
+ struct damon_addr_range *range;
+
+ range = &ranges[i];
+ /* Get the first/last regions intersecting with the range */
+ damon_for_each_region_from(r, t) {
+ if (damon_intersect(r, range)) {
+ if (!first)
+ first = r;
+ last = r;
+ }
+ if (r->ar.start >= range->end)
+ break;
+ }
+ if (!first) {
+ /* no region intersects with this range */
+ newr = damon_new_region(
+ ALIGN_DOWN(range->start,
+ DAMON_MIN_REGION),
+ ALIGN(range->end, DAMON_MIN_REGION));
+ if (!newr)
+ return -ENOMEM;
+ damon_insert_region(newr, damon_prev_region(r), r, t);
+ } else {
+ /* resize intersecting regions to fit in this range */
+ first->ar.start = ALIGN_DOWN(range->start,
+ DAMON_MIN_REGION);
+ last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+
+ /* fill possible holes in the range */
+ err = damon_fill_regions_holes(first, last, t);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+struct damos_filter *damos_new_filter(enum damos_filter_type type,
+ bool matching)
+{
+ struct damos_filter *filter;
+
+ filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return NULL;
+ filter->type = type;
+ filter->matching = matching;
+ INIT_LIST_HEAD(&filter->list);
+ return filter;
+}
+
+void damos_add_filter(struct damos *s, struct damos_filter *f)
+{
+ list_add_tail(&f->list, &s->filters);
+}
+
+static void damos_del_filter(struct damos_filter *f)
+{
+ list_del(&f->list);
+}
+
+static void damos_free_filter(struct damos_filter *f)
+{
+ kfree(f);
+}
+
+void damos_destroy_filter(struct damos_filter *f)
+{
+ damos_del_filter(f);
+ damos_free_filter(f);
+}
+
+/* initialize private fields of damos_quota and return the pointer */
+static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
+{
+ quota->total_charged_sz = 0;
+ quota->total_charged_ns = 0;
+ quota->esz = 0;
+ quota->charged_sz = 0;
+ quota->charged_from = 0;
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ return quota;
+}
+
+struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
+ enum damos_action action, struct damos_quota *quota,
+ struct damos_watermarks *wmarks)
+{
+ struct damos *scheme;
+
+ scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
+ if (!scheme)
+ return NULL;
+ scheme->pattern = *pattern;
+ scheme->action = action;
+ INIT_LIST_HEAD(&scheme->filters);
+ scheme->stat = (struct damos_stat){};
+ INIT_LIST_HEAD(&scheme->list);
+
+ scheme->quota = *(damos_quota_init_priv(quota));
+
+ scheme->wmarks = *wmarks;
+ scheme->wmarks.activated = true;
+
+ return scheme;
+}
+
+void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
+{
+ list_add_tail(&s->list, &ctx->schemes);
+}
+
+static void damon_del_scheme(struct damos *s)
+{
+ list_del(&s->list);
+}
+
+static void damon_free_scheme(struct damos *s)
+{
+ kfree(s);
+}
+
+void damon_destroy_scheme(struct damos *s)
+{
+ struct damos_filter *f, *next;
+
+ damos_for_each_filter_safe(f, next, s)
+ damos_destroy_filter(f);
+ damon_del_scheme(s);
+ damon_free_scheme(s);
+}
+
+/*
+ * Construct a damon_target struct
+ *
+ * Returns the pointer to the new struct if success, or NULL otherwise
+ */
+struct damon_target *damon_new_target(void)
+{
+ struct damon_target *t;
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return NULL;
+
+ t->pid = NULL;
+ t->nr_regions = 0;
+ INIT_LIST_HEAD(&t->regions_list);
+ INIT_LIST_HEAD(&t->list);
+
+ return t;
+}
+
+void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
+{
+ list_add_tail(&t->list, &ctx->adaptive_targets);
+}
+
+bool damon_targets_empty(struct damon_ctx *ctx)
+{
+ return list_empty(&ctx->adaptive_targets);
+}
+
+static void damon_del_target(struct damon_target *t)
+{
+ list_del(&t->list);
+}
+
+void damon_free_target(struct damon_target *t)
+{
+ struct damon_region *r, *next;
+
+ damon_for_each_region_safe(r, next, t)
+ damon_free_region(r);
+ kfree(t);
+}
+
+void damon_destroy_target(struct damon_target *t)
+{
+ damon_del_target(t);
+ damon_free_target(t);
+}
+
+unsigned int damon_nr_regions(struct damon_target *t)
+{
+ return t->nr_regions;
+}
+
+struct damon_ctx *damon_new_ctx(void)
+{
+ struct damon_ctx *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ ctx->attrs.sample_interval = 5 * 1000;
+ ctx->attrs.aggr_interval = 100 * 1000;
+ ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
+
+ ktime_get_coarse_ts64(&ctx->last_aggregation);
+ ctx->last_ops_update = ctx->last_aggregation;
+
+ mutex_init(&ctx->kdamond_lock);
+
+ ctx->attrs.min_nr_regions = 10;
+ ctx->attrs.max_nr_regions = 1000;
+
+ INIT_LIST_HEAD(&ctx->adaptive_targets);
+ INIT_LIST_HEAD(&ctx->schemes);
+
+ return ctx;
+}
+
+static void damon_destroy_targets(struct damon_ctx *ctx)
+{
+ struct damon_target *t, *next_t;
+
+ if (ctx->ops.cleanup) {
+ ctx->ops.cleanup(ctx);
+ return;
+ }
+
+ damon_for_each_target_safe(t, next_t, ctx)
+ damon_destroy_target(t);
+}
+
+void damon_destroy_ctx(struct damon_ctx *ctx)
+{
+ struct damos *s, *next_s;
+
+ damon_destroy_targets(ctx);
+
+ damon_for_each_scheme_safe(s, next_s, ctx)
+ damon_destroy_scheme(s);
+
+ kfree(ctx);
+}
+
+static unsigned int damon_age_for_new_attrs(unsigned int age,
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+ return age * old_attrs->aggr_interval / new_attrs->aggr_interval;
+}
+
+/* convert access ratio in bp (per 10,000) to nr_accesses */
+static unsigned int damon_accesses_bp_to_nr_accesses(
+ unsigned int accesses_bp, struct damon_attrs *attrs)
+{
+ unsigned int max_nr_accesses =
+ attrs->aggr_interval / attrs->sample_interval;
+
+ return accesses_bp * max_nr_accesses / 10000;
+}
+
+/* convert nr_accesses to access ratio in bp (per 10,000) */
+static unsigned int damon_nr_accesses_to_accesses_bp(
+ unsigned int nr_accesses, struct damon_attrs *attrs)
+{
+ unsigned int max_nr_accesses =
+ attrs->aggr_interval / attrs->sample_interval;
+
+ return nr_accesses * 10000 / max_nr_accesses;
+}
+
+static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+ return damon_accesses_bp_to_nr_accesses(
+ damon_nr_accesses_to_accesses_bp(
+ nr_accesses, old_attrs),
+ new_attrs);
+}
+
+static void damon_update_monitoring_result(struct damon_region *r,
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+ r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
+ old_attrs, new_attrs);
+ r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
+}
+
+/*
+ * region->nr_accesses is the number of sampling intervals in the last
+ * aggregation interval that access to the region has found, and region->age is
+ * the number of aggregation intervals that its access pattern has maintained.
+ * For the reason, the real meaning of the two fields depend on current
+ * sampling interval and aggregation interval. This function updates
+ * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs.
+ */
+static void damon_update_monitoring_results(struct damon_ctx *ctx,
+ struct damon_attrs *new_attrs)
+{
+ struct damon_attrs *old_attrs = &ctx->attrs;
+ struct damon_target *t;
+ struct damon_region *r;
+
+ /* if any interval is zero, simply forgive conversion */
+ if (!old_attrs->sample_interval || !old_attrs->aggr_interval ||
+ !new_attrs->sample_interval ||
+ !new_attrs->aggr_interval)
+ return;
+
+ damon_for_each_target(t, ctx)
+ damon_for_each_region(r, t)
+ damon_update_monitoring_result(
+ r, old_attrs, new_attrs);
+}
+
+/**
+ * damon_set_attrs() - Set attributes for the monitoring.
+ * @ctx: monitoring context
+ * @attrs: monitoring attributes
+ *
+ * This function should not be called while the kdamond is running.
+ * Every time interval is in micro-seconds.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
+{
+ if (attrs->min_nr_regions < 3)
+ return -EINVAL;
+ if (attrs->min_nr_regions > attrs->max_nr_regions)
+ return -EINVAL;
+ if (attrs->sample_interval > attrs->aggr_interval)
+ return -EINVAL;
+
+ damon_update_monitoring_results(ctx, attrs);
+ ctx->attrs = *attrs;
+ return 0;
+}
+
+/**
+ * damon_set_schemes() - Set data access monitoring based operation schemes.
+ * @ctx: monitoring context
+ * @schemes: array of the schemes
+ * @nr_schemes: number of entries in @schemes
+ *
+ * This function should not be called while the kdamond of the context is
+ * running.
+ */
+void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
+ ssize_t nr_schemes)
+{
+ struct damos *s, *next;
+ ssize_t i;
+
+ damon_for_each_scheme_safe(s, next, ctx)
+ damon_destroy_scheme(s);
+ for (i = 0; i < nr_schemes; i++)
+ damon_add_scheme(ctx, schemes[i]);
+}
+
+/**
+ * damon_nr_running_ctxs() - Return number of currently running contexts.
+ */
+int damon_nr_running_ctxs(void)
+{
+ int nr_ctxs;
+
+ mutex_lock(&damon_lock);
+ nr_ctxs = nr_running_ctxs;
+ mutex_unlock(&damon_lock);
+
+ return nr_ctxs;
+}
+
+/* Returns the size upper limit for each monitoring region */
+static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long sz = 0;
+
+ damon_for_each_target(t, ctx) {
+ damon_for_each_region(r, t)
+ sz += damon_sz_region(r);
+ }
+
+ if (ctx->attrs.min_nr_regions)
+ sz /= ctx->attrs.min_nr_regions;
+ if (sz < DAMON_MIN_REGION)
+ sz = DAMON_MIN_REGION;
+
+ return sz;
+}
+
+static int kdamond_fn(void *data);
+
+/*
+ * __damon_start() - Starts monitoring with given context.
+ * @ctx: monitoring context
+ *
+ * This function should be called while damon_lock is hold.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int __damon_start(struct damon_ctx *ctx)
+{
+ int err = -EBUSY;
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (!ctx->kdamond) {
+ err = 0;
+ ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
+ nr_running_ctxs);
+ if (IS_ERR(ctx->kdamond)) {
+ err = PTR_ERR(ctx->kdamond);
+ ctx->kdamond = NULL;
+ }
+ }
+ mutex_unlock(&ctx->kdamond_lock);
+
+ return err;
+}
+
+/**
+ * damon_start() - Starts the monitorings for a given group of contexts.
+ * @ctxs: an array of the pointers for contexts to start monitoring
+ * @nr_ctxs: size of @ctxs
+ * @exclusive: exclusiveness of this contexts group
+ *
+ * This function starts a group of monitoring threads for a group of monitoring
+ * contexts. One thread per each context is created and run in parallel. The
+ * caller should handle synchronization between the threads by itself. If
+ * @exclusive is true and a group of threads that created by other
+ * 'damon_start()' call is currently running, this function does nothing but
+ * returns -EBUSY.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive)
+{
+ int i;
+ int err = 0;
+
+ mutex_lock(&damon_lock);
+ if ((exclusive && nr_running_ctxs) ||
+ (!exclusive && running_exclusive_ctxs)) {
+ mutex_unlock(&damon_lock);
+ return -EBUSY;
+ }
+
+ for (i = 0; i < nr_ctxs; i++) {
+ err = __damon_start(ctxs[i]);
+ if (err)
+ break;
+ nr_running_ctxs++;
+ }
+ if (exclusive && nr_running_ctxs)
+ running_exclusive_ctxs = true;
+ mutex_unlock(&damon_lock);
+
+ return err;
+}
+
+/*
+ * __damon_stop() - Stops monitoring of a given context.
+ * @ctx: monitoring context
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int __damon_stop(struct damon_ctx *ctx)
+{
+ struct task_struct *tsk;
+
+ mutex_lock(&ctx->kdamond_lock);
+ tsk = ctx->kdamond;
+ if (tsk) {
+ get_task_struct(tsk);
+ mutex_unlock(&ctx->kdamond_lock);
+ kthread_stop(tsk);
+ put_task_struct(tsk);
+ return 0;
+ }
+ mutex_unlock(&ctx->kdamond_lock);
+
+ return -EPERM;
+}
+
+/**
+ * damon_stop() - Stops the monitorings for a given group of contexts.
+ * @ctxs: an array of the pointers for contexts to stop monitoring
+ * @nr_ctxs: size of @ctxs
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
+{
+ int i, err = 0;
+
+ for (i = 0; i < nr_ctxs; i++) {
+ /* nr_running_ctxs is decremented in kdamond_fn */
+ err = __damon_stop(ctxs[i]);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * damon_check_reset_time_interval() - Check if a time interval is elapsed.
+ * @baseline: the time to check whether the interval has elapsed since
+ * @interval: the time interval (microseconds)
+ *
+ * See whether the given time interval has passed since the given baseline
+ * time. If so, it also updates the baseline to current time for next check.
+ *
+ * Return: true if the time interval has passed, or false otherwise.
+ */
+static bool damon_check_reset_time_interval(struct timespec64 *baseline,
+ unsigned long interval)
+{
+ struct timespec64 now;
+
+ ktime_get_coarse_ts64(&now);
+ if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
+ interval * 1000)
+ return false;
+ *baseline = now;
+ return true;
+}
+
+/*
+ * Check whether it is time to flush the aggregated information
+ */
+static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
+{
+ return damon_check_reset_time_interval(&ctx->last_aggregation,
+ ctx->attrs.aggr_interval);
+}
+
+/*
+ * Reset the aggregated monitoring results ('nr_accesses' of each region).
+ */
+static void kdamond_reset_aggregated(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ unsigned int ti = 0; /* target's index */
+
+ damon_for_each_target(t, c) {
+ struct damon_region *r;
+
+ damon_for_each_region(r, t) {
+ trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
+ r->last_nr_accesses = r->nr_accesses;
+ r->nr_accesses = 0;
+ }
+ ti++;
+ }
+}
+
+static void damon_split_region_at(struct damon_target *t,
+ struct damon_region *r, unsigned long sz_r);
+
+static bool __damos_valid_target(struct damon_region *r, struct damos *s)
+{
+ unsigned long sz;
+
+ sz = damon_sz_region(r);
+ return s->pattern.min_sz_region <= sz &&
+ sz <= s->pattern.max_sz_region &&
+ s->pattern.min_nr_accesses <= r->nr_accesses &&
+ r->nr_accesses <= s->pattern.max_nr_accesses &&
+ s->pattern.min_age_region <= r->age &&
+ r->age <= s->pattern.max_age_region;
+}
+
+static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
+ struct damon_region *r, struct damos *s)
+{
+ bool ret = __damos_valid_target(r, s);
+
+ if (!ret || !s->quota.esz || !c->ops.get_scheme_score)
+ return ret;
+
+ return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score;
+}
+
+/*
+ * damos_skip_charged_region() - Check if the given region or starting part of
+ * it is already charged for the DAMOS quota.
+ * @t: The target of the region.
+ * @rp: The pointer to the region.
+ * @s: The scheme to be applied.
+ *
+ * If a quota of a scheme has exceeded in a quota charge window, the scheme's
+ * action would applied to only a part of the target access pattern fulfilling
+ * regions. To avoid applying the scheme action to only already applied
+ * regions, DAMON skips applying the scheme action to the regions that charged
+ * in the previous charge window.
+ *
+ * This function checks if a given region should be skipped or not for the
+ * reason. If only the starting part of the region has previously charged,
+ * this function splits the region into two so that the second one covers the
+ * area that not charged in the previous charge widnow and saves the second
+ * region in *rp and returns false, so that the caller can apply DAMON action
+ * to the second one.
+ *
+ * Return: true if the region should be entirely skipped, false otherwise.
+ */
+static bool damos_skip_charged_region(struct damon_target *t,
+ struct damon_region **rp, struct damos *s)
+{
+ struct damon_region *r = *rp;
+ struct damos_quota *quota = &s->quota;
+ unsigned long sz_to_skip;
+
+ /* Skip previously charged regions */
+ if (quota->charge_target_from) {
+ if (t != quota->charge_target_from)
+ return true;
+ if (r == damon_last_region(t)) {
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ return true;
+ }
+ if (quota->charge_addr_from &&
+ r->ar.end <= quota->charge_addr_from)
+ return true;
+
+ if (quota->charge_addr_from && r->ar.start <
+ quota->charge_addr_from) {
+ sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
+ r->ar.start, DAMON_MIN_REGION);
+ if (!sz_to_skip) {
+ if (damon_sz_region(r) <= DAMON_MIN_REGION)
+ return true;
+ sz_to_skip = DAMON_MIN_REGION;
+ }
+ damon_split_region_at(t, r, sz_to_skip);
+ r = damon_next_region(r);
+ *rp = r;
+ }
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ }
+ return false;
+}
+
+static void damos_update_stat(struct damos *s,
+ unsigned long sz_tried, unsigned long sz_applied)
+{
+ s->stat.nr_tried++;
+ s->stat.sz_tried += sz_tried;
+ if (sz_applied)
+ s->stat.nr_applied++;
+ s->stat.sz_applied += sz_applied;
+}
+
+static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
+ struct damon_region *r, struct damos *s)
+{
+ struct damos_quota *quota = &s->quota;
+ unsigned long sz = damon_sz_region(r);
+ struct timespec64 begin, end;
+ unsigned long sz_applied = 0;
+ int err = 0;
+
+ if (c->ops.apply_scheme) {
+ if (quota->esz && quota->charged_sz + sz > quota->esz) {
+ sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
+ DAMON_MIN_REGION);
+ if (!sz)
+ goto update_stat;
+ damon_split_region_at(t, r, sz);
+ }
+ ktime_get_coarse_ts64(&begin);
+ if (c->callback.before_damos_apply)
+ err = c->callback.before_damos_apply(c, t, r, s);
+ if (!err)
+ sz_applied = c->ops.apply_scheme(c, t, r, s);
+ ktime_get_coarse_ts64(&end);
+ quota->total_charged_ns += timespec64_to_ns(&end) -
+ timespec64_to_ns(&begin);
+ quota->charged_sz += sz;
+ if (quota->esz && quota->charged_sz >= quota->esz) {
+ quota->charge_target_from = t;
+ quota->charge_addr_from = r->ar.end + 1;
+ }
+ }
+ if (s->action != DAMOS_STAT)
+ r->age = 0;
+
+update_stat:
+ damos_update_stat(s, sz, sz_applied);
+}
+
+static void damon_do_apply_schemes(struct damon_ctx *c,
+ struct damon_target *t,
+ struct damon_region *r)
+{
+ struct damos *s;
+
+ damon_for_each_scheme(s, c) {
+ struct damos_quota *quota = &s->quota;
+
+ if (!s->wmarks.activated)
+ continue;
+
+ /* Check the quota */
+ if (quota->esz && quota->charged_sz >= quota->esz)
+ continue;
+
+ if (damos_skip_charged_region(t, &r, s))
+ continue;
+
+ if (!damos_valid_target(c, t, r, s))
+ continue;
+
+ damos_apply_scheme(c, t, r, s);
+ }
+}
+
+/* Shouldn't be called if quota->ms and quota->sz are zero */
+static void damos_set_effective_quota(struct damos_quota *quota)
+{
+ unsigned long throughput;
+ unsigned long esz;
+
+ if (!quota->ms) {
+ quota->esz = quota->sz;
+ return;
+ }
+
+ if (quota->total_charged_ns)
+ throughput = quota->total_charged_sz * 1000000 /
+ quota->total_charged_ns;
+ else
+ throughput = PAGE_SIZE * 1024;
+ esz = throughput * quota->ms;
+
+ if (quota->sz && quota->sz < esz)
+ esz = quota->sz;
+ quota->esz = esz;
+}
+
+static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
+{
+ struct damos_quota *quota = &s->quota;
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long cumulated_sz;
+ unsigned int score, max_score = 0;
+
+ if (!quota->ms && !quota->sz)
+ return;
+
+ /* New charge window starts */
+ if (time_after_eq(jiffies, quota->charged_from +
+ msecs_to_jiffies(quota->reset_interval))) {
+ if (quota->esz && quota->charged_sz >= quota->esz)
+ s->stat.qt_exceeds++;
+ quota->total_charged_sz += quota->charged_sz;
+ quota->charged_from = jiffies;
+ quota->charged_sz = 0;
+ damos_set_effective_quota(quota);
+ }
+
+ if (!c->ops.get_scheme_score)
+ return;
+
+ /* Fill up the score histogram */
+ memset(quota->histogram, 0, sizeof(quota->histogram));
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ if (!__damos_valid_target(r, s))
+ continue;
+ score = c->ops.get_scheme_score(c, t, r, s);
+ quota->histogram[score] += damon_sz_region(r);
+ if (score > max_score)
+ max_score = score;
+ }
+ }
+
+ /* Set the min score limit */
+ for (cumulated_sz = 0, score = max_score; ; score--) {
+ cumulated_sz += quota->histogram[score];
+ if (cumulated_sz >= quota->esz || !score)
+ break;
+ }
+ quota->min_score = score;
+}
+
+static void kdamond_apply_schemes(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r, *next_r;
+ struct damos *s;
+
+ damon_for_each_scheme(s, c) {
+ if (!s->wmarks.activated)
+ continue;
+
+ damos_adjust_quota(c, s);
+ }
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region_safe(r, next_r, t)
+ damon_do_apply_schemes(c, t, r);
+ }
+}
+
+/*
+ * Merge two adjacent regions into one region
+ */
+static void damon_merge_two_regions(struct damon_target *t,
+ struct damon_region *l, struct damon_region *r)
+{
+ unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r);
+
+ l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
+ (sz_l + sz_r);
+ l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
+ l->ar.end = r->ar.end;
+ damon_destroy_region(r, t);
+}
+
+/*
+ * Merge adjacent regions having similar access frequencies
+ *
+ * t target affected by this merge operation
+ * thres '->nr_accesses' diff threshold for the merge
+ * sz_limit size upper limit of each region
+ */
+static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
+ unsigned long sz_limit)
+{
+ struct damon_region *r, *prev = NULL, *next;
+
+ damon_for_each_region_safe(r, next, t) {
+ if (abs(r->nr_accesses - r->last_nr_accesses) > thres)
+ r->age = 0;
+ else
+ r->age++;
+
+ if (prev && prev->ar.end == r->ar.start &&
+ abs(prev->nr_accesses - r->nr_accesses) <= thres &&
+ damon_sz_region(prev) + damon_sz_region(r) <= sz_limit)
+ damon_merge_two_regions(t, prev, r);
+ else
+ prev = r;
+ }
+}
+
+/*
+ * Merge adjacent regions having similar access frequencies
+ *
+ * threshold '->nr_accesses' diff threshold for the merge
+ * sz_limit size upper limit of each region
+ *
+ * This function merges monitoring target regions which are adjacent and their
+ * access frequencies are similar. This is for minimizing the monitoring
+ * overhead under the dynamically changeable access pattern. If a merge was
+ * unnecessarily made, later 'kdamond_split_regions()' will revert it.
+ */
+static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
+ unsigned long sz_limit)
+{
+ struct damon_target *t;
+
+ damon_for_each_target(t, c)
+ damon_merge_regions_of(t, threshold, sz_limit);
+}
+
+/*
+ * Split a region in two
+ *
+ * r the region to be split
+ * sz_r size of the first sub-region that will be made
+ */
+static void damon_split_region_at(struct damon_target *t,
+ struct damon_region *r, unsigned long sz_r)
+{
+ struct damon_region *new;
+
+ new = damon_new_region(r->ar.start + sz_r, r->ar.end);
+ if (!new)
+ return;
+
+ r->ar.end = new->ar.start;
+
+ new->age = r->age;
+ new->last_nr_accesses = r->last_nr_accesses;
+
+ damon_insert_region(new, r, damon_next_region(r), t);
+}
+
+/* Split every region in the given target into 'nr_subs' regions */
+static void damon_split_regions_of(struct damon_target *t, int nr_subs)
+{
+ struct damon_region *r, *next;
+ unsigned long sz_region, sz_sub = 0;
+ int i;
+
+ damon_for_each_region_safe(r, next, t) {
+ sz_region = damon_sz_region(r);
+
+ for (i = 0; i < nr_subs - 1 &&
+ sz_region > 2 * DAMON_MIN_REGION; i++) {
+ /*
+ * Randomly select size of left sub-region to be at
+ * least 10 percent and at most 90% of original region
+ */
+ sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
+ sz_region / 10, DAMON_MIN_REGION);
+ /* Do not allow blank region */
+ if (sz_sub == 0 || sz_sub >= sz_region)
+ continue;
+
+ damon_split_region_at(t, r, sz_sub);
+ sz_region = sz_sub;
+ }
+ }
+}
+
+/*
+ * Split every target region into randomly-sized small regions
+ *
+ * This function splits every target region into random-sized small regions if
+ * current total number of the regions is equal or smaller than half of the
+ * user-specified maximum number of regions. This is for maximizing the
+ * monitoring accuracy under the dynamically changeable access patterns. If a
+ * split was unnecessarily made, later 'kdamond_merge_regions()' will revert
+ * it.
+ */
+static void kdamond_split_regions(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ unsigned int nr_regions = 0;
+ static unsigned int last_nr_regions;
+ int nr_subregions = 2;
+
+ damon_for_each_target(t, ctx)
+ nr_regions += damon_nr_regions(t);
+
+ if (nr_regions > ctx->attrs.max_nr_regions / 2)
+ return;
+
+ /* Maybe the middle of the region has different access frequency */
+ if (last_nr_regions == nr_regions &&
+ nr_regions < ctx->attrs.max_nr_regions / 3)
+ nr_subregions = 3;
+
+ damon_for_each_target(t, ctx)
+ damon_split_regions_of(t, nr_subregions);
+
+ last_nr_regions = nr_regions;
+}
+
+/*
+ * Check whether it is time to check and apply the operations-related data
+ * structures.
+ *
+ * Returns true if it is.
+ */
+static bool kdamond_need_update_operations(struct damon_ctx *ctx)
+{
+ return damon_check_reset_time_interval(&ctx->last_ops_update,
+ ctx->attrs.ops_update_interval);
+}
+
+/*
+ * Check whether current monitoring should be stopped
+ *
+ * The monitoring is stopped when either the user requested to stop, or all
+ * monitoring targets are invalid.
+ *
+ * Returns true if need to stop current monitoring.
+ */
+static bool kdamond_need_stop(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+
+ if (kthread_should_stop())
+ return true;
+
+ if (!ctx->ops.target_valid)
+ return false;
+
+ damon_for_each_target(t, ctx) {
+ if (ctx->ops.target_valid(t))
+ return false;
+ }
+
+ return true;
+}
+
+static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
+{
+ struct sysinfo i;
+
+ switch (metric) {
+ case DAMOS_WMARK_FREE_MEM_RATE:
+ si_meminfo(&i);
+ return i.freeram * 1000 / i.totalram;
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+/*
+ * Returns zero if the scheme is active. Else, returns time to wait for next
+ * watermark check in micro-seconds.
+ */
+static unsigned long damos_wmark_wait_us(struct damos *scheme)
+{
+ unsigned long metric;
+
+ if (scheme->wmarks.metric == DAMOS_WMARK_NONE)
+ return 0;
+
+ metric = damos_wmark_metric_value(scheme->wmarks.metric);
+ /* higher than high watermark or lower than low watermark */
+ if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
+ if (scheme->wmarks.activated)
+ pr_debug("deactivate a scheme (%d) for %s wmark\n",
+ scheme->action,
+ metric > scheme->wmarks.high ?
+ "high" : "low");
+ scheme->wmarks.activated = false;
+ return scheme->wmarks.interval;
+ }
+
+ /* inactive and higher than middle watermark */
+ if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) &&
+ !scheme->wmarks.activated)
+ return scheme->wmarks.interval;
+
+ if (!scheme->wmarks.activated)
+ pr_debug("activate a scheme (%d)\n", scheme->action);
+ scheme->wmarks.activated = true;
+ return 0;
+}
+
+static void kdamond_usleep(unsigned long usecs)
+{
+ /* See Documentation/timers/timers-howto.rst for the thresholds */
+ if (usecs > 20 * USEC_PER_MSEC)
+ schedule_timeout_idle(usecs_to_jiffies(usecs));
+ else
+ usleep_idle_range(usecs, usecs + 1);
+}
+
+/* Returns negative error code if it's not activated but should return */
+static int kdamond_wait_activation(struct damon_ctx *ctx)
+{
+ struct damos *s;
+ unsigned long wait_time;
+ unsigned long min_wait_time = 0;
+ bool init_wait_time = false;
+
+ while (!kdamond_need_stop(ctx)) {
+ damon_for_each_scheme(s, ctx) {
+ wait_time = damos_wmark_wait_us(s);
+ if (!init_wait_time || wait_time < min_wait_time) {
+ init_wait_time = true;
+ min_wait_time = wait_time;
+ }
+ }
+ if (!min_wait_time)
+ return 0;
+
+ kdamond_usleep(min_wait_time);
+
+ if (ctx->callback.after_wmarks_check &&
+ ctx->callback.after_wmarks_check(ctx))
+ break;
+ }
+ return -EBUSY;
+}
+
+/*
+ * The monitoring daemon that runs as a kernel thread
+ */
+static int kdamond_fn(void *data)
+{
+ struct damon_ctx *ctx = data;
+ struct damon_target *t;
+ struct damon_region *r, *next;
+ unsigned int max_nr_accesses = 0;
+ unsigned long sz_limit = 0;
+
+ pr_debug("kdamond (%d) starts\n", current->pid);
+
+ if (ctx->ops.init)
+ ctx->ops.init(ctx);
+ if (ctx->callback.before_start && ctx->callback.before_start(ctx))
+ goto done;
+
+ sz_limit = damon_region_sz_limit(ctx);
+
+ while (!kdamond_need_stop(ctx)) {
+ if (kdamond_wait_activation(ctx))
+ break;
+
+ if (ctx->ops.prepare_access_checks)
+ ctx->ops.prepare_access_checks(ctx);
+ if (ctx->callback.after_sampling &&
+ ctx->callback.after_sampling(ctx))
+ break;
+
+ kdamond_usleep(ctx->attrs.sample_interval);
+
+ if (ctx->ops.check_accesses)
+ max_nr_accesses = ctx->ops.check_accesses(ctx);
+
+ if (kdamond_aggregate_interval_passed(ctx)) {
+ kdamond_merge_regions(ctx,
+ max_nr_accesses / 10,
+ sz_limit);
+ if (ctx->callback.after_aggregation &&
+ ctx->callback.after_aggregation(ctx))
+ break;
+ if (!list_empty(&ctx->schemes))
+ kdamond_apply_schemes(ctx);
+ kdamond_reset_aggregated(ctx);
+ kdamond_split_regions(ctx);
+ if (ctx->ops.reset_aggregated)
+ ctx->ops.reset_aggregated(ctx);
+ }
+
+ if (kdamond_need_update_operations(ctx)) {
+ if (ctx->ops.update)
+ ctx->ops.update(ctx);
+ sz_limit = damon_region_sz_limit(ctx);
+ }
+ }
+done:
+ damon_for_each_target(t, ctx) {
+ damon_for_each_region_safe(r, next, t)
+ damon_destroy_region(r, t);
+ }
+
+ if (ctx->callback.before_terminate)
+ ctx->callback.before_terminate(ctx);
+ if (ctx->ops.cleanup)
+ ctx->ops.cleanup(ctx);
+
+ pr_debug("kdamond (%d) finishes\n", current->pid);
+ mutex_lock(&ctx->kdamond_lock);
+ ctx->kdamond = NULL;
+ mutex_unlock(&ctx->kdamond_lock);
+
+ mutex_lock(&damon_lock);
+ nr_running_ctxs--;
+ if (!nr_running_ctxs && running_exclusive_ctxs)
+ running_exclusive_ctxs = false;
+ mutex_unlock(&damon_lock);
+
+ return 0;
+}
+
+/*
+ * struct damon_system_ram_region - System RAM resource address region of
+ * [@start, @end).
+ * @start: Start address of the region (inclusive).
+ * @end: End address of the region (exclusive).
+ */
+struct damon_system_ram_region {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+ struct damon_system_ram_region *a = arg;
+
+ if (a->end - a->start < resource_size(res)) {
+ a->start = res->start;
+ a->end = res->end;
+ }
+ return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively. If no System RAM is found, returns false.
+ */
+static bool damon_find_biggest_system_ram(unsigned long *start,
+ unsigned long *end)
+
+{
+ struct damon_system_ram_region arg = {};
+
+ walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+ if (arg.end <= arg.start)
+ return false;
+
+ *start = arg.start;
+ *end = arg.end;
+ return true;
+}
+
+/**
+ * damon_set_region_biggest_system_ram_default() - Set the region of the given
+ * monitoring target as requested, or biggest 'System RAM'.
+ * @t: The monitoring target to set the region.
+ * @start: The pointer to the start address of the region.
+ * @end: The pointer to the end address of the region.
+ *
+ * This function sets the region of @t as requested by @start and @end. If the
+ * values of @start and @end are zero, however, this function finds the biggest
+ * 'System RAM' resource and sets the region to cover the resource. In the
+ * latter case, this function saves the start and end addresses of the resource
+ * in @start and @end, respectively.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+ unsigned long *start, unsigned long *end)
+{
+ struct damon_addr_range addr_range;
+
+ if (*start > *end)
+ return -EINVAL;
+
+ if (!*start && !*end &&
+ !damon_find_biggest_system_ram(start, end))
+ return -EINVAL;
+
+ addr_range.start = *start;
+ addr_range.end = *end;
+ return damon_set_regions(t, &addr_range, 1);
+}
+
+static int __init damon_init(void)
+{
+ damon_region_cache = KMEM_CACHE(damon_region, 0);
+ if (unlikely(!damon_region_cache)) {
+ pr_err("creating damon_region_cache fails\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+subsys_initcall(damon_init);
+
+#include "core-test.h"
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
new file mode 100644
index 000000000000..0bb0d532b159
--- /dev/null
+++ b/mm/damon/dbgfs-test.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DAMON Debugfs Interface Unit Tests
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST
+
+#ifndef _DAMON_DBGFS_TEST_H
+#define _DAMON_DBGFS_TEST_H
+
+#include <kunit/test.h>
+
+static void damon_dbgfs_test_str_to_ints(struct kunit *test)
+{
+ char *question;
+ int *answers;
+ int expected[] = {12, 35, 46};
+ ssize_t nr_integers = 0, i;
+
+ question = "123";
+ answers = str_to_ints(question, strlen(question), &nr_integers);
+ KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
+ KUNIT_EXPECT_EQ(test, 123, answers[0]);
+ kfree(answers);
+
+ question = "123abc";
+ answers = str_to_ints(question, strlen(question), &nr_integers);
+ KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
+ KUNIT_EXPECT_EQ(test, 123, answers[0]);
+ kfree(answers);
+
+ question = "a123";
+ answers = str_to_ints(question, strlen(question), &nr_integers);
+ KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
+ kfree(answers);
+
+ question = "12 35";
+ answers = str_to_ints(question, strlen(question), &nr_integers);
+ KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
+ for (i = 0; i < nr_integers; i++)
+ KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
+ kfree(answers);
+
+ question = "12 35 46";
+ answers = str_to_ints(question, strlen(question), &nr_integers);
+ KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
+ for (i = 0; i < nr_integers; i++)
+ KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
+ kfree(answers);
+
+ question = "12 35 abc 46";
+ answers = str_to_ints(question, strlen(question), &nr_integers);
+ KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
+ for (i = 0; i < 2; i++)
+ KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
+ kfree(answers);
+
+ question = "";
+ answers = str_to_ints(question, strlen(question), &nr_integers);
+ KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
+ kfree(answers);
+
+ question = "\n";
+ answers = str_to_ints(question, strlen(question), &nr_integers);
+ KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
+ kfree(answers);
+}
+
+static void damon_dbgfs_test_set_targets(struct kunit *test)
+{
+ struct damon_ctx *ctx = dbgfs_new_ctx();
+ char buf[64];
+
+ /* Make DAMON consider target has no pid */
+ damon_select_ops(ctx, DAMON_OPS_PADDR);
+
+ dbgfs_set_targets(ctx, 0, NULL);
+ sprint_target_ids(ctx, buf, 64);
+ KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
+
+ dbgfs_set_targets(ctx, 1, NULL);
+ sprint_target_ids(ctx, buf, 64);
+ KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n");
+
+ dbgfs_set_targets(ctx, 0, NULL);
+ sprint_target_ids(ctx, buf, 64);
+ KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
+
+ dbgfs_destroy_ctx(ctx);
+}
+
+static void damon_dbgfs_test_set_init_regions(struct kunit *test)
+{
+ struct damon_ctx *ctx = damon_new_ctx();
+ /* Each line represents one region in ``<target idx> <start> <end>`` */
+ char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45",
+ "1 10 20\n",
+ "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n",
+ ""};
+ /* Reading the file again will show sorted, clean output */
+ char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n",
+ "1 10 20\n",
+ "0 39 59\n0 70 134\n1 10 20\n1 20 25\n",
+ ""};
+ char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */
+ "1 10 20\n 1 14 26\n", /* regions overlap */
+ "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */
+ char *input, *expect;
+ int i, rc;
+ char buf[256];
+
+ damon_select_ops(ctx, DAMON_OPS_PADDR);
+
+ dbgfs_set_targets(ctx, 3, NULL);
+
+ /* Put valid inputs and check the results */
+ for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
+ input = valid_inputs[i];
+ expect = valid_expects[i];
+
+ rc = set_init_regions(ctx, input, strnlen(input, 256));
+ KUNIT_EXPECT_EQ(test, rc, 0);
+
+ memset(buf, 0, 256);
+ sprint_init_regions(ctx, buf, 256);
+
+ KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
+ }
+ /* Put invalid inputs and check the return error code */
+ for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
+ input = invalid_inputs[i];
+ pr_info("input: %s\n", input);
+ rc = set_init_regions(ctx, input, strnlen(input, 256));
+ KUNIT_EXPECT_EQ(test, rc, -EINVAL);
+
+ memset(buf, 0, 256);
+ sprint_init_regions(ctx, buf, 256);
+
+ KUNIT_EXPECT_STREQ(test, (char *)buf, "");
+ }
+
+ dbgfs_set_targets(ctx, 0, NULL);
+ damon_destroy_ctx(ctx);
+}
+
+static struct kunit_case damon_test_cases[] = {
+ KUNIT_CASE(damon_dbgfs_test_str_to_ints),
+ KUNIT_CASE(damon_dbgfs_test_set_targets),
+ KUNIT_CASE(damon_dbgfs_test_set_init_regions),
+ {},
+};
+
+static struct kunit_suite damon_test_suite = {
+ .name = "damon-dbgfs",
+ .test_cases = damon_test_cases,
+};
+kunit_test_suite(damon_test_suite);
+
+#endif /* _DAMON_TEST_H */
+
+#endif /* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
new file mode 100644
index 000000000000..124f0f8c97b7
--- /dev/null
+++ b/mm/damon/dbgfs.c
@@ -0,0 +1,1133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON Debugfs Interface
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#define pr_fmt(fmt) "damon-dbgfs: " fmt
+
+#include <linux/damon.h>
+#include <linux/debugfs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page_idle.h>
+#include <linux/slab.h>
+
+static struct damon_ctx **dbgfs_ctxs;
+static int dbgfs_nr_ctxs;
+static struct dentry **dbgfs_dirs;
+static DEFINE_MUTEX(damon_dbgfs_lock);
+
+static void damon_dbgfs_warn_deprecation(void)
+{
+ pr_warn_once("DAMON debugfs interface is deprecated, "
+ "so users should move to DAMON_SYSFS. If you cannot, "
+ "please report your usecase to damon@lists.linux.dev and "
+ "linux-mm@kvack.org.\n");
+}
+
+/*
+ * Returns non-empty string on success, negative error code otherwise.
+ */
+static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos)
+{
+ char *kbuf;
+ ssize_t ret;
+
+ /* We do not accept continuous write */
+ if (*ppos)
+ return ERR_PTR(-EINVAL);
+
+ kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN);
+ if (!kbuf)
+ return ERR_PTR(-ENOMEM);
+
+ ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count);
+ if (ret != count) {
+ kfree(kbuf);
+ return ERR_PTR(-EIO);
+ }
+ kbuf[ret] = '\0';
+
+ return kbuf;
+}
+
+static ssize_t dbgfs_attrs_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char kbuf[128];
+ int ret;
+
+ mutex_lock(&ctx->kdamond_lock);
+ ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
+ ctx->attrs.sample_interval, ctx->attrs.aggr_interval,
+ ctx->attrs.ops_update_interval,
+ ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions);
+ mutex_unlock(&ctx->kdamond_lock);
+
+ return simple_read_from_buffer(buf, count, ppos, kbuf, ret);
+}
+
+static ssize_t dbgfs_attrs_write(struct file *file,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ struct damon_attrs attrs;
+ char *kbuf;
+ ssize_t ret;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ if (sscanf(kbuf, "%lu %lu %lu %lu %lu",
+ &attrs.sample_interval, &attrs.aggr_interval,
+ &attrs.ops_update_interval,
+ &attrs.min_nr_regions,
+ &attrs.max_nr_regions) != 5) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ ret = -EBUSY;
+ goto unlock_out;
+ }
+
+ ret = damon_set_attrs(ctx, &attrs);
+ if (!ret)
+ ret = count;
+unlock_out:
+ mutex_unlock(&ctx->kdamond_lock);
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+/*
+ * Return corresponding dbgfs' scheme action value (int) for the given
+ * damos_action if the given damos_action value is valid and supported by
+ * dbgfs, negative error code otherwise.
+ */
+static int damos_action_to_dbgfs_scheme_action(enum damos_action action)
+{
+ switch (action) {
+ case DAMOS_WILLNEED:
+ return 0;
+ case DAMOS_COLD:
+ return 1;
+ case DAMOS_PAGEOUT:
+ return 2;
+ case DAMOS_HUGEPAGE:
+ return 3;
+ case DAMOS_NOHUGEPAGE:
+ return 4;
+ case DAMOS_STAT:
+ return 5;
+ default:
+ return -EINVAL;
+ }
+}
+
+static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
+{
+ struct damos *s;
+ int written = 0;
+ int rc;
+
+ damon_for_each_scheme(s, c) {
+ rc = scnprintf(&buf[written], len - written,
+ "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ s->pattern.min_sz_region,
+ s->pattern.max_sz_region,
+ s->pattern.min_nr_accesses,
+ s->pattern.max_nr_accesses,
+ s->pattern.min_age_region,
+ s->pattern.max_age_region,
+ damos_action_to_dbgfs_scheme_action(s->action),
+ s->quota.ms, s->quota.sz,
+ s->quota.reset_interval,
+ s->quota.weight_sz,
+ s->quota.weight_nr_accesses,
+ s->quota.weight_age,
+ s->wmarks.metric, s->wmarks.interval,
+ s->wmarks.high, s->wmarks.mid, s->wmarks.low,
+ s->stat.nr_tried, s->stat.sz_tried,
+ s->stat.nr_applied, s->stat.sz_applied,
+ s->stat.qt_exceeds);
+ if (!rc)
+ return -ENOMEM;
+
+ written += rc;
+ }
+ return written;
+}
+
+static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ ssize_t len;
+
+ kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
+ if (!kbuf)
+ return -ENOMEM;
+
+ mutex_lock(&ctx->kdamond_lock);
+ len = sprint_schemes(ctx, kbuf, count);
+ mutex_unlock(&ctx->kdamond_lock);
+ if (len < 0)
+ goto out;
+ len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+ kfree(kbuf);
+ return len;
+}
+
+static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
+{
+ ssize_t i;
+
+ for (i = 0; i < nr_schemes; i++)
+ kfree(schemes[i]);
+ kfree(schemes);
+}
+
+/*
+ * Return corresponding damos_action for the given dbgfs input for a scheme
+ * action if the input is valid, negative error code otherwise.
+ */
+static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action)
+{
+ switch (dbgfs_action) {
+ case 0:
+ return DAMOS_WILLNEED;
+ case 1:
+ return DAMOS_COLD;
+ case 2:
+ return DAMOS_PAGEOUT;
+ case 3:
+ return DAMOS_HUGEPAGE;
+ case 4:
+ return DAMOS_NOHUGEPAGE;
+ case 5:
+ return DAMOS_STAT;
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * Converts a string into an array of struct damos pointers
+ *
+ * Returns an array of struct damos pointers that converted if the conversion
+ * success, or NULL otherwise.
+ */
+static struct damos **str_to_schemes(const char *str, ssize_t len,
+ ssize_t *nr_schemes)
+{
+ struct damos *scheme, **schemes;
+ const int max_nr_schemes = 256;
+ int pos = 0, parsed, ret;
+ unsigned int action_input;
+ enum damos_action action;
+
+ schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
+ GFP_KERNEL);
+ if (!schemes)
+ return NULL;
+
+ *nr_schemes = 0;
+ while (pos < len && *nr_schemes < max_nr_schemes) {
+ struct damos_access_pattern pattern = {};
+ struct damos_quota quota = {};
+ struct damos_watermarks wmarks;
+
+ ret = sscanf(&str[pos],
+ "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
+ &pattern.min_sz_region, &pattern.max_sz_region,
+ &pattern.min_nr_accesses,
+ &pattern.max_nr_accesses,
+ &pattern.min_age_region,
+ &pattern.max_age_region,
+ &action_input, &quota.ms,
+ &quota.sz, &quota.reset_interval,
+ &quota.weight_sz, &quota.weight_nr_accesses,
+ &quota.weight_age, &wmarks.metric,
+ &wmarks.interval, &wmarks.high, &wmarks.mid,
+ &wmarks.low, &parsed);
+ if (ret != 18)
+ break;
+ action = dbgfs_scheme_action_to_damos_action(action_input);
+ if ((int)action < 0)
+ goto fail;
+
+ if (pattern.min_sz_region > pattern.max_sz_region ||
+ pattern.min_nr_accesses > pattern.max_nr_accesses ||
+ pattern.min_age_region > pattern.max_age_region)
+ goto fail;
+
+ if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
+ wmarks.mid < wmarks.low)
+ goto fail;
+
+ pos += parsed;
+ scheme = damon_new_scheme(&pattern, action, &quota, &wmarks);
+ if (!scheme)
+ goto fail;
+
+ schemes[*nr_schemes] = scheme;
+ *nr_schemes += 1;
+ }
+ return schemes;
+fail:
+ free_schemes_arr(schemes, *nr_schemes);
+ return NULL;
+}
+
+static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ struct damos **schemes;
+ ssize_t nr_schemes = 0, ret;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ schemes = str_to_schemes(kbuf, count, &nr_schemes);
+ if (!schemes) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ ret = -EBUSY;
+ goto unlock_out;
+ }
+
+ damon_set_schemes(ctx, schemes, nr_schemes);
+ ret = count;
+ nr_schemes = 0;
+
+unlock_out:
+ mutex_unlock(&ctx->kdamond_lock);
+ free_schemes_arr(schemes, nr_schemes);
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
+{
+ struct damon_target *t;
+ int id;
+ int written = 0;
+ int rc;
+
+ damon_for_each_target(t, ctx) {
+ if (damon_target_has_pid(ctx))
+ /* Show pid numbers to debugfs users */
+ id = pid_vnr(t->pid);
+ else
+ /* Show 42 for physical address space, just for fun */
+ id = 42;
+
+ rc = scnprintf(&buf[written], len - written, "%d ", id);
+ if (!rc)
+ return -ENOMEM;
+ written += rc;
+ }
+ if (written)
+ written -= 1;
+ written += scnprintf(&buf[written], len - written, "\n");
+ return written;
+}
+
+static ssize_t dbgfs_target_ids_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ ssize_t len;
+ char ids_buf[320];
+
+ mutex_lock(&ctx->kdamond_lock);
+ len = sprint_target_ids(ctx, ids_buf, 320);
+ mutex_unlock(&ctx->kdamond_lock);
+ if (len < 0)
+ return len;
+
+ return simple_read_from_buffer(buf, count, ppos, ids_buf, len);
+}
+
+/*
+ * Converts a string into an integers array
+ *
+ * Returns an array of integers array if the conversion success, or NULL
+ * otherwise.
+ */
+static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints)
+{
+ int *array;
+ const int max_nr_ints = 32;
+ int nr;
+ int pos = 0, parsed, ret;
+
+ *nr_ints = 0;
+ array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL);
+ if (!array)
+ return NULL;
+ while (*nr_ints < max_nr_ints && pos < len) {
+ ret = sscanf(&str[pos], "%d%n", &nr, &parsed);
+ pos += parsed;
+ if (ret != 1)
+ break;
+ array[*nr_ints] = nr;
+ *nr_ints += 1;
+ }
+
+ return array;
+}
+
+static void dbgfs_put_pids(struct pid **pids, int nr_pids)
+{
+ int i;
+
+ for (i = 0; i < nr_pids; i++)
+ put_pid(pids[i]);
+}
+
+/*
+ * Converts a string into an struct pid pointers array
+ *
+ * Returns an array of struct pid pointers if the conversion success, or NULL
+ * otherwise.
+ */
+static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids)
+{
+ int *ints;
+ ssize_t nr_ints;
+ struct pid **pids;
+
+ *nr_pids = 0;
+
+ ints = str_to_ints(str, len, &nr_ints);
+ if (!ints)
+ return NULL;
+
+ pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL);
+ if (!pids)
+ goto out;
+
+ for (; *nr_pids < nr_ints; (*nr_pids)++) {
+ pids[*nr_pids] = find_get_pid(ints[*nr_pids]);
+ if (!pids[*nr_pids]) {
+ dbgfs_put_pids(pids, *nr_pids);
+ kfree(ints);
+ kfree(pids);
+ return NULL;
+ }
+ }
+
+out:
+ kfree(ints);
+ return pids;
+}
+
+/*
+ * dbgfs_set_targets() - Set monitoring targets.
+ * @ctx: monitoring context
+ * @nr_targets: number of targets
+ * @pids: array of target pids (size is same to @nr_targets)
+ *
+ * This function should not be called while the kdamond is running. @pids is
+ * ignored if the context is not configured to have pid in each target. On
+ * failure, reference counts of all pids in @pids are decremented.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
+ struct pid **pids)
+{
+ ssize_t i;
+ struct damon_target *t, *next;
+
+ damon_for_each_target_safe(t, next, ctx) {
+ if (damon_target_has_pid(ctx))
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+
+ for (i = 0; i < nr_targets; i++) {
+ t = damon_new_target();
+ if (!t) {
+ damon_for_each_target_safe(t, next, ctx)
+ damon_destroy_target(t);
+ if (damon_target_has_pid(ctx))
+ dbgfs_put_pids(pids, nr_targets);
+ return -ENOMEM;
+ }
+ if (damon_target_has_pid(ctx))
+ t->pid = pids[i];
+ damon_add_target(ctx, t);
+ }
+
+ return 0;
+}
+
+static ssize_t dbgfs_target_ids_write(struct file *file,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ bool id_is_pid = true;
+ char *kbuf;
+ struct pid **target_pids = NULL;
+ ssize_t nr_targets;
+ ssize_t ret;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ if (!strncmp(kbuf, "paddr\n", count)) {
+ id_is_pid = false;
+ nr_targets = 1;
+ }
+
+ if (id_is_pid) {
+ target_pids = str_to_pids(kbuf, count, &nr_targets);
+ if (!target_pids) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ if (id_is_pid)
+ dbgfs_put_pids(target_pids, nr_targets);
+ ret = -EBUSY;
+ goto unlock_out;
+ }
+
+ /* remove previously set targets */
+ dbgfs_set_targets(ctx, 0, NULL);
+ if (!nr_targets) {
+ ret = count;
+ goto unlock_out;
+ }
+
+ /* Configure the context for the address space type */
+ if (id_is_pid)
+ ret = damon_select_ops(ctx, DAMON_OPS_VADDR);
+ else
+ ret = damon_select_ops(ctx, DAMON_OPS_PADDR);
+ if (ret)
+ goto unlock_out;
+
+ ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
+ if (!ret)
+ ret = count;
+
+unlock_out:
+ mutex_unlock(&ctx->kdamond_lock);
+ kfree(target_pids);
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ int target_idx = 0;
+ int written = 0;
+ int rc;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ rc = scnprintf(&buf[written], len - written,
+ "%d %lu %lu\n",
+ target_idx, r->ar.start, r->ar.end);
+ if (!rc)
+ return -ENOMEM;
+ written += rc;
+ }
+ target_idx++;
+ }
+ return written;
+}
+
+static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ ssize_t len;
+
+ kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
+ if (!kbuf)
+ return -ENOMEM;
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ mutex_unlock(&ctx->kdamond_lock);
+ len = -EBUSY;
+ goto out;
+ }
+
+ len = sprint_init_regions(ctx, kbuf, count);
+ mutex_unlock(&ctx->kdamond_lock);
+ if (len < 0)
+ goto out;
+ len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+ kfree(kbuf);
+ return len;
+}
+
+static int add_init_region(struct damon_ctx *c, int target_idx,
+ struct damon_addr_range *ar)
+{
+ struct damon_target *t;
+ struct damon_region *r, *prev;
+ unsigned long idx = 0;
+ int rc = -EINVAL;
+
+ if (ar->start >= ar->end)
+ return -EINVAL;
+
+ damon_for_each_target(t, c) {
+ if (idx++ == target_idx) {
+ r = damon_new_region(ar->start, ar->end);
+ if (!r)
+ return -ENOMEM;
+ damon_add_region(r, t);
+ if (damon_nr_regions(t) > 1) {
+ prev = damon_prev_region(r);
+ if (prev->ar.end > r->ar.start) {
+ damon_destroy_region(r, t);
+ return -EINVAL;
+ }
+ }
+ rc = 0;
+ }
+ }
+ return rc;
+}
+
+static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
+{
+ struct damon_target *t;
+ struct damon_region *r, *next;
+ int pos = 0, parsed, ret;
+ int target_idx;
+ struct damon_addr_range ar;
+ int err;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region_safe(r, next, t)
+ damon_destroy_region(r, t);
+ }
+
+ while (pos < len) {
+ ret = sscanf(&str[pos], "%d %lu %lu%n",
+ &target_idx, &ar.start, &ar.end, &parsed);
+ if (ret != 3)
+ break;
+ err = add_init_region(c, target_idx, &ar);
+ if (err)
+ goto fail;
+ pos += parsed;
+ }
+
+ return 0;
+
+fail:
+ damon_for_each_target(t, c) {
+ damon_for_each_region_safe(r, next, t)
+ damon_destroy_region(r, t);
+ }
+ return err;
+}
+
+static ssize_t dbgfs_init_regions_write(struct file *file,
+ const char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ ssize_t ret = count;
+ int err;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ ret = -EBUSY;
+ goto unlock_out;
+ }
+
+ err = set_init_regions(ctx, kbuf, ret);
+ if (err)
+ ret = err;
+
+unlock_out:
+ mutex_unlock(&ctx->kdamond_lock);
+ kfree(kbuf);
+ return ret;
+}
+
+static ssize_t dbgfs_kdamond_pid_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ ssize_t len;
+
+ kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
+ if (!kbuf)
+ return -ENOMEM;
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond)
+ len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid);
+ else
+ len = scnprintf(kbuf, count, "none\n");
+ mutex_unlock(&ctx->kdamond_lock);
+ if (!len)
+ goto out;
+ len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+ kfree(kbuf);
+ return len;
+}
+
+static int damon_dbgfs_open(struct inode *inode, struct file *file)
+{
+ damon_dbgfs_warn_deprecation();
+
+ file->private_data = inode->i_private;
+
+ return nonseekable_open(inode, file);
+}
+
+static const struct file_operations attrs_fops = {
+ .open = damon_dbgfs_open,
+ .read = dbgfs_attrs_read,
+ .write = dbgfs_attrs_write,
+};
+
+static const struct file_operations schemes_fops = {
+ .open = damon_dbgfs_open,
+ .read = dbgfs_schemes_read,
+ .write = dbgfs_schemes_write,
+};
+
+static const struct file_operations target_ids_fops = {
+ .open = damon_dbgfs_open,
+ .read = dbgfs_target_ids_read,
+ .write = dbgfs_target_ids_write,
+};
+
+static const struct file_operations init_regions_fops = {
+ .open = damon_dbgfs_open,
+ .read = dbgfs_init_regions_read,
+ .write = dbgfs_init_regions_write,
+};
+
+static const struct file_operations kdamond_pid_fops = {
+ .open = damon_dbgfs_open,
+ .read = dbgfs_kdamond_pid_read,
+};
+
+static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
+{
+ const char * const file_names[] = {"attrs", "schemes", "target_ids",
+ "init_regions", "kdamond_pid"};
+ const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
+ &target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(file_names); i++)
+ debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
+}
+
+static void dbgfs_before_terminate(struct damon_ctx *ctx)
+{
+ struct damon_target *t, *next;
+
+ if (!damon_target_has_pid(ctx))
+ return;
+
+ mutex_lock(&ctx->kdamond_lock);
+ damon_for_each_target_safe(t, next, ctx) {
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+ mutex_unlock(&ctx->kdamond_lock);
+}
+
+static struct damon_ctx *dbgfs_new_ctx(void)
+{
+ struct damon_ctx *ctx;
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return NULL;
+
+ if (damon_select_ops(ctx, DAMON_OPS_VADDR) &&
+ damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+ damon_destroy_ctx(ctx);
+ return NULL;
+ }
+ ctx->callback.before_terminate = dbgfs_before_terminate;
+ return ctx;
+}
+
+static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
+{
+ damon_destroy_ctx(ctx);
+}
+
+/*
+ * Make a context of @name and create a debugfs directory for it.
+ *
+ * This function should be called while holding damon_dbgfs_lock.
+ *
+ * Returns 0 on success, negative error code otherwise.
+ */
+static int dbgfs_mk_context(char *name)
+{
+ struct dentry *root, **new_dirs, *new_dir;
+ struct damon_ctx **new_ctxs, *new_ctx;
+
+ if (damon_nr_running_ctxs())
+ return -EBUSY;
+
+ new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) *
+ (dbgfs_nr_ctxs + 1), GFP_KERNEL);
+ if (!new_ctxs)
+ return -ENOMEM;
+ dbgfs_ctxs = new_ctxs;
+
+ new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) *
+ (dbgfs_nr_ctxs + 1), GFP_KERNEL);
+ if (!new_dirs)
+ return -ENOMEM;
+ dbgfs_dirs = new_dirs;
+
+ root = dbgfs_dirs[0];
+ if (!root)
+ return -ENOENT;
+
+ new_dir = debugfs_create_dir(name, root);
+ /* Below check is required for a potential duplicated name case */
+ if (IS_ERR(new_dir))
+ return PTR_ERR(new_dir);
+ dbgfs_dirs[dbgfs_nr_ctxs] = new_dir;
+
+ new_ctx = dbgfs_new_ctx();
+ if (!new_ctx) {
+ debugfs_remove(new_dir);
+ dbgfs_dirs[dbgfs_nr_ctxs] = NULL;
+ return -ENOMEM;
+ }
+
+ dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx;
+ dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs],
+ dbgfs_ctxs[dbgfs_nr_ctxs]);
+ dbgfs_nr_ctxs++;
+
+ return 0;
+}
+
+static ssize_t dbgfs_mk_context_write(struct file *file,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ char *kbuf;
+ char *ctx_name;
+ ssize_t ret;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+ ctx_name = kmalloc(count + 1, GFP_KERNEL);
+ if (!ctx_name) {
+ kfree(kbuf);
+ return -ENOMEM;
+ }
+
+ /* Trim white space */
+ if (sscanf(kbuf, "%s", ctx_name) != 1) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&damon_dbgfs_lock);
+ ret = dbgfs_mk_context(ctx_name);
+ if (!ret)
+ ret = count;
+ mutex_unlock(&damon_dbgfs_lock);
+
+out:
+ kfree(kbuf);
+ kfree(ctx_name);
+ return ret;
+}
+
+/*
+ * Remove a context of @name and its debugfs directory.
+ *
+ * This function should be called while holding damon_dbgfs_lock.
+ *
+ * Return 0 on success, negative error code otherwise.
+ */
+static int dbgfs_rm_context(char *name)
+{
+ struct dentry *root, *dir, **new_dirs;
+ struct inode *inode;
+ struct damon_ctx **new_ctxs;
+ int i, j;
+ int ret = 0;
+
+ if (damon_nr_running_ctxs())
+ return -EBUSY;
+
+ root = dbgfs_dirs[0];
+ if (!root)
+ return -ENOENT;
+
+ dir = debugfs_lookup(name, root);
+ if (!dir)
+ return -ENOENT;
+
+ inode = d_inode(dir);
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = -EINVAL;
+ goto out_dput;
+ }
+
+ new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
+ GFP_KERNEL);
+ if (!new_dirs) {
+ ret = -ENOMEM;
+ goto out_dput;
+ }
+
+ new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
+ GFP_KERNEL);
+ if (!new_ctxs) {
+ ret = -ENOMEM;
+ goto out_new_dirs;
+ }
+
+ for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
+ if (dbgfs_dirs[i] == dir) {
+ debugfs_remove(dbgfs_dirs[i]);
+ dbgfs_destroy_ctx(dbgfs_ctxs[i]);
+ continue;
+ }
+ new_dirs[j] = dbgfs_dirs[i];
+ new_ctxs[j++] = dbgfs_ctxs[i];
+ }
+
+ kfree(dbgfs_dirs);
+ kfree(dbgfs_ctxs);
+
+ dbgfs_dirs = new_dirs;
+ dbgfs_ctxs = new_ctxs;
+ dbgfs_nr_ctxs--;
+
+ goto out_dput;
+
+out_new_dirs:
+ kfree(new_dirs);
+out_dput:
+ dput(dir);
+ return ret;
+}
+
+static ssize_t dbgfs_rm_context_write(struct file *file,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ char *kbuf;
+ ssize_t ret;
+ char *ctx_name;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+ ctx_name = kmalloc(count + 1, GFP_KERNEL);
+ if (!ctx_name) {
+ kfree(kbuf);
+ return -ENOMEM;
+ }
+
+ /* Trim white space */
+ if (sscanf(kbuf, "%s", ctx_name) != 1) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&damon_dbgfs_lock);
+ ret = dbgfs_rm_context(ctx_name);
+ if (!ret)
+ ret = count;
+ mutex_unlock(&damon_dbgfs_lock);
+
+out:
+ kfree(kbuf);
+ kfree(ctx_name);
+ return ret;
+}
+
+static ssize_t dbgfs_monitor_on_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ char monitor_on_buf[5];
+ bool monitor_on = damon_nr_running_ctxs() != 0;
+ int len;
+
+ len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n");
+
+ return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len);
+}
+
+static ssize_t dbgfs_monitor_on_write(struct file *file,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ ssize_t ret;
+ char *kbuf;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ /* Remove white space */
+ if (sscanf(kbuf, "%s", kbuf) != 1) {
+ kfree(kbuf);
+ return -EINVAL;
+ }
+
+ mutex_lock(&damon_dbgfs_lock);
+ if (!strncmp(kbuf, "on", count)) {
+ int i;
+
+ for (i = 0; i < dbgfs_nr_ctxs; i++) {
+ if (damon_targets_empty(dbgfs_ctxs[i])) {
+ kfree(kbuf);
+ mutex_unlock(&damon_dbgfs_lock);
+ return -EINVAL;
+ }
+ }
+ ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true);
+ } else if (!strncmp(kbuf, "off", count)) {
+ ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
+ } else {
+ ret = -EINVAL;
+ }
+ mutex_unlock(&damon_dbgfs_lock);
+
+ if (!ret)
+ ret = count;
+ kfree(kbuf);
+ return ret;
+}
+
+static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
+{
+ damon_dbgfs_warn_deprecation();
+ return nonseekable_open(inode, file);
+}
+
+static const struct file_operations mk_contexts_fops = {
+ .open = damon_dbgfs_static_file_open,
+ .write = dbgfs_mk_context_write,
+};
+
+static const struct file_operations rm_contexts_fops = {
+ .open = damon_dbgfs_static_file_open,
+ .write = dbgfs_rm_context_write,
+};
+
+static const struct file_operations monitor_on_fops = {
+ .open = damon_dbgfs_static_file_open,
+ .read = dbgfs_monitor_on_read,
+ .write = dbgfs_monitor_on_write,
+};
+
+static int __init __damon_dbgfs_init(void)
+{
+ struct dentry *dbgfs_root;
+ const char * const file_names[] = {"mk_contexts", "rm_contexts",
+ "monitor_on"};
+ const struct file_operations *fops[] = {&mk_contexts_fops,
+ &rm_contexts_fops, &monitor_on_fops};
+ int i;
+
+ dbgfs_root = debugfs_create_dir("damon", NULL);
+
+ for (i = 0; i < ARRAY_SIZE(file_names); i++)
+ debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL,
+ fops[i]);
+ dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
+
+ dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
+ if (!dbgfs_dirs) {
+ debugfs_remove(dbgfs_root);
+ return -ENOMEM;
+ }
+ dbgfs_dirs[0] = dbgfs_root;
+
+ return 0;
+}
+
+/*
+ * Functions for the initialization
+ */
+
+static int __init damon_dbgfs_init(void)
+{
+ int rc = -ENOMEM;
+
+ mutex_lock(&damon_dbgfs_lock);
+ dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL);
+ if (!dbgfs_ctxs)
+ goto out;
+ dbgfs_ctxs[0] = dbgfs_new_ctx();
+ if (!dbgfs_ctxs[0]) {
+ kfree(dbgfs_ctxs);
+ goto out;
+ }
+ dbgfs_nr_ctxs = 1;
+
+ rc = __damon_dbgfs_init();
+ if (rc) {
+ kfree(dbgfs_ctxs[0]);
+ kfree(dbgfs_ctxs);
+ pr_err("%s: dbgfs init failed\n", __func__);
+ }
+
+out:
+ mutex_unlock(&damon_dbgfs_lock);
+ return rc;
+}
+
+module_init(damon_dbgfs_init);
+
+#include "dbgfs-test.h"
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
new file mode 100644
index 000000000000..7b8fce2f67a8
--- /dev/null
+++ b/mm/damon/lru_sort.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON-based LRU-lists Sorting
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-lru-sort: " fmt
+
+#include <linux/damon.h>
+#include <linux/kstrtox.h>
+#include <linux/module.h>
+
+#include "modules-common.h"
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_lru_sort."
+
+/*
+ * Enable or disable DAMON_LRU_SORT.
+ *
+ * You can enable DAMON_LRU_SORT by setting the value of this parameter as
+ * ``Y``. Setting it as ``N`` disables DAMON_LRU_SORT. Note that
+ * DAMON_LRU_SORT could do no real monitoring and LRU-lists sorting due to the
+ * watermarks-based activation condition. Refer to below descriptions for the
+ * watermarks parameter for this.
+ */
+static bool enabled __read_mostly;
+
+/*
+ * Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
+ *
+ * Input parameters that updated while DAMON_LRU_SORT is running are not
+ * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT
+ * reads values of parametrs except ``enabled`` again. Once the re-reading is
+ * done, this parameter is set as ``N``. If invalid parameters are found while
+ * the re-reading, DAMON_LRU_SORT will be disabled.
+ */
+static bool commit_inputs __read_mostly;
+module_param(commit_inputs, bool, 0600);
+
+/*
+ * Access frequency threshold for hot memory regions identification in permil.
+ *
+ * If a memory region is accessed in frequency of this or higher,
+ * DAMON_LRU_SORT identifies the region as hot, and mark it as accessed on the
+ * LRU list, so that it could not be reclaimed under memory pressure. 50% by
+ * default.
+ */
+static unsigned long hot_thres_access_freq = 500;
+module_param(hot_thres_access_freq, ulong, 0600);
+
+/*
+ * Time threshold for cold memory regions identification in microseconds.
+ *
+ * If a memory region is not accessed for this or longer time, DAMON_LRU_SORT
+ * identifies the region as cold, and mark it as unaccessed on the LRU list, so
+ * that it could be reclaimed first under memory pressure. 120 seconds by
+ * default.
+ */
+static unsigned long cold_min_age __read_mostly = 120000000;
+module_param(cold_min_age, ulong, 0600);
+
+static struct damos_quota damon_lru_sort_quota = {
+ /* Use up to 10 ms per 1 sec, by default */
+ .ms = 10,
+ .sz = 0,
+ .reset_interval = 1000,
+ /* Within the quota, mark hotter regions accessed first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 1,
+ .weight_age = 0,
+};
+DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota);
+
+static struct damos_watermarks damon_lru_sort_wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = 5000000, /* 5 seconds */
+ .high = 200, /* 20 percent */
+ .mid = 150, /* 15 percent */
+ .low = 50, /* 5 percent */
+};
+DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks);
+
+static struct damon_attrs damon_lru_sort_mon_attrs = {
+ .sample_interval = 5000, /* 5 ms */
+ .aggr_interval = 100000, /* 100 ms */
+ .ops_update_interval = 0,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+};
+DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs);
+
+/*
+ * Start of the target memory region in physical address.
+ *
+ * The start physical address of memory region that DAMON_LRU_SORT will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_start __read_mostly;
+module_param(monitor_region_start, ulong, 0600);
+
+/*
+ * End of the target memory region in physical address.
+ *
+ * The end physical address of memory region that DAMON_LRU_SORT will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_end __read_mostly;
+module_param(monitor_region_end, ulong, 0600);
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+static int kdamond_pid __read_mostly = -1;
+module_param(kdamond_pid, int, 0400);
+
+static struct damos_stat damon_lru_sort_hot_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat,
+ lru_sort_tried_hot_regions, lru_sorted_hot_regions,
+ hot_quota_exceeds);
+
+static struct damos_stat damon_lru_sort_cold_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat,
+ lru_sort_tried_cold_regions, lru_sorted_cold_regions,
+ cold_quota_exceeds);
+
+static struct damos_access_pattern damon_lru_sort_stub_pattern = {
+ /* Find regions having PAGE_SIZE or larger size */
+ .min_sz_region = PAGE_SIZE,
+ .max_sz_region = ULONG_MAX,
+ /* no matter its access frequency */
+ .min_nr_accesses = 0,
+ .max_nr_accesses = UINT_MAX,
+ /* no matter its age */
+ .min_age_region = 0,
+ .max_age_region = UINT_MAX,
+};
+
+static struct damon_ctx *ctx;
+static struct damon_target *target;
+
+static struct damos *damon_lru_sort_new_scheme(
+ struct damos_access_pattern *pattern, enum damos_action action)
+{
+ struct damos_quota quota = damon_lru_sort_quota;
+
+ /* Use half of total quota for hot/cold pages sorting */
+ quota.ms = quota.ms / 2;
+
+ return damon_new_scheme(
+ /* find the pattern, and */
+ pattern,
+ /* (de)prioritize on LRU-lists */
+ action,
+ /* under the quota. */
+ &quota,
+ /* (De)activate this according to the watermarks. */
+ &damon_lru_sort_wmarks);
+}
+
+/* Create a DAMON-based operation scheme for hot memory regions */
+static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
+{
+ struct damos_access_pattern pattern = damon_lru_sort_stub_pattern;
+
+ pattern.min_nr_accesses = hot_thres;
+ return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO);
+}
+
+/* Create a DAMON-based operation scheme for cold memory regions */
+static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
+{
+ struct damos_access_pattern pattern = damon_lru_sort_stub_pattern;
+
+ pattern.max_nr_accesses = 0;
+ pattern.min_age_region = cold_thres;
+ return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
+}
+
+static int damon_lru_sort_apply_parameters(void)
+{
+ struct damos *scheme;
+ unsigned int hot_thres, cold_thres;
+ int err = 0;
+
+ err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs);
+ if (err)
+ return err;
+
+ /* aggr_interval / sample_interval is the maximum nr_accesses */
+ hot_thres = damon_lru_sort_mon_attrs.aggr_interval /
+ damon_lru_sort_mon_attrs.sample_interval *
+ hot_thres_access_freq / 1000;
+ scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+ if (!scheme)
+ return -ENOMEM;
+ damon_set_schemes(ctx, &scheme, 1);
+
+ cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
+ scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+ if (!scheme)
+ return -ENOMEM;
+ damon_add_scheme(ctx, scheme);
+
+ return damon_set_region_biggest_system_ram_default(target,
+ &monitor_region_start,
+ &monitor_region_end);
+}
+
+static int damon_lru_sort_turn(bool on)
+{
+ int err;
+
+ if (!on) {
+ err = damon_stop(&ctx, 1);
+ if (!err)
+ kdamond_pid = -1;
+ return err;
+ }
+
+ err = damon_lru_sort_apply_parameters();
+ if (err)
+ return err;
+
+ err = damon_start(&ctx, 1, true);
+ if (err)
+ return err;
+ kdamond_pid = ctx->kdamond->pid;
+ return 0;
+}
+
+static int damon_lru_sort_enabled_store(const char *val,
+ const struct kernel_param *kp)
+{
+ bool is_enabled = enabled;
+ bool enable;
+ int err;
+
+ err = kstrtobool(val, &enable);
+ if (err)
+ return err;
+
+ if (is_enabled == enable)
+ return 0;
+
+ /* Called before init function. The function will handle this. */
+ if (!ctx)
+ goto set_param_out;
+
+ err = damon_lru_sort_turn(enable);
+ if (err)
+ return err;
+
+set_param_out:
+ enabled = enable;
+ return err;
+}
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_lru_sort_enabled_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled,
+ "Enable or disable DAMON_LRU_SORT (default: disabled)");
+
+static int damon_lru_sort_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_lru_sort_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
+{
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c) {
+ if (s->action == DAMOS_LRU_PRIO)
+ damon_lru_sort_hot_stat = s->stat;
+ else if (s->action == DAMOS_LRU_DEPRIO)
+ damon_lru_sort_cold_stat = s->stat;
+ }
+
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
+{
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static int __init damon_lru_sort_init(void)
+{
+ int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
+
+ if (err)
+ return err;
+
+ ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
+ ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
+
+ /* 'enabled' has set before this function, probably via command line */
+ if (enabled)
+ err = damon_lru_sort_turn(true);
+
+ return err;
+}
+
+module_init(damon_lru_sort_init);
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
new file mode 100644
index 000000000000..b2381a8466ec
--- /dev/null
+++ b/mm/damon/modules-common.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#include <linux/damon.h>
+
+#include "modules-common.h"
+
+/*
+ * Allocate, set, and return a DAMON context for the physical address space.
+ * @ctxp: Pointer to save the point to the newly created context
+ * @targetp: Pointer to save the point to the newly created target
+ */
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+ struct damon_target **targetp)
+{
+ struct damon_ctx *ctx;
+ struct damon_target *target;
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return -ENOMEM;
+
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+ damon_destroy_ctx(ctx);
+ return -EINVAL;
+ }
+
+ target = damon_new_target();
+ if (!target) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_add_target(ctx, target);
+
+ *ctxp = ctx;
+ *targetp = target;
+ return 0;
+}
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
new file mode 100644
index 000000000000..f49cdb417005
--- /dev/null
+++ b/mm/damon/modules-common.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/moduleparam.h>
+
+#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \
+ module_param_named(sample_interval, attrs.sample_interval, \
+ ulong, 0600); \
+ module_param_named(aggr_interval, attrs.aggr_interval, ulong, \
+ 0600); \
+ module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \
+ 0600); \
+ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \
+ 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \
+ module_param_named(quota_ms, quota.ms, ulong, 0600); \
+ module_param_named(quota_reset_interval_ms, \
+ quota.reset_interval, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \
+ DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \
+ module_param_named(quota_sz, quota.sz, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \
+ module_param_named(wmarks_interval, wmarks.interval, ulong, \
+ 0600); \
+ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \
+ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \
+ module_param_named(wmarks_low, wmarks.low, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \
+ succ_name, qt_exceed_name) \
+ module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \
+ module_param_named(bytes_##try_name, stat.sz_tried, ulong, \
+ 0400); \
+ module_param_named(nr_##succ_name, stat.nr_applied, ulong, \
+ 0400); \
+ module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \
+ 0400); \
+ module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \
+ 0400);
+
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+ struct damon_target **targetp);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
new file mode 100644
index 000000000000..e940802a15a4
--- /dev/null
+++ b/mm/damon/ops-common.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for Data Access Monitoring
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+
+#include "ops-common.h"
+
+/*
+ * Get an online page for a pfn if it's in the LRU list. Otherwise, returns
+ * NULL.
+ *
+ * The body of this function is stolen from the 'page_idle_get_folio()'. We
+ * steal rather than reuse it because the code is quite simple.
+ */
+struct folio *damon_get_folio(unsigned long pfn)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
+
+ if (!page || PageTail(page))
+ return NULL;
+
+ folio = page_folio(page);
+ if (!folio_test_lru(folio) || !folio_try_get(folio))
+ return NULL;
+ if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ folio_put(folio);
+ folio = NULL;
+ }
+ return folio;
+}
+
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte)));
+
+ if (!folio)
+ return;
+
+ if (ptep_clear_young_notify(vma, addr, pte))
+ folio_set_young(folio);
+
+ folio_set_idle(folio);
+ folio_put(folio);
+}
+
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct folio *folio = damon_get_folio(pmd_pfn(*pmd));
+
+ if (!folio)
+ return;
+
+ if (pmdp_clear_young_notify(vma, addr, pmd))
+ folio_set_young(folio);
+
+ folio_set_idle(folio);
+ folio_put(folio);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+
+#define DAMON_MAX_SUBSCORE (100)
+#define DAMON_MAX_AGE_IN_LOG (32)
+
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s)
+{
+ unsigned int max_nr_accesses;
+ int freq_subscore;
+ unsigned int age_in_sec;
+ int age_in_log, age_subscore;
+ unsigned int freq_weight = s->quota.weight_nr_accesses;
+ unsigned int age_weight = s->quota.weight_age;
+ int hotness;
+
+ max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval;
+ freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+ age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
+ for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+ age_in_log++, age_in_sec >>= 1)
+ ;
+
+ /* If frequency is 0, higher age means it's colder */
+ if (freq_subscore == 0)
+ age_in_log *= -1;
+
+ /*
+ * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+ * Scale it to be in [0, 100] and set it as age subscore.
+ */
+ age_in_log += DAMON_MAX_AGE_IN_LOG;
+ age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+ DAMON_MAX_AGE_IN_LOG / 2;
+
+ hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+ if (freq_weight + age_weight)
+ hotness /= freq_weight + age_weight;
+ /*
+ * Transform it to fit in [0, DAMOS_MAX_SCORE]
+ */
+ hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+ return hotness;
+}
+
+int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s)
+{
+ int hotness = damon_hot_score(c, r, s);
+
+ /* Return coldness of the region */
+ return DAMOS_MAX_SCORE - hotness;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
new file mode 100644
index 000000000000..18d837d11bce
--- /dev/null
+++ b/mm/damon/ops-common.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for Data Access Monitoring
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+
+struct folio *damon_get_folio(unsigned long pfn);
+
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr);
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr);
+
+int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s);
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
new file mode 100644
index 000000000000..40801e38fcf0
--- /dev/null
+++ b/mm/damon/paddr.c
@@ -0,0 +1,351 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON Primitives for The Physical Address Space
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-pa: " fmt
+
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+
+#include "../internal.h"
+#include "ops-common.h"
+
+static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr, void *arg)
+{
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte)
+ damon_ptep_mkold(pvmw.pte, vma, addr);
+ else
+ damon_pmdp_mkold(pvmw.pmd, vma, addr);
+ }
+ return true;
+}
+
+static void damon_pa_mkold(unsigned long paddr)
+{
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
+ struct rmap_walk_control rwc = {
+ .rmap_one = __damon_pa_mkold,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!folio)
+ return;
+
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ folio_set_idle(folio);
+ goto out;
+ }
+
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
+ goto out;
+
+ rmap_walk(folio, &rwc);
+
+ if (need_lock)
+ folio_unlock(folio);
+
+out:
+ folio_put(folio);
+}
+
+static void __damon_pa_prepare_access_check(struct damon_region *r)
+{
+ r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+
+ damon_pa_mkold(r->sampling_addr);
+}
+
+static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+
+ damon_for_each_target(t, ctx) {
+ damon_for_each_region(r, t)
+ __damon_pa_prepare_access_check(r);
+ }
+}
+
+static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr, void *arg)
+{
+ bool *accessed = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+
+ *accessed = false;
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte) {
+ *accessed = pte_young(ptep_get(pvmw.pte)) ||
+ !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+ } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ *accessed = pmd_young(*pvmw.pmd) ||
+ !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+#else
+ WARN_ON_ONCE(1);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ }
+ if (*accessed) {
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* If accessed, stop walking */
+ return *accessed == false;
+}
+
+static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
+{
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
+ bool accessed = false;
+ struct rmap_walk_control rwc = {
+ .arg = &accessed,
+ .rmap_one = __damon_pa_young,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!folio)
+ return false;
+
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ if (folio_test_idle(folio))
+ accessed = false;
+ else
+ accessed = true;
+ goto out;
+ }
+
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
+ goto out;
+
+ rmap_walk(folio, &rwc);
+
+ if (need_lock)
+ folio_unlock(folio);
+
+out:
+ *folio_sz = folio_size(folio);
+ folio_put(folio);
+ return accessed;
+}
+
+static void __damon_pa_check_access(struct damon_region *r)
+{
+ static unsigned long last_addr;
+ static unsigned long last_folio_sz = PAGE_SIZE;
+ static bool last_accessed;
+
+ /* If the region is in the last checked page, reuse the result */
+ if (ALIGN_DOWN(last_addr, last_folio_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
+ if (last_accessed)
+ r->nr_accesses++;
+ return;
+ }
+
+ last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
+ if (last_accessed)
+ r->nr_accesses++;
+
+ last_addr = r->sampling_addr;
+}
+
+static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned int max_nr_accesses = 0;
+
+ damon_for_each_target(t, ctx) {
+ damon_for_each_region(r, t) {
+ __damon_pa_check_access(r);
+ max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+ }
+ }
+
+ return max_nr_accesses;
+}
+
+static bool __damos_pa_filter_out(struct damos_filter *filter,
+ struct folio *folio)
+{
+ bool matched = false;
+ struct mem_cgroup *memcg;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_ANON:
+ matched = folio_test_anon(folio);
+ break;
+ case DAMOS_FILTER_TYPE_MEMCG:
+ rcu_read_lock();
+ memcg = folio_memcg_check(folio);
+ if (!memcg)
+ matched = false;
+ else
+ matched = filter->memcg_id == mem_cgroup_id(memcg);
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+/*
+ * damos_pa_filter_out - Return true if the page should be filtered out.
+ */
+static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
+{
+ struct damos_filter *filter;
+
+ damos_for_each_filter(filter, scheme) {
+ if (__damos_pa_filter_out(filter, folio))
+ return true;
+ }
+ return false;
+}
+
+static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
+{
+ unsigned long addr, applied;
+ LIST_HEAD(folio_list);
+
+ for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+ if (!folio)
+ continue;
+
+ if (damos_pa_filter_out(s, folio))
+ goto put_folio;
+
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
+ if (!folio_isolate_lru(folio))
+ goto put_folio;
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
+ else
+ list_add(&folio->lru, &folio_list);
+put_folio:
+ folio_put(folio);
+ }
+ applied = reclaim_pages(&folio_list);
+ cond_resched();
+ return applied * PAGE_SIZE;
+}
+
+static inline unsigned long damon_pa_mark_accessed_or_deactivate(
+ struct damon_region *r, struct damos *s, bool mark_accessed)
+{
+ unsigned long addr, applied = 0;
+
+ for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+ if (!folio)
+ continue;
+
+ if (damos_pa_filter_out(s, folio))
+ goto put_folio;
+
+ if (mark_accessed)
+ folio_mark_accessed(folio);
+ else
+ folio_deactivate(folio);
+ applied += folio_nr_pages(folio);
+put_folio:
+ folio_put(folio);
+ }
+ return applied * PAGE_SIZE;
+}
+
+static unsigned long damon_pa_mark_accessed(struct damon_region *r,
+ struct damos *s)
+{
+ return damon_pa_mark_accessed_or_deactivate(r, s, true);
+}
+
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
+ struct damos *s)
+{
+ return damon_pa_mark_accessed_or_deactivate(r, s, false);
+}
+
+static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
+{
+ switch (scheme->action) {
+ case DAMOS_PAGEOUT:
+ return damon_pa_pageout(r, scheme);
+ case DAMOS_LRU_PRIO:
+ return damon_pa_mark_accessed(r, scheme);
+ case DAMOS_LRU_DEPRIO:
+ return damon_pa_deactivate_pages(r, scheme);
+ case DAMOS_STAT:
+ break;
+ default:
+ /* DAMOS actions that not yet supported by 'paddr'. */
+ break;
+ }
+ return 0;
+}
+
+static int damon_pa_scheme_score(struct damon_ctx *context,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
+{
+ switch (scheme->action) {
+ case DAMOS_PAGEOUT:
+ return damon_cold_score(context, r, scheme);
+ case DAMOS_LRU_PRIO:
+ return damon_hot_score(context, r, scheme);
+ case DAMOS_LRU_DEPRIO:
+ return damon_cold_score(context, r, scheme);
+ default:
+ break;
+ }
+
+ return DAMOS_MAX_SCORE;
+}
+
+static int __init damon_pa_initcall(void)
+{
+ struct damon_operations ops = {
+ .id = DAMON_OPS_PADDR,
+ .init = NULL,
+ .update = NULL,
+ .prepare_access_checks = damon_pa_prepare_access_checks,
+ .check_accesses = damon_pa_check_accesses,
+ .reset_aggregated = NULL,
+ .target_valid = NULL,
+ .cleanup = NULL,
+ .apply_scheme = damon_pa_apply_scheme,
+ .get_scheme_score = damon_pa_scheme_score,
+ };
+
+ return damon_register_ops(&ops);
+};
+
+subsys_initcall(damon_pa_initcall);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
new file mode 100644
index 000000000000..648d2a85523a
--- /dev/null
+++ b/mm/damon/reclaim.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON-based page reclamation
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-reclaim: " fmt
+
+#include <linux/damon.h>
+#include <linux/kstrtox.h>
+#include <linux/module.h>
+
+#include "modules-common.h"
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_reclaim."
+
+/*
+ * Enable or disable DAMON_RECLAIM.
+ *
+ * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
+ * Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could
+ * do no real monitoring and reclamation due to the watermarks-based activation
+ * condition. Refer to below descriptions for the watermarks parameter for
+ * this.
+ */
+static bool enabled __read_mostly;
+
+/*
+ * Make DAMON_RECLAIM reads the input parameters again, except ``enabled``.
+ *
+ * Input parameters that updated while DAMON_RECLAIM is running are not applied
+ * by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values
+ * of parametrs except ``enabled`` again. Once the re-reading is done, this
+ * parameter is set as ``N``. If invalid parameters are found while the
+ * re-reading, DAMON_RECLAIM will be disabled.
+ */
+static bool commit_inputs __read_mostly;
+module_param(commit_inputs, bool, 0600);
+
+/*
+ * Time threshold for cold memory regions identification in microseconds.
+ *
+ * If a memory region is not accessed for this or longer time, DAMON_RECLAIM
+ * identifies the region as cold, and reclaims. 120 seconds by default.
+ */
+static unsigned long min_age __read_mostly = 120000000;
+module_param(min_age, ulong, 0600);
+
+static struct damos_quota damon_reclaim_quota = {
+ /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */
+ .ms = 10,
+ .sz = 128 * 1024 * 1024,
+ .reset_interval = 1000,
+ /* Within the quota, page out older regions first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 0,
+ .weight_age = 1
+};
+DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota);
+
+static struct damos_watermarks damon_reclaim_wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = 5000000, /* 5 seconds */
+ .high = 500, /* 50 percent */
+ .mid = 400, /* 40 percent */
+ .low = 200, /* 20 percent */
+};
+DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks);
+
+static struct damon_attrs damon_reclaim_mon_attrs = {
+ .sample_interval = 5000, /* 5 ms */
+ .aggr_interval = 100000, /* 100 ms */
+ .ops_update_interval = 0,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+};
+DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs);
+
+/*
+ * Start of the target memory region in physical address.
+ *
+ * The start physical address of memory region that DAMON_RECLAIM will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_start __read_mostly;
+module_param(monitor_region_start, ulong, 0600);
+
+/*
+ * End of the target memory region in physical address.
+ *
+ * The end physical address of memory region that DAMON_RECLAIM will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_end __read_mostly;
+module_param(monitor_region_end, ulong, 0600);
+
+/*
+ * Skip anonymous pages reclamation.
+ *
+ * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
+ * pages. By default, ``N``.
+ */
+static bool skip_anon __read_mostly;
+module_param(skip_anon, bool, 0600);
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+static int kdamond_pid __read_mostly = -1;
+module_param(kdamond_pid, int, 0400);
+
+static struct damos_stat damon_reclaim_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat,
+ reclaim_tried_regions, reclaimed_regions, quota_exceeds);
+
+static struct damon_ctx *ctx;
+static struct damon_target *target;
+
+static struct damos *damon_reclaim_new_scheme(void)
+{
+ struct damos_access_pattern pattern = {
+ /* Find regions having PAGE_SIZE or larger size */
+ .min_sz_region = PAGE_SIZE,
+ .max_sz_region = ULONG_MAX,
+ /* and not accessed at all */
+ .min_nr_accesses = 0,
+ .max_nr_accesses = 0,
+ /* for min_age or more micro-seconds */
+ .min_age_region = min_age /
+ damon_reclaim_mon_attrs.aggr_interval,
+ .max_age_region = UINT_MAX,
+ };
+
+ return damon_new_scheme(
+ &pattern,
+ /* page out those, as soon as found */
+ DAMOS_PAGEOUT,
+ /* under the quota. */
+ &damon_reclaim_quota,
+ /* (De)activate this according to the watermarks. */
+ &damon_reclaim_wmarks);
+}
+
+static int damon_reclaim_apply_parameters(void)
+{
+ struct damos *scheme;
+ struct damos_filter *filter;
+ int err = 0;
+
+ err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
+ if (err)
+ return err;
+
+ /* Will be freed by next 'damon_set_schemes()' below */
+ scheme = damon_reclaim_new_scheme();
+ if (!scheme)
+ return -ENOMEM;
+ if (skip_anon) {
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ if (!filter) {
+ /* Will be freed by next 'damon_set_schemes()' below */
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ damos_add_filter(scheme, filter);
+ }
+ damon_set_schemes(ctx, &scheme, 1);
+
+ return damon_set_region_biggest_system_ram_default(target,
+ &monitor_region_start,
+ &monitor_region_end);
+}
+
+static int damon_reclaim_turn(bool on)
+{
+ int err;
+
+ if (!on) {
+ err = damon_stop(&ctx, 1);
+ if (!err)
+ kdamond_pid = -1;
+ return err;
+ }
+
+ err = damon_reclaim_apply_parameters();
+ if (err)
+ return err;
+
+ err = damon_start(&ctx, 1, true);
+ if (err)
+ return err;
+ kdamond_pid = ctx->kdamond->pid;
+ return 0;
+}
+
+static int damon_reclaim_enabled_store(const char *val,
+ const struct kernel_param *kp)
+{
+ bool is_enabled = enabled;
+ bool enable;
+ int err;
+
+ err = kstrtobool(val, &enable);
+ if (err)
+ return err;
+
+ if (is_enabled == enable)
+ return 0;
+
+ /* Called before init function. The function will handle this. */
+ if (!ctx)
+ goto set_param_out;
+
+ err = damon_reclaim_turn(enable);
+ if (err)
+ return err;
+
+set_param_out:
+ enabled = enable;
+ return err;
+}
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_reclaim_enabled_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled,
+ "Enable or disable DAMON_RECLAIM (default: disabled)");
+
+static int damon_reclaim_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_reclaim_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_reclaim_after_aggregation(struct damon_ctx *c)
+{
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c)
+ damon_reclaim_stat = s->stat;
+
+ return damon_reclaim_handle_commit_inputs();
+}
+
+static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
+{
+ return damon_reclaim_handle_commit_inputs();
+}
+
+static int __init damon_reclaim_init(void)
+{
+ int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
+
+ if (err)
+ return err;
+
+ ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
+ ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
+
+ /* 'enabled' has set before this function, probably via command line */
+ if (enabled)
+ err = damon_reclaim_turn(true);
+
+ return err;
+}
+
+module_init(damon_reclaim_init);
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
new file mode 100644
index 000000000000..70edf45c2174
--- /dev/null
+++ b/mm/damon/sysfs-common.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/slab.h>
+
+#include "sysfs-common.h"
+
+DEFINE_MUTEX(damon_sysfs_lock);
+
+/*
+ * unsigned long range directory
+ */
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+ unsigned long min,
+ unsigned long max)
+{
+ struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
+ GFP_KERNEL);
+
+ if (!range)
+ return NULL;
+ range->kobj = (struct kobject){};
+ range->min = min;
+ range->max = max;
+
+ return range;
+}
+
+static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+
+ return sysfs_emit(buf, "%lu\n", range->min);
+}
+
+static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+ unsigned long min;
+ int err;
+
+ err = kstrtoul(buf, 0, &min);
+ if (err)
+ return err;
+
+ range->min = min;
+ return count;
+}
+
+static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+
+ return sysfs_emit(buf, "%lu\n", range->max);
+}
+
+static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+ unsigned long max;
+ int err;
+
+ err = kstrtoul(buf, 0, &max);
+ if (err)
+ return err;
+
+ range->max = max;
+ return count;
+}
+
+void damon_sysfs_ul_range_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_ul_range_min_attr =
+ __ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_ul_range_max_attr =
+ __ATTR_RW_MODE(max, 0600);
+
+static struct attribute *damon_sysfs_ul_range_attrs[] = {
+ &damon_sysfs_ul_range_min_attr.attr,
+ &damon_sysfs_ul_range_max_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
+
+const struct kobj_type damon_sysfs_ul_range_ktype = {
+ .release = damon_sysfs_ul_range_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_ul_range_groups,
+};
+
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
new file mode 100644
index 000000000000..db677eba78fd
--- /dev/null
+++ b/mm/damon/sysfs-common.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/kobject.h>
+
+extern struct mutex damon_sysfs_lock;
+
+struct damon_sysfs_ul_range {
+ struct kobject kobj;
+ unsigned long min;
+ unsigned long max;
+};
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+ unsigned long min,
+ unsigned long max);
+void damon_sysfs_ul_range_release(struct kobject *kobj);
+
+extern const struct kobj_type damon_sysfs_ul_range_ktype;
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes {
+ struct kobject kobj;
+ struct damon_sysfs_scheme **schemes_arr;
+ int nr;
+};
+
+struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void);
+void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes);
+
+extern const struct kobj_type damon_sysfs_schemes_ktype;
+
+int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+ struct damon_sysfs_schemes *sysfs_schemes);
+
+void damon_sysfs_schemes_update_stats(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_update_regions_start(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_clear_regions(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
new file mode 100644
index 000000000000..50cf89dcd898
--- /dev/null
+++ b/mm/damon/sysfs-schemes.c
@@ -0,0 +1,1707 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON sysfs Interface
+ *
+ * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/slab.h>
+
+#include "sysfs-common.h"
+
+/*
+ * scheme region directory
+ */
+
+struct damon_sysfs_scheme_region {
+ struct kobject kobj;
+ struct damon_addr_range ar;
+ unsigned int nr_accesses;
+ unsigned int age;
+ struct list_head list;
+};
+
+static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc(
+ struct damon_region *region)
+{
+ struct damon_sysfs_scheme_region *sysfs_region = kmalloc(
+ sizeof(*sysfs_region), GFP_KERNEL);
+
+ if (!sysfs_region)
+ return NULL;
+ sysfs_region->kobj = (struct kobject){};
+ sysfs_region->ar = region->ar;
+ sysfs_region->nr_accesses = region->nr_accesses;
+ sysfs_region->age = region->age;
+ INIT_LIST_HEAD(&sysfs_region->list);
+ return sysfs_region;
+}
+
+static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->ar.start);
+}
+
+static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->ar.end);
+}
+
+static ssize_t nr_accesses_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%u\n", region->nr_accesses);
+}
+
+static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%u\n", region->age);
+}
+
+static void damon_sysfs_scheme_region_release(struct kobject *kobj)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ list_del(&region->list);
+ kfree(region);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_region_start_attr =
+ __ATTR_RO_MODE(start, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_end_attr =
+ __ATTR_RO_MODE(end, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr =
+ __ATTR_RO_MODE(nr_accesses, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_age_attr =
+ __ATTR_RO_MODE(age, 0400);
+
+static struct attribute *damon_sysfs_scheme_region_attrs[] = {
+ &damon_sysfs_scheme_region_start_attr.attr,
+ &damon_sysfs_scheme_region_end_attr.attr,
+ &damon_sysfs_scheme_region_nr_accesses_attr.attr,
+ &damon_sysfs_scheme_region_age_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_region);
+
+static const struct kobj_type damon_sysfs_scheme_region_ktype = {
+ .release = damon_sysfs_scheme_region_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_region_groups,
+};
+
+/*
+ * scheme regions directory
+ */
+
+struct damon_sysfs_scheme_regions {
+ struct kobject kobj;
+ struct list_head regions_list;
+ int nr_regions;
+};
+
+static struct damon_sysfs_scheme_regions *
+damon_sysfs_scheme_regions_alloc(void)
+{
+ struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions),
+ GFP_KERNEL);
+
+ regions->kobj = (struct kobject){};
+ INIT_LIST_HEAD(&regions->regions_list);
+ regions->nr_regions = 0;
+ return regions;
+}
+
+static void damon_sysfs_scheme_regions_rm_dirs(
+ struct damon_sysfs_scheme_regions *regions)
+{
+ struct damon_sysfs_scheme_region *r, *next;
+
+ list_for_each_entry_safe(r, next, &regions->regions_list, list) {
+ /* release function deletes it from the list */
+ kobject_put(&r->kobj);
+ regions->nr_regions--;
+ }
+}
+
+static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj));
+}
+
+static struct attribute *damon_sysfs_scheme_regions_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions);
+
+static const struct kobj_type damon_sysfs_scheme_regions_ktype = {
+ .release = damon_sysfs_scheme_regions_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_regions_groups,
+};
+
+/*
+ * schemes/stats directory
+ */
+
+struct damon_sysfs_stats {
+ struct kobject kobj;
+ unsigned long nr_tried;
+ unsigned long sz_tried;
+ unsigned long nr_applied;
+ unsigned long sz_applied;
+ unsigned long qt_exceeds;
+};
+
+static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
+}
+
+static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->nr_tried);
+}
+
+static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_tried);
+}
+
+static ssize_t nr_applied_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->nr_applied);
+}
+
+static ssize_t sz_applied_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_applied);
+}
+
+static ssize_t qt_exceeds_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
+}
+
+static void damon_sysfs_stats_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
+ __ATTR_RO_MODE(nr_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
+ __ATTR_RO_MODE(sz_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
+ __ATTR_RO_MODE(nr_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
+ __ATTR_RO_MODE(sz_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
+ __ATTR_RO_MODE(qt_exceeds, 0400);
+
+static struct attribute *damon_sysfs_stats_attrs[] = {
+ &damon_sysfs_stats_nr_tried_attr.attr,
+ &damon_sysfs_stats_sz_tried_attr.attr,
+ &damon_sysfs_stats_nr_applied_attr.attr,
+ &damon_sysfs_stats_sz_applied_attr.attr,
+ &damon_sysfs_stats_qt_exceeds_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_stats);
+
+static const struct kobj_type damon_sysfs_stats_ktype = {
+ .release = damon_sysfs_stats_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_stats_groups,
+};
+
+/*
+ * filter directory
+ */
+
+struct damon_sysfs_scheme_filter {
+ struct kobject kobj;
+ enum damos_filter_type type;
+ bool matching;
+ char *memcg_path;
+};
+
+static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+}
+
+/* Should match with enum damos_filter_type */
+static const char * const damon_sysfs_scheme_filter_type_strs[] = {
+ "anon",
+ "memcg",
+};
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_scheme_filter_type_strs[filter->type]);
+}
+
+static ssize_t type_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ enum damos_filter_type type;
+ ssize_t ret = -EINVAL;
+
+ for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
+ if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
+ type])) {
+ filter->type = type;
+ ret = count;
+ break;
+ }
+ }
+ return ret;
+}
+
+static ssize_t matching_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N');
+}
+
+static ssize_t matching_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ bool matching;
+ int err = kstrtobool(buf, &matching);
+
+ if (err)
+ return err;
+
+ filter->matching = matching;
+ return count;
+}
+
+static ssize_t memcg_path_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ filter->memcg_path ? filter->memcg_path : "");
+}
+
+static ssize_t memcg_path_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+
+ if (!path)
+ return -ENOMEM;
+
+ strscpy(path, buf, count + 1);
+ filter->memcg_path = path;
+ return count;
+}
+
+static void damon_sysfs_scheme_filter_release(struct kobject *kobj)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ kfree(filter->memcg_path);
+ kfree(filter);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filter_type_attr =
+ __ATTR_RW_MODE(type, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
+ __ATTR_RW_MODE(matching, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
+ __ATTR_RW_MODE(memcg_path, 0600);
+
+static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
+ &damon_sysfs_scheme_filter_type_attr.attr,
+ &damon_sysfs_scheme_filter_matching_attr.attr,
+ &damon_sysfs_scheme_filter_memcg_path_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter);
+
+static const struct kobj_type damon_sysfs_scheme_filter_ktype = {
+ .release = damon_sysfs_scheme_filter_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_filter_groups,
+};
+
+/*
+ * filters directory
+ */
+
+struct damon_sysfs_scheme_filters {
+ struct kobject kobj;
+ struct damon_sysfs_scheme_filter **filters_arr;
+ int nr;
+};
+
+static struct damon_sysfs_scheme_filters *
+damon_sysfs_scheme_filters_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+}
+
+static void damon_sysfs_scheme_filters_rm_dirs(
+ struct damon_sysfs_scheme_filters *filters)
+{
+ struct damon_sysfs_scheme_filter **filters_arr = filters->filters_arr;
+ int i;
+
+ for (i = 0; i < filters->nr; i++)
+ kobject_put(&filters_arr[i]->kobj);
+ filters->nr = 0;
+ kfree(filters_arr);
+ filters->filters_arr = NULL;
+}
+
+static int damon_sysfs_scheme_filters_add_dirs(
+ struct damon_sysfs_scheme_filters *filters, int nr_filters)
+{
+ struct damon_sysfs_scheme_filter **filters_arr, *filter;
+ int err, i;
+
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ if (!nr_filters)
+ return 0;
+
+ filters_arr = kmalloc_array(nr_filters, sizeof(*filters_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!filters_arr)
+ return -ENOMEM;
+ filters->filters_arr = filters_arr;
+
+ for (i = 0; i < nr_filters; i++) {
+ filter = damon_sysfs_scheme_filter_alloc();
+ if (!filter) {
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&filter->kobj,
+ &damon_sysfs_scheme_filter_ktype,
+ &filters->kobj, "%d", i);
+ if (err) {
+ kobject_put(&filter->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ return err;
+ }
+
+ filters_arr[i] = filter;
+ filters->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_filters_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filters *filters = container_of(kobj,
+ struct damon_sysfs_scheme_filters, kobj);
+
+ return sysfs_emit(buf, "%d\n", filters->nr);
+}
+
+static ssize_t nr_filters_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filters *filters;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ filters = container_of(kobj, struct damon_sysfs_scheme_filters, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_scheme_filters_add_dirs(filters, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_scheme_filters_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme_filters, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filters_nr_attr =
+ __ATTR_RW_MODE(nr_filters, 0600);
+
+static struct attribute *damon_sysfs_scheme_filters_attrs[] = {
+ &damon_sysfs_scheme_filters_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters);
+
+static const struct kobj_type damon_sysfs_scheme_filters_ktype = {
+ .release = damon_sysfs_scheme_filters_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_filters_groups,
+};
+
+/*
+ * watermarks directory
+ */
+
+struct damon_sysfs_watermarks {
+ struct kobject kobj;
+ enum damos_wmark_metric metric;
+ unsigned long interval_us;
+ unsigned long high;
+ unsigned long mid;
+ unsigned long low;
+};
+
+static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
+ enum damos_wmark_metric metric, unsigned long interval_us,
+ unsigned long high, unsigned long mid, unsigned long low)
+{
+ struct damon_sysfs_watermarks *watermarks = kmalloc(
+ sizeof(*watermarks), GFP_KERNEL);
+
+ if (!watermarks)
+ return NULL;
+ watermarks->kobj = (struct kobject){};
+ watermarks->metric = metric;
+ watermarks->interval_us = interval_us;
+ watermarks->high = high;
+ watermarks->mid = mid;
+ watermarks->low = low;
+ return watermarks;
+}
+
+/* Should match with enum damos_wmark_metric */
+static const char * const damon_sysfs_wmark_metric_strs[] = {
+ "none",
+ "free_mem_rate",
+};
+
+static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_wmark_metric_strs[watermarks->metric]);
+}
+
+static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ enum damos_wmark_metric metric;
+
+ for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
+ if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
+ watermarks->metric = metric;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t interval_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
+}
+
+static ssize_t interval_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->interval_us);
+
+ return err ? err : count;
+}
+
+static ssize_t high_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->high);
+}
+
+static ssize_t high_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->high);
+
+ return err ? err : count;
+}
+
+static ssize_t mid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->mid);
+}
+
+static ssize_t mid_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->mid);
+
+ return err ? err : count;
+}
+
+static ssize_t low_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->low);
+}
+
+static ssize_t low_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->low);
+
+ return err ? err : count;
+}
+
+static void damon_sysfs_watermarks_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
+ __ATTR_RW_MODE(metric, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
+ __ATTR_RW_MODE(interval_us, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_high_attr =
+ __ATTR_RW_MODE(high, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
+ __ATTR_RW_MODE(mid, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_low_attr =
+ __ATTR_RW_MODE(low, 0600);
+
+static struct attribute *damon_sysfs_watermarks_attrs[] = {
+ &damon_sysfs_watermarks_metric_attr.attr,
+ &damon_sysfs_watermarks_interval_us_attr.attr,
+ &damon_sysfs_watermarks_high_attr.attr,
+ &damon_sysfs_watermarks_mid_attr.attr,
+ &damon_sysfs_watermarks_low_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
+
+static const struct kobj_type damon_sysfs_watermarks_ktype = {
+ .release = damon_sysfs_watermarks_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_watermarks_groups,
+};
+
+/*
+ * scheme/weights directory
+ */
+
+struct damon_sysfs_weights {
+ struct kobject kobj;
+ unsigned int sz;
+ unsigned int nr_accesses;
+ unsigned int age;
+};
+
+static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
+ unsigned int nr_accesses, unsigned int age)
+{
+ struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
+ GFP_KERNEL);
+
+ if (!weights)
+ return NULL;
+ weights->kobj = (struct kobject){};
+ weights->sz = sz;
+ weights->nr_accesses = nr_accesses;
+ weights->age = age;
+ return weights;
+}
+
+static ssize_t sz_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->sz);
+}
+
+static ssize_t sz_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->sz);
+
+ return err ? err : count;
+}
+
+static ssize_t nr_accesses_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->nr_accesses);
+}
+
+static ssize_t nr_accesses_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->nr_accesses);
+
+ return err ? err : count;
+}
+
+static ssize_t age_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->age);
+}
+
+static ssize_t age_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->age);
+
+ return err ? err : count;
+}
+
+static void damon_sysfs_weights_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_weights_sz_attr =
+ __ATTR_RW_MODE(sz_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
+ __ATTR_RW_MODE(nr_accesses_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_age_attr =
+ __ATTR_RW_MODE(age_permil, 0600);
+
+static struct attribute *damon_sysfs_weights_attrs[] = {
+ &damon_sysfs_weights_sz_attr.attr,
+ &damon_sysfs_weights_nr_accesses_attr.attr,
+ &damon_sysfs_weights_age_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_weights);
+
+static const struct kobj_type damon_sysfs_weights_ktype = {
+ .release = damon_sysfs_weights_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_weights_groups,
+};
+
+/*
+ * quotas directory
+ */
+
+struct damon_sysfs_quotas {
+ struct kobject kobj;
+ struct damon_sysfs_weights *weights;
+ unsigned long ms;
+ unsigned long sz;
+ unsigned long reset_interval_ms;
+};
+
+static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
+}
+
+static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
+{
+ struct damon_sysfs_weights *weights;
+ int err;
+
+ weights = damon_sysfs_weights_alloc(0, 0, 0);
+ if (!weights)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
+ &quotas->kobj, "weights");
+ if (err)
+ kobject_put(&weights->kobj);
+ else
+ quotas->weights = weights;
+ return err;
+}
+
+static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
+{
+ kobject_put(&quotas->weights->kobj);
+}
+
+static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->ms);
+}
+
+static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->ms);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->sz);
+}
+
+static ssize_t bytes_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->sz);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t reset_interval_ms_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
+}
+
+static ssize_t reset_interval_ms_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static void damon_sysfs_quotas_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_quotas_ms_attr =
+ __ATTR_RW_MODE(ms, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_sz_attr =
+ __ATTR_RW_MODE(bytes, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
+ __ATTR_RW_MODE(reset_interval_ms, 0600);
+
+static struct attribute *damon_sysfs_quotas_attrs[] = {
+ &damon_sysfs_quotas_ms_attr.attr,
+ &damon_sysfs_quotas_sz_attr.attr,
+ &damon_sysfs_quotas_reset_interval_ms_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_quotas);
+
+static const struct kobj_type damon_sysfs_quotas_ktype = {
+ .release = damon_sysfs_quotas_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_quotas_groups,
+};
+
+/*
+ * access_pattern directory
+ */
+
+struct damon_sysfs_access_pattern {
+ struct kobject kobj;
+ struct damon_sysfs_ul_range *sz;
+ struct damon_sysfs_ul_range *nr_accesses;
+ struct damon_sysfs_ul_range *age;
+};
+
+static
+struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
+{
+ struct damon_sysfs_access_pattern *access_pattern =
+ kmalloc(sizeof(*access_pattern), GFP_KERNEL);
+
+ if (!access_pattern)
+ return NULL;
+ access_pattern->kobj = (struct kobject){};
+ return access_pattern;
+}
+
+static int damon_sysfs_access_pattern_add_range_dir(
+ struct damon_sysfs_access_pattern *access_pattern,
+ struct damon_sysfs_ul_range **range_dir_ptr,
+ char *name)
+{
+ struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
+ int err;
+
+ if (!range)
+ return -ENOMEM;
+ err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
+ &access_pattern->kobj, name);
+ if (err)
+ kobject_put(&range->kobj);
+ else
+ *range_dir_ptr = range;
+ return err;
+}
+
+static int damon_sysfs_access_pattern_add_dirs(
+ struct damon_sysfs_access_pattern *access_pattern)
+{
+ int err;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->sz, "sz");
+ if (err)
+ goto put_sz_out;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->nr_accesses, "nr_accesses");
+ if (err)
+ goto put_nr_accesses_sz_out;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->age, "age");
+ if (err)
+ goto put_age_nr_accesses_sz_out;
+ return 0;
+
+put_age_nr_accesses_sz_out:
+ kobject_put(&access_pattern->age->kobj);
+ access_pattern->age = NULL;
+put_nr_accesses_sz_out:
+ kobject_put(&access_pattern->nr_accesses->kobj);
+ access_pattern->nr_accesses = NULL;
+put_sz_out:
+ kobject_put(&access_pattern->sz->kobj);
+ access_pattern->sz = NULL;
+ return err;
+}
+
+static void damon_sysfs_access_pattern_rm_dirs(
+ struct damon_sysfs_access_pattern *access_pattern)
+{
+ kobject_put(&access_pattern->sz->kobj);
+ kobject_put(&access_pattern->nr_accesses->kobj);
+ kobject_put(&access_pattern->age->kobj);
+}
+
+static void damon_sysfs_access_pattern_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
+}
+
+static struct attribute *damon_sysfs_access_pattern_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
+
+static const struct kobj_type damon_sysfs_access_pattern_ktype = {
+ .release = damon_sysfs_access_pattern_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_access_pattern_groups,
+};
+
+/*
+ * scheme directory
+ */
+
+struct damon_sysfs_scheme {
+ struct kobject kobj;
+ enum damos_action action;
+ struct damon_sysfs_access_pattern *access_pattern;
+ struct damon_sysfs_quotas *quotas;
+ struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_scheme_filters *filters;
+ struct damon_sysfs_stats *stats;
+ struct damon_sysfs_scheme_regions *tried_regions;
+};
+
+/* This should match with enum damos_action */
+static const char * const damon_sysfs_damos_action_strs[] = {
+ "willneed",
+ "cold",
+ "pageout",
+ "hugepage",
+ "nohugepage",
+ "lru_prio",
+ "lru_deprio",
+ "stat",
+};
+
+static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
+ enum damos_action action)
+{
+ struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
+ GFP_KERNEL);
+
+ if (!scheme)
+ return NULL;
+ scheme->kobj = (struct kobject){};
+ scheme->action = action;
+ return scheme;
+}
+
+static int damon_sysfs_scheme_set_access_pattern(
+ struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_access_pattern *access_pattern;
+ int err;
+
+ access_pattern = damon_sysfs_access_pattern_alloc();
+ if (!access_pattern)
+ return -ENOMEM;
+ err = kobject_init_and_add(&access_pattern->kobj,
+ &damon_sysfs_access_pattern_ktype, &scheme->kobj,
+ "access_pattern");
+ if (err)
+ goto out;
+ err = damon_sysfs_access_pattern_add_dirs(access_pattern);
+ if (err)
+ goto out;
+ scheme->access_pattern = access_pattern;
+ return 0;
+
+out:
+ kobject_put(&access_pattern->kobj);
+ return err;
+}
+
+static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
+ int err;
+
+ if (!quotas)
+ return -ENOMEM;
+ err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
+ &scheme->kobj, "quotas");
+ if (err)
+ goto out;
+ err = damon_sysfs_quotas_add_dirs(quotas);
+ if (err)
+ goto out;
+ scheme->quotas = quotas;
+ return 0;
+
+out:
+ kobject_put(&quotas->kobj);
+ return err;
+}
+
+static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_watermarks *watermarks =
+ damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
+ int err;
+
+ if (!watermarks)
+ return -ENOMEM;
+ err = kobject_init_and_add(&watermarks->kobj,
+ &damon_sysfs_watermarks_ktype, &scheme->kobj,
+ "watermarks");
+ if (err)
+ kobject_put(&watermarks->kobj);
+ else
+ scheme->watermarks = watermarks;
+ return err;
+}
+
+static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_scheme_filters *filters =
+ damon_sysfs_scheme_filters_alloc();
+ int err;
+
+ if (!filters)
+ return -ENOMEM;
+ err = kobject_init_and_add(&filters->kobj,
+ &damon_sysfs_scheme_filters_ktype, &scheme->kobj,
+ "filters");
+ if (err)
+ kobject_put(&filters->kobj);
+ else
+ scheme->filters = filters;
+ return err;
+}
+
+static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
+ int err;
+
+ if (!stats)
+ return -ENOMEM;
+ err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
+ &scheme->kobj, "stats");
+ if (err)
+ kobject_put(&stats->kobj);
+ else
+ scheme->stats = stats;
+ return err;
+}
+
+static int damon_sysfs_scheme_set_tried_regions(
+ struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_scheme_regions *tried_regions =
+ damon_sysfs_scheme_regions_alloc();
+ int err;
+
+ if (!tried_regions)
+ return -ENOMEM;
+ err = kobject_init_and_add(&tried_regions->kobj,
+ &damon_sysfs_scheme_regions_ktype, &scheme->kobj,
+ "tried_regions");
+ if (err)
+ kobject_put(&tried_regions->kobj);
+ else
+ scheme->tried_regions = tried_regions;
+ return err;
+}
+
+static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
+{
+ int err;
+
+ err = damon_sysfs_scheme_set_access_pattern(scheme);
+ if (err)
+ return err;
+ err = damon_sysfs_scheme_set_quotas(scheme);
+ if (err)
+ goto put_access_pattern_out;
+ err = damon_sysfs_scheme_set_watermarks(scheme);
+ if (err)
+ goto put_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_filters(scheme);
+ if (err)
+ goto put_watermarks_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_stats(scheme);
+ if (err)
+ goto put_filters_watermarks_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_tried_regions(scheme);
+ if (err)
+ goto put_tried_regions_out;
+ return 0;
+
+put_tried_regions_out:
+ kobject_put(&scheme->tried_regions->kobj);
+ scheme->tried_regions = NULL;
+put_filters_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->filters->kobj);
+ scheme->filters = NULL;
+put_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->watermarks->kobj);
+ scheme->watermarks = NULL;
+put_quotas_access_pattern_out:
+ kobject_put(&scheme->quotas->kobj);
+ scheme->quotas = NULL;
+put_access_pattern_out:
+ kobject_put(&scheme->access_pattern->kobj);
+ scheme->access_pattern = NULL;
+ return err;
+}
+
+static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
+{
+ damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
+ kobject_put(&scheme->access_pattern->kobj);
+ damon_sysfs_quotas_rm_dirs(scheme->quotas);
+ kobject_put(&scheme->quotas->kobj);
+ kobject_put(&scheme->watermarks->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->filters);
+ kobject_put(&scheme->filters->kobj);
+ kobject_put(&scheme->stats->kobj);
+ damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
+ kobject_put(&scheme->tried_regions->kobj);
+}
+
+static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme *scheme = container_of(kobj,
+ struct damon_sysfs_scheme, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_damos_action_strs[scheme->action]);
+}
+
+static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme *scheme = container_of(kobj,
+ struct damon_sysfs_scheme, kobj);
+ enum damos_action action;
+
+ for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
+ if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
+ scheme->action = action;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
+static void damon_sysfs_scheme_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_action_attr =
+ __ATTR_RW_MODE(action, 0600);
+
+static struct attribute *damon_sysfs_scheme_attrs[] = {
+ &damon_sysfs_scheme_action_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme);
+
+static const struct kobj_type damon_sysfs_scheme_ktype = {
+ .release = damon_sysfs_scheme_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_groups,
+};
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
+}
+
+void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
+{
+ struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
+ int i;
+
+ for (i = 0; i < schemes->nr; i++) {
+ damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
+ kobject_put(&schemes_arr[i]->kobj);
+ }
+ schemes->nr = 0;
+ kfree(schemes_arr);
+ schemes->schemes_arr = NULL;
+}
+
+static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
+ int nr_schemes)
+{
+ struct damon_sysfs_scheme **schemes_arr, *scheme;
+ int err, i;
+
+ damon_sysfs_schemes_rm_dirs(schemes);
+ if (!nr_schemes)
+ return 0;
+
+ schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!schemes_arr)
+ return -ENOMEM;
+ schemes->schemes_arr = schemes_arr;
+
+ for (i = 0; i < nr_schemes; i++) {
+ scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
+ if (!scheme) {
+ damon_sysfs_schemes_rm_dirs(schemes);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&scheme->kobj,
+ &damon_sysfs_scheme_ktype, &schemes->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+ err = damon_sysfs_scheme_add_dirs(scheme);
+ if (err)
+ goto out;
+
+ schemes_arr[i] = scheme;
+ schemes->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_schemes_rm_dirs(schemes);
+ kobject_put(&scheme->kobj);
+ return err;
+}
+
+static ssize_t nr_schemes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_schemes *schemes = container_of(kobj,
+ struct damon_sysfs_schemes, kobj);
+
+ return sysfs_emit(buf, "%d\n", schemes->nr);
+}
+
+static ssize_t nr_schemes_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_schemes *schemes;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ schemes = container_of(kobj, struct damon_sysfs_schemes, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_schemes_add_dirs(schemes, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+ return count;
+}
+
+static void damon_sysfs_schemes_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_schemes_nr_attr =
+ __ATTR_RW_MODE(nr_schemes, 0600);
+
+static struct attribute *damon_sysfs_schemes_attrs[] = {
+ &damon_sysfs_schemes_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_schemes);
+
+const struct kobj_type damon_sysfs_schemes_ktype = {
+ .release = damon_sysfs_schemes_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_schemes_groups,
+};
+
+static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg,
+ char *memcg_path_buf, char *path)
+{
+#ifdef CONFIG_MEMCG
+ cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX);
+ if (sysfs_streq(memcg_path_buf, path))
+ return true;
+#endif /* CONFIG_MEMCG */
+ return false;
+}
+
+static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
+{
+ struct mem_cgroup *memcg;
+ char *path;
+ bool found = false;
+
+ if (!memcg_path)
+ return -EINVAL;
+
+ path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ if (!path)
+ return -ENOMEM;
+
+ for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
+ memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
+ /* skip removed memcg */
+ if (!mem_cgroup_id(memcg))
+ continue;
+ if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) {
+ *id = mem_cgroup_id(memcg);
+ found = true;
+ break;
+ }
+ }
+
+ kfree(path);
+ return found ? 0 : -EINVAL;
+}
+
+static int damon_sysfs_set_scheme_filters(struct damos *scheme,
+ struct damon_sysfs_scheme_filters *sysfs_filters)
+{
+ int i;
+ struct damos_filter *filter, *next;
+
+ damos_for_each_filter_safe(filter, next, scheme)
+ damos_destroy_filter(filter);
+
+ for (i = 0; i < sysfs_filters->nr; i++) {
+ struct damon_sysfs_scheme_filter *sysfs_filter =
+ sysfs_filters->filters_arr[i];
+ struct damos_filter *filter =
+ damos_new_filter(sysfs_filter->type,
+ sysfs_filter->matching);
+ int err;
+
+ if (!filter)
+ return -ENOMEM;
+ if (filter->type == DAMOS_FILTER_TYPE_MEMCG) {
+ err = damon_sysfs_memcg_path_to_id(
+ sysfs_filter->memcg_path,
+ &filter->memcg_id);
+ if (err) {
+ damos_destroy_filter(filter);
+ return err;
+ }
+ }
+ damos_add_filter(scheme, filter);
+ }
+ return 0;
+}
+
+static struct damos *damon_sysfs_mk_scheme(
+ struct damon_sysfs_scheme *sysfs_scheme)
+{
+ struct damon_sysfs_access_pattern *access_pattern =
+ sysfs_scheme->access_pattern;
+ struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+ struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+ struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ struct damon_sysfs_scheme_filters *sysfs_filters =
+ sysfs_scheme->filters;
+ struct damos *scheme;
+ int err;
+
+ struct damos_access_pattern pattern = {
+ .min_sz_region = access_pattern->sz->min,
+ .max_sz_region = access_pattern->sz->max,
+ .min_nr_accesses = access_pattern->nr_accesses->min,
+ .max_nr_accesses = access_pattern->nr_accesses->max,
+ .min_age_region = access_pattern->age->min,
+ .max_age_region = access_pattern->age->max,
+ };
+ struct damos_quota quota = {
+ .ms = sysfs_quotas->ms,
+ .sz = sysfs_quotas->sz,
+ .reset_interval = sysfs_quotas->reset_interval_ms,
+ .weight_sz = sysfs_weights->sz,
+ .weight_nr_accesses = sysfs_weights->nr_accesses,
+ .weight_age = sysfs_weights->age,
+ };
+ struct damos_watermarks wmarks = {
+ .metric = sysfs_wmarks->metric,
+ .interval = sysfs_wmarks->interval_us,
+ .high = sysfs_wmarks->high,
+ .mid = sysfs_wmarks->mid,
+ .low = sysfs_wmarks->low,
+ };
+
+ scheme = damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+ &wmarks);
+ if (!scheme)
+ return NULL;
+
+ err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ return scheme;
+}
+
+static void damon_sysfs_update_scheme(struct damos *scheme,
+ struct damon_sysfs_scheme *sysfs_scheme)
+{
+ struct damon_sysfs_access_pattern *access_pattern =
+ sysfs_scheme->access_pattern;
+ struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+ struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+ struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ int err;
+
+ scheme->pattern.min_sz_region = access_pattern->sz->min;
+ scheme->pattern.max_sz_region = access_pattern->sz->max;
+ scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
+ scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
+ scheme->pattern.min_age_region = access_pattern->age->min;
+ scheme->pattern.max_age_region = access_pattern->age->max;
+
+ scheme->action = sysfs_scheme->action;
+
+ scheme->quota.ms = sysfs_quotas->ms;
+ scheme->quota.sz = sysfs_quotas->sz;
+ scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
+ scheme->quota.weight_sz = sysfs_weights->sz;
+ scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
+ scheme->quota.weight_age = sysfs_weights->age;
+
+ scheme->wmarks.metric = sysfs_wmarks->metric;
+ scheme->wmarks.interval = sysfs_wmarks->interval_us;
+ scheme->wmarks.high = sysfs_wmarks->high;
+ scheme->wmarks.mid = sysfs_wmarks->mid;
+ scheme->wmarks.low = sysfs_wmarks->low;
+
+ err = damon_sysfs_set_scheme_filters(scheme, sysfs_scheme->filters);
+ if (err)
+ damon_destroy_scheme(scheme);
+}
+
+int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+ struct damon_sysfs_schemes *sysfs_schemes)
+{
+ struct damos *scheme, *next;
+ int i = 0;
+
+ damon_for_each_scheme_safe(scheme, next, ctx) {
+ if (i < sysfs_schemes->nr)
+ damon_sysfs_update_scheme(scheme,
+ sysfs_schemes->schemes_arr[i]);
+ else
+ damon_destroy_scheme(scheme);
+ i++;
+ }
+
+ for (; i < sysfs_schemes->nr; i++) {
+ struct damos *scheme, *next;
+
+ scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
+ if (!scheme) {
+ damon_for_each_scheme_safe(scheme, next, ctx)
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ damon_add_scheme(ctx, scheme);
+ }
+ return 0;
+}
+
+void damon_sysfs_schemes_update_stats(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx)
+{
+ struct damos *scheme;
+ int schemes_idx = 0;
+
+ damon_for_each_scheme(scheme, ctx) {
+ struct damon_sysfs_stats *sysfs_stats;
+
+ /* user could have removed the scheme sysfs dir */
+ if (schemes_idx >= sysfs_schemes->nr)
+ break;
+
+ sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+ sysfs_stats->nr_tried = scheme->stat.nr_tried;
+ sysfs_stats->sz_tried = scheme->stat.sz_tried;
+ sysfs_stats->nr_applied = scheme->stat.nr_applied;
+ sysfs_stats->sz_applied = scheme->stat.sz_applied;
+ sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+ }
+}
+
+/*
+ * damon_sysfs_schemes that need to update its schemes regions dir. Protected
+ * by damon_sysfs_lock
+ */
+static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
+static int damon_sysfs_schemes_region_idx;
+
+/*
+ * DAMON callback that called before damos apply. While this callback is
+ * registered, damon_sysfs_lock should be held to ensure the regions
+ * directories exist.
+ */
+static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *s)
+{
+ struct damos *scheme;
+ struct damon_sysfs_scheme_regions *sysfs_regions;
+ struct damon_sysfs_scheme_region *region;
+ struct damon_sysfs_schemes *sysfs_schemes =
+ damon_sysfs_schemes_for_damos_callback;
+ int schemes_idx = 0;
+
+ damon_for_each_scheme(scheme, ctx) {
+ if (scheme == s)
+ break;
+ schemes_idx++;
+ }
+
+ /* user could have removed the scheme sysfs dir */
+ if (schemes_idx >= sysfs_schemes->nr)
+ return 0;
+
+ sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+ region = damon_sysfs_scheme_region_alloc(r);
+ list_add_tail(&region->list, &sysfs_regions->regions_list);
+ sysfs_regions->nr_regions++;
+ if (kobject_init_and_add(&region->kobj,
+ &damon_sysfs_scheme_region_ktype,
+ &sysfs_regions->kobj, "%d",
+ damon_sysfs_schemes_region_idx++)) {
+ kobject_put(&region->kobj);
+ }
+ return 0;
+}
+
+/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
+int damon_sysfs_schemes_clear_regions(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx)
+{
+ struct damos *scheme;
+ int schemes_idx = 0;
+
+ damon_for_each_scheme(scheme, ctx) {
+ struct damon_sysfs_scheme *sysfs_scheme;
+
+ /* user could have removed the scheme sysfs dir */
+ if (schemes_idx >= sysfs_schemes->nr)
+ break;
+
+ sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
+ damon_sysfs_scheme_regions_rm_dirs(
+ sysfs_scheme->tried_regions);
+ }
+ return 0;
+}
+
+/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
+int damon_sysfs_schemes_update_regions_start(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx)
+{
+ damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
+ damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+ ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
+ return 0;
+}
+
+/*
+ * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller
+ * should unlock damon_sysfs_lock which held before
+ * damon_sysfs_schemes_update_regions_start()
+ */
+int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
+{
+ damon_sysfs_schemes_for_damos_callback = NULL;
+ ctx->callback.before_damos_apply = NULL;
+ damon_sysfs_schemes_region_idx = 0;
+ return 0;
+}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
new file mode 100644
index 000000000000..33e1d5c9cb54
--- /dev/null
+++ b/mm/damon/sysfs.c
@@ -0,0 +1,1795 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON sysfs Interface
+ *
+ * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "sysfs-common.h"
+
+/*
+ * init region directory
+ */
+
+struct damon_sysfs_region {
+ struct kobject kobj;
+ struct damon_addr_range ar;
+};
+
+static struct damon_sysfs_region *damon_sysfs_region_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL);
+}
+
+static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_region *region = container_of(kobj,
+ struct damon_sysfs_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->ar.start);
+}
+
+static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_region *region = container_of(kobj,
+ struct damon_sysfs_region, kobj);
+ int err = kstrtoul(buf, 0, &region->ar.start);
+
+ return err ? err : count;
+}
+
+static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_region *region = container_of(kobj,
+ struct damon_sysfs_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->ar.end);
+}
+
+static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_region *region = container_of(kobj,
+ struct damon_sysfs_region, kobj);
+ int err = kstrtoul(buf, 0, &region->ar.end);
+
+ return err ? err : count;
+}
+
+static void damon_sysfs_region_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_region, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_region_start_attr =
+ __ATTR_RW_MODE(start, 0600);
+
+static struct kobj_attribute damon_sysfs_region_end_attr =
+ __ATTR_RW_MODE(end, 0600);
+
+static struct attribute *damon_sysfs_region_attrs[] = {
+ &damon_sysfs_region_start_attr.attr,
+ &damon_sysfs_region_end_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_region);
+
+static const struct kobj_type damon_sysfs_region_ktype = {
+ .release = damon_sysfs_region_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_region_groups,
+};
+
+/*
+ * init_regions directory
+ */
+
+struct damon_sysfs_regions {
+ struct kobject kobj;
+ struct damon_sysfs_region **regions_arr;
+ int nr;
+};
+
+static struct damon_sysfs_regions *damon_sysfs_regions_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_regions), GFP_KERNEL);
+}
+
+static void damon_sysfs_regions_rm_dirs(struct damon_sysfs_regions *regions)
+{
+ struct damon_sysfs_region **regions_arr = regions->regions_arr;
+ int i;
+
+ for (i = 0; i < regions->nr; i++)
+ kobject_put(&regions_arr[i]->kobj);
+ regions->nr = 0;
+ kfree(regions_arr);
+ regions->regions_arr = NULL;
+}
+
+static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
+ int nr_regions)
+{
+ struct damon_sysfs_region **regions_arr, *region;
+ int err, i;
+
+ damon_sysfs_regions_rm_dirs(regions);
+ if (!nr_regions)
+ return 0;
+
+ regions_arr = kmalloc_array(nr_regions, sizeof(*regions_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!regions_arr)
+ return -ENOMEM;
+ regions->regions_arr = regions_arr;
+
+ for (i = 0; i < nr_regions; i++) {
+ region = damon_sysfs_region_alloc();
+ if (!region) {
+ damon_sysfs_regions_rm_dirs(regions);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&region->kobj,
+ &damon_sysfs_region_ktype, &regions->kobj,
+ "%d", i);
+ if (err) {
+ kobject_put(&region->kobj);
+ damon_sysfs_regions_rm_dirs(regions);
+ return err;
+ }
+
+ regions_arr[i] = region;
+ regions->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_regions_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_regions *regions = container_of(kobj,
+ struct damon_sysfs_regions, kobj);
+
+ return sysfs_emit(buf, "%d\n", regions->nr);
+}
+
+static ssize_t nr_regions_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_regions *regions;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ regions = container_of(kobj, struct damon_sysfs_regions, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_regions_add_dirs(regions, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_regions_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_regions, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_regions_nr_attr =
+ __ATTR_RW_MODE(nr_regions, 0600);
+
+static struct attribute *damon_sysfs_regions_attrs[] = {
+ &damon_sysfs_regions_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_regions);
+
+static const struct kobj_type damon_sysfs_regions_ktype = {
+ .release = damon_sysfs_regions_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_regions_groups,
+};
+
+/*
+ * target directory
+ */
+
+struct damon_sysfs_target {
+ struct kobject kobj;
+ struct damon_sysfs_regions *regions;
+ int pid;
+};
+
+static struct damon_sysfs_target *damon_sysfs_target_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_target), GFP_KERNEL);
+}
+
+static int damon_sysfs_target_add_dirs(struct damon_sysfs_target *target)
+{
+ struct damon_sysfs_regions *regions = damon_sysfs_regions_alloc();
+ int err;
+
+ if (!regions)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&regions->kobj, &damon_sysfs_regions_ktype,
+ &target->kobj, "regions");
+ if (err)
+ kobject_put(&regions->kobj);
+ else
+ target->regions = regions;
+ return err;
+}
+
+static void damon_sysfs_target_rm_dirs(struct damon_sysfs_target *target)
+{
+ damon_sysfs_regions_rm_dirs(target->regions);
+ kobject_put(&target->regions->kobj);
+}
+
+static ssize_t pid_target_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_target *target = container_of(kobj,
+ struct damon_sysfs_target, kobj);
+
+ return sysfs_emit(buf, "%d\n", target->pid);
+}
+
+static ssize_t pid_target_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_target *target = container_of(kobj,
+ struct damon_sysfs_target, kobj);
+ int err = kstrtoint(buf, 0, &target->pid);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static void damon_sysfs_target_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_target, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_target_pid_attr =
+ __ATTR_RW_MODE(pid_target, 0600);
+
+static struct attribute *damon_sysfs_target_attrs[] = {
+ &damon_sysfs_target_pid_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_target);
+
+static const struct kobj_type damon_sysfs_target_ktype = {
+ .release = damon_sysfs_target_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_target_groups,
+};
+
+/*
+ * targets directory
+ */
+
+struct damon_sysfs_targets {
+ struct kobject kobj;
+ struct damon_sysfs_target **targets_arr;
+ int nr;
+};
+
+static struct damon_sysfs_targets *damon_sysfs_targets_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_targets), GFP_KERNEL);
+}
+
+static void damon_sysfs_targets_rm_dirs(struct damon_sysfs_targets *targets)
+{
+ struct damon_sysfs_target **targets_arr = targets->targets_arr;
+ int i;
+
+ for (i = 0; i < targets->nr; i++) {
+ damon_sysfs_target_rm_dirs(targets_arr[i]);
+ kobject_put(&targets_arr[i]->kobj);
+ }
+ targets->nr = 0;
+ kfree(targets_arr);
+ targets->targets_arr = NULL;
+}
+
+static int damon_sysfs_targets_add_dirs(struct damon_sysfs_targets *targets,
+ int nr_targets)
+{
+ struct damon_sysfs_target **targets_arr, *target;
+ int err, i;
+
+ damon_sysfs_targets_rm_dirs(targets);
+ if (!nr_targets)
+ return 0;
+
+ targets_arr = kmalloc_array(nr_targets, sizeof(*targets_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!targets_arr)
+ return -ENOMEM;
+ targets->targets_arr = targets_arr;
+
+ for (i = 0; i < nr_targets; i++) {
+ target = damon_sysfs_target_alloc();
+ if (!target) {
+ damon_sysfs_targets_rm_dirs(targets);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&target->kobj,
+ &damon_sysfs_target_ktype, &targets->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+
+ err = damon_sysfs_target_add_dirs(target);
+ if (err)
+ goto out;
+
+ targets_arr[i] = target;
+ targets->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_targets_rm_dirs(targets);
+ kobject_put(&target->kobj);
+ return err;
+}
+
+static ssize_t nr_targets_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_targets *targets = container_of(kobj,
+ struct damon_sysfs_targets, kobj);
+
+ return sysfs_emit(buf, "%d\n", targets->nr);
+}
+
+static ssize_t nr_targets_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_targets *targets;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ targets = container_of(kobj, struct damon_sysfs_targets, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_targets_add_dirs(targets, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_targets_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_targets, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_targets_nr_attr =
+ __ATTR_RW_MODE(nr_targets, 0600);
+
+static struct attribute *damon_sysfs_targets_attrs[] = {
+ &damon_sysfs_targets_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_targets);
+
+static const struct kobj_type damon_sysfs_targets_ktype = {
+ .release = damon_sysfs_targets_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_targets_groups,
+};
+
+/*
+ * intervals directory
+ */
+
+struct damon_sysfs_intervals {
+ struct kobject kobj;
+ unsigned long sample_us;
+ unsigned long aggr_us;
+ unsigned long update_us;
+};
+
+static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
+ unsigned long sample_us, unsigned long aggr_us,
+ unsigned long update_us)
+{
+ struct damon_sysfs_intervals *intervals = kmalloc(sizeof(*intervals),
+ GFP_KERNEL);
+
+ if (!intervals)
+ return NULL;
+
+ intervals->kobj = (struct kobject){};
+ intervals->sample_us = sample_us;
+ intervals->aggr_us = aggr_us;
+ intervals->update_us = update_us;
+ return intervals;
+}
+
+static ssize_t sample_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+
+ return sysfs_emit(buf, "%lu\n", intervals->sample_us);
+}
+
+static ssize_t sample_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+ unsigned long us;
+ int err = kstrtoul(buf, 0, &us);
+
+ if (err)
+ return err;
+
+ intervals->sample_us = us;
+ return count;
+}
+
+static ssize_t aggr_us_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+
+ return sysfs_emit(buf, "%lu\n", intervals->aggr_us);
+}
+
+static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+ unsigned long us;
+ int err = kstrtoul(buf, 0, &us);
+
+ if (err)
+ return err;
+
+ intervals->aggr_us = us;
+ return count;
+}
+
+static ssize_t update_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+
+ return sysfs_emit(buf, "%lu\n", intervals->update_us);
+}
+
+static ssize_t update_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+ unsigned long us;
+ int err = kstrtoul(buf, 0, &us);
+
+ if (err)
+ return err;
+
+ intervals->update_us = us;
+ return count;
+}
+
+static void damon_sysfs_intervals_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_intervals, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_intervals_sample_us_attr =
+ __ATTR_RW_MODE(sample_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_aggr_us_attr =
+ __ATTR_RW_MODE(aggr_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_update_us_attr =
+ __ATTR_RW_MODE(update_us, 0600);
+
+static struct attribute *damon_sysfs_intervals_attrs[] = {
+ &damon_sysfs_intervals_sample_us_attr.attr,
+ &damon_sysfs_intervals_aggr_us_attr.attr,
+ &damon_sysfs_intervals_update_us_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_intervals);
+
+static const struct kobj_type damon_sysfs_intervals_ktype = {
+ .release = damon_sysfs_intervals_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_intervals_groups,
+};
+
+/*
+ * monitoring_attrs directory
+ */
+
+struct damon_sysfs_attrs {
+ struct kobject kobj;
+ struct damon_sysfs_intervals *intervals;
+ struct damon_sysfs_ul_range *nr_regions_range;
+};
+
+static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void)
+{
+ struct damon_sysfs_attrs *attrs = kmalloc(sizeof(*attrs), GFP_KERNEL);
+
+ if (!attrs)
+ return NULL;
+ attrs->kobj = (struct kobject){};
+ return attrs;
+}
+
+static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
+{
+ struct damon_sysfs_intervals *intervals;
+ struct damon_sysfs_ul_range *nr_regions_range;
+ int err;
+
+ intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000);
+ if (!intervals)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&intervals->kobj,
+ &damon_sysfs_intervals_ktype, &attrs->kobj,
+ "intervals");
+ if (err)
+ goto put_intervals_out;
+ attrs->intervals = intervals;
+
+ nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000);
+ if (!nr_regions_range) {
+ err = -ENOMEM;
+ goto put_intervals_out;
+ }
+
+ err = kobject_init_and_add(&nr_regions_range->kobj,
+ &damon_sysfs_ul_range_ktype, &attrs->kobj,
+ "nr_regions");
+ if (err)
+ goto put_nr_regions_intervals_out;
+ attrs->nr_regions_range = nr_regions_range;
+ return 0;
+
+put_nr_regions_intervals_out:
+ kobject_put(&nr_regions_range->kobj);
+ attrs->nr_regions_range = NULL;
+put_intervals_out:
+ kobject_put(&intervals->kobj);
+ attrs->intervals = NULL;
+ return err;
+}
+
+static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs)
+{
+ kobject_put(&attrs->nr_regions_range->kobj);
+ kobject_put(&attrs->intervals->kobj);
+}
+
+static void damon_sysfs_attrs_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_attrs, kobj));
+}
+
+static struct attribute *damon_sysfs_attrs_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_attrs);
+
+static const struct kobj_type damon_sysfs_attrs_ktype = {
+ .release = damon_sysfs_attrs_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_attrs_groups,
+};
+
+/*
+ * context directory
+ */
+
+/* This should match with enum damon_ops_id */
+static const char * const damon_sysfs_ops_strs[] = {
+ "vaddr",
+ "fvaddr",
+ "paddr",
+};
+
+struct damon_sysfs_context {
+ struct kobject kobj;
+ enum damon_ops_id ops_id;
+ struct damon_sysfs_attrs *attrs;
+ struct damon_sysfs_targets *targets;
+ struct damon_sysfs_schemes *schemes;
+};
+
+static struct damon_sysfs_context *damon_sysfs_context_alloc(
+ enum damon_ops_id ops_id)
+{
+ struct damon_sysfs_context *context = kmalloc(sizeof(*context),
+ GFP_KERNEL);
+
+ if (!context)
+ return NULL;
+ context->kobj = (struct kobject){};
+ context->ops_id = ops_id;
+ return context;
+}
+
+static int damon_sysfs_context_set_attrs(struct damon_sysfs_context *context)
+{
+ struct damon_sysfs_attrs *attrs = damon_sysfs_attrs_alloc();
+ int err;
+
+ if (!attrs)
+ return -ENOMEM;
+ err = kobject_init_and_add(&attrs->kobj, &damon_sysfs_attrs_ktype,
+ &context->kobj, "monitoring_attrs");
+ if (err)
+ goto out;
+ err = damon_sysfs_attrs_add_dirs(attrs);
+ if (err)
+ goto out;
+ context->attrs = attrs;
+ return 0;
+
+out:
+ kobject_put(&attrs->kobj);
+ return err;
+}
+
+static int damon_sysfs_context_set_targets(struct damon_sysfs_context *context)
+{
+ struct damon_sysfs_targets *targets = damon_sysfs_targets_alloc();
+ int err;
+
+ if (!targets)
+ return -ENOMEM;
+ err = kobject_init_and_add(&targets->kobj, &damon_sysfs_targets_ktype,
+ &context->kobj, "targets");
+ if (err) {
+ kobject_put(&targets->kobj);
+ return err;
+ }
+ context->targets = targets;
+ return 0;
+}
+
+static int damon_sysfs_context_set_schemes(struct damon_sysfs_context *context)
+{
+ struct damon_sysfs_schemes *schemes = damon_sysfs_schemes_alloc();
+ int err;
+
+ if (!schemes)
+ return -ENOMEM;
+ err = kobject_init_and_add(&schemes->kobj, &damon_sysfs_schemes_ktype,
+ &context->kobj, "schemes");
+ if (err) {
+ kobject_put(&schemes->kobj);
+ return err;
+ }
+ context->schemes = schemes;
+ return 0;
+}
+
+static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context)
+{
+ int err;
+
+ err = damon_sysfs_context_set_attrs(context);
+ if (err)
+ return err;
+
+ err = damon_sysfs_context_set_targets(context);
+ if (err)
+ goto put_attrs_out;
+
+ err = damon_sysfs_context_set_schemes(context);
+ if (err)
+ goto put_targets_attrs_out;
+ return 0;
+
+put_targets_attrs_out:
+ kobject_put(&context->targets->kobj);
+ context->targets = NULL;
+put_attrs_out:
+ kobject_put(&context->attrs->kobj);
+ context->attrs = NULL;
+ return err;
+}
+
+static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
+{
+ damon_sysfs_attrs_rm_dirs(context->attrs);
+ kobject_put(&context->attrs->kobj);
+ damon_sysfs_targets_rm_dirs(context->targets);
+ kobject_put(&context->targets->kobj);
+ damon_sysfs_schemes_rm_dirs(context->schemes);
+ kobject_put(&context->schemes->kobj);
+}
+
+static ssize_t avail_operations_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ enum damon_ops_id id;
+ int len = 0;
+
+ for (id = 0; id < NR_DAMON_OPS; id++) {
+ if (!damon_is_registered_ops(id))
+ continue;
+ len += sysfs_emit_at(buf, len, "%s\n",
+ damon_sysfs_ops_strs[id]);
+ }
+ return len;
+}
+
+static ssize_t operations_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]);
+}
+
+static ssize_t operations_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ enum damon_ops_id id;
+
+ for (id = 0; id < NR_DAMON_OPS; id++) {
+ if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
+ context->ops_id = id;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
+static void damon_sysfs_context_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_context, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_context_avail_operations_attr =
+ __ATTR_RO_MODE(avail_operations, 0400);
+
+static struct kobj_attribute damon_sysfs_context_operations_attr =
+ __ATTR_RW_MODE(operations, 0600);
+
+static struct attribute *damon_sysfs_context_attrs[] = {
+ &damon_sysfs_context_avail_operations_attr.attr,
+ &damon_sysfs_context_operations_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_context);
+
+static const struct kobj_type damon_sysfs_context_ktype = {
+ .release = damon_sysfs_context_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_context_groups,
+};
+
+/*
+ * contexts directory
+ */
+
+struct damon_sysfs_contexts {
+ struct kobject kobj;
+ struct damon_sysfs_context **contexts_arr;
+ int nr;
+};
+
+static struct damon_sysfs_contexts *damon_sysfs_contexts_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_contexts), GFP_KERNEL);
+}
+
+static void damon_sysfs_contexts_rm_dirs(struct damon_sysfs_contexts *contexts)
+{
+ struct damon_sysfs_context **contexts_arr = contexts->contexts_arr;
+ int i;
+
+ for (i = 0; i < contexts->nr; i++) {
+ damon_sysfs_context_rm_dirs(contexts_arr[i]);
+ kobject_put(&contexts_arr[i]->kobj);
+ }
+ contexts->nr = 0;
+ kfree(contexts_arr);
+ contexts->contexts_arr = NULL;
+}
+
+static int damon_sysfs_contexts_add_dirs(struct damon_sysfs_contexts *contexts,
+ int nr_contexts)
+{
+ struct damon_sysfs_context **contexts_arr, *context;
+ int err, i;
+
+ damon_sysfs_contexts_rm_dirs(contexts);
+ if (!nr_contexts)
+ return 0;
+
+ contexts_arr = kmalloc_array(nr_contexts, sizeof(*contexts_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!contexts_arr)
+ return -ENOMEM;
+ contexts->contexts_arr = contexts_arr;
+
+ for (i = 0; i < nr_contexts; i++) {
+ context = damon_sysfs_context_alloc(DAMON_OPS_VADDR);
+ if (!context) {
+ damon_sysfs_contexts_rm_dirs(contexts);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&context->kobj,
+ &damon_sysfs_context_ktype, &contexts->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+
+ err = damon_sysfs_context_add_dirs(context);
+ if (err)
+ goto out;
+
+ contexts_arr[i] = context;
+ contexts->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_contexts_rm_dirs(contexts);
+ kobject_put(&context->kobj);
+ return err;
+}
+
+static ssize_t nr_contexts_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_contexts *contexts = container_of(kobj,
+ struct damon_sysfs_contexts, kobj);
+
+ return sysfs_emit(buf, "%d\n", contexts->nr);
+}
+
+static ssize_t nr_contexts_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_contexts *contexts;
+ int nr, err;
+
+ err = kstrtoint(buf, 0, &nr);
+ if (err)
+ return err;
+ /* TODO: support multiple contexts per kdamond */
+ if (nr < 0 || 1 < nr)
+ return -EINVAL;
+
+ contexts = container_of(kobj, struct damon_sysfs_contexts, kobj);
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_contexts_add_dirs(contexts, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_contexts_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_contexts, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_contexts_nr_attr
+ = __ATTR_RW_MODE(nr_contexts, 0600);
+
+static struct attribute *damon_sysfs_contexts_attrs[] = {
+ &damon_sysfs_contexts_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_contexts);
+
+static const struct kobj_type damon_sysfs_contexts_ktype = {
+ .release = damon_sysfs_contexts_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_contexts_groups,
+};
+
+/*
+ * kdamond directory
+ */
+
+struct damon_sysfs_kdamond {
+ struct kobject kobj;
+ struct damon_sysfs_contexts *contexts;
+ struct damon_ctx *damon_ctx;
+};
+
+static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_kdamond), GFP_KERNEL);
+}
+
+static int damon_sysfs_kdamond_add_dirs(struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_sysfs_contexts *contexts;
+ int err;
+
+ contexts = damon_sysfs_contexts_alloc();
+ if (!contexts)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&contexts->kobj,
+ &damon_sysfs_contexts_ktype, &kdamond->kobj,
+ "contexts");
+ if (err) {
+ kobject_put(&contexts->kobj);
+ return err;
+ }
+ kdamond->contexts = contexts;
+
+ return err;
+}
+
+static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond)
+{
+ damon_sysfs_contexts_rm_dirs(kdamond->contexts);
+ kobject_put(&kdamond->contexts->kobj);
+}
+
+static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
+{
+ bool running;
+
+ mutex_lock(&ctx->kdamond_lock);
+ running = ctx->kdamond != NULL;
+ mutex_unlock(&ctx->kdamond_lock);
+ return running;
+}
+
+/*
+ * enum damon_sysfs_cmd - Commands for a specific kdamond.
+ */
+enum damon_sysfs_cmd {
+ /* @DAMON_SYSFS_CMD_ON: Turn the kdamond on. */
+ DAMON_SYSFS_CMD_ON,
+ /* @DAMON_SYSFS_CMD_OFF: Turn the kdamond off. */
+ DAMON_SYSFS_CMD_OFF,
+ /* @DAMON_SYSFS_CMD_COMMIT: Update kdamond inputs. */
+ DAMON_SYSFS_CMD_COMMIT,
+ /*
+ * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs
+ * files.
+ */
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
+ /*
+ * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried
+ * regions
+ */
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS,
+ /*
+ * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried
+ * regions
+ */
+ DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS,
+ /*
+ * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
+ */
+ NR_DAMON_SYSFS_CMDS,
+};
+
+/* Should match with enum damon_sysfs_cmd */
+static const char * const damon_sysfs_cmd_strs[] = {
+ "on",
+ "off",
+ "commit",
+ "update_schemes_stats",
+ "update_schemes_tried_regions",
+ "clear_schemes_tried_regions",
+};
+
+/*
+ * struct damon_sysfs_cmd_request - A request to the DAMON callback.
+ * @cmd: The command that needs to be handled by the callback.
+ * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ *
+ * This structure represents a sysfs command request that need to access some
+ * DAMON context-internal data. Because DAMON context-internal data can be
+ * safely accessed from DAMON callbacks without additional synchronization, the
+ * request will be handled by the DAMON callback. None-``NULL`` @kdamond means
+ * the request is valid.
+ */
+struct damon_sysfs_cmd_request {
+ enum damon_sysfs_cmd cmd;
+ struct damon_sysfs_kdamond *kdamond;
+};
+
+/* Current DAMON callback request. Protected by damon_sysfs_lock. */
+static struct damon_sysfs_cmd_request damon_sysfs_cmd_request;
+
+static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+ bool running;
+
+ if (!ctx)
+ running = false;
+ else
+ running = damon_sysfs_ctx_running(ctx);
+
+ return sysfs_emit(buf, "%s\n", running ?
+ damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
+ damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]);
+}
+
+static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
+ struct damon_sysfs_attrs *sys_attrs)
+{
+ struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
+ struct damon_sysfs_ul_range *sys_nr_regions =
+ sys_attrs->nr_regions_range;
+ struct damon_attrs attrs = {
+ .sample_interval = sys_intervals->sample_us,
+ .aggr_interval = sys_intervals->aggr_us,
+ .ops_update_interval = sys_intervals->update_us,
+ .min_nr_regions = sys_nr_regions->min,
+ .max_nr_regions = sys_nr_regions->max,
+ };
+ return damon_set_attrs(ctx, &attrs);
+}
+
+static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
+{
+ struct damon_target *t, *next;
+ bool has_pid = damon_target_has_pid(ctx);
+
+ damon_for_each_target_safe(t, next, ctx) {
+ if (has_pid)
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+}
+
+static int damon_sysfs_set_regions(struct damon_target *t,
+ struct damon_sysfs_regions *sysfs_regions)
+{
+ struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr,
+ sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN);
+ int i, err = -EINVAL;
+
+ if (!ranges)
+ return -ENOMEM;
+ for (i = 0; i < sysfs_regions->nr; i++) {
+ struct damon_sysfs_region *sys_region =
+ sysfs_regions->regions_arr[i];
+
+ if (sys_region->ar.start > sys_region->ar.end)
+ goto out;
+
+ ranges[i].start = sys_region->ar.start;
+ ranges[i].end = sys_region->ar.end;
+ if (i == 0)
+ continue;
+ if (ranges[i - 1].end > ranges[i].start)
+ goto out;
+ }
+ err = damon_set_regions(t, ranges, sysfs_regions->nr);
+out:
+ kfree(ranges);
+ return err;
+
+}
+
+static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
+ struct damon_ctx *ctx)
+{
+ struct damon_target *t = damon_new_target();
+ int err = -EINVAL;
+
+ if (!t)
+ return -ENOMEM;
+ damon_add_target(ctx, t);
+ if (damon_target_has_pid(ctx)) {
+ t->pid = find_get_pid(sys_target->pid);
+ if (!t->pid)
+ goto destroy_targets_out;
+ }
+ err = damon_sysfs_set_regions(t, sys_target->regions);
+ if (err)
+ goto destroy_targets_out;
+ return 0;
+
+destroy_targets_out:
+ damon_sysfs_destroy_targets(ctx);
+ return err;
+}
+
+/*
+ * Search a target in a context that corresponds to the sysfs target input.
+ *
+ * Return: pointer to the target if found, NULL if not found, or negative
+ * error code if the search failed.
+ */
+static struct damon_target *damon_sysfs_existing_target(
+ struct damon_sysfs_target *sys_target, struct damon_ctx *ctx)
+{
+ struct pid *pid;
+ struct damon_target *t;
+
+ if (!damon_target_has_pid(ctx)) {
+ /* Up to only one target for paddr could exist */
+ damon_for_each_target(t, ctx)
+ return t;
+ return NULL;
+ }
+
+ /* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */
+ pid = find_get_pid(sys_target->pid);
+ if (!pid)
+ return ERR_PTR(-EINVAL);
+ damon_for_each_target(t, ctx) {
+ if (t->pid == pid) {
+ put_pid(pid);
+ return t;
+ }
+ }
+ put_pid(pid);
+ return NULL;
+}
+
+static int damon_sysfs_set_targets(struct damon_ctx *ctx,
+ struct damon_sysfs_targets *sysfs_targets)
+{
+ int i, err;
+
+ /* Multiple physical address space monitoring targets makes no sense */
+ if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1)
+ return -EINVAL;
+
+ for (i = 0; i < sysfs_targets->nr; i++) {
+ struct damon_sysfs_target *st = sysfs_targets->targets_arr[i];
+ struct damon_target *t = damon_sysfs_existing_target(st, ctx);
+
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ if (!t)
+ err = damon_sysfs_add_target(st, ctx);
+ else
+ err = damon_sysfs_set_regions(t, st->regions);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
+{
+ struct damon_target *t, *next;
+ struct damon_sysfs_kdamond *kdamond;
+
+ /* damon_sysfs_schemes_update_regions_stop() might not yet called */
+ kdamond = damon_sysfs_cmd_request.kdamond;
+ if (kdamond && damon_sysfs_cmd_request.cmd ==
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS &&
+ ctx == kdamond->damon_ctx) {
+ damon_sysfs_schemes_update_regions_stop(ctx);
+ mutex_unlock(&damon_sysfs_lock);
+ }
+
+ if (!damon_target_has_pid(ctx))
+ return;
+
+ mutex_lock(&ctx->kdamond_lock);
+ damon_for_each_target_safe(t, next, ctx) {
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+ mutex_unlock(&ctx->kdamond_lock);
+}
+
+/*
+ * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
+ * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ *
+ * This function reads the schemes stats of specific kdamond and update the
+ * related values for sysfs files. This function should be called from DAMON
+ * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
+ * contexts-internal data and DAMON sysfs variables.
+ */
+static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+ damon_sysfs_schemes_update_stats(
+ kdamond->contexts->contexts_arr[0]->schemes, ctx);
+ return 0;
+}
+
+static int damon_sysfs_upd_schemes_regions_start(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+ return damon_sysfs_schemes_update_regions_start(
+ kdamond->contexts->contexts_arr[0]->schemes, ctx);
+}
+
+static int damon_sysfs_upd_schemes_regions_stop(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+ return damon_sysfs_schemes_update_regions_stop(ctx);
+}
+
+static int damon_sysfs_clear_schemes_regions(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+ return damon_sysfs_schemes_clear_regions(
+ kdamond->contexts->contexts_arr[0]->schemes, ctx);
+}
+
+static inline bool damon_sysfs_kdamond_running(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ return kdamond->damon_ctx &&
+ damon_sysfs_ctx_running(kdamond->damon_ctx);
+}
+
+static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
+ struct damon_sysfs_context *sys_ctx)
+{
+ int err;
+
+ err = damon_select_ops(ctx, sys_ctx->ops_id);
+ if (err)
+ return err;
+ err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
+ if (err)
+ return err;
+ err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+ if (err)
+ return err;
+ return damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
+}
+
+/*
+ * damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
+ * @kdamond: The kobject wrapper for the associated kdamond.
+ *
+ * If the sysfs input is wrong, the kdamond will be terminated.
+ */
+static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
+{
+ if (!damon_sysfs_kdamond_running(kdamond))
+ return -EINVAL;
+ /* TODO: Support multiple contexts per kdamond */
+ if (kdamond->contexts->nr != 1)
+ return -EINVAL;
+
+ return damon_sysfs_apply_inputs(kdamond->damon_ctx,
+ kdamond->contexts->contexts_arr[0]);
+}
+
+/*
+ * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
+ * @c: The DAMON context of the callback.
+ *
+ * This function is periodically called back from the kdamond thread for @c.
+ * Then, it checks if there is a waiting DAMON sysfs request and handles it.
+ */
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
+{
+ struct damon_sysfs_kdamond *kdamond;
+ static bool damon_sysfs_schemes_regions_updating;
+ int err = 0;
+
+ /* avoid deadlock due to concurrent state_store('off') */
+ if (!damon_sysfs_schemes_regions_updating &&
+ !mutex_trylock(&damon_sysfs_lock))
+ return 0;
+ kdamond = damon_sysfs_cmd_request.kdamond;
+ if (!kdamond || kdamond->damon_ctx != c)
+ goto out;
+ switch (damon_sysfs_cmd_request.cmd) {
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
+ err = damon_sysfs_upd_schemes_stats(kdamond);
+ break;
+ case DAMON_SYSFS_CMD_COMMIT:
+ err = damon_sysfs_commit_input(kdamond);
+ break;
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
+ if (!damon_sysfs_schemes_regions_updating) {
+ err = damon_sysfs_upd_schemes_regions_start(kdamond);
+ if (!err) {
+ damon_sysfs_schemes_regions_updating = true;
+ goto keep_lock_out;
+ }
+ } else {
+ err = damon_sysfs_upd_schemes_regions_stop(kdamond);
+ damon_sysfs_schemes_regions_updating = false;
+ }
+ break;
+ case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
+ err = damon_sysfs_clear_schemes_regions(kdamond);
+ break;
+ default:
+ break;
+ }
+ /* Mark the request as invalid now. */
+ damon_sysfs_cmd_request.kdamond = NULL;
+out:
+ if (!damon_sysfs_schemes_regions_updating)
+ mutex_unlock(&damon_sysfs_lock);
+keep_lock_out:
+ return err;
+}
+
+static struct damon_ctx *damon_sysfs_build_ctx(
+ struct damon_sysfs_context *sys_ctx)
+{
+ struct damon_ctx *ctx = damon_new_ctx();
+ int err;
+
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ err = damon_sysfs_apply_inputs(ctx, sys_ctx);
+ if (err) {
+ damon_destroy_ctx(ctx);
+ return ERR_PTR(err);
+ }
+
+ ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback;
+ ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback;
+ ctx->callback.before_terminate = damon_sysfs_before_terminate;
+ return ctx;
+}
+
+static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx;
+ int err;
+
+ if (damon_sysfs_kdamond_running(kdamond))
+ return -EBUSY;
+ if (damon_sysfs_cmd_request.kdamond == kdamond)
+ return -EBUSY;
+ /* TODO: support multiple contexts per kdamond */
+ if (kdamond->contexts->nr != 1)
+ return -EINVAL;
+
+ if (kdamond->damon_ctx)
+ damon_destroy_ctx(kdamond->damon_ctx);
+ kdamond->damon_ctx = NULL;
+
+ ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ err = damon_start(&ctx, 1, false);
+ if (err) {
+ damon_destroy_ctx(ctx);
+ return err;
+ }
+ kdamond->damon_ctx = ctx;
+ return err;
+}
+
+static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
+{
+ if (!kdamond->damon_ctx)
+ return -EINVAL;
+ return damon_stop(&kdamond->damon_ctx, 1);
+ /*
+ * To allow users show final monitoring results of already turned-off
+ * DAMON, we free kdamond->damon_ctx in next
+ * damon_sysfs_turn_damon_on(), or kdamonds_nr_store()
+ */
+}
+
+/*
+ * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond.
+ * @cmd: The command to handle.
+ * @kdamond: The kobject wrapper for the associated kdamond.
+ *
+ * This function handles a DAMON sysfs command for a kdamond. For commands
+ * that need to access running DAMON context-internal data, it requests
+ * handling of the command to the DAMON callback
+ * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled,
+ * or the context is completed.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
+ struct damon_sysfs_kdamond *kdamond)
+{
+ bool need_wait = true;
+
+ /* Handle commands that doesn't access DAMON context-internal data */
+ switch (cmd) {
+ case DAMON_SYSFS_CMD_ON:
+ return damon_sysfs_turn_damon_on(kdamond);
+ case DAMON_SYSFS_CMD_OFF:
+ return damon_sysfs_turn_damon_off(kdamond);
+ default:
+ break;
+ }
+
+ /* Pass the command to DAMON callback for safe DAMON context access */
+ if (damon_sysfs_cmd_request.kdamond)
+ return -EBUSY;
+ if (!damon_sysfs_kdamond_running(kdamond))
+ return -EINVAL;
+ damon_sysfs_cmd_request.cmd = cmd;
+ damon_sysfs_cmd_request.kdamond = kdamond;
+
+ /*
+ * wait until damon_sysfs_cmd_request_callback() handles the request
+ * from kdamond context
+ */
+ mutex_unlock(&damon_sysfs_lock);
+ while (need_wait) {
+ schedule_timeout_idle(msecs_to_jiffies(100));
+ if (!mutex_trylock(&damon_sysfs_lock))
+ continue;
+ if (!damon_sysfs_cmd_request.kdamond) {
+ /* damon_sysfs_cmd_request_callback() handled */
+ need_wait = false;
+ } else if (!damon_sysfs_kdamond_running(kdamond)) {
+ /* kdamond has already finished */
+ need_wait = false;
+ damon_sysfs_cmd_request.kdamond = NULL;
+ }
+ mutex_unlock(&damon_sysfs_lock);
+ }
+ mutex_lock(&damon_sysfs_lock);
+ return 0;
+}
+
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ enum damon_sysfs_cmd cmd;
+ ssize_t ret = -EINVAL;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ for (cmd = 0; cmd < NR_DAMON_SYSFS_CMDS; cmd++) {
+ if (sysfs_streq(buf, damon_sysfs_cmd_strs[cmd])) {
+ ret = damon_sysfs_handle_cmd(cmd, kdamond);
+ break;
+ }
+ }
+ mutex_unlock(&damon_sysfs_lock);
+ if (!ret)
+ ret = count;
+ return ret;
+}
+
+static ssize_t pid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ struct damon_ctx *ctx;
+ int pid = -1;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ ctx = kdamond->damon_ctx;
+ if (!ctx)
+ goto out;
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond)
+ pid = ctx->kdamond->pid;
+ mutex_unlock(&ctx->kdamond_lock);
+out:
+ mutex_unlock(&damon_sysfs_lock);
+ return sysfs_emit(buf, "%d\n", pid);
+}
+
+static void damon_sysfs_kdamond_release(struct kobject *kobj)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+
+ if (kdamond->damon_ctx)
+ damon_destroy_ctx(kdamond->damon_ctx);
+ kfree(kdamond);
+}
+
+static struct kobj_attribute damon_sysfs_kdamond_state_attr =
+ __ATTR_RW_MODE(state, 0600);
+
+static struct kobj_attribute damon_sysfs_kdamond_pid_attr =
+ __ATTR_RO_MODE(pid, 0400);
+
+static struct attribute *damon_sysfs_kdamond_attrs[] = {
+ &damon_sysfs_kdamond_state_attr.attr,
+ &damon_sysfs_kdamond_pid_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_kdamond);
+
+static const struct kobj_type damon_sysfs_kdamond_ktype = {
+ .release = damon_sysfs_kdamond_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_kdamond_groups,
+};
+
+/*
+ * kdamonds directory
+ */
+
+struct damon_sysfs_kdamonds {
+ struct kobject kobj;
+ struct damon_sysfs_kdamond **kdamonds_arr;
+ int nr;
+};
+
+static struct damon_sysfs_kdamonds *damon_sysfs_kdamonds_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_kdamonds), GFP_KERNEL);
+}
+
+static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds)
+{
+ struct damon_sysfs_kdamond **kdamonds_arr = kdamonds->kdamonds_arr;
+ int i;
+
+ for (i = 0; i < kdamonds->nr; i++) {
+ damon_sysfs_kdamond_rm_dirs(kdamonds_arr[i]);
+ kobject_put(&kdamonds_arr[i]->kobj);
+ }
+ kdamonds->nr = 0;
+ kfree(kdamonds_arr);
+ kdamonds->kdamonds_arr = NULL;
+}
+
+static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds,
+ int nr_kdamonds)
+{
+ int i;
+
+ for (i = 0; i < nr_kdamonds; i++) {
+ if (damon_sysfs_kdamond_running(kdamonds[i]) ||
+ damon_sysfs_cmd_request.kdamond == kdamonds[i])
+ return true;
+ }
+
+ return false;
+}
+
+static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
+ int nr_kdamonds)
+{
+ struct damon_sysfs_kdamond **kdamonds_arr, *kdamond;
+ int err, i;
+
+ if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr))
+ return -EBUSY;
+
+ damon_sysfs_kdamonds_rm_dirs(kdamonds);
+ if (!nr_kdamonds)
+ return 0;
+
+ kdamonds_arr = kmalloc_array(nr_kdamonds, sizeof(*kdamonds_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!kdamonds_arr)
+ return -ENOMEM;
+ kdamonds->kdamonds_arr = kdamonds_arr;
+
+ for (i = 0; i < nr_kdamonds; i++) {
+ kdamond = damon_sysfs_kdamond_alloc();
+ if (!kdamond) {
+ damon_sysfs_kdamonds_rm_dirs(kdamonds);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&kdamond->kobj,
+ &damon_sysfs_kdamond_ktype, &kdamonds->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+
+ err = damon_sysfs_kdamond_add_dirs(kdamond);
+ if (err)
+ goto out;
+
+ kdamonds_arr[i] = kdamond;
+ kdamonds->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_kdamonds_rm_dirs(kdamonds);
+ kobject_put(&kdamond->kobj);
+ return err;
+}
+
+static ssize_t nr_kdamonds_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_kdamonds *kdamonds = container_of(kobj,
+ struct damon_sysfs_kdamonds, kobj);
+
+ return sysfs_emit(buf, "%d\n", kdamonds->nr);
+}
+
+static ssize_t nr_kdamonds_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_kdamonds *kdamonds;
+ int nr, err;
+
+ err = kstrtoint(buf, 0, &nr);
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_kdamonds_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_kdamonds, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_kdamonds_nr_attr =
+ __ATTR_RW_MODE(nr_kdamonds, 0600);
+
+static struct attribute *damon_sysfs_kdamonds_attrs[] = {
+ &damon_sysfs_kdamonds_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_kdamonds);
+
+static const struct kobj_type damon_sysfs_kdamonds_ktype = {
+ .release = damon_sysfs_kdamonds_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_kdamonds_groups,
+};
+
+/*
+ * damon user interface directory
+ */
+
+struct damon_sysfs_ui_dir {
+ struct kobject kobj;
+ struct damon_sysfs_kdamonds *kdamonds;
+};
+
+static struct damon_sysfs_ui_dir *damon_sysfs_ui_dir_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_ui_dir), GFP_KERNEL);
+}
+
+static int damon_sysfs_ui_dir_add_dirs(struct damon_sysfs_ui_dir *ui_dir)
+{
+ struct damon_sysfs_kdamonds *kdamonds;
+ int err;
+
+ kdamonds = damon_sysfs_kdamonds_alloc();
+ if (!kdamonds)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&kdamonds->kobj,
+ &damon_sysfs_kdamonds_ktype, &ui_dir->kobj,
+ "kdamonds");
+ if (err) {
+ kobject_put(&kdamonds->kobj);
+ return err;
+ }
+ ui_dir->kdamonds = kdamonds;
+ return err;
+}
+
+static void damon_sysfs_ui_dir_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_ui_dir, kobj));
+}
+
+static struct attribute *damon_sysfs_ui_dir_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ui_dir);
+
+static const struct kobj_type damon_sysfs_ui_dir_ktype = {
+ .release = damon_sysfs_ui_dir_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_ui_dir_groups,
+};
+
+static int __init damon_sysfs_init(void)
+{
+ struct kobject *damon_sysfs_root;
+ struct damon_sysfs_ui_dir *admin;
+ int err;
+
+ damon_sysfs_root = kobject_create_and_add("damon", mm_kobj);
+ if (!damon_sysfs_root)
+ return -ENOMEM;
+
+ admin = damon_sysfs_ui_dir_alloc();
+ if (!admin) {
+ kobject_put(damon_sysfs_root);
+ return -ENOMEM;
+ }
+ err = kobject_init_and_add(&admin->kobj, &damon_sysfs_ui_dir_ktype,
+ damon_sysfs_root, "admin");
+ if (err)
+ goto out;
+ err = damon_sysfs_ui_dir_add_dirs(admin);
+ if (err)
+ goto out;
+ return 0;
+
+out:
+ kobject_put(&admin->kobj);
+ kobject_put(damon_sysfs_root);
+ return err;
+}
+subsys_initcall(damon_sysfs_init);
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
new file mode 100644
index 000000000000..c4b455b5ee30
--- /dev/null
+++ b/mm/damon/vaddr-test.h
@@ -0,0 +1,322 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Access Monitor Unit Tests
+ *
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
+
+#ifndef _DAMON_VADDR_TEST_H
+#define _DAMON_VADDR_TEST_H
+
+#include <kunit/test.h>
+
+static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
+ ssize_t nr_vmas)
+{
+ int i, ret = -ENOMEM;
+ MA_STATE(mas, mt, 0, 0);
+
+ if (!nr_vmas)
+ return 0;
+
+ mas_lock(&mas);
+ for (i = 0; i < nr_vmas; i++) {
+ mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1);
+ if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL))
+ goto failed;
+ }
+
+ ret = 0;
+failed:
+ mas_unlock(&mas);
+ return ret;
+}
+
+/*
+ * Test __damon_va_three_regions() function
+ *
+ * In case of virtual memory address spaces monitoring, DAMON converts the
+ * complex and dynamic memory mappings of each target task to three
+ * discontiguous regions which cover every mapped areas. However, the three
+ * regions should not include the two biggest unmapped areas in the original
+ * mapping, because the two biggest areas are normally the areas between 1)
+ * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack.
+ * Because these two unmapped areas are very huge but obviously never accessed,
+ * covering the region is just a waste.
+ *
+ * '__damon_va_three_regions() receives an address space of a process. It
+ * first identifies the start of mappings, end of mappings, and the two biggest
+ * unmapped areas. After that, based on the information, it constructs the
+ * three regions and returns. For more detail, refer to the comment of
+ * 'damon_init_regions_of()' function definition in 'mm/damon.c' file.
+ *
+ * For example, suppose virtual address ranges of 10-20, 20-25, 200-210,
+ * 210-220, 300-305, and 307-330 (Other comments represent this mappings in
+ * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are
+ * mapped. To cover every mappings, the three regions should start with 10,
+ * and end with 305. The process also has three unmapped areas, 25-200,
+ * 220-300, and 305-307. Among those, 25-200 and 220-300 are the biggest two
+ * unmapped areas, and thus it should be converted to three regions of 10-25,
+ * 200-220, and 300-330.
+ */
+static void damon_test_three_regions_in_vmas(struct kunit *test)
+{
+ static struct mm_struct mm;
+ struct damon_addr_range regions[3] = {0,};
+ /* 10-20-25, 200-210-220, 300-305, 307-330 */
+ struct vm_area_struct vmas[] = {
+ (struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
+ (struct vm_area_struct) {.vm_start = 20, .vm_end = 25},
+ (struct vm_area_struct) {.vm_start = 200, .vm_end = 210},
+ (struct vm_area_struct) {.vm_start = 210, .vm_end = 220},
+ (struct vm_area_struct) {.vm_start = 300, .vm_end = 305},
+ (struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
+ };
+
+ mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+ if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
+ kunit_skip(test, "Failed to create VMA tree");
+
+ __damon_va_three_regions(&mm, regions);
+
+ KUNIT_EXPECT_EQ(test, 10ul, regions[0].start);
+ KUNIT_EXPECT_EQ(test, 25ul, regions[0].end);
+ KUNIT_EXPECT_EQ(test, 200ul, regions[1].start);
+ KUNIT_EXPECT_EQ(test, 220ul, regions[1].end);
+ KUNIT_EXPECT_EQ(test, 300ul, regions[2].start);
+ KUNIT_EXPECT_EQ(test, 330ul, regions[2].end);
+}
+
+static struct damon_region *__nth_region_of(struct damon_target *t, int idx)
+{
+ struct damon_region *r;
+ unsigned int i = 0;
+
+ damon_for_each_region(r, t) {
+ if (i++ == idx)
+ return r;
+ }
+
+ return NULL;
+}
+
+/*
+ * Test 'damon_set_regions()'
+ *
+ * test kunit object
+ * regions an array containing start/end addresses of current
+ * monitoring target regions
+ * nr_regions the number of the addresses in 'regions'
+ * three_regions The three regions that need to be applied now
+ * expected start/end addresses of monitoring target regions that
+ * 'three_regions' are applied
+ * nr_expected the number of addresses in 'expected'
+ *
+ * The memory mapping of the target processes changes dynamically. To follow
+ * the change, DAMON periodically reads the mappings, simplifies it to the
+ * three regions, and updates the monitoring target regions to fit in the three
+ * regions. The update of current target regions is the role of
+ * 'damon_set_regions()'.
+ *
+ * This test passes the given target regions and the new three regions that
+ * need to be applied to the function and check whether it updates the regions
+ * as expected.
+ */
+static void damon_do_test_apply_three_regions(struct kunit *test,
+ unsigned long *regions, int nr_regions,
+ struct damon_addr_range *three_regions,
+ unsigned long *expected, int nr_expected)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ int i;
+
+ t = damon_new_target();
+ for (i = 0; i < nr_regions / 2; i++) {
+ r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
+ damon_add_region(r, t);
+ }
+
+ damon_set_regions(t, three_regions, 3);
+
+ for (i = 0; i < nr_expected / 2; i++) {
+ r = __nth_region_of(t, i);
+ KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]);
+ KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
+ }
+}
+
+/*
+ * This function test most common case where the three big regions are only
+ * slightly changed. Target regions should adjust their boundary (10-20-30,
+ * 50-55, 70-80, 90-100) to fit with the new big regions or remove target
+ * regions (57-79) that now out of the three regions.
+ */
+static void damon_test_apply_three_regions1(struct kunit *test)
+{
+ /* 10-20-30, 50-55-57-59, 70-80-90-100 */
+ unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59,
+ 70, 80, 80, 90, 90, 100};
+ /* 5-27, 45-55, 73-104 */
+ struct damon_addr_range new_three_regions[3] = {
+ (struct damon_addr_range){.start = 5, .end = 27},
+ (struct damon_addr_range){.start = 45, .end = 55},
+ (struct damon_addr_range){.start = 73, .end = 104} };
+ /* 5-20-27, 45-55, 73-80-90-104 */
+ unsigned long expected[] = {5, 20, 20, 27, 45, 55,
+ 73, 80, 80, 90, 90, 104};
+
+ damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions),
+ new_three_regions, expected, ARRAY_SIZE(expected));
+}
+
+/*
+ * Test slightly bigger change. Similar to above, but the second big region
+ * now require two target regions (50-55, 57-59) to be removed.
+ */
+static void damon_test_apply_three_regions2(struct kunit *test)
+{
+ /* 10-20-30, 50-55-57-59, 70-80-90-100 */
+ unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59,
+ 70, 80, 80, 90, 90, 100};
+ /* 5-27, 56-57, 65-104 */
+ struct damon_addr_range new_three_regions[3] = {
+ (struct damon_addr_range){.start = 5, .end = 27},
+ (struct damon_addr_range){.start = 56, .end = 57},
+ (struct damon_addr_range){.start = 65, .end = 104} };
+ /* 5-20-27, 56-57, 65-80-90-104 */
+ unsigned long expected[] = {5, 20, 20, 27, 56, 57,
+ 65, 80, 80, 90, 90, 104};
+
+ damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions),
+ new_three_regions, expected, ARRAY_SIZE(expected));
+}
+
+/*
+ * Test a big change. The second big region has totally freed and mapped to
+ * different area (50-59 -> 61-63). The target regions which were in the old
+ * second big region (50-55-57-59) should be removed and new target region
+ * covering the second big region (61-63) should be created.
+ */
+static void damon_test_apply_three_regions3(struct kunit *test)
+{
+ /* 10-20-30, 50-55-57-59, 70-80-90-100 */
+ unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59,
+ 70, 80, 80, 90, 90, 100};
+ /* 5-27, 61-63, 65-104 */
+ struct damon_addr_range new_three_regions[3] = {
+ (struct damon_addr_range){.start = 5, .end = 27},
+ (struct damon_addr_range){.start = 61, .end = 63},
+ (struct damon_addr_range){.start = 65, .end = 104} };
+ /* 5-20-27, 61-63, 65-80-90-104 */
+ unsigned long expected[] = {5, 20, 20, 27, 61, 63,
+ 65, 80, 80, 90, 90, 104};
+
+ damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions),
+ new_three_regions, expected, ARRAY_SIZE(expected));
+}
+
+/*
+ * Test another big change. Both of the second and third big regions (50-59
+ * and 70-100) has totally freed and mapped to different area (30-32 and
+ * 65-68). The target regions which were in the old second and third big
+ * regions should now be removed and new target regions covering the new second
+ * and third big regions should be created.
+ */
+static void damon_test_apply_three_regions4(struct kunit *test)
+{
+ /* 10-20-30, 50-55-57-59, 70-80-90-100 */
+ unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59,
+ 70, 80, 80, 90, 90, 100};
+ /* 5-7, 30-32, 65-68 */
+ struct damon_addr_range new_three_regions[3] = {
+ (struct damon_addr_range){.start = 5, .end = 7},
+ (struct damon_addr_range){.start = 30, .end = 32},
+ (struct damon_addr_range){.start = 65, .end = 68} };
+ /* expect 5-7, 30-32, 65-68 */
+ unsigned long expected[] = {5, 7, 30, 32, 65, 68};
+
+ damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions),
+ new_three_regions, expected, ARRAY_SIZE(expected));
+}
+
+static void damon_test_split_evenly_fail(struct kunit *test,
+ unsigned long start, unsigned long end, unsigned int nr_pieces)
+{
+ struct damon_target *t = damon_new_target();
+ struct damon_region *r = damon_new_region(start, end);
+
+ damon_add_region(r, t);
+ KUNIT_EXPECT_EQ(test,
+ damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL);
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u);
+
+ damon_for_each_region(r, t) {
+ KUNIT_EXPECT_EQ(test, r->ar.start, start);
+ KUNIT_EXPECT_EQ(test, r->ar.end, end);
+ }
+
+ damon_free_target(t);
+}
+
+static void damon_test_split_evenly_succ(struct kunit *test,
+ unsigned long start, unsigned long end, unsigned int nr_pieces)
+{
+ struct damon_target *t = damon_new_target();
+ struct damon_region *r = damon_new_region(start, end);
+ unsigned long expected_width = (end - start) / nr_pieces;
+ unsigned long i = 0;
+
+ damon_add_region(r, t);
+ KUNIT_EXPECT_EQ(test,
+ damon_va_evenly_split_region(t, r, nr_pieces), 0);
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces);
+
+ damon_for_each_region(r, t) {
+ if (i == nr_pieces - 1) {
+ KUNIT_EXPECT_EQ(test,
+ r->ar.start, start + i * expected_width);
+ KUNIT_EXPECT_EQ(test, r->ar.end, end);
+ break;
+ }
+ KUNIT_EXPECT_EQ(test,
+ r->ar.start, start + i++ * expected_width);
+ KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width);
+ }
+ damon_free_target(t);
+}
+
+static void damon_test_split_evenly(struct kunit *test)
+{
+ KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
+ -EINVAL);
+
+ damon_test_split_evenly_fail(test, 0, 100, 0);
+ damon_test_split_evenly_succ(test, 0, 100, 10);
+ damon_test_split_evenly_succ(test, 5, 59, 5);
+ damon_test_split_evenly_fail(test, 5, 6, 2);
+}
+
+static struct kunit_case damon_test_cases[] = {
+ KUNIT_CASE(damon_test_three_regions_in_vmas),
+ KUNIT_CASE(damon_test_apply_three_regions1),
+ KUNIT_CASE(damon_test_apply_three_regions2),
+ KUNIT_CASE(damon_test_apply_three_regions3),
+ KUNIT_CASE(damon_test_apply_three_regions4),
+ KUNIT_CASE(damon_test_split_evenly),
+ {},
+};
+
+static struct kunit_suite damon_test_suite = {
+ .name = "damon-operations",
+ .test_cases = damon_test_cases,
+};
+kunit_test_suite(damon_test_suite);
+
+#endif /* _DAMON_VADDR_TEST_H */
+
+#endif /* CONFIG_DAMON_VADDR_KUNIT_TEST */
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
new file mode 100644
index 000000000000..e0e59d420fca
--- /dev/null
+++ b/mm/damon/vaddr.c
@@ -0,0 +1,721 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON Primitives for Virtual Address Spaces
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#define pr_fmt(fmt) "damon-va: " fmt
+
+#include <asm-generic/mman-common.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagewalk.h>
+#include <linux/sched/mm.h>
+
+#include "ops-common.h"
+
+#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
+#undef DAMON_MIN_REGION
+#define DAMON_MIN_REGION 1
+#endif
+
+/*
+ * 't->pid' should be the pointer to the relevant 'struct pid' having reference
+ * count. Caller must put the returned task, unless it is NULL.
+ */
+static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
+{
+ return get_pid_task(t->pid, PIDTYPE_PID);
+}
+
+/*
+ * Get the mm_struct of the given target
+ *
+ * Caller _must_ put the mm_struct after use, unless it is NULL.
+ *
+ * Returns the mm_struct of the target on success, NULL on failure
+ */
+static struct mm_struct *damon_get_mm(struct damon_target *t)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+
+ task = damon_get_task_struct(t);
+ if (!task)
+ return NULL;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ return mm;
+}
+
+/*
+ * Functions for the initial monitoring target regions construction
+ */
+
+/*
+ * Size-evenly split a region into 'nr_pieces' small regions
+ *
+ * Returns 0 on success, or negative error code otherwise.
+ */
+static int damon_va_evenly_split_region(struct damon_target *t,
+ struct damon_region *r, unsigned int nr_pieces)
+{
+ unsigned long sz_orig, sz_piece, orig_end;
+ struct damon_region *n = NULL, *next;
+ unsigned long start;
+
+ if (!r || !nr_pieces)
+ return -EINVAL;
+
+ orig_end = r->ar.end;
+ sz_orig = damon_sz_region(r);
+ sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
+
+ if (!sz_piece)
+ return -EINVAL;
+
+ r->ar.end = r->ar.start + sz_piece;
+ next = damon_next_region(r);
+ for (start = r->ar.end; start + sz_piece <= orig_end;
+ start += sz_piece) {
+ n = damon_new_region(start, start + sz_piece);
+ if (!n)
+ return -ENOMEM;
+ damon_insert_region(n, r, next, t);
+ r = n;
+ }
+ /* complement last region for possible rounding error */
+ if (n)
+ n->ar.end = orig_end;
+
+ return 0;
+}
+
+static unsigned long sz_range(struct damon_addr_range *r)
+{
+ return r->end - r->start;
+}
+
+/*
+ * Find three regions separated by two biggest unmapped regions
+ *
+ * vma the head vma of the target address space
+ * regions an array of three address ranges that results will be saved
+ *
+ * This function receives an address space and finds three regions in it which
+ * separated by the two biggest unmapped regions in the space. Please refer to
+ * below comments of '__damon_va_init_regions()' function to know why this is
+ * necessary.
+ *
+ * Returns 0 if success, or negative error code otherwise.
+ */
+static int __damon_va_three_regions(struct mm_struct *mm,
+ struct damon_addr_range regions[3])
+{
+ struct damon_addr_range first_gap = {0}, second_gap = {0};
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma, *prev = NULL;
+ unsigned long start;
+
+ /*
+ * Find the two biggest gaps so that first_gap > second_gap > others.
+ * If this is too slow, it can be optimised to examine the maple
+ * tree gaps.
+ */
+ for_each_vma(vmi, vma) {
+ unsigned long gap;
+
+ if (!prev) {
+ start = vma->vm_start;
+ goto next;
+ }
+ gap = vma->vm_start - prev->vm_end;
+
+ if (gap > sz_range(&first_gap)) {
+ second_gap = first_gap;
+ first_gap.start = prev->vm_end;
+ first_gap.end = vma->vm_start;
+ } else if (gap > sz_range(&second_gap)) {
+ second_gap.start = prev->vm_end;
+ second_gap.end = vma->vm_start;
+ }
+next:
+ prev = vma;
+ }
+
+ if (!sz_range(&second_gap) || !sz_range(&first_gap))
+ return -EINVAL;
+
+ /* Sort the two biggest gaps by address */
+ if (first_gap.start > second_gap.start)
+ swap(first_gap, second_gap);
+
+ /* Store the result */
+ regions[0].start = ALIGN(start, DAMON_MIN_REGION);
+ regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION);
+ regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
+ regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
+ regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
+ regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION);
+
+ return 0;
+}
+
+/*
+ * Get the three regions in the given target (task)
+ *
+ * Returns 0 on success, negative error code otherwise.
+ */
+static int damon_va_three_regions(struct damon_target *t,
+ struct damon_addr_range regions[3])
+{
+ struct mm_struct *mm;
+ int rc;
+
+ mm = damon_get_mm(t);
+ if (!mm)
+ return -EINVAL;
+
+ mmap_read_lock(mm);
+ rc = __damon_va_three_regions(mm, regions);
+ mmap_read_unlock(mm);
+
+ mmput(mm);
+ return rc;
+}
+
+/*
+ * Initialize the monitoring target regions for the given target (task)
+ *
+ * t the given target
+ *
+ * Because only a number of small portions of the entire address space
+ * is actually mapped to the memory and accessed, monitoring the unmapped
+ * regions is wasteful. That said, because we can deal with small noises,
+ * tracking every mapping is not strictly required but could even incur a high
+ * overhead if the mapping frequently changes or the number of mappings is
+ * high. The adaptive regions adjustment mechanism will further help to deal
+ * with the noise by simply identifying the unmapped areas as a region that
+ * has no access. Moreover, applying the real mappings that would have many
+ * unmapped areas inside will make the adaptive mechanism quite complex. That
+ * said, too huge unmapped areas inside the monitoring target should be removed
+ * to not take the time for the adaptive mechanism.
+ *
+ * For the reason, we convert the complex mappings to three distinct regions
+ * that cover every mapped area of the address space. Also the two gaps
+ * between the three regions are the two biggest unmapped areas in the given
+ * address space. In detail, this function first identifies the start and the
+ * end of the mappings and the two biggest unmapped areas of the address space.
+ * Then, it constructs the three regions as below:
+ *
+ * [mappings[0]->start, big_two_unmapped_areas[0]->start)
+ * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start)
+ * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end)
+ *
+ * As usual memory map of processes is as below, the gap between the heap and
+ * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed
+ * region and the stack will be two biggest unmapped regions. Because these
+ * gaps are exceptionally huge areas in usual address space, excluding these
+ * two biggest unmapped regions will be sufficient to make a trade-off.
+ *
+ * <heap>
+ * <BIG UNMAPPED REGION 1>
+ * <uppermost mmap()-ed region>
+ * (other mmap()-ed regions and small unmapped regions)
+ * <lowermost mmap()-ed region>
+ * <BIG UNMAPPED REGION 2>
+ * <stack>
+ */
+static void __damon_va_init_regions(struct damon_ctx *ctx,
+ struct damon_target *t)
+{
+ struct damon_target *ti;
+ struct damon_region *r;
+ struct damon_addr_range regions[3];
+ unsigned long sz = 0, nr_pieces;
+ int i, tidx = 0;
+
+ if (damon_va_three_regions(t, regions)) {
+ damon_for_each_target(ti, ctx) {
+ if (ti == t)
+ break;
+ tidx++;
+ }
+ pr_debug("Failed to get three regions of %dth target\n", tidx);
+ return;
+ }
+
+ for (i = 0; i < 3; i++)
+ sz += regions[i].end - regions[i].start;
+ if (ctx->attrs.min_nr_regions)
+ sz /= ctx->attrs.min_nr_regions;
+ if (sz < DAMON_MIN_REGION)
+ sz = DAMON_MIN_REGION;
+
+ /* Set the initial three regions of the target */
+ for (i = 0; i < 3; i++) {
+ r = damon_new_region(regions[i].start, regions[i].end);
+ if (!r) {
+ pr_err("%d'th init region creation failed\n", i);
+ return;
+ }
+ damon_add_region(r, t);
+
+ nr_pieces = (regions[i].end - regions[i].start) / sz;
+ damon_va_evenly_split_region(t, r, nr_pieces);
+ }
+}
+
+/* Initialize '->regions_list' of every target (task) */
+static void damon_va_init(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+
+ damon_for_each_target(t, ctx) {
+ /* the user may set the target regions as they want */
+ if (!damon_nr_regions(t))
+ __damon_va_init_regions(ctx, t);
+ }
+}
+
+/*
+ * Update regions for current memory mappings
+ */
+static void damon_va_update(struct damon_ctx *ctx)
+{
+ struct damon_addr_range three_regions[3];
+ struct damon_target *t;
+
+ damon_for_each_target(t, ctx) {
+ if (damon_va_three_regions(t, three_regions))
+ continue;
+ damon_set_regions(t, three_regions, 3);
+ }
+}
+
+static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(walk->mm, pmd);
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_huge(*pmd)) {
+ damon_pmdp_mkold(pmd, walk->vma, addr);
+ spin_unlock(ptl);
+ return 0;
+ }
+ spin_unlock(ptl);
+ }
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+ if (!pte_present(ptep_get(pte)))
+ goto out;
+ damon_ptep_mkold(pte, walk->vma, addr);
+out:
+ pte_unmap_unlock(pte, ptl);
+ return 0;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ bool referenced = false;
+ pte_t entry = huge_ptep_get(pte);
+ struct folio *folio = pfn_folio(pte_pfn(entry));
+
+ folio_get(folio);
+
+ if (pte_young(entry)) {
+ referenced = true;
+ entry = pte_mkold(entry);
+ set_huge_pte_at(mm, addr, pte, entry);
+ }
+
+#ifdef CONFIG_MMU_NOTIFIER
+ if (mmu_notifier_clear_young(mm, addr,
+ addr + huge_page_size(hstate_vma(vma))))
+ referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+ if (referenced)
+ folio_set_young(folio);
+
+ folio_set_idle(folio);
+ folio_put(folio);
+}
+
+static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hstate *h = hstate_vma(walk->vma);
+ spinlock_t *ptl;
+ pte_t entry;
+
+ ptl = huge_pte_lock(h, walk->mm, pte);
+ entry = huge_ptep_get(pte);
+ if (!pte_present(entry))
+ goto out;
+
+ damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr);
+
+out:
+ spin_unlock(ptl);
+ return 0;
+}
+#else
+#define damon_mkold_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static const struct mm_walk_ops damon_mkold_ops = {
+ .pmd_entry = damon_mkold_pmd_entry,
+ .hugetlb_entry = damon_mkold_hugetlb_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
+{
+ mmap_read_lock(mm);
+ walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL);
+ mmap_read_unlock(mm);
+}
+
+/*
+ * Functions for the access checking of the regions
+ */
+
+static void __damon_va_prepare_access_check(struct mm_struct *mm,
+ struct damon_region *r)
+{
+ r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+
+ damon_va_mkold(mm, r->sampling_addr);
+}
+
+static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ struct mm_struct *mm;
+ struct damon_region *r;
+
+ damon_for_each_target(t, ctx) {
+ mm = damon_get_mm(t);
+ if (!mm)
+ continue;
+ damon_for_each_region(r, t)
+ __damon_va_prepare_access_check(mm, r);
+ mmput(mm);
+ }
+}
+
+struct damon_young_walk_private {
+ /* size of the folio for the access checked virtual memory address */
+ unsigned long *folio_sz;
+ bool young;
+};
+
+static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t *pte;
+ pte_t ptent;
+ spinlock_t *ptl;
+ struct folio *folio;
+ struct damon_young_walk_private *priv = walk->private;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(walk->mm, pmd);
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (!pmd_trans_huge(*pmd)) {
+ spin_unlock(ptl);
+ goto regular_page;
+ }
+ folio = damon_get_folio(pmd_pfn(*pmd));
+ if (!folio)
+ goto huge_out;
+ if (pmd_young(*pmd) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm,
+ addr))
+ priv->young = true;
+ *priv->folio_sz = HPAGE_PMD_SIZE;
+ folio_put(folio);
+huge_out:
+ spin_unlock(ptl);
+ return 0;
+ }
+
+regular_page:
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
+ goto out;
+ folio = damon_get_folio(pte_pfn(ptent));
+ if (!folio)
+ goto out;
+ if (pte_young(ptent) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm, addr))
+ priv->young = true;
+ *priv->folio_sz = folio_size(folio);
+ folio_put(folio);
+out:
+ pte_unmap_unlock(pte, ptl);
+ return 0;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct damon_young_walk_private *priv = walk->private;
+ struct hstate *h = hstate_vma(walk->vma);
+ struct folio *folio;
+ spinlock_t *ptl;
+ pte_t entry;
+
+ ptl = huge_pte_lock(h, walk->mm, pte);
+ entry = huge_ptep_get(pte);
+ if (!pte_present(entry))
+ goto out;
+
+ folio = pfn_folio(pte_pfn(entry));
+ folio_get(folio);
+
+ if (pte_young(entry) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm, addr))
+ priv->young = true;
+ *priv->folio_sz = huge_page_size(h);
+
+ folio_put(folio);
+
+out:
+ spin_unlock(ptl);
+ return 0;
+}
+#else
+#define damon_young_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static const struct mm_walk_ops damon_young_ops = {
+ .pmd_entry = damon_young_pmd_entry,
+ .hugetlb_entry = damon_young_hugetlb_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
+ unsigned long *folio_sz)
+{
+ struct damon_young_walk_private arg = {
+ .folio_sz = folio_sz,
+ .young = false,
+ };
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg);
+ mmap_read_unlock(mm);
+ return arg.young;
+}
+
+/*
+ * Check whether the region was accessed after the last preparation
+ *
+ * mm 'mm_struct' for the given virtual address space
+ * r the region to be checked
+ */
+static void __damon_va_check_access(struct mm_struct *mm,
+ struct damon_region *r, bool same_target)
+{
+ static unsigned long last_addr;
+ static unsigned long last_folio_sz = PAGE_SIZE;
+ static bool last_accessed;
+
+ /* If the region is in the last checked page, reuse the result */
+ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_folio_sz))) {
+ if (last_accessed)
+ r->nr_accesses++;
+ return;
+ }
+
+ last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz);
+ if (last_accessed)
+ r->nr_accesses++;
+
+ last_addr = r->sampling_addr;
+}
+
+static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ struct mm_struct *mm;
+ struct damon_region *r;
+ unsigned int max_nr_accesses = 0;
+ bool same_target;
+
+ damon_for_each_target(t, ctx) {
+ mm = damon_get_mm(t);
+ if (!mm)
+ continue;
+ same_target = false;
+ damon_for_each_region(r, t) {
+ __damon_va_check_access(mm, r, same_target);
+ max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+ same_target = true;
+ }
+ mmput(mm);
+ }
+
+ return max_nr_accesses;
+}
+
+/*
+ * Functions for the target validity check and cleanup
+ */
+
+static bool damon_va_target_valid(struct damon_target *t)
+{
+ struct task_struct *task;
+
+ task = damon_get_task_struct(t);
+ if (task) {
+ put_task_struct(task);
+ return true;
+ }
+
+ return false;
+}
+
+#ifndef CONFIG_ADVISE_SYSCALLS
+static unsigned long damos_madvise(struct damon_target *target,
+ struct damon_region *r, int behavior)
+{
+ return 0;
+}
+#else
+static unsigned long damos_madvise(struct damon_target *target,
+ struct damon_region *r, int behavior)
+{
+ struct mm_struct *mm;
+ unsigned long start = PAGE_ALIGN(r->ar.start);
+ unsigned long len = PAGE_ALIGN(damon_sz_region(r));
+ unsigned long applied;
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ return 0;
+
+ applied = do_madvise(mm, start, len, behavior) ? 0 : len;
+ mmput(mm);
+
+ return applied;
+}
+#endif /* CONFIG_ADVISE_SYSCALLS */
+
+static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
+{
+ int madv_action;
+
+ switch (scheme->action) {
+ case DAMOS_WILLNEED:
+ madv_action = MADV_WILLNEED;
+ break;
+ case DAMOS_COLD:
+ madv_action = MADV_COLD;
+ break;
+ case DAMOS_PAGEOUT:
+ madv_action = MADV_PAGEOUT;
+ break;
+ case DAMOS_HUGEPAGE:
+ madv_action = MADV_HUGEPAGE;
+ break;
+ case DAMOS_NOHUGEPAGE:
+ madv_action = MADV_NOHUGEPAGE;
+ break;
+ case DAMOS_STAT:
+ return 0;
+ default:
+ /*
+ * DAMOS actions that are not yet supported by 'vaddr'.
+ */
+ return 0;
+ }
+
+ return damos_madvise(t, r, madv_action);
+}
+
+static int damon_va_scheme_score(struct damon_ctx *context,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
+{
+
+ switch (scheme->action) {
+ case DAMOS_PAGEOUT:
+ return damon_cold_score(context, r, scheme);
+ default:
+ break;
+ }
+
+ return DAMOS_MAX_SCORE;
+}
+
+static int __init damon_va_initcall(void)
+{
+ struct damon_operations ops = {
+ .id = DAMON_OPS_VADDR,
+ .init = damon_va_init,
+ .update = damon_va_update,
+ .prepare_access_checks = damon_va_prepare_access_checks,
+ .check_accesses = damon_va_check_accesses,
+ .reset_aggregated = NULL,
+ .target_valid = damon_va_target_valid,
+ .cleanup = NULL,
+ .apply_scheme = damon_va_apply_scheme,
+ .get_scheme_score = damon_va_scheme_score,
+ };
+ /* ops for fixed virtual address ranges */
+ struct damon_operations ops_fvaddr = ops;
+ int err;
+
+ /* Don't set the monitoring target regions for the entire mapping */
+ ops_fvaddr.id = DAMON_OPS_FVADDR;
+ ops_fvaddr.init = NULL;
+ ops_fvaddr.update = NULL;
+
+ err = damon_register_ops(&ops);
+ if (err)
+ return err;
+ return damon_register_ops(&ops_fvaddr);
+};
+
+subsys_initcall(damon_va_initcall);
+
+#include "vaddr-test.h"
diff --git a/mm/debug.c b/mm/debug.c
index ccca576b2899..ee533a5ceb79 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -16,15 +16,19 @@
#include <linux/ctype.h>
#include "internal.h"
+#include <trace/events/migrate.h>
+
+/*
+ * Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can
+ * be used to populate migrate_reason_names[].
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) b,
+#define EMe(a, b) b
const char *migrate_reason_names[MR_TYPES] = {
- "compaction",
- "memory_failure",
- "memory_hotplug",
- "syscall_or_cpuset",
- "mempolicy_mbind",
- "numa_misplaced",
- "cma",
+ MIGRATE_REASON
};
const struct trace_print_flags pageflag_names[] = {
@@ -32,6 +36,11 @@ const struct trace_print_flags pageflag_names[] = {
{0, NULL}
};
+const struct trace_print_flags pagetype_names[] = {
+ __def_pagetype_names,
+ {0, NULL}
+};
+
const struct trace_print_flags gfpflag_names[] = {
__def_gfpflag_names,
{0, NULL}
@@ -42,11 +51,11 @@ const struct trace_print_flags vmaflag_names[] = {
{0, NULL}
};
-void __dump_page(struct page *page, const char *reason)
+static void __dump_page(struct page *page)
{
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
+ struct page *head = &folio->page;
struct address_space *mapping;
- bool page_poisoned = PagePoisoned(page);
bool compound = PageCompound(page);
/*
* Accessing the pageblock without the zone lock. It could change to
@@ -58,16 +67,6 @@ void __dump_page(struct page *page, const char *reason)
int mapcount;
char *type = "";
- /*
- * If struct page is poisoned don't access Page*() functions as that
- * leads to recursive loop. Page*() check for poisoned pages, and calls
- * dump_page() when detected.
- */
- if (page_poisoned) {
- pr_warn("page:%px is uninitialized and poisoned", page);
- goto hex_only;
- }
-
if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
/*
* Corrupt page, so we cannot call page_mapping. Instead, do a
@@ -83,6 +82,7 @@ void __dump_page(struct page *page, const char *reason)
else
mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
head = page;
+ folio = (struct folio *)page;
compound = false;
} else {
mapping = page_mapping(page);
@@ -99,77 +99,29 @@ void __dump_page(struct page *page, const char *reason)
page, page_ref_count(head), mapcount, mapping,
page_to_pgoff(page), page_to_pfn(page));
if (compound) {
- if (hpage_pincount_available(page)) {
- pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
- head, compound_order(head),
- head_compound_mapcount(head),
- head_compound_pincount(head));
- } else {
- pr_warn("head:%p order:%u compound_mapcount:%d\n",
- head, compound_order(head),
- head_compound_mapcount(head));
- }
+ pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
+ head, compound_order(head),
+ folio_entire_mapcount(folio),
+ folio_nr_pages_mapped(folio),
+ atomic_read(&folio->_pincount));
}
+
+#ifdef CONFIG_MEMCG
+ if (head->memcg_data)
+ pr_warn("memcg:%lx\n", head->memcg_data);
+#endif
if (PageKsm(page))
type = "ksm ";
else if (PageAnon(page))
type = "anon ";
- else if (mapping) {
- struct inode *host;
- const struct address_space_operations *a_ops;
- struct hlist_node *dentry_first;
- struct dentry *dentry_ptr;
- struct dentry dentry;
- unsigned long ino;
-
- /*
- * mapping can be invalid pointer and we don't want to crash
- * accessing it, so probe everything depending on it carefully
- */
- if (get_kernel_nofault(host, &mapping->host) ||
- get_kernel_nofault(a_ops, &mapping->a_ops)) {
- pr_warn("failed to read mapping contents, not a valid kernel address?\n");
- goto out_mapping;
- }
-
- if (!host) {
- pr_warn("aops:%ps\n", a_ops);
- goto out_mapping;
- }
-
- if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
- get_kernel_nofault(ino, &host->i_ino)) {
- pr_warn("aops:%ps with invalid host inode %px\n",
- a_ops, host);
- goto out_mapping;
- }
-
- if (!dentry_first) {
- pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
- goto out_mapping;
- }
-
- dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
- if (get_kernel_nofault(dentry, dentry_ptr)) {
- pr_warn("aops:%ps ino:%lx with invalid dentry %px\n",
- a_ops, ino, dentry_ptr);
- } else {
- /*
- * if dentry is corrupted, the %pd handler may still
- * crash, but it's unlikely that we reach here with a
- * corrupted struct page
- */
- pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
- a_ops, ino, &dentry);
- }
- }
-out_mapping:
+ else if (mapping)
+ dump_mapping(mapping);
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
+ pr_warn("%sflags: %pGp%s\n", type, &head->flags,
page_cma ? " CMA" : "");
+ pr_warn("page_type: %pGt\n", &head->page_type);
-hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
@@ -177,19 +129,16 @@ hex_only:
print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), head,
sizeof(struct page), false);
-
- if (reason)
- pr_warn("page dumped because: %s\n", reason);
-
-#ifdef CONFIG_MEMCG
- if (!page_poisoned && page->mem_cgroup)
- pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
-#endif
}
void dump_page(struct page *page, const char *reason)
{
- __dump_page(page, reason);
+ if (PagePoisoned(page))
+ pr_warn("page:%p is uninitialized and poisoned", page);
+ else
+ __dump_page(page);
+ if (reason)
+ pr_warn("page dumped because: %s\n", reason);
dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
@@ -198,13 +147,11 @@ EXPORT_SYMBOL(dump_page);
void dump_vma(const struct vm_area_struct *vma)
{
- pr_emerg("vma %px start %px end %px\n"
- "next %px prev %px mm %px\n"
+ pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
"flags: %#lx(%pGv)\n",
- vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
- vma->vm_prev, vma->vm_mm,
+ vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
@@ -214,18 +161,18 @@ EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
- pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n"
+ pr_emerg("mm %px task_size %lu\n"
#ifdef CONFIG_MMU
"get_unmapped_area %px\n"
#endif
- "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
+ "mmap_base %lu mmap_legacy_base %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
- "binfmt %px flags %lx core_state %px\n"
+ "binfmt %px flags %lx\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
@@ -242,11 +189,11 @@ void dump_mm(const struct mm_struct *mm)
"tlb_flush_pending %d\n"
"def_flags: %#lx(%pGv)\n",
- mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size,
+ mm, mm->task_size,
#ifdef CONFIG_MMU
mm->get_unmapped_area,
#endif
- mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
+ mm->mmap_base, mm->mmap_legacy_base,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
mm_pgtables_bytes(mm),
@@ -257,7 +204,7 @@ void dump_mm(const struct mm_struct *mm)
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
- mm->binfmt, mm->flags, mm->core_state,
+ mm->binfmt, mm->flags,
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
@@ -275,6 +222,7 @@ void dump_mm(const struct mm_struct *mm)
mm->def_flags, &mm->def_flags
);
}
+EXPORT_SYMBOL(dump_mm);
static bool page_init_poisoning __read_mostly = true;
@@ -320,5 +268,13 @@ void page_init_poison(struct page *page, size_t size)
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
-EXPORT_SYMBOL_GPL(page_init_poison);
+
+void vma_iter_dump_tree(const struct vma_iterator *vmi)
+{
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ mas_dump(&vmi->mas);
+ mt_dump(vmi->mas.tree, mt_dump_hex);
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
+}
+
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
new file mode 100644
index 000000000000..f9d145730fd1
--- /dev/null
+++ b/mm/debug_page_alloc.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+
+unsigned int _debug_guardpage_minorder;
+
+bool _debug_pagealloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
+
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
+
+static int __init early_debug_pagealloc(char *buf)
+{
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ pr_err("Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
+
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype)
+{
+ if (order >= debug_guardpage_minorder())
+ return false;
+
+ __SetPageGuard(page);
+ INIT_LIST_HEAD(&page->buddy_list);
+ set_page_private(page, order);
+ /* Guard pages are not available for any usage */
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+ return true;
+}
+
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype)
+{
+ __ClearPageGuard(page);
+
+ set_page_private(page, 0);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, (1 << order), migratetype);
+}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..ee119e33fef1 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -15,6 +15,7 @@
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/kconfig.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mm_types.h>
@@ -28,33 +29,83 @@
#include <linux/swapops.h>
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
+#include <linux/io.h>
+
+#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
/*
- * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics
* expectations that are being validated here. All future changes in here
* or the documentation need to be in sync.
- */
-
-#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
-
-/*
+ *
* On s390 platform, the lower 4 bits are used to identify given page table
* entry type. But these bits might affect the ability to clear entries with
* pxx_clear() because of how dynamic page table folding works on s390. So
* while loading up the entries do not change the lower 4 bits. It does not
- * have affect any other platform.
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
*/
-#define S390_MASK_BITS 4
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define S390_SKIP_MASK GENMASK(3, 0)
+#if __BITS_PER_LONG == 64
+#define PPC64_SKIP_MASK GENMASK(62, 62)
+#else
+#define PPC64_SKIP_MASK 0x0
+#endif
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
-static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
+struct pgtable_debug_args {
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ p4d_t *start_p4dp;
+ pud_t *start_pudp;
+ pmd_t *start_pmdp;
+ pgtable_t start_ptep;
+
+ unsigned long vaddr;
+ pgprot_t page_prot;
+ pgprot_t page_prot_none;
+
+ bool is_contiguous_page;
+ unsigned long pud_pfn;
+ unsigned long pmd_pfn;
+ unsigned long pte_pfn;
+
+ unsigned long fixed_alignment;
+ unsigned long fixed_pgd_pfn;
+ unsigned long fixed_p4d_pfn;
+ unsigned long fixed_pud_pfn;
+ unsigned long fixed_pmd_pfn;
+ unsigned long fixed_pte_pfn;
+};
+
+static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pgprot_t prot = vm_get_page_prot(idx);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, prot);
+ unsigned long val = idx, *ptr = &val;
+
+ pr_debug("Validating PTE basic (%pGv)\n", ptr);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pte() to make sure that vm_get_page_prot(idx)
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pte_dirty(pte_wrprotect(pte)));
- pr_debug("Validating PTE basic\n");
WARN_ON(!pte_same(pte, pte));
WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
@@ -62,68 +113,92 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+ WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte))));
+ WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
}
-static void __init pte_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pte_t *ptep,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pte_advanced_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ struct page *page;
+ pte_t pte;
+
+ /*
+ * Architectures optimize set_pte_at by avoiding TLB flush.
+ * This requires set_pte_at to be not used to update an
+ * existing pte entry. Clear pte before we do set_pte_at
+ *
+ * flush_dcache_page() is called after set_pte_at() to clear
+ * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+ * when it's released and page allocation check will fail when
+ * the page is allocated again. For architectures other than ARM64,
+ * the unexpected overhead of cache flushing is acceptable.
+ */
+ page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+ if (!page)
+ return;
pr_debug("Validating PTE advanced\n");
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
- ptep_set_wrprotect(mm, vaddr, ptep);
- pte = ptep_get(ptep);
- WARN_ON(pte_write(pte));
+ if (WARN_ON(!args->ptep))
+ return;
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
- ptep_get_and_clear(mm, vaddr, ptep);
- pte = ptep_get(ptep);
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
+ ptep_set_wrprotect(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
+ WARN_ON(pte_write(pte));
+ ptep_get_and_clear(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
- set_pte_at(mm, vaddr, ptep, pte);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
pte = pte_mkwrite(pte);
pte = pte_mkdirty(pte);
- ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
- pte = ptep_get(ptep);
+ ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1);
+ pte = ptep_get(args->ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
- ptep_get_and_clear_full(mm, vaddr, ptep, 1);
- pte = ptep_get(ptep);
+ ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
pte = pte_mkyoung(pte);
- set_pte_at(mm, vaddr, ptep, pte);
- ptep_test_and_clear_young(vma, vaddr, ptep);
- pte = ptep_get(ptep);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
+ ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(pte_young(pte));
-}
-
-static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
-{
- pte_t pte = pfn_pte(pfn, prot);
- pr_debug("Validating PTE saved write\n");
- WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
- WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
+ ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1);
}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pgprot_t prot = vm_get_page_prot(idx);
+ unsigned long val = idx, *ptr = &val;
+ pmd_t pmd;
if (!has_transparent_hugepage())
return;
- pr_debug("Validating PMD basic\n");
+ pr_debug("Validating PMD basic (%pGv)\n", ptr);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, prot);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pmd() to make sure that vm_get_page_prot(idx)
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pmd_dirty(pmd_wrprotect(pmd)));
+
+
WARN_ON(!pmd_same(pmd, pmd));
WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
@@ -131,6 +206,8 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd))));
/*
* A huge page does not point to next level page table
* entry. Hence this must qualify as pmd_bad().
@@ -138,60 +215,79 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
}
-static void __init pmd_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pmd_t *pmdp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ struct page *page;
+ pmd_t pmd;
+ unsigned long vaddr = args->vaddr;
if (!has_transparent_hugepage())
return;
+ page = (args->pmd_pfn != ULONG_MAX) ? pfn_to_page(args->pmd_pfn) : NULL;
+ if (!page)
+ return;
+
+ /*
+ * flush_dcache_page() is called after set_pmd_at() to clear
+ * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+ * when it's released and page allocation check will fail when
+ * the page is allocated again. For architectures other than ARM64,
+ * the unexpected overhead of cache flushing is acceptable.
+ */
pr_debug("Validating PMD advanced\n");
/* Align the address wrt HPAGE_PMD_SIZE */
- vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ vaddr &= HPAGE_PMD_MASK;
- pmd = pfn_pmd(pfn, prot);
- set_pmd_at(mm, vaddr, pmdp, pmd);
- pmdp_set_wrprotect(mm, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
- WARN_ON(pmd_write(pmd));
+ pgtable_trans_huge_deposit(args->mm, args->pmdp, args->start_ptep);
- pmd = pfn_pmd(pfn, prot);
- set_pmd_at(mm, vaddr, pmdp, pmd);
- pmdp_huge_get_and_clear(mm, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
+ pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
+ pmdp_set_wrprotect(args->mm, vaddr, args->pmdp);
+ pmd = READ_ONCE(*args->pmdp);
+ WARN_ON(pmd_write(pmd));
+ pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!pmd_none(pmd));
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
- set_pmd_at(mm, vaddr, pmdp, pmd);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
pmd = pmd_mkwrite(pmd);
pmd = pmd_mkdirty(pmd);
- pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
- pmd = READ_ONCE(*pmdp);
+ pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
-
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
- set_pmd_at(mm, vaddr, pmdp, pmd);
- pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
- pmd = READ_ONCE(*pmdp);
+ pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!pmd_none(pmd));
+ pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot));
pmd = pmd_mkyoung(pmd);
- set_pmd_at(mm, vaddr, pmdp, pmd);
- pmdp_test_and_clear_young(vma, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
+ pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(pmd_young(pmd));
+
+ /* Clear the pte entries */
+ pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
+ pgtable_trans_huge_withdraw(args->mm, args->pmdp);
}
-static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_leaf_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
pr_debug("Validating PMD leaf\n");
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
+
/*
* PMD based THP is a leaf entry.
*/
@@ -199,50 +295,39 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
{
- pmd_t pmd;
+ pgprot_t prot = vm_get_page_prot(idx);
+ unsigned long val = idx, *ptr = &val;
+ pud_t pud;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!has_transparent_hugepage())
return;
- pr_debug("Validating PMD huge\n");
+ pr_debug("Validating PUD basic (%pGv)\n", ptr);
+ pud = pfn_pud(args->fixed_pud_pfn, prot);
+
/*
- * X86 defined pmd_set_huge() verifies that the given
- * PMD is not a populated non-leaf entry.
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pud() to make sure that vm_get_page_prot(idx)
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
*/
- WRITE_ONCE(*pmdp, __pmd(0));
- WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pmd_clear_huge(pmdp));
- pmd = READ_ONCE(*pmdp);
- WARN_ON(!pmd_none(pmd));
-}
-
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
-{
- pmd_t pmd = pfn_pmd(pfn, prot);
-
- pr_debug("Validating PMD saved write\n");
- WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
- WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
-{
- pud_t pud = pfn_pud(pfn, prot);
-
- if (!has_transparent_hugepage())
- return;
+ WARN_ON(pud_dirty(pud_wrprotect(pud)));
- pr_debug("Validating PUD basic\n");
WARN_ON(!pud_same(pud, pud));
WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
+ WARN_ON(!pud_dirty(pud_mkdirty(pud_mkclean(pud))));
+ WARN_ON(pud_dirty(pud_mkclean(pud_mkdirty(pud))));
WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud))));
WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud))));
+ WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud))));
+ WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud))));
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
/*
@@ -252,72 +337,126 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pud_bad(pud_mkhuge(pud)));
}
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pud_advanced_tests(struct pgtable_debug_args *args)
{
- pud_t pud = pfn_pud(pfn, prot);
+ struct page *page;
+ unsigned long vaddr = args->vaddr;
+ pud_t pud;
if (!has_transparent_hugepage())
return;
+ page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL;
+ if (!page)
+ return;
+
+ /*
+ * flush_dcache_page() is called after set_pud_at() to clear
+ * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+ * when it's released and page allocation check will fail when
+ * the page is allocated again. For architectures other than ARM64,
+ * the unexpected overhead of cache flushing is acceptable.
+ */
pr_debug("Validating PUD advanced\n");
/* Align the address wrt HPAGE_PUD_SIZE */
- vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+ vaddr &= HPAGE_PUD_MASK;
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_set_wrprotect(mm, vaddr, pudp);
- pud = READ_ONCE(*pudp);
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
+ pudp_set_wrprotect(args->mm, vaddr, args->pudp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(pud_write(pud));
#ifndef __PAGETABLE_PMD_FOLDED
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_huge_get_and_clear(mm, vaddr, pudp);
- pud = READ_ONCE(*pudp);
- WARN_ON(!pud_none(pud));
-
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
- pud = READ_ONCE(*pudp);
+ pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
- set_pud_at(mm, vaddr, pudp, pud);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
pud = pud_mkwrite(pud);
pud = pud_mkdirty(pud);
- pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
- pud = READ_ONCE(*pudp);
+ pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+#ifndef __PAGETABLE_PMD_FOLDED
+ pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1);
+ pud = READ_ONCE(*args->pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
pud = pud_mkyoung(pud);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_test_and_clear_young(vma, vaddr, pudp);
- pud = READ_ONCE(*pudp);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
+ pudp_test_and_clear_young(args->vma, vaddr, args->pudp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(pud_young(pud));
+
+ pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
}
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_leaf_tests(struct pgtable_debug_args *args)
{
- pud_t pud = pfn_pud(pfn, prot);
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
pr_debug("Validating PUD leaf\n");
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
/*
* PUD based THP is a leaf entry.
*/
pud = pud_mkhuge(pud);
WARN_ON(!pud_leaf(pud));
}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { }
+static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static void __init pmd_huge_tests(struct pgtable_debug_args *args)
+{
+ pmd_t pmd;
+
+ if (!arch_vmap_pmd_supported(args->page_prot) ||
+ args->fixed_alignment < PMD_SIZE)
+ return;
+
+ pr_debug("Validating PMD huge\n");
+ /*
+ * X86 defined pmd_set_huge() verifies that the given
+ * PMD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*args->pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot));
+ WARN_ON(!pmd_clear_huge(args->pmdp));
+ pmd = READ_ONCE(*args->pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+static void __init pud_huge_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_vmap_pud_supported(args->page_prot) ||
+ args->fixed_alignment < PUD_SIZE)
return;
pr_debug("Validating PUD huge\n");
@@ -325,52 +464,18 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
* X86 defined pud_set_huge() verifies that the given
* PUD is not a populated non-leaf entry.
*/
- WRITE_ONCE(*pudp, __pud(0));
- WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pud_clear_huge(pudp));
- pud = READ_ONCE(*pudp);
+ WRITE_ONCE(*args->pudp, __pud(0));
+ WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot));
+ WARN_ON(!pud_clear_huge(args->pudp));
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
}
-#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
-{
-}
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
-{
-}
-#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pmd_t *pmdp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
-{
-}
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
-{
-}
-static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
-{
-}
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
-{
-}
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(struct pgtable_debug_args *args) { }
+static void __init pud_huge_tests(struct pgtable_debug_args *args) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init p4d_basic_tests(struct pgtable_debug_args *args)
{
p4d_t p4d;
@@ -379,7 +484,7 @@ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!p4d_same(p4d, p4d));
}
-static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pgd_basic_tests(struct pgtable_debug_args *args)
{
pgd_t pgd;
@@ -389,27 +494,26 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
}
#ifndef __PAGETABLE_PUD_FOLDED
-static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
+static void __init pud_clear_tests(struct pgtable_debug_args *args)
{
- pud_t pud = READ_ONCE(*pudp);
+ pud_t pud = READ_ONCE(*args->pudp);
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
pr_debug("Validating PUD clear\n");
pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
- WRITE_ONCE(*pudp, pud);
- pud_clear(pudp);
- pud = READ_ONCE(*pudp);
+ WRITE_ONCE(*args->pudp, pud);
+ pud_clear(args->pudp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
}
-static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
- pmd_t *pmdp)
+static void __init pud_populate_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
pr_debug("Validating PUD populate\n");
@@ -417,42 +521,36 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
*/
- pmd_clear(pmdp);
- pud_clear(pudp);
- pud_populate(mm, pudp, pmdp);
- pud = READ_ONCE(*pudp);
+ pud_populate(args->mm, args->pudp, args->start_pmdp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(pud_bad(pud));
}
#else /* !__PAGETABLE_PUD_FOLDED */
-static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { }
-static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
- pmd_t *pmdp)
-{
-}
+static void __init pud_clear_tests(struct pgtable_debug_args *args) { }
+static void __init pud_populate_tests(struct pgtable_debug_args *args) { }
#endif /* PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_P4D_FOLDED
-static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
+static void __init p4d_clear_tests(struct pgtable_debug_args *args)
{
- p4d_t p4d = READ_ONCE(*p4dp);
+ p4d_t p4d = READ_ONCE(*args->p4dp);
- if (mm_pud_folded(mm))
+ if (mm_pud_folded(args->mm))
return;
pr_debug("Validating P4D clear\n");
p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
- WRITE_ONCE(*p4dp, p4d);
- p4d_clear(p4dp);
- p4d = READ_ONCE(*p4dp);
+ WRITE_ONCE(*args->p4dp, p4d);
+ p4d_clear(args->p4dp);
+ p4d = READ_ONCE(*args->p4dp);
WARN_ON(!p4d_none(p4d));
}
-static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
- pud_t *pudp)
+static void __init p4d_populate_tests(struct pgtable_debug_args *args)
{
p4d_t p4d;
- if (mm_pud_folded(mm))
+ if (mm_pud_folded(args->mm))
return;
pr_debug("Validating P4D populate\n");
@@ -460,34 +558,33 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
* This entry points to next level page table page.
* Hence this must not qualify as p4d_bad().
*/
- pud_clear(pudp);
- p4d_clear(p4dp);
- p4d_populate(mm, p4dp, pudp);
- p4d = READ_ONCE(*p4dp);
+ pud_clear(args->pudp);
+ p4d_clear(args->p4dp);
+ p4d_populate(args->mm, args->p4dp, args->start_pudp);
+ p4d = READ_ONCE(*args->p4dp);
WARN_ON(p4d_bad(p4d));
}
-static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
+static void __init pgd_clear_tests(struct pgtable_debug_args *args)
{
- pgd_t pgd = READ_ONCE(*pgdp);
+ pgd_t pgd = READ_ONCE(*(args->pgdp));
- if (mm_p4d_folded(mm))
+ if (mm_p4d_folded(args->mm))
return;
pr_debug("Validating PGD clear\n");
pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
- WRITE_ONCE(*pgdp, pgd);
- pgd_clear(pgdp);
- pgd = READ_ONCE(*pgdp);
+ WRITE_ONCE(*args->pgdp, pgd);
+ pgd_clear(args->pgdp);
+ pgd = READ_ONCE(*args->pgdp);
WARN_ON(!pgd_none(pgd));
}
-static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
- p4d_t *p4dp)
+static void __init pgd_populate_tests(struct pgtable_debug_args *args)
{
pgd_t pgd;
- if (mm_p4d_folded(mm))
+ if (mm_p4d_folded(args->mm))
return;
pr_debug("Validating PGD populate\n");
@@ -495,53 +592,63 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
* This entry points to next level page table page.
* Hence this must not qualify as pgd_bad().
*/
- p4d_clear(p4dp);
- pgd_clear(pgdp);
- pgd_populate(mm, pgdp, p4dp);
- pgd = READ_ONCE(*pgdp);
+ p4d_clear(args->p4dp);
+ pgd_clear(args->pgdp);
+ pgd_populate(args->mm, args->pgdp, args->start_p4dp);
+ pgd = READ_ONCE(*args->pgdp);
WARN_ON(pgd_bad(pgd));
}
#else /* !__PAGETABLE_P4D_FOLDED */
-static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { }
-static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { }
-static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
- pud_t *pudp)
-{
-}
-static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
- p4d_t *p4dp)
-{
-}
+static void __init p4d_clear_tests(struct pgtable_debug_args *args) { }
+static void __init pgd_clear_tests(struct pgtable_debug_args *args) { }
+static void __init p4d_populate_tests(struct pgtable_debug_args *args) { }
+static void __init pgd_populate_tests(struct pgtable_debug_args *args) { }
#endif /* PAGETABLE_P4D_FOLDED */
-static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
- unsigned long vaddr)
+static void __init pte_clear_tests(struct pgtable_debug_args *args)
{
- pte_t pte = ptep_get(ptep);
+ struct page *page;
+ pte_t pte = pfn_pte(args->pte_pfn, args->page_prot);
+
+ page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+ if (!page)
+ return;
+ /*
+ * flush_dcache_page() is called after set_pte_at() to clear
+ * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+ * when it's released and page allocation check will fail when
+ * the page is allocated again. For architectures other than ARM64,
+ * the unexpected overhead of cache flushing is acceptable.
+ */
pr_debug("Validating PTE clear\n");
+ if (WARN_ON(!args->ptep))
+ return;
+
+#ifndef CONFIG_RISCV
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
- set_pte_at(mm, vaddr, ptep, pte);
+#endif
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
barrier();
- pte_clear(mm, vaddr, ptep);
- pte = ptep_get(ptep);
+ ptep_clear(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
}
-static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
+static void __init pmd_clear_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = READ_ONCE(*pmdp);
+ pmd_t pmd = READ_ONCE(*args->pmdp);
pr_debug("Validating PMD clear\n");
pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
- WRITE_ONCE(*pmdp, pmd);
- pmd_clear(pmdp);
- pmd = READ_ONCE(*pmdp);
+ WRITE_ONCE(*args->pmdp, pmd);
+ pmd_clear(args->pmdp);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!pmd_none(pmd));
}
-static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
+static void __init pmd_populate_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -550,15 +657,14 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
*/
- pmd_clear(pmdp);
- pmd_populate(mm, pmdp, pgtable);
- pmd = READ_ONCE(*pmdp);
+ pmd_populate(args->mm, args->pmdp, args->start_ptep);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(pmd_bad(pmd));
}
-static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_special_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
return;
@@ -567,9 +673,9 @@ static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pte_special(pte_mkspecial(pte)));
}
-static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_protnone_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none);
if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;
@@ -580,63 +686,75 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_protnone_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ pmd_t pmd;
if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD protnone\n");
+ pmd = pmd_mkhuge(pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none));
WARN_ON(!pmd_protnone(pmd));
WARN_ON(!pmd_present(pmd));
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_devmap_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
pr_debug("Validating PTE devmap\n");
WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
pr_debug("Validating PMD devmap\n");
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_devmap_tests(struct pgtable_debug_args *args)
{
- pud_t pud = pfn_pud(pfn, prot);
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
pr_debug("Validating PUD devmap\n");
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#else
-static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pte_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
@@ -646,9 +764,9 @@ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
}
-static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
@@ -659,66 +777,109 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD soft dirty\n");
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
}
-static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
!IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
return;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD swap soft dirty\n");
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
-#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
-static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
{
+ unsigned long max_swap_offset;
+ swp_entry_t entry, entry2;
+ pte_t pte;
+
+ pr_debug("Validating PTE swap exclusive\n");
+
+ /* See generic_max_swapfile_size(): probe the maximum offset */
+ max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
+
+ /* Create a swp entry with all possible bits set */
+ entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+
+ pte = swp_entry_to_pte(entry);
+ WARN_ON(pte_swp_exclusive(pte));
+ WARN_ON(!is_swap_pte(pte));
+ entry2 = pte_to_swp_entry(pte);
+ WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+
+ pte = pte_swp_mkexclusive(pte);
+ WARN_ON(!pte_swp_exclusive(pte));
+ WARN_ON(!is_swap_pte(pte));
+ WARN_ON(pte_swp_soft_dirty(pte));
+ entry2 = pte_to_swp_entry(pte);
+ WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+
+ pte = pte_swp_clear_exclusive(pte);
+ WARN_ON(pte_swp_exclusive(pte));
+ WARN_ON(!is_swap_pte(pte));
+ entry2 = pte_to_swp_entry(pte);
+ WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
}
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_swap_tests(struct pgtable_debug_args *args)
{
swp_entry_t swp;
pte_t pte;
pr_debug("Validating PTE swap\n");
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
swp = __pte_to_swp_entry(pte);
pte = __swp_entry_to_pte(swp);
- WARN_ON(pfn != pte_pfn(pte));
+ WARN_ON(args->fixed_pte_pfn != pte_pfn(pte));
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_swap_tests(struct pgtable_debug_args *args)
{
swp_entry_t swp;
pmd_t pmd;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD swap\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
swp = __pmd_to_swp_entry(pmd);
pmd = __swp_entry_to_pmd(swp);
- WARN_ON(pfn != pmd_pfn(pmd));
+ WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd));
}
#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init swap_migration_tests(void)
+static void __init swap_migration_tests(struct pgtable_debug_args *args)
{
struct page *page;
swp_entry_t swp;
@@ -726,42 +887,40 @@ static void __init swap_migration_tests(void)
if (!IS_ENABLED(CONFIG_MIGRATION))
return;
- pr_debug("Validating swap migration\n");
/*
* swap_migration_tests() requires a dedicated page as it needs to
* be locked before creating a migration entry from it. Locking the
* page that actually maps kernel text ('start_kernel') can be real
- * problematic. Lets allocate a dedicated page explicitly for this
- * purpose that will be freed subsequently.
+ * problematic. Lets use the allocated page explicitly for this
+ * purpose.
*/
- page = alloc_page(GFP_KERNEL);
- if (!page) {
- pr_err("page allocation failed\n");
+ page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+ if (!page)
return;
- }
+
+ pr_debug("Validating swap migration\n");
/*
- * make_migration_entry() expects given page to be
- * locked, otherwise it stumbles upon a BUG_ON().
+ * make_[readable|writable]_migration_entry() expects given page to
+ * be locked, otherwise it stumbles upon a BUG_ON().
*/
__SetPageLocked(page);
- swp = make_migration_entry(page, 1);
+ swp = make_writable_migration_entry(page_to_pfn(page));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(!is_write_migration_entry(swp));
+ WARN_ON(!is_writable_migration_entry(swp));
- make_migration_entry_read(&swp);
+ swp = make_readable_migration_entry(swp_offset(swp));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_write_migration_entry(swp));
+ WARN_ON(is_writable_migration_entry(swp));
- swp = make_migration_entry(page, 0);
+ swp = make_readable_migration_entry(page_to_pfn(page));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_write_migration_entry(swp));
+ WARN_ON(is_writable_migration_entry(swp));
__ClearPageLocked(page);
- __free_page(page);
}
#ifdef CONFIG_HUGETLB_PAGE
-static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
{
struct page *page;
pte_t pte;
@@ -771,74 +930,25 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
* Accessing the page associated with the pfn is safe here,
* as it was previously derived from a real kernel symbol.
*/
- page = pfn_to_page(pfn);
- pte = mk_huge_pte(page, prot);
+ page = pfn_to_page(args->fixed_pmd_pfn);
+ pte = mk_huge_pte(page, args->page_prot);
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
- WARN_ON(!pte_huge(pte_mkhuge(pte)));
+ WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS)));
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
-
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
- struct page *page = pfn_to_page(pfn);
- pte_t pte = ptep_get(ptep);
- unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
-
- pr_debug("Validating HugeTLB advanced\n");
- pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
- huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_set_wrprotect(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(huge_pte_write(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_get_and_clear(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- pte = huge_pte_wrprotect(pte);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- pte = huge_pte_mkwrite(pte);
- pte = huge_pte_mkdirty(pte);
- huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
- pte = huge_ptep_get(ptep);
- WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
-}
#else /* !CONFIG_HUGETLB_PAGE */
-static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
-}
+static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_thp_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -857,7 +967,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
* needs to return true. pmd_present() should be true whenever
* pmd_trans_huge() returns true.
*/
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
#ifndef __HAVE_ARCH_PMDP_INVALIDATE
@@ -867,7 +977,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_thp_tests(struct pgtable_debug_args *args)
{
pud_t pud;
@@ -875,7 +985,7 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PUD based THP\n");
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
/*
@@ -887,11 +997,11 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
*/
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_thp_tests(struct pgtable_debug_args *args) { }
+static void __init pud_thp_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static unsigned long __init get_random_vaddr(void)
@@ -906,144 +1016,398 @@ static unsigned long __init get_random_vaddr(void)
return random_vaddr;
}
+static void __init destroy_args(struct pgtable_debug_args *args)
+{
+ struct page *page = NULL;
+
+ /* Free (huge) page */
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+ has_transparent_hugepage() &&
+ args->pud_pfn != ULONG_MAX) {
+ if (args->is_contiguous_page) {
+ free_contig_range(args->pud_pfn,
+ (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT)));
+ } else {
+ page = pfn_to_page(args->pud_pfn);
+ __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
+ }
+
+ args->pud_pfn = ULONG_MAX;
+ args->pmd_pfn = ULONG_MAX;
+ args->pte_pfn = ULONG_MAX;
+ }
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ has_transparent_hugepage() &&
+ args->pmd_pfn != ULONG_MAX) {
+ if (args->is_contiguous_page) {
+ free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER));
+ } else {
+ page = pfn_to_page(args->pmd_pfn);
+ __free_pages(page, HPAGE_PMD_ORDER);
+ }
+
+ args->pmd_pfn = ULONG_MAX;
+ args->pte_pfn = ULONG_MAX;
+ }
+
+ if (args->pte_pfn != ULONG_MAX) {
+ page = pfn_to_page(args->pte_pfn);
+ __free_page(page);
+
+ args->pte_pfn = ULONG_MAX;
+ }
+
+ /* Free page table entries */
+ if (args->start_ptep) {
+ pte_free(args->mm, args->start_ptep);
+ mm_dec_nr_ptes(args->mm);
+ }
+
+ if (args->start_pmdp) {
+ pmd_free(args->mm, args->start_pmdp);
+ mm_dec_nr_pmds(args->mm);
+ }
+
+ if (args->start_pudp) {
+ pud_free(args->mm, args->start_pudp);
+ mm_dec_nr_puds(args->mm);
+ }
+
+ if (args->start_p4dp)
+ p4d_free(args->mm, args->start_p4dp);
+
+ /* Free vma and mm struct */
+ if (args->vma)
+ vm_area_free(args->vma);
+
+ if (args->mm)
+ mmdrop(args->mm);
+}
+
+static struct page * __init
+debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
+{
+ struct page *page = NULL;
+
+#ifdef CONFIG_CONTIG_ALLOC
+ if (order > MAX_ORDER) {
+ page = alloc_contig_pages((1 << order), GFP_KERNEL,
+ first_online_node, NULL);
+ if (page) {
+ args->is_contiguous_page = true;
+ return page;
+ }
+ }
+#endif
+
+ if (order <= MAX_ORDER)
+ page = alloc_pages(GFP_KERNEL, order);
+
+ return page;
+}
+
+/*
+ * Check if a physical memory range described by <pstart, pend> contains
+ * an area that is of size psize, and aligned to psize.
+ *
+ * Don't use address 0, an all-zeroes physical address might mask bugs, and
+ * it's not used on x86.
+ */
+static void __init phys_align_check(phys_addr_t pstart,
+ phys_addr_t pend, unsigned long psize,
+ phys_addr_t *physp, unsigned long *alignp)
+{
+ phys_addr_t aligned_start, aligned_end;
+
+ if (pstart == 0)
+ pstart = PAGE_SIZE;
+
+ aligned_start = ALIGN(pstart, psize);
+ aligned_end = aligned_start + psize;
+
+ if (aligned_end > aligned_start && aligned_end <= pend) {
+ *alignp = psize;
+ *physp = aligned_start;
+ }
+}
+
+static void __init init_fixed_pfns(struct pgtable_debug_args *args)
+{
+ u64 idx;
+ phys_addr_t phys, pstart, pend;
+
+ /*
+ * Initialize the fixed pfns. To do this, try to find a
+ * valid physical range, preferably aligned to PUD_SIZE,
+ * but settling for aligned to PMD_SIZE as a fallback. If
+ * neither of those is found, use the physical address of
+ * the start_kernel symbol.
+ *
+ * The memory doesn't need to be allocated, it just needs to exist
+ * as usable memory. It won't be touched.
+ *
+ * The alignment is recorded, and can be checked to see if we
+ * can run the tests that require an actual valid physical
+ * address range on some architectures ({pmd,pud}_huge_test
+ * on x86).
+ */
+
+ phys = __pa_symbol(&start_kernel);
+ args->fixed_alignment = PAGE_SIZE;
+
+ for_each_mem_range(idx, &pstart, &pend) {
+ /* First check for a PUD-aligned area */
+ phys_align_check(pstart, pend, PUD_SIZE, &phys,
+ &args->fixed_alignment);
+
+ /* If a PUD-aligned area is found, we're done */
+ if (args->fixed_alignment == PUD_SIZE)
+ break;
+
+ /*
+ * If no PMD-aligned area found yet, check for one,
+ * but continue the loop to look for a PUD-aligned area.
+ */
+ if (args->fixed_alignment < PMD_SIZE)
+ phys_align_check(pstart, pend, PMD_SIZE, &phys,
+ &args->fixed_alignment);
+ }
+
+ args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+ args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+ args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+ args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+ args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
+ WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+}
+
+
+static int __init init_args(struct pgtable_debug_args *args)
+{
+ struct page *page = NULL;
+ int ret = 0;
+
+ /*
+ * Initialize the debugging data.
+ *
+ * vm_get_page_prot(VM_NONE) or vm_get_page_prot(VM_SHARED|VM_NONE)
+ * will help create page table entries with PROT_NONE permission as
+ * required for pxx_protnone_tests().
+ */
+ memset(args, 0, sizeof(*args));
+ args->vaddr = get_random_vaddr();
+ args->page_prot = vm_get_page_prot(VM_ACCESS_FLAGS);
+ args->page_prot_none = vm_get_page_prot(VM_NONE);
+ args->is_contiguous_page = false;
+ args->pud_pfn = ULONG_MAX;
+ args->pmd_pfn = ULONG_MAX;
+ args->pte_pfn = ULONG_MAX;
+ args->fixed_pgd_pfn = ULONG_MAX;
+ args->fixed_p4d_pfn = ULONG_MAX;
+ args->fixed_pud_pfn = ULONG_MAX;
+ args->fixed_pmd_pfn = ULONG_MAX;
+ args->fixed_pte_pfn = ULONG_MAX;
+
+ /* Allocate mm and vma */
+ args->mm = mm_alloc();
+ if (!args->mm) {
+ pr_err("Failed to allocate mm struct\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ args->vma = vm_area_alloc(args->mm);
+ if (!args->vma) {
+ pr_err("Failed to allocate vma\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /*
+ * Allocate page table entries. They will be modified in the tests.
+ * Lets save the page table entries so that they can be released
+ * when the tests are completed.
+ */
+ args->pgdp = pgd_offset(args->mm, args->vaddr);
+ args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
+ if (!args->p4dp) {
+ pr_err("Failed to allocate p4d entries\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+ args->start_p4dp = p4d_offset(args->pgdp, 0UL);
+ WARN_ON(!args->start_p4dp);
+
+ args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr);
+ if (!args->pudp) {
+ pr_err("Failed to allocate pud entries\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+ args->start_pudp = pud_offset(args->p4dp, 0UL);
+ WARN_ON(!args->start_pudp);
+
+ args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr);
+ if (!args->pmdp) {
+ pr_err("Failed to allocate pmd entries\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+ args->start_pmdp = pmd_offset(args->pudp, 0UL);
+ WARN_ON(!args->start_pmdp);
+
+ if (pte_alloc(args->mm, args->pmdp)) {
+ pr_err("Failed to allocate pte entries\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+ args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp));
+ WARN_ON(!args->start_ptep);
+
+ init_fixed_pfns(args);
+
+ /*
+ * Allocate (huge) pages because some of the tests need to access
+ * the data in the pages. The corresponding tests will be skipped
+ * if we fail to allocate (huge) pages.
+ */
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+ has_transparent_hugepage()) {
+ page = debug_vm_pgtable_alloc_huge_page(args,
+ HPAGE_PUD_SHIFT - PAGE_SHIFT);
+ if (page) {
+ args->pud_pfn = page_to_pfn(page);
+ args->pmd_pfn = args->pud_pfn;
+ args->pte_pfn = args->pud_pfn;
+ return 0;
+ }
+ }
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ has_transparent_hugepage()) {
+ page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PMD_ORDER);
+ if (page) {
+ args->pmd_pfn = page_to_pfn(page);
+ args->pte_pfn = args->pmd_pfn;
+ return 0;
+ }
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (page)
+ args->pte_pfn = page_to_pfn(page);
+
+ return 0;
+
+error:
+ destroy_args(args);
+ return ret;
+}
+
static int __init debug_vm_pgtable(void)
{
- struct vm_area_struct *vma;
- struct mm_struct *mm;
- pgd_t *pgdp;
- p4d_t *p4dp, *saved_p4dp;
- pud_t *pudp, *saved_pudp;
- pmd_t *pmdp, *saved_pmdp, pmd;
- pte_t *ptep;
- pgtable_t saved_ptep;
- pgprot_t prot, protnone;
- phys_addr_t paddr;
- unsigned long vaddr, pte_aligned, pmd_aligned;
- unsigned long pud_aligned, p4d_aligned, pgd_aligned;
+ struct pgtable_debug_args args;
spinlock_t *ptl = NULL;
+ int idx, ret;
pr_info("Validating architecture page table helpers\n");
- prot = vm_get_page_prot(VMFLAGS);
- vaddr = get_random_vaddr();
- mm = mm_alloc();
- if (!mm) {
- pr_err("mm_struct allocation failed\n");
- return 1;
- }
+ ret = init_args(&args);
+ if (ret)
+ return ret;
/*
- * __P000 (or even __S000) will help create page table entries with
- * PROT_NONE permission as required for pxx_protnone_tests().
+ * Iterate over each possible vm_flags to make sure that all
+ * the basic page table transformation validations just hold
+ * true irrespective of the starting protection value for a
+ * given page table entry.
+ *
+ * Protection based vm_flags combinatins are always linear
+ * and increasing i.e starting from VM_NONE and going upto
+ * (VM_SHARED | READ | WRITE | EXEC).
*/
- protnone = __P000;
+#define VM_FLAGS_START (VM_NONE)
+#define VM_FLAGS_END (VM_SHARED | VM_EXEC | VM_WRITE | VM_READ)
- vma = vm_area_alloc(mm);
- if (!vma) {
- pr_err("vma allocation failed\n");
- return 1;
+ for (idx = VM_FLAGS_START; idx <= VM_FLAGS_END; idx++) {
+ pte_basic_tests(&args, idx);
+ pmd_basic_tests(&args, idx);
+ pud_basic_tests(&args, idx);
}
/*
- * PFN for mapping at PTE level is determined from a standard kernel
- * text symbol. But pfns for higher page table levels are derived by
- * masking lower bits of this real pfn. These derived pfns might not
- * exist on the platform but that does not really matter as pfn_pxx()
- * helpers will still create appropriate entries for the test. This
- * helps avoid large memory block allocations to be used for mapping
- * at higher page table levels.
+ * Both P4D and PGD level tests are very basic which do not
+ * involve creating page table entries from the protection
+ * value and the given pfn. Hence just keep them out from
+ * the above iteration for now to save some test execution
+ * time.
*/
- paddr = __pa_symbol(&start_kernel);
+ p4d_basic_tests(&args);
+ pgd_basic_tests(&args);
- pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT;
- pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT;
- pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT;
- p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT;
- pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT;
- WARN_ON(!pfn_valid(pte_aligned));
+ pmd_leaf_tests(&args);
+ pud_leaf_tests(&args);
- pgdp = pgd_offset(mm, vaddr);
- p4dp = p4d_alloc(mm, pgdp, vaddr);
- pudp = pud_alloc(mm, p4dp, vaddr);
- pmdp = pmd_alloc(mm, pudp, vaddr);
- ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+ pte_special_tests(&args);
+ pte_protnone_tests(&args);
+ pmd_protnone_tests(&args);
+
+ pte_devmap_tests(&args);
+ pmd_devmap_tests(&args);
+ pud_devmap_tests(&args);
+
+ pte_soft_dirty_tests(&args);
+ pmd_soft_dirty_tests(&args);
+ pte_swap_soft_dirty_tests(&args);
+ pmd_swap_soft_dirty_tests(&args);
+
+ pte_swap_exclusive_tests(&args);
+
+ pte_swap_tests(&args);
+ pmd_swap_tests(&args);
+
+ swap_migration_tests(&args);
+
+ pmd_thp_tests(&args);
+ pud_thp_tests(&args);
+
+ hugetlb_basic_tests(&args);
/*
- * Save all the page table page addresses as the page table
- * entries will be used for testing with random or garbage
- * values. These saved addresses will be used for freeing
- * page table pages.
+ * Page table modifying tests. They need to hold
+ * proper page table lock.
*/
- pmd = READ_ONCE(*pmdp);
- saved_p4dp = p4d_offset(pgdp, 0UL);
- saved_pudp = pud_offset(p4dp, 0UL);
- saved_pmdp = pmd_offset(pudp, 0UL);
- saved_ptep = pmd_pgtable(pmd);
-
- pte_basic_tests(pte_aligned, prot);
- pmd_basic_tests(pmd_aligned, prot);
- pud_basic_tests(pud_aligned, prot);
- p4d_basic_tests(p4d_aligned, prot);
- pgd_basic_tests(pgd_aligned, prot);
-
- pte_clear_tests(mm, ptep, vaddr);
- pmd_clear_tests(mm, pmdp);
- pud_clear_tests(mm, pudp);
- p4d_clear_tests(mm, p4dp);
- pgd_clear_tests(mm, pgdp);
-
- pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
- pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
- pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
- hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
- pmd_leaf_tests(pmd_aligned, prot);
- pud_leaf_tests(pud_aligned, prot);
-
- pmd_huge_tests(pmdp, pmd_aligned, prot);
- pud_huge_tests(pudp, pud_aligned, prot);
-
- pte_savedwrite_tests(pte_aligned, prot);
- pmd_savedwrite_tests(pmd_aligned, prot);
-
- pte_unmap_unlock(ptep, ptl);
-
- pmd_populate_tests(mm, pmdp, saved_ptep);
- pud_populate_tests(mm, pudp, saved_pmdp);
- p4d_populate_tests(mm, p4dp, saved_pudp);
- pgd_populate_tests(mm, pgdp, saved_p4dp);
-
- pte_special_tests(pte_aligned, prot);
- pte_protnone_tests(pte_aligned, protnone);
- pmd_protnone_tests(pmd_aligned, protnone);
-
- pte_devmap_tests(pte_aligned, prot);
- pmd_devmap_tests(pmd_aligned, prot);
- pud_devmap_tests(pud_aligned, prot);
-
- pte_soft_dirty_tests(pte_aligned, prot);
- pmd_soft_dirty_tests(pmd_aligned, prot);
- pte_swap_soft_dirty_tests(pte_aligned, prot);
- pmd_swap_soft_dirty_tests(pmd_aligned, prot);
-
- pte_swap_tests(pte_aligned, prot);
- pmd_swap_tests(pmd_aligned, prot);
-
- swap_migration_tests();
- hugetlb_basic_tests(pte_aligned, prot);
-
- pmd_thp_tests(pmd_aligned, prot);
- pud_thp_tests(pud_aligned, prot);
-
- p4d_free(mm, saved_p4dp);
- pud_free(mm, saved_pudp);
- pmd_free(mm, saved_pmdp);
- pte_free(mm, saved_ptep);
-
- vm_area_free(vma);
- mm_dec_nr_puds(mm);
- mm_dec_nr_pmds(mm);
- mm_dec_nr_ptes(mm);
- mmdrop(mm);
+
+ args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl);
+ pte_clear_tests(&args);
+ pte_advanced_tests(&args);
+ if (args.ptep)
+ pte_unmap_unlock(args.ptep, ptl);
+
+ ptl = pmd_lock(args.mm, args.pmdp);
+ pmd_clear_tests(&args);
+ pmd_advanced_tests(&args);
+ pmd_huge_tests(&args);
+ pmd_populate_tests(&args);
+ spin_unlock(ptl);
+
+ ptl = pud_lock(args.mm, args.pudp);
+ pud_clear_tests(&args);
+ pud_advanced_tests(&args);
+ pud_huge_tests(&args);
+ pud_populate_tests(&args);
+ spin_unlock(ptl);
+
+ spin_lock(&(args.mm->page_table_lock));
+ p4d_clear_tests(&args);
+ pgd_clear_tests(&args);
+ p4d_populate_tests(&args);
+ pgd_populate_tests(&args);
+ spin_unlock(&(args.mm->page_table_lock));
+
+ destroy_args(&args);
return 0;
}
late_initcall(debug_vm_pgtable);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index a97c97232337..a151a21e571b 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -15,7 +15,7 @@
* represented by the 'struct dma_pool' which keeps a doubly-linked list of
* allocated pages. Each page in the page_list is split into blocks of at
* least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
- * list of free blocks within the page. Used blocks aren't tracked, but we
+ * list of free blocks across all pages. Used blocks aren't tracked, but we
* keep a count of how many are currently allocated from each page.
*/
@@ -28,6 +28,7 @@
#include <linux/mutex.h>
#include <linux/poison.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/spinlock.h>
@@ -39,13 +40,22 @@
#define DMAPOOL_DEBUG 1
#endif
+struct dma_block {
+ struct dma_block *next_block;
+ dma_addr_t dma;
+};
+
struct dma_pool { /* the pool */
struct list_head page_list;
spinlock_t lock;
- size_t size;
+ struct dma_block *next_block;
+ size_t nr_blocks;
+ size_t nr_active;
+ size_t nr_pages;
struct device *dev;
- size_t allocation;
- size_t boundary;
+ unsigned int size;
+ unsigned int allocation;
+ unsigned int boundary;
char name[32];
struct list_head pools;
};
@@ -54,55 +64,139 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
struct list_head page_list;
void *vaddr;
dma_addr_t dma;
- unsigned int in_use;
- unsigned int offset;
};
static DEFINE_MUTEX(pools_lock);
static DEFINE_MUTEX(pools_reg_lock);
-static ssize_t
-show_pools(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf)
{
- unsigned temp;
- unsigned size;
- char *next;
- struct dma_page *page;
struct dma_pool *pool;
+ unsigned size;
- next = buf;
- size = PAGE_SIZE;
-
- temp = scnprintf(next, size, "poolinfo - 0.1\n");
- size -= temp;
- next += temp;
+ size = sysfs_emit(buf, "poolinfo - 0.1\n");
mutex_lock(&pools_lock);
list_for_each_entry(pool, &dev->dma_pools, pools) {
- unsigned pages = 0;
- unsigned blocks = 0;
+ /* per-pool info, no real statistics yet */
+ size += sysfs_emit_at(buf, size, "%-16s %4zu %4zu %4u %2zu\n",
+ pool->name, pool->nr_active,
+ pool->nr_blocks, pool->size,
+ pool->nr_pages);
+ }
+ mutex_unlock(&pools_lock);
+
+ return size;
+}
+
+static DEVICE_ATTR_RO(pools);
+
+#ifdef DMAPOOL_DEBUG
+static void pool_check_block(struct dma_pool *pool, struct dma_block *block,
+ gfp_t mem_flags)
+{
+ u8 *data = (void *)block;
+ int i;
+
+ for (i = sizeof(struct dma_block); i < pool->size; i++) {
+ if (data[i] == POOL_POISON_FREED)
+ continue;
+ dev_err(pool->dev, "%s %s, %p (corrupted)\n", __func__,
+ pool->name, block);
+
+ /*
+ * Dump the first 4 bytes even if they are not
+ * POOL_POISON_FREED
+ */
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
+ data, pool->size, 1);
+ break;
+ }
+
+ if (!want_init_on_alloc(mem_flags))
+ memset(block, POOL_POISON_ALLOCATED, pool->size);
+}
+
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+ struct dma_page *page;
+
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (dma < page->dma)
+ continue;
+ if ((dma - page->dma) < pool->allocation)
+ return page;
+ }
+ return NULL;
+}
+
+static bool pool_block_err(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+{
+ struct dma_block *block = pool->next_block;
+ struct dma_page *page;
- spin_lock_irq(&pool->lock);
- list_for_each_entry(page, &pool->page_list, page_list) {
- pages++;
- blocks += page->in_use;
+ page = pool_find_page(pool, dma);
+ if (!page) {
+ dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
+ return true;
+ }
+
+ while (block) {
+ if (block != vaddr) {
+ block = block->next_block;
+ continue;
}
- spin_unlock_irq(&pool->lock);
+ dev_err(pool->dev, "%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
+ return true;
+ }
- /* per-pool info, no real statistics yet */
- temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n",
- pool->name, blocks,
- pages * (pool->allocation / pool->size),
- pool->size, pages);
- size -= temp;
- next += temp;
+ memset(vaddr, POOL_POISON_FREED, pool->size);
+ return false;
+}
+
+static void pool_init_page(struct dma_pool *pool, struct dma_page *page)
+{
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+}
+#else
+static void pool_check_block(struct dma_pool *pool, struct dma_block *block,
+ gfp_t mem_flags)
+{
+}
+
+static bool pool_block_err(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+{
+ if (want_init_on_free())
+ memset(vaddr, 0, pool->size);
+ return false;
+}
+
+static void pool_init_page(struct dma_pool *pool, struct dma_page *page)
+{
+}
+#endif
+
+static struct dma_block *pool_block_pop(struct dma_pool *pool)
+{
+ struct dma_block *block = pool->next_block;
+
+ if (block) {
+ pool->next_block = block->next_block;
+ pool->nr_active++;
}
- mutex_unlock(&pools_lock);
+ return block;
+}
- return PAGE_SIZE - size;
+static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
+ dma_addr_t dma)
+{
+ block->dma = dma;
+ block->next_block = pool->next_block;
+ pool->next_block = block;
}
-static DEVICE_ATTR(pools, 0444, show_pools, NULL);
/**
* dma_pool_create - Creates a pool of consistent memory blocks, for dma.
@@ -132,17 +226,20 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
{
struct dma_pool *retval;
size_t allocation;
- bool empty = false;
+ bool empty;
+
+ if (!dev)
+ return NULL;
if (align == 0)
align = 1;
else if (align & (align - 1))
return NULL;
- if (size == 0)
+ if (size == 0 || size > INT_MAX)
return NULL;
- else if (size < 4)
- size = 4;
+ if (size < sizeof(struct dma_block))
+ size = sizeof(struct dma_block);
size = ALIGN(size, align);
allocation = max_t(size_t, size, PAGE_SIZE);
@@ -152,11 +249,13 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
else if ((boundary < size) || (boundary & (boundary - 1)))
return NULL;
- retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+ boundary = min(boundary, allocation);
+
+ retval = kzalloc(sizeof(*retval), GFP_KERNEL);
if (!retval)
return retval;
- strlcpy(retval->name, name, sizeof(retval->name));
+ strscpy(retval->name, name, sizeof(retval->name));
retval->dev = dev;
@@ -165,7 +264,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
-
INIT_LIST_HEAD(&retval->pools);
/*
@@ -178,8 +276,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
*/
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
- if (list_empty(&dev->dma_pools))
- empty = true;
+ empty = list_empty(&dev->dma_pools);
list_add(&retval->pools, &dev->dma_pools);
mutex_unlock(&pools_lock);
if (empty) {
@@ -202,18 +299,36 @@ EXPORT_SYMBOL(dma_pool_create);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
- unsigned int offset = 0;
- unsigned int next_boundary = pool->boundary;
+ unsigned int next_boundary = pool->boundary, offset = 0;
+ struct dma_block *block, *first = NULL, *last = NULL;
- do {
- unsigned int next = offset + pool->size;
- if (unlikely((next + pool->size) >= next_boundary)) {
- next = next_boundary;
+ pool_init_page(pool, page);
+ while (offset + pool->size <= pool->allocation) {
+ if (offset + pool->size > next_boundary) {
+ offset = next_boundary;
next_boundary += pool->boundary;
+ continue;
}
- *(int *)(page->vaddr + offset) = next;
- offset = next;
- } while (offset < pool->allocation);
+
+ block = page->vaddr + offset;
+ block->dma = page->dma + offset;
+ block->next_block = NULL;
+
+ if (last)
+ last->next_block = block;
+ else
+ first = block;
+ last = block;
+
+ offset += pool->size;
+ pool->nr_blocks++;
+ }
+
+ last->next_block = pool->next_block;
+ pool->next_block = first;
+
+ list_add(&page->page_list, &pool->page_list);
+ pool->nr_pages++;
}
static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
@@ -223,37 +338,15 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
page = kmalloc(sizeof(*page), mem_flags);
if (!page)
return NULL;
+
page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
&page->dma, mem_flags);
- if (page->vaddr) {
-#ifdef DMAPOOL_DEBUG
- memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
-#endif
- pool_initialise_page(pool, page);
- page->in_use = 0;
- page->offset = 0;
- } else {
+ if (!page->vaddr) {
kfree(page);
- page = NULL;
+ return NULL;
}
- return page;
-}
-
-static inline bool is_page_busy(struct dma_page *page)
-{
- return page->in_use != 0;
-}
-static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
-{
- dma_addr_t dma = page->dma;
-
-#ifdef DMAPOOL_DEBUG
- memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
-#endif
- dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
- list_del(&page->page_list);
- kfree(page);
+ return page;
}
/**
@@ -267,7 +360,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
void dma_pool_destroy(struct dma_pool *pool)
{
struct dma_page *page, *tmp;
- bool empty = false;
+ bool empty, busy = false;
if (unlikely(!pool))
return;
@@ -275,26 +368,23 @@ void dma_pool_destroy(struct dma_pool *pool)
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
- if (pool->dev && list_empty(&pool->dev->dma_pools))
- empty = true;
+ empty = list_empty(&pool->dev->dma_pools);
mutex_unlock(&pools_lock);
if (empty)
device_remove_file(pool->dev, &dev_attr_pools);
mutex_unlock(&pools_reg_lock);
+ if (pool->nr_active) {
+ dev_err(pool->dev, "%s %s busy\n", __func__, pool->name);
+ busy = true;
+ }
+
list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) {
- if (is_page_busy(page)) {
- if (pool->dev)
- dev_err(pool->dev, "%s %s, %p busy\n", __func__,
- pool->name, page->vaddr);
- else
- pr_err("%s %s, %p busy\n", __func__,
- pool->name, page->vaddr);
- /* leak the still-in-use consistent memory */
- list_del(&page->page_list);
- kfree(page);
- } else
- pool_free_page(pool, page);
+ if (!busy)
+ dma_free_coherent(pool->dev, pool->allocation,
+ page->vaddr, page->dma);
+ list_del(&page->page_list);
+ kfree(page);
}
kfree(pool);
@@ -314,84 +404,40 @@ EXPORT_SYMBOL(dma_pool_destroy);
void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle)
{
- unsigned long flags;
+ struct dma_block *block;
struct dma_page *page;
- size_t offset;
- void *retval;
+ unsigned long flags;
- might_sleep_if(gfpflags_allow_blocking(mem_flags));
+ might_alloc(mem_flags);
spin_lock_irqsave(&pool->lock, flags);
- list_for_each_entry(page, &pool->page_list, page_list) {
- if (page->offset < pool->allocation)
- goto ready;
- }
-
- /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
- spin_unlock_irqrestore(&pool->lock, flags);
-
- page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
- if (!page)
- return NULL;
+ block = pool_block_pop(pool);
+ if (!block) {
+ /*
+ * pool_alloc_page() might sleep, so temporarily drop
+ * &pool->lock
+ */
+ spin_unlock_irqrestore(&pool->lock, flags);
- spin_lock_irqsave(&pool->lock, flags);
+ page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
+ if (!page)
+ return NULL;
- list_add(&page->page_list, &pool->page_list);
- ready:
- page->in_use++;
- offset = page->offset;
- page->offset = *(int *)(page->vaddr + offset);
- retval = offset + page->vaddr;
- *handle = offset + page->dma;
-#ifdef DMAPOOL_DEBUG
- {
- int i;
- u8 *data = retval;
- /* page->offset is stored in first 4 bytes */
- for (i = sizeof(page->offset); i < pool->size; i++) {
- if (data[i] == POOL_POISON_FREED)
- continue;
- if (pool->dev)
- dev_err(pool->dev, "%s %s, %p (corrupted)\n",
- __func__, pool->name, retval);
- else
- pr_err("%s %s, %p (corrupted)\n",
- __func__, pool->name, retval);
-
- /*
- * Dump the first 4 bytes even if they are not
- * POOL_POISON_FREED
- */
- print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
- data, pool->size, 1);
- break;
- }
+ spin_lock_irqsave(&pool->lock, flags);
+ pool_initialise_page(pool, page);
+ block = pool_block_pop(pool);
}
- if (!(mem_flags & __GFP_ZERO))
- memset(retval, POOL_POISON_ALLOCATED, pool->size);
-#endif
spin_unlock_irqrestore(&pool->lock, flags);
+ *handle = block->dma;
+ pool_check_block(pool, block, mem_flags);
if (want_init_on_alloc(mem_flags))
- memset(retval, 0, pool->size);
+ memset(block, 0, pool->size);
- return retval;
+ return block;
}
EXPORT_SYMBOL(dma_pool_alloc);
-static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
-{
- struct dma_page *page;
-
- list_for_each_entry(page, &pool->page_list, page_list) {
- if (dma < page->dma)
- continue;
- if ((dma - page->dma) < pool->allocation)
- return page;
- }
- return NULL;
-}
-
/**
* dma_pool_free - put block back into dma pool
* @pool: the dma pool holding the block
@@ -403,65 +449,14 @@ static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
*/
void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
{
- struct dma_page *page;
+ struct dma_block *block = vaddr;
unsigned long flags;
- unsigned int offset;
spin_lock_irqsave(&pool->lock, flags);
- page = pool_find_page(pool, dma);
- if (!page) {
- spin_unlock_irqrestore(&pool->lock, flags);
- if (pool->dev)
- dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n",
- __func__, pool->name, vaddr, &dma);
- else
- pr_err("%s %s, %p/%pad (bad dma)\n",
- __func__, pool->name, vaddr, &dma);
- return;
- }
-
- offset = vaddr - page->vaddr;
- if (want_init_on_free())
- memset(vaddr, 0, pool->size);
-#ifdef DMAPOOL_DEBUG
- if ((dma - page->dma) != offset) {
- spin_unlock_irqrestore(&pool->lock, flags);
- if (pool->dev)
- dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n",
- __func__, pool->name, vaddr, &dma);
- else
- pr_err("%s %s, %p (bad vaddr)/%pad\n",
- __func__, pool->name, vaddr, &dma);
- return;
+ if (!pool_block_err(pool, vaddr, dma)) {
+ pool_block_push(pool, block, dma);
+ pool->nr_active--;
}
- {
- unsigned int chain = page->offset;
- while (chain < pool->allocation) {
- if (chain != offset) {
- chain = *(int *)(page->vaddr + chain);
- continue;
- }
- spin_unlock_irqrestore(&pool->lock, flags);
- if (pool->dev)
- dev_err(pool->dev, "%s %s, dma %pad already free\n",
- __func__, pool->name, &dma);
- else
- pr_err("%s %s, dma %pad already free\n",
- __func__, pool->name, &dma);
- return;
- }
- }
- memset(vaddr, POOL_POISON_FREED, pool->size);
-#endif
-
- page->in_use--;
- *(int *)vaddr = page->offset;
- page->offset = offset;
- /*
- * Resist a temptation to do
- * if (!is_page_busy(page)) pool_free_page(pool, page);
- * Better have a few empty pages hang around.
- */
spin_unlock_irqrestore(&pool->lock, flags);
}
EXPORT_SYMBOL(dma_pool_free);
diff --git a/mm/dmapool_test.c b/mm/dmapool_test.c
new file mode 100644
index 000000000000..370fb9e209ef
--- /dev/null
+++ b/mm/dmapool_test.c
@@ -0,0 +1,147 @@
+#include <linux/device.h>
+#include <linux/dma-map-ops.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+
+#define NR_TESTS (100)
+
+struct dma_pool_pair {
+ dma_addr_t dma;
+ void *v;
+};
+
+struct dmapool_parms {
+ size_t size;
+ size_t align;
+ size_t boundary;
+};
+
+static const struct dmapool_parms pool_parms[] = {
+ { .size = 16, .align = 16, .boundary = 0 },
+ { .size = 64, .align = 64, .boundary = 0 },
+ { .size = 256, .align = 256, .boundary = 0 },
+ { .size = 1024, .align = 1024, .boundary = 0 },
+ { .size = 4096, .align = 4096, .boundary = 0 },
+ { .size = 68, .align = 32, .boundary = 4096 },
+};
+
+static struct dma_pool *pool;
+static struct device test_dev;
+static u64 dma_mask;
+
+static inline int nr_blocks(int size)
+{
+ return clamp_t(int, (PAGE_SIZE / size) * 512, 1024, 8192);
+}
+
+static int dmapool_test_alloc(struct dma_pool_pair *p, int blocks)
+{
+ int i;
+
+ for (i = 0; i < blocks; i++) {
+ p[i].v = dma_pool_alloc(pool, GFP_KERNEL,
+ &p[i].dma);
+ if (!p[i].v)
+ goto pool_fail;
+ }
+
+ for (i = 0; i < blocks; i++)
+ dma_pool_free(pool, p[i].v, p[i].dma);
+
+ return 0;
+
+pool_fail:
+ for (--i; i >= 0; i--)
+ dma_pool_free(pool, p[i].v, p[i].dma);
+ return -ENOMEM;
+}
+
+static int dmapool_test_block(const struct dmapool_parms *parms)
+{
+ int blocks = nr_blocks(parms->size);
+ ktime_t start_time, end_time;
+ struct dma_pool_pair *p;
+ int i, ret;
+
+ p = kcalloc(blocks, sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ pool = dma_pool_create("test pool", &test_dev, parms->size,
+ parms->align, parms->boundary);
+ if (!pool) {
+ ret = -ENOMEM;
+ goto free_pairs;
+ }
+
+ start_time = ktime_get();
+ for (i = 0; i < NR_TESTS; i++) {
+ ret = dmapool_test_alloc(p, blocks);
+ if (ret)
+ goto free_pool;
+ if (need_resched())
+ cond_resched();
+ }
+ end_time = ktime_get();
+
+ printk("dmapool test: size:%-4zu align:%-4zu blocks:%-4d time:%llu\n",
+ parms->size, parms->align, blocks,
+ ktime_us_delta(end_time, start_time));
+
+free_pool:
+ dma_pool_destroy(pool);
+free_pairs:
+ kfree(p);
+ return ret;
+}
+
+static void dmapool_test_release(struct device *dev)
+{
+}
+
+static int dmapool_checks(void)
+{
+ int i, ret;
+
+ ret = dev_set_name(&test_dev, "dmapool-test");
+ if (ret)
+ return ret;
+
+ ret = device_register(&test_dev);
+ if (ret) {
+ printk("%s: register failed:%d\n", __func__, ret);
+ goto put_device;
+ }
+
+ test_dev.release = dmapool_test_release;
+ set_dma_ops(&test_dev, NULL);
+ test_dev.dma_mask = &dma_mask;
+ ret = dma_set_mask_and_coherent(&test_dev, DMA_BIT_MASK(64));
+ if (ret) {
+ printk("%s: mask failed:%d\n", __func__, ret);
+ goto del_device;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(pool_parms); i++) {
+ ret = dmapool_test_block(&pool_parms[i]);
+ if (ret)
+ break;
+ }
+
+del_device:
+ device_del(&test_dev);
+put_device:
+ put_device(&test_dev);
+ return ret;
+}
+
+static void dmapool_exit(void)
+{
+}
+
+module_init(dmapool_checks);
+module_exit(dmapool_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index a0018ad1a1f6..ce06b2884789 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -17,6 +17,7 @@
#include <linux/vmalloc.h>
#include <asm/fixmap.h>
#include <asm/early_ioremap.h>
+#include "internal.h"
#ifdef CONFIG_MMU
static int early_ioremap_debug __initdata;
@@ -38,13 +39,8 @@ pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
return prot;
}
-void __init __weak early_ioremap_shutdown(void)
-{
-}
-
void __init early_ioremap_reset(void)
{
- early_ioremap_shutdown();
after_paging_init = 1;
}
@@ -76,12 +72,10 @@ void __init early_ioremap_setup(void)
{
int i;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (WARN_ON(prev_map[i]))
- break;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ WARN_ON_ONCE(prev_map[i]);
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+ }
}
static int __init check_early_ioremap_leak(void)
@@ -181,17 +175,17 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
}
}
- if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
- addr, size))
+ if (WARN(slot < 0, "%s(%p, %08lx) not found slot\n",
+ __func__, addr, size))
return;
if (WARN(prev_size[slot] != size,
- "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
- addr, size, slot, prev_size[slot]))
+ "%s(%p, %08lx) [%d] size not consistent %08lx\n",
+ __func__, addr, size, slot, prev_size[slot]))
return;
- WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
- addr, size, slot);
+ WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n",
+ __func__, addr, size, slot);
virt_addr = (unsigned long)addr;
if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d6baa4f451c5..6c39d42f16dc 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -14,7 +14,6 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
-#include <linux/pagevec.h>
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
@@ -72,7 +71,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
endbyte = (u64)offset + (u64)len;
if (!len || endbyte < len)
- endbyte = -1;
+ endbyte = LLONG_MAX;
else
endbyte--; /* inclusive */
@@ -80,7 +79,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
spin_lock(&file->f_lock);
- file->f_mode &= ~FMODE_RANDOM;
+ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_RANDOM:
@@ -107,11 +106,13 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
+ spin_lock(&file->f_lock);
+ file->f_mode |= FMODE_NOREUSE;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_DONTNEED:
- if (!inode_write_congested(mapping->host))
- __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
/*
* First and last FULL page! Partial pages are deliberately
@@ -141,7 +142,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
}
if (end_index >= start_index) {
- unsigned long nr_pagevec = 0;
+ unsigned long nr_failed = 0;
/*
* It's common to FADV_DONTNEED right after
@@ -154,17 +155,15 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
lru_add_drain();
- invalidate_mapping_pagevec(mapping,
- start_index, end_index,
- &nr_pagevec);
+ mapping_try_invalidate(mapping, start_index, end_index,
+ &nr_failed);
/*
- * If fewer pages were invalidated than expected then
- * it is possible that some of the pages were on
- * a per-cpu pagevec for a remote CPU. Drain all
- * pagevecs and try again.
+ * The failures may be due to the folio being
+ * in the LRU cache of a remote CPU. Drain all
+ * caches and try again.
*/
- if (nr_pagevec) {
+ if (nr_failed) {
lru_add_drain_all();
invalidate_mapping_pages(mapping, start_index,
end_index);
@@ -216,4 +215,15 @@ SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
}
#endif
+
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FADVISE64_64)
+
+COMPAT_SYSCALL_DEFINE6(fadvise64_64, int, fd, compat_arg_u64_dual(offset),
+ compat_arg_u64_dual(len), int, advice)
+{
+ return ksys_fadvise64_64(fd, compat_arg_u64_glue(offset),
+ compat_arg_u64_glue(len), advice);
+}
+
+#endif
#endif
diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c
new file mode 100644
index 000000000000..b1b09cce9394
--- /dev/null
+++ b/mm/fail_page_alloc.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fault-inject.h>
+#include <linux/mm.h>
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_gfp_highmem;
+ bool ignore_gfp_reclaim;
+ u32 min_order;
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_reclaim = true,
+ .ignore_gfp_highmem = true,
+ .min_order = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ int flags = 0;
+
+ if (order < fail_page_alloc.min_order)
+ return false;
+ if (gfp_mask & __GFP_NOFAIL)
+ return false;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return false;
+ if (fail_page_alloc.ignore_gfp_reclaim &&
+ (gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* See comment in __should_failslab() */
+ if (gfp_mask & __GFP_NOWARN)
+ flags |= FAULT_NOWARN;
+
+ return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+ &fail_page_alloc.attr);
+
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
+
+ return 0;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/failslab.c b/mm/failslab.c
index f92fed91ac23..ffc420c0e767 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -16,6 +16,8 @@ static struct {
bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
+ int flags = 0;
+
/* No fault-injection for bootstrap cache */
if (unlikely(s == kmem_cache))
return false;
@@ -30,7 +32,16 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
- return should_fail(&failslab.attr, s->object_size);
+ /*
+ * In some cases, it expects to specify __GFP_NOWARN
+ * to avoid printing any information(not just a warning),
+ * thus avoiding deadlocks. See commit 6b9dbedbe349 for
+ * details.
+ */
+ if (gfpflags & __GFP_NOWARN)
+ flags |= FAULT_NOWARN;
+
+ return should_fail_ex(&failslab.attr, s->object_size, flags);
}
static int __init setup_failslab(char *str)
diff --git a/mm/filemap.c b/mm/filemap.c
index e3b8987153e6..9e44a49bbd74 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -21,6 +21,8 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
@@ -30,18 +32,21 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
-#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
-#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
+#include <linux/migrate.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/splice.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -54,6 +59,8 @@
#include <asm/mman.h>
+#include "swap.h"
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -70,12 +77,13 @@
* Lock ordering:
*
* ->i_mmap_rwsem (truncate_pagecache)
- * ->private_lock (__free_pte->__set_page_dirty_buffers)
+ * ->private_lock (__free_pte->block_dirty_folio)
* ->swap_lock (exclusive_swap_page, others)
* ->i_pages lock
*
- * ->i_mutex
- * ->i_mmap_rwsem (truncate->unmap_mapping_range)
+ * ->i_rwsem
+ * ->invalidate_lock (acquired by fs in truncate path)
+ * ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
* ->mmap_lock
* ->i_mmap_rwsem
@@ -83,17 +91,18 @@
* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_lock
- * ->lock_page (access_process_vm)
+ * ->invalidate_lock (filemap_fault)
+ * ->lock_page (filemap_fault, access_process_vm)
*
- * ->i_mutex (generic_perform_write)
- * ->mmap_lock (fault_in_pages_readable->do_page_fault)
+ * ->i_rwsem (generic_perform_write)
+ * ->mmap_lock (fault_in_readable->do_page_fault)
*
* bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
* ->i_pages lock (__sync_single_inode)
*
* ->i_mmap_rwsem
- * ->anon_vma.lock (vma_adjust)
+ * ->anon_vma.lock (vma_merge)
*
* ->anon_vma.lock
* ->page_table_lock or pte_lock (anon_vma_prepare and various)
@@ -102,126 +111,107 @@
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->i_pages lock (try_to_unmap_one)
- * ->pgdat->lru_lock (follow_page->mark_page_accessed)
- * ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
+ * ->lruvec->lru_lock (follow_page->mark_page_accessed)
+ * ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
+ * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
- * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
+ * ->private_lock (zap_pte_range->block_dirty_folio)
*
* ->i_mmap_rwsem
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
static void page_cache_delete(struct address_space *mapping,
- struct page *page, void *shadow)
+ struct folio *folio, void *shadow)
{
- XA_STATE(xas, &mapping->i_pages, page->index);
- unsigned int nr = 1;
+ XA_STATE(xas, &mapping->i_pages, folio->index);
+ long nr = 1;
mapping_set_update(&xas, mapping);
/* hugetlb pages are represented by a single entry in the xarray */
- if (!PageHuge(page)) {
- xas_set_order(&xas, page->index, compound_order(page));
- nr = compound_nr(page);
+ if (!folio_test_hugetlb(folio)) {
+ xas_set_order(&xas, folio->index, folio_order(folio));
+ nr = folio_nr_pages(folio);
}
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(nr != 1 && shadow, page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
xas_store(&xas, shadow);
xas_init_marks(&xas);
- page->mapping = NULL;
+ folio->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
-
- if (shadow) {
- mapping->nrexceptional += nr;
- /*
- * Make sure the nrexceptional update is committed before
- * the nrpages update so that final truncate racing
- * with reclaim does not see both counters 0 at the
- * same time and miss a shadow entry.
- */
- smp_wmb();
- }
mapping->nrpages -= nr;
}
-static void unaccount_page_cache_page(struct address_space *mapping,
- struct page *page)
+static void filemap_unaccount_folio(struct address_space *mapping,
+ struct folio *folio)
{
- int nr;
-
- /*
- * if we're uptodate, flush out into the cleancache, otherwise
- * invalidate any existing cleancache entries. We can't leave
- * stale data around in the cleancache once our page is gone
- */
- if (PageUptodate(page) && PageMappedToDisk(page))
- cleancache_put_page(page);
- else
- cleancache_invalidate_page(mapping, page);
-
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(page_mapped(page), page);
- if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
- int mapcount;
+ long nr;
+ VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
- current->comm, page_to_pfn(page));
- dump_page(page, "still mapped when deleted");
+ current->comm, folio_pfn(folio));
+ dump_page(&folio->page, "still mapped when deleted");
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- mapcount = page_mapcount(page);
- if (mapping_exiting(mapping) &&
- page_count(page) >= mapcount + 2) {
- /*
- * All vmas have already been torn down, so it's
- * a good bet that actually the page is unmapped,
- * and we'd prefer not to leak it: if we're wrong,
- * some other bad page check should catch it later.
- */
- page_mapcount_reset(page);
- page_ref_sub(page, mapcount);
+ if (mapping_exiting(mapping) && !folio_test_large(folio)) {
+ int mapcount = page_mapcount(&folio->page);
+
+ if (folio_ref_count(folio) >= mapcount + 2) {
+ /*
+ * All vmas have already been torn down, so it's
+ * a good bet that actually the page is unmapped
+ * and we'd rather not leak it: if we're wrong,
+ * another bad page check should catch it later.
+ */
+ page_mapcount_reset(&folio->page);
+ folio_ref_sub(folio, mapcount);
+ }
}
}
- /* hugetlb pages do not participate in page cache accounting. */
- if (PageHuge(page))
+ /* hugetlb folios do not participate in page cache accounting. */
+ if (folio_test_hugetlb(folio))
return;
- nr = thp_nr_pages(page);
+ nr = folio_nr_pages(folio);
- __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
- if (PageSwapBacked(page)) {
- __mod_lruvec_page_state(page, NR_SHMEM, -nr);
- if (PageTransHuge(page))
- __dec_node_page_state(page, NR_SHMEM_THPS);
- } else if (PageTransHuge(page)) {
- __dec_node_page_state(page, NR_FILE_THPS);
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ if (folio_test_swapbacked(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
+ if (folio_test_pmd_mappable(folio))
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
+ } else if (folio_test_pmd_mappable(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
/*
- * At this point page must be either written or cleaned by
- * truncate. Dirty page here signals a bug and loss of
- * unwritten data.
+ * At this point folio must be either written or cleaned by
+ * truncate. Dirty folio here signals a bug and loss of
+ * unwritten data - on ordinary filesystems.
+ *
+ * But it's harmless on in-memory filesystems like tmpfs; and can
+ * occur when a driver which did get_user_pages() sets page dirty
+ * before putting it, while the inode is being finally evicted.
*
- * This fixes dirty accounting after removing the page entirely
- * but leaves PageDirty set: it has no effect for truncated
- * page and anyway will be cleared before returning page into
+ * Below fixes dirty accounting after removing the folio entirely
+ * but leaves the dirty flag set: it has no effect for truncated
+ * folio and anyway will be cleared before returning folio to
* buddy allocator.
*/
- if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
+ if (WARN_ON_ONCE(folio_test_dirty(folio) &&
+ mapping_can_writeback(mapping)))
+ folio_account_cleaned(folio, inode_to_wb(mapping->host));
}
/*
@@ -229,84 +219,81 @@ static void unaccount_page_cache_page(struct address_space *mapping,
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the i_pages lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __filemap_remove_folio(struct folio *folio, void *shadow)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
- trace_mm_filemap_delete_from_page_cache(page);
-
- unaccount_page_cache_page(mapping, page);
- page_cache_delete(mapping, page, shadow);
+ trace_mm_filemap_delete_from_page_cache(folio);
+ filemap_unaccount_folio(mapping, folio);
+ page_cache_delete(mapping, folio, shadow);
}
-static void page_cache_free_page(struct address_space *mapping,
- struct page *page)
+void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
- void (*freepage)(struct page *);
+ void (*free_folio)(struct folio *);
+ int refs = 1;
- freepage = mapping->a_ops->freepage;
- if (freepage)
- freepage(page);
+ free_folio = mapping->a_ops->free_folio;
+ if (free_folio)
+ free_folio(folio);
- if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(page_count(page) <= 0, page);
- } else {
- put_page(page);
- }
+ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ refs = folio_nr_pages(folio);
+ folio_put_refs(folio, refs);
}
/**
- * delete_from_page_cache - delete page from page cache
- * @page: the page which the kernel is trying to remove from page cache
+ * filemap_remove_folio - Remove folio from page cache.
+ * @folio: The folio.
*
- * This must be called only on pages that have been verified to be in the page
- * cache and locked. It will never put the page into the free list, the caller
- * has a reference on the page.
+ * This must be called only on folios that are locked and have been
+ * verified to be in the page cache. It will never put the folio into
+ * the free list because the caller has a reference on the page.
*/
-void delete_from_page_cache(struct page *page)
+void filemap_remove_folio(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
- unsigned long flags;
-
- BUG_ON(!PageLocked(page));
- xa_lock_irqsave(&mapping->i_pages, flags);
- __delete_from_page_cache(page, NULL);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
-
- page_cache_free_page(mapping, page);
+ struct address_space *mapping = folio->mapping;
+
+ BUG_ON(!folio_test_locked(folio));
+ spin_lock(&mapping->host->i_lock);
+ xa_lock_irq(&mapping->i_pages);
+ __filemap_remove_folio(folio, NULL);
+ xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+
+ filemap_free_folio(mapping, folio);
}
-EXPORT_SYMBOL(delete_from_page_cache);
/*
- * page_cache_delete_batch - delete several pages from page cache
- * @mapping: the mapping to which pages belong
- * @pvec: pagevec with pages to delete
+ * page_cache_delete_batch - delete several folios from page cache
+ * @mapping: the mapping to which folios belong
+ * @fbatch: batch of folios to delete
*
- * The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index
- * and is optimised for it to be dense.
- * It tolerates holes in @pvec (mapping entries at those indices are not
- * modified). The function expects only THP head pages to be present in the
- * @pvec.
+ * The function walks over mapping->i_pages and removes folios passed in
+ * @fbatch from the mapping. The function expects @fbatch to be sorted
+ * by page index and is optimised for it to be dense.
+ * It tolerates holes in @fbatch (mapping entries at those indices are not
+ * modified).
*
* The function expects the i_pages lock to be held.
*/
static void page_cache_delete_batch(struct address_space *mapping,
- struct pagevec *pvec)
+ struct folio_batch *fbatch)
{
- XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
- int total_pages = 0;
+ XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
+ long total_pages = 0;
int i = 0;
- struct page *page;
+ struct folio *folio;
mapping_set_update(&xas, mapping);
- xas_for_each(&xas, page, ULONG_MAX) {
- if (i >= pagevec_count(pvec))
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ if (i >= folio_batch_count(fbatch))
break;
/* A swap/dax/shadow entry got inserted? Skip it. */
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
/*
* A page got inserted in our range? Skip it. We have our
@@ -315,51 +302,48 @@ static void page_cache_delete_batch(struct address_space *mapping,
* means our page has been removed, which shouldn't be
* possible because we're holding the PageLock.
*/
- if (page != pvec->pages[i]) {
- VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
- page);
+ if (folio != fbatch->folios[i]) {
+ VM_BUG_ON_FOLIO(folio->index >
+ fbatch->folios[i]->index, folio);
continue;
}
- WARN_ON_ONCE(!PageLocked(page));
+ WARN_ON_ONCE(!folio_test_locked(folio));
- if (page->index == xas.xa_index)
- page->mapping = NULL;
- /* Leave page->index set: truncation lookup relies on it */
+ folio->mapping = NULL;
+ /* Leave folio->index set: truncation lookup relies on it */
- /*
- * Move to the next page in the vector if this is a regular
- * page or the index is of the last sub-page of this compound
- * page.
- */
- if (page->index + compound_nr(page) - 1 == xas.xa_index)
- i++;
+ i++;
xas_store(&xas, NULL);
- total_pages++;
+ total_pages += folio_nr_pages(folio);
}
mapping->nrpages -= total_pages;
}
void delete_from_page_cache_batch(struct address_space *mapping,
- struct pagevec *pvec)
+ struct folio_batch *fbatch)
{
int i;
- unsigned long flags;
- if (!pagevec_count(pvec))
+ if (!folio_batch_count(fbatch))
return;
- xa_lock_irqsave(&mapping->i_pages, flags);
- for (i = 0; i < pagevec_count(pvec); i++) {
- trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
+ spin_lock(&mapping->host->i_lock);
+ xa_lock_irq(&mapping->i_pages);
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
- unaccount_page_cache_page(mapping, pvec->pages[i]);
+ trace_mm_filemap_delete_from_page_cache(folio);
+ filemap_unaccount_folio(mapping, folio);
}
- page_cache_delete_batch(mapping, pvec);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
-
- for (i = 0; i < pagevec_count(pvec); i++)
- page_cache_free_page(mapping, pvec->pages[i]);
+ page_cache_delete_batch(mapping, fbatch);
+ xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+
+ for (i = 0; i < folio_batch_count(fbatch); i++)
+ filemap_free_folio(mapping, fbatch->folios[i]);
}
int filemap_check_errors(struct address_space *mapping)
@@ -387,6 +371,32 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
}
/**
+ * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
+ * @mapping: address space structure to write
+ * @wbc: the writeback_control controlling the writeout
+ *
+ * Call writepages on the mapping using the provided wbc to control the
+ * writeout.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_fdatawrite_wbc(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ int ret;
+
+ if (!mapping_can_writeback(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ wbc_attach_fdatawrite_inode(wbc, mapping->host);
+ ret = do_writepages(mapping, wbc);
+ wbc_detach_inode(wbc);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_fdatawrite_wbc);
+
+/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
@@ -406,7 +416,6 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode)
{
- int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
.nr_to_write = LONG_MAX,
@@ -414,14 +423,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_can_writeback(mapping) ||
- !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
-
- wbc_attach_fdatawrite_inode(&wbc, mapping->host);
- ret = do_writepages(mapping, &wbc);
- wbc_detach_inode(&wbc);
- return ret;
+ return filemap_fdatawrite_wbc(mapping, &wbc);
}
static inline int __filemap_fdatawrite(struct address_space *mapping,
@@ -473,7 +475,7 @@ EXPORT_SYMBOL(filemap_flush);
bool filemap_range_has_page(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
- struct page *page;
+ struct folio *folio;
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
pgoff_t max = end_byte >> PAGE_SHIFT;
@@ -482,11 +484,11 @@ bool filemap_range_has_page(struct address_space *mapping,
rcu_read_lock();
for (;;) {
- page = xas_find(&xas, max);
- if (xas_retry(&xas, page))
+ folio = xas_find(&xas, max);
+ if (xas_retry(&xas, folio))
continue;
/* Shadow entries don't count */
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
/*
* We don't need to try to pin this page; we're about to
@@ -497,7 +499,7 @@ bool filemap_range_has_page(struct address_space *mapping,
}
rcu_read_unlock();
- return page != NULL;
+ return folio != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);
@@ -506,28 +508,27 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
- struct pagevec pvec;
- int nr_pages;
+ struct folio_batch fbatch;
+ unsigned nr_folios;
- if (end_byte < start_byte)
- return;
+ folio_batch_init(&fbatch);
- pagevec_init(&pvec);
while (index <= end) {
unsigned i;
- nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
- end, PAGECACHE_TAG_WRITEBACK);
- if (!nr_pages)
+ nr_folios = filemap_get_folios_tag(mapping, &index, end,
+ PAGECACHE_TAG_WRITEBACK, &fbatch);
+
+ if (!nr_folios)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
- wait_on_page_writeback(page);
- ClearPageError(page);
+ folio_wait_writeback(folio);
+ folio_clear_error(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
}
@@ -627,12 +628,34 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
- if (dax_mapping(mapping))
- return mapping->nrexceptional;
-
return mapping->nrpages;
}
+bool filemap_range_has_writeback(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
+ struct folio *folio;
+
+ if (end_byte < start_byte)
+ return false;
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, max) {
+ if (xas_retry(&xas, folio))
+ continue;
+ if (xa_is_value(folio))
+ continue;
+ if (folio_test_dirty(folio) || folio_test_locked(folio) ||
+ folio_test_writeback(folio))
+ break;
+ }
+ rcu_read_unlock();
+ return folio != NULL;
+}
+EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
+
/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
@@ -649,7 +672,10 @@ static bool mapping_needs_writeback(struct address_space *mapping)
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- int err = 0;
+ int err = 0, err2;
+
+ if (lend < lstart)
+ return 0;
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
@@ -660,18 +686,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
* But the -EIO is special case, it may indicate the worst
* thing (e.g. bug) happened, so we avoid waiting for it.
*/
- if (err != -EIO) {
- int err2 = filemap_fdatawait_range(mapping,
- lstart, lend);
- if (!err)
- err = err2;
- } else {
- /* Clear any previously stored errors */
- filemap_check_errors(mapping);
- }
- } else {
- err = filemap_check_errors(mapping);
+ if (err != -EIO)
+ __filemap_fdatawait_range(mapping, lstart, lend);
}
+ err2 = filemap_check_errors(mapping);
+ if (!err)
+ err = err2;
return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);
@@ -757,6 +777,9 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
int err = 0, err2;
struct address_space *mapping = file->f_mapping;
+ if (lend < lstart)
+ return 0;
+
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
@@ -772,194 +795,231 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
EXPORT_SYMBOL(file_write_and_wait_range);
/**
- * replace_page_cache_page - replace a pagecache page with a new one
- * @old: page to be replaced
- * @new: page to replace with
- * @gfp_mask: allocation mode
- *
- * This function replaces a page in the pagecache with a new one. On
- * success it acquires the pagecache reference for the new page and
- * drops it for the old page. Both the old and new pages must be
- * locked. This function does not add the new page to the LRU, the
+ * replace_page_cache_folio - replace a pagecache folio with a new one
+ * @old: folio to be replaced
+ * @new: folio to replace with
+ *
+ * This function replaces a folio in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new folio and
+ * drops it for the old folio. Both the old and new folios must be
+ * locked. This function does not add the new folio to the LRU, the
* caller must do that.
*
* The remove + add is atomic. This function cannot fail.
- *
- * Return: %0
*/
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+void replace_page_cache_folio(struct folio *old, struct folio *new)
{
struct address_space *mapping = old->mapping;
- void (*freepage)(struct page *) = mapping->a_ops->freepage;
+ void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);
- unsigned long flags;
- VM_BUG_ON_PAGE(!PageLocked(old), old);
- VM_BUG_ON_PAGE(!PageLocked(new), new);
- VM_BUG_ON_PAGE(new->mapping, new);
+ VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+ VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+ VM_BUG_ON_FOLIO(new->mapping, new);
- get_page(new);
+ folio_get(new);
new->mapping = mapping;
new->index = offset;
mem_cgroup_migrate(old, new);
- xas_lock_irqsave(&xas, flags);
+ xas_lock_irq(&xas);
xas_store(&xas, new);
old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
- if (!PageHuge(old))
- __dec_lruvec_page_state(old, NR_FILE_PAGES);
- if (!PageHuge(new))
- __inc_lruvec_page_state(new, NR_FILE_PAGES);
- if (PageSwapBacked(old))
- __dec_lruvec_page_state(old, NR_SHMEM);
- if (PageSwapBacked(new))
- __inc_lruvec_page_state(new, NR_SHMEM);
- xas_unlock_irqrestore(&xas, flags);
- if (freepage)
- freepage(old);
- put_page(old);
-
- return 0;
+ if (!folio_test_hugetlb(old))
+ __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
+ if (!folio_test_hugetlb(new))
+ __lruvec_stat_add_folio(new, NR_FILE_PAGES);
+ if (folio_test_swapbacked(old))
+ __lruvec_stat_sub_folio(old, NR_SHMEM);
+ if (folio_test_swapbacked(new))
+ __lruvec_stat_add_folio(new, NR_SHMEM);
+ xas_unlock_irq(&xas);
+ if (free_folio)
+ free_folio(old);
+ folio_put(old);
}
-EXPORT_SYMBOL_GPL(replace_page_cache_page);
+EXPORT_SYMBOL_GPL(replace_page_cache_folio);
-noinline int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+noinline int __filemap_add_folio(struct address_space *mapping,
+ struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
- XA_STATE(xas, &mapping->i_pages, offset);
- int huge = PageHuge(page);
- int error;
- void *old;
+ XA_STATE(xas, &mapping->i_pages, index);
+ int huge = folio_test_hugetlb(folio);
+ bool charged = false;
+ long nr = 1;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
mapping_set_update(&xas, mapping);
- get_page(page);
- page->mapping = mapping;
- page->index = offset;
-
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ int error = mem_cgroup_charge(folio, NULL, gfp);
+ VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
if (error)
- goto error;
+ return error;
+ charged = true;
+ xas_set_order(&xas, index, folio_order(folio));
+ nr = folio_nr_pages(folio);
}
+ gfp &= GFP_RECLAIM_MASK;
+ folio_ref_add(folio, nr);
+ folio->mapping = mapping;
+ folio->index = xas.xa_index;
+
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > folio_order(folio))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
- xas_store(&xas, page);
- if (xas_error(&xas))
- goto unlock;
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
- if (xa_is_value(old)) {
- mapping->nrexceptional--;
+ if (old) {
if (shadowp)
*shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > folio_order(folio)) {
+ /* How to handle large swap entries? */
+ BUG_ON(shmem_mapping(mapping));
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
}
- mapping->nrpages++;
+
+ xas_store(&xas, folio);
+ if (xas_error(&xas))
+ goto unlock;
+
+ mapping->nrpages += nr;
/* hugetlb pages do not participate in page cache accounting */
- if (!huge)
- __inc_lruvec_page_state(page, NR_FILE_PAGES);
+ if (!huge) {
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ if (folio_test_pmd_mappable(folio))
+ __lruvec_stat_mod_folio(folio,
+ NR_FILE_THPS, nr);
+ }
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp));
- if (xas_error(&xas)) {
- error = xas_error(&xas);
+ if (xas_error(&xas))
goto error;
- }
- trace_mm_filemap_add_to_page_cache(page);
+ trace_mm_filemap_add_to_page_cache(folio);
return 0;
error:
- page->mapping = NULL;
+ if (charged)
+ mem_cgroup_uncharge(folio);
+ folio->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- put_page(page);
- return error;
+ folio_put_refs(folio, nr);
+ return xas_error(&xas);
}
-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
+ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
-/**
- * add_to_page_cache_locked - add a locked page to the pagecache
- * @page: page to add
- * @mapping: the page's address_space
- * @offset: page index
- * @gfp_mask: page allocation mode
- *
- * This function is used to add a page to the pagecache. It must be locked.
- * This function does not add the page to the LRU. The caller must do that.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask)
-{
- return __add_to_page_cache_locked(page, mapping, offset,
- gfp_mask, NULL);
-}
-EXPORT_SYMBOL(add_to_page_cache_locked);
-
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask)
+int filemap_add_folio(struct address_space *mapping, struct folio *folio,
+ pgoff_t index, gfp_t gfp)
{
void *shadow = NULL;
int ret;
- __SetPageLocked(page);
- ret = __add_to_page_cache_locked(page, mapping, offset,
- gfp_mask, &shadow);
+ __folio_set_locked(folio);
+ ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
if (unlikely(ret))
- __ClearPageLocked(page);
+ __folio_clear_locked(folio);
else {
/*
- * The page might have been evicted from cache only
+ * The folio might have been evicted from cache only
* recently, in which case it should be activated like
- * any other repeatedly accessed page.
- * The exception is pages getting rewritten; evicting other
+ * any other repeatedly accessed folio.
+ * The exception is folios getting rewritten; evicting other
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
- WARN_ON_ONCE(PageActive(page));
- if (!(gfp_mask & __GFP_WRITE) && shadow)
- workingset_refault(page, shadow);
- lru_cache_add(page);
+ WARN_ON_ONCE(folio_test_active(folio));
+ if (!(gfp & __GFP_WRITE) && shadow)
+ workingset_refault(folio, shadow);
+ folio_add_lru(folio);
}
return ret;
}
-EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
+EXPORT_SYMBOL_GPL(filemap_add_folio);
#ifdef CONFIG_NUMA
-struct page *__page_cache_alloc(gfp_t gfp)
+struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
{
int n;
- struct page *page;
+ struct folio *folio;
if (cpuset_do_page_mem_spread()) {
unsigned int cpuset_mems_cookie;
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
- page = __alloc_pages_node(n, gfp, 0);
- } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+ folio = __folio_alloc_node(gfp, order, n);
+ } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
- return page;
+ return folio;
}
- return alloc_pages(gfp, 0);
+ return folio_alloc(gfp, order);
}
-EXPORT_SYMBOL(__page_cache_alloc);
+EXPORT_SYMBOL(filemap_alloc_folio);
#endif
/*
+ * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
+ *
+ * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to lock
+ * @mapping2: the second mapping to lock
+ */
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+ struct address_space *mapping2)
+{
+ if (mapping1 > mapping2)
+ swap(mapping1, mapping2);
+ if (mapping1)
+ down_write(&mapping1->invalidate_lock);
+ if (mapping2 && mapping1 != mapping2)
+ down_write_nested(&mapping2->invalidate_lock, 1);
+}
+EXPORT_SYMBOL(filemap_invalidate_lock_two);
+
+/*
+ * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
+ *
+ * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to unlock
+ * @mapping2: the second mapping to unlock
+ */
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ struct address_space *mapping2)
+{
+ if (mapping1)
+ up_write(&mapping1->invalidate_lock);
+ if (mapping2 && mapping1 != mapping2)
+ up_write(&mapping2->invalidate_lock);
+}
+EXPORT_SYMBOL(filemap_invalidate_unlock_two);
+
+/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
* waitqueues where the bucket discipline is to maintain all
@@ -971,11 +1031,11 @@ EXPORT_SYMBOL(__page_cache_alloc);
*/
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static wait_queue_head_t *folio_waitqueue(struct folio *folio)
{
- return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+ return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}
void __init pagecache_init(void)
@@ -983,7 +1043,7 @@ void __init pagecache_init(void)
int i;
for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
- init_waitqueue_head(&page_wait_table[i]);
+ init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
}
@@ -1038,10 +1098,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
- if (test_bit(key->bit_nr, &key->page->flags))
+ if (test_bit(key->bit_nr, &key->folio->flags))
return -1;
if (flags & WQ_FLAG_CUSTOM) {
- if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ if (test_and_set_bit(key->bit_nr, &key->folio->flags))
return -1;
flags |= WQ_FLAG_DONE;
}
@@ -1054,7 +1114,7 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*
* So update the flags atomically, and wake up the waiter
* afterwards to avoid any races. This store-release pairs
- * with the load-acquire in wait_on_page_bit_common().
+ * with the load-acquire in folio_wait_bit_common().
*/
smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
wake_up_state(wait->private, mode);
@@ -1073,14 +1133,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}
-static void wake_up_page_bit(struct page *page, int bit_nr)
+static void folio_wake_bit(struct folio *folio, int bit_nr)
{
- wait_queue_head_t *q = page_waitqueue(page);
+ wait_queue_head_t *q = folio_waitqueue(folio);
struct wait_page_key key;
unsigned long flags;
wait_queue_entry_t bookmark;
- key.page = page;
+ key.folio = folio;
key.bit_nr = bit_nr;
key.page_match = 0;
@@ -1106,60 +1166,53 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
}
/*
- * It is possible for other pages to have collided on the waitqueue
- * hash, so in that case check for a page match. That prevents a long-
- * term waiter
+ * It's possible to miss clearing waiters here, when we woke our page
+ * waiters, but the hashed waitqueue has waiters for other pages on it.
+ * That's okay, it's a rare case. The next waker will clear it.
*
- * It is still possible to miss a case here, when we woke page waiters
- * and removed them from the waitqueue, but there are still other
- * page waiters.
+ * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
+ * other), the flag may be cleared in the course of freeing the page;
+ * but that is not required for correctness.
*/
- if (!waitqueue_active(q) || !key.page_match) {
- ClearPageWaiters(page);
- /*
- * It's possible to miss clearing Waiters here, when we woke
- * our page waiters, but the hashed waitqueue has waiters for
- * other pages on it.
- *
- * That's okay, it's a rare case. The next waker will clear it.
- */
- }
+ if (!waitqueue_active(q) || !key.page_match)
+ folio_clear_waiters(folio);
+
spin_unlock_irqrestore(&q->lock, flags);
}
-static void wake_up_page(struct page *page, int bit)
+static void folio_wake(struct folio *folio, int bit)
{
- if (!PageWaiters(page))
+ if (!folio_test_waiters(folio))
return;
- wake_up_page_bit(page, bit);
+ folio_wake_bit(folio, bit);
}
/*
- * A choice of three behaviors for wait_on_page_bit_common():
+ * A choice of three behaviors for folio_wait_bit_common():
*/
enum behavior {
EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
- * __lock_page() waiting on then setting PG_locked.
+ * __folio_lock() waiting on then setting PG_locked.
*/
SHARED, /* Hold ref to page and check the bit when woken, like
- * wait_on_page_writeback() waiting on PG_writeback.
+ * folio_wait_writeback() waiting on PG_writeback.
*/
DROP, /* Drop ref to page before wait, no check when woken,
- * like put_and_wait_on_page_locked() on PG_locked.
+ * like folio_put_wait_locked() on PG_locked.
*/
};
/*
- * Attempt to check (or get) the page bit, and mark us done
+ * Attempt to check (or get) the folio flag, and mark us done
* if successful.
*/
-static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(bit_nr, &page->flags))
+ if (test_and_set_bit(bit_nr, &folio->flags))
return false;
- } else if (test_bit(bit_nr, &page->flags))
+ } else if (test_bit(bit_nr, &folio->flags))
return false;
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1169,29 +1222,27 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
- struct page *page, int bit_nr, int state, enum behavior behavior)
+static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
+ int state, enum behavior behavior)
{
+ wait_queue_head_t *q = folio_waitqueue(folio);
int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
- bool delayacct = false;
unsigned long pflags;
+ bool in_thrashing;
if (bit_nr == PG_locked &&
- !PageUptodate(page) && PageWorkingset(page)) {
- if (!PageSwapBacked(page)) {
- delayacct_thrashing_start();
- delayacct = true;
- }
+ !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
+ delayacct_thrashing_start(&in_thrashing);
psi_memstall_enter(&pflags);
thrashing = true;
}
init_wait(wait);
wait->func = wake_page_function;
- wait_page.page = page;
+ wait_page.folio = folio;
wait_page.bit_nr = bit_nr;
repeat:
@@ -1206,7 +1257,7 @@ repeat:
* Do one last check whether we can get the
* page bit synchronously.
*
- * Do the SetPageWaiters() marking before that
+ * Do the folio_set_waiters() marking before that
* to let any waker we _just_ missed know they
* need to wake us up (otherwise they'll never
* even go to the slow case that looks at the
@@ -1217,8 +1268,8 @@ repeat:
* lock to avoid races.
*/
spin_lock_irq(&q->lock);
- SetPageWaiters(page);
- if (!trylock_page_bit_common(page, bit_nr, wait))
+ folio_set_waiters(folio);
+ if (!folio_trylock_flag(folio, bit_nr, wait))
__add_wait_queue_entry_tail(q, wait);
spin_unlock_irq(&q->lock);
@@ -1228,10 +1279,10 @@ repeat:
* see whether the page bit testing has already
* been done by the wake function.
*
- * We can drop our reference to the page.
+ * We can drop our reference to the folio.
*/
if (behavior == DROP)
- put_page(page);
+ folio_put(folio);
/*
* Note that until the "finish_wait()", or until
@@ -1268,7 +1319,7 @@ repeat:
*
* And if that fails, we'll have to retry this all.
*/
- if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
goto repeat;
wait->flags |= WQ_FLAG_DONE;
@@ -1277,15 +1328,14 @@ repeat:
/*
* If a signal happened, this 'finish_wait()' may remove the last
- * waiter from the wait-queues, but the PageWaiters bit will remain
+ * waiter from the wait-queues, but the folio waiters bit will remain
* set. That's ok. The next wakeup will take care of it, and trying
* to do it here would be difficult and prone to races.
*/
finish_wait(q, wait);
if (thrashing) {
- if (delayacct)
- delayacct_thrashing_end();
+ delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
}
@@ -1308,95 +1358,134 @@ repeat:
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
-void wait_on_page_bit(struct page *page, int bit_nr)
-{
- wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
-}
-EXPORT_SYMBOL(wait_on_page_bit);
-
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+#ifdef CONFIG_MIGRATION
+/**
+ * migration_entry_wait_on_locked - Wait for a migration entry to be removed
+ * @entry: migration swap entry.
+ * @ptl: already locked ptl. This function will drop the lock.
+ *
+ * Wait for a migration entry referencing the given page to be removed. This is
+ * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
+ * this can be called without taking a reference on the page. Instead this
+ * should be called while holding the ptl for the migration entry referencing
+ * the page.
+ *
+ * Returns after unlocking the ptl.
+ *
+ * This follows the same logic as folio_wait_bit_common() so see the comments
+ * there.
+ */
+void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
+ __releases(ptl)
{
- wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
-}
-EXPORT_SYMBOL(wait_on_page_bit_killable);
+ struct wait_page_queue wait_page;
+ wait_queue_entry_t *wait = &wait_page.wait;
+ bool thrashing = false;
+ unsigned long pflags;
+ bool in_thrashing;
+ wait_queue_head_t *q;
+ struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
-static int __wait_on_page_locked_async(struct page *page,
- struct wait_page_queue *wait, bool set)
-{
- struct wait_queue_head *q = page_waitqueue(page);
- int ret = 0;
+ q = folio_waitqueue(folio);
+ if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
+ delayacct_thrashing_start(&in_thrashing);
+ psi_memstall_enter(&pflags);
+ thrashing = true;
+ }
- wait->page = page;
- wait->bit_nr = PG_locked;
+ init_wait(wait);
+ wait->func = wake_page_function;
+ wait_page.folio = folio;
+ wait_page.bit_nr = PG_locked;
+ wait->flags = 0;
spin_lock_irq(&q->lock);
- __add_wait_queue_entry_tail(q, &wait->wait);
- SetPageWaiters(page);
- if (set)
- ret = !trylock_page(page);
- else
- ret = PageLocked(page);
+ folio_set_waiters(folio);
+ if (!folio_trylock_flag(folio, PG_locked, wait))
+ __add_wait_queue_entry_tail(q, wait);
+ spin_unlock_irq(&q->lock);
+
/*
- * If we were succesful now, we know we're still on the
- * waitqueue as we're still under the lock. This means it's
- * safe to remove and return success, we know the callback
- * isn't going to trigger.
+ * If a migration entry exists for the page the migration path must hold
+ * a valid reference to the page, and it must take the ptl to remove the
+ * migration entry. So the page is valid until the ptl is dropped.
*/
- if (!ret)
- __remove_wait_queue(q, &wait->wait);
- else
- ret = -EIOCBQUEUED;
- spin_unlock_irq(&q->lock);
- return ret;
+ spin_unlock(ptl);
+
+ for (;;) {
+ unsigned int flags;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /* Loop until we've been woken or interrupted */
+ flags = smp_load_acquire(&wait->flags);
+ if (!(flags & WQ_FLAG_WOKEN)) {
+ if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
+ break;
+
+ io_schedule();
+ continue;
+ }
+ break;
+ }
+
+ finish_wait(q, wait);
+
+ if (thrashing) {
+ delayacct_thrashing_end(&in_thrashing);
+ psi_memstall_leave(&pflags);
+ }
}
+#endif
-static int wait_on_page_locked_async(struct page *page,
- struct wait_page_queue *wait)
+void folio_wait_bit(struct folio *folio, int bit_nr)
{
- if (!PageLocked(page))
- return 0;
- return __wait_on_page_locked_async(compound_head(page), wait, false);
+ folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
+EXPORT_SYMBOL(folio_wait_bit);
+
+int folio_wait_bit_killable(struct folio *folio, int bit_nr)
+{
+ return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
+}
+EXPORT_SYMBOL(folio_wait_bit_killable);
/**
- * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
- * @page: The page to wait for.
+ * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
+ * @folio: The folio to wait for.
+ * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
*
- * The caller should hold a reference on @page. They expect the page to
+ * The caller should hold a reference on @folio. They expect the page to
* become unlocked relatively soon, but do not wish to hold up migration
- * (for example) by holding the reference while waiting for the page to
+ * (for example) by holding the reference while waiting for the folio to
* come unlocked. After this function returns, the caller should not
- * dereference @page.
+ * dereference @folio.
+ *
+ * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
*/
-void put_and_wait_on_page_locked(struct page *page)
+static int folio_put_wait_locked(struct folio *folio, int state)
{
- wait_queue_head_t *q;
-
- page = compound_head(page);
- q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+ return folio_wait_bit_common(folio, PG_locked, state, DROP);
}
/**
- * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
- * @page: Page defining the wait queue of interest
+ * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
+ * @folio: Folio defining the wait queue of interest
* @waiter: Waiter to add to the queue
*
- * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ * Add an arbitrary @waiter to the wait queue for the nominated @folio.
*/
-void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
+void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
{
- wait_queue_head_t *q = page_waitqueue(page);
+ wait_queue_head_t *q = folio_waitqueue(folio);
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue_entry_tail(q, waiter);
- SetPageWaiters(page);
+ folio_set_waiters(folio);
spin_unlock_irqrestore(&q->lock, flags);
}
-EXPORT_SYMBOL_GPL(add_page_wait_queue);
+EXPORT_SYMBOL_GPL(folio_add_wait_queue);
#ifndef clear_bit_unlock_is_negative_byte
@@ -1422,123 +1511,174 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
#endif
/**
- * unlock_page - unlock a locked page
- * @page: the page
+ * folio_unlock - Unlock a locked folio.
+ * @folio: The folio.
*
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechanism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ * Unlocks the folio and wakes up any thread sleeping on the page lock.
*
- * Note that this depends on PG_waiters being the sign bit in the byte
- * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
- * clear the PG_locked bit and test PG_waiters at the same time fairly
- * portably (architectures that do LL/SC can test any bit, while x86 can
- * test the sign bit).
+ * Context: May be called from interrupt or process context. May not be
+ * called from NMI context.
*/
-void unlock_page(struct page *page)
+void folio_unlock(struct folio *folio)
{
+ /* Bit 7 allows x86 to check the byte's sign bit */
BUILD_BUG_ON(PG_waiters != 7);
- page = compound_head(page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
- wake_up_page_bit(page, PG_locked);
+ BUILD_BUG_ON(PG_locked > 7);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
+ folio_wake_bit(folio, PG_locked);
+}
+EXPORT_SYMBOL(folio_unlock);
+
+/**
+ * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
+ * @folio: The folio.
+ *
+ * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
+ * it. The folio reference held for PG_private_2 being set is released.
+ *
+ * This is, for example, used when a netfs folio is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same folio to be
+ * serialised.
+ */
+void folio_end_private_2(struct folio *folio)
+{
+ VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
+ clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
+ folio_wake_bit(folio, PG_private_2);
+ folio_put(folio);
}
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(folio_end_private_2);
/**
- * end_page_writeback - end writeback against a page
- * @page: the page
+ * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
+ *
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
*/
-void end_page_writeback(struct page *page)
+void folio_wait_private_2(struct folio *folio)
{
- /*
- * TestClearPageReclaim could be used here but it is an atomic
- * operation and overkill in this particular case. Failing to
- * shuffle a page marked for immediate reclaim is too mild to
- * justify taking an atomic operation penalty at the end of
- * ever page writeback.
- */
- if (PageReclaim(page)) {
- ClearPageReclaim(page);
- rotate_reclaimable_page(page);
- }
+ while (folio_test_private_2(folio))
+ folio_wait_bit(folio, PG_private_2);
+}
+EXPORT_SYMBOL(folio_wait_private_2);
- if (!test_clear_page_writeback(page))
- BUG();
+/**
+ * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
+ *
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
+ * fatal signal is received by the calling task.
+ *
+ * Return:
+ * - 0 if successful.
+ * - -EINTR if a fatal signal was encountered.
+ */
+int folio_wait_private_2_killable(struct folio *folio)
+{
+ int ret = 0;
- smp_mb__after_atomic();
- wake_up_page(page, PG_writeback);
+ while (folio_test_private_2(folio)) {
+ ret = folio_wait_bit_killable(folio, PG_private_2);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
}
-EXPORT_SYMBOL(end_page_writeback);
+EXPORT_SYMBOL(folio_wait_private_2_killable);
-/*
- * After completing I/O on a page, call this routine to update the page
- * flags appropriately
+/**
+ * folio_end_writeback - End writeback against a folio.
+ * @folio: The folio.
*/
-void page_endio(struct page *page, bool is_write, int err)
+void folio_end_writeback(struct folio *folio)
{
- if (!is_write) {
- if (!err) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- } else {
- if (err) {
- struct address_space *mapping;
-
- SetPageError(page);
- mapping = page_mapping(page);
- if (mapping)
- mapping_set_error(mapping, err);
- }
- end_page_writeback(page);
+ /*
+ * folio_test_clear_reclaim() could be used here but it is an
+ * atomic operation and overkill in this particular case. Failing
+ * to shuffle a folio marked for immediate reclaim is too mild
+ * a gain to justify taking an atomic operation penalty at the
+ * end of every folio writeback.
+ */
+ if (folio_test_reclaim(folio)) {
+ folio_clear_reclaim(folio);
+ folio_rotate_reclaimable(folio);
}
+
+ /*
+ * Writeback does not hold a folio reference of its own, relying
+ * on truncation to wait for the clearing of PG_writeback.
+ * But here we must make sure that the folio is not freed and
+ * reused before the folio_wake().
+ */
+ folio_get(folio);
+ if (!__folio_end_writeback(folio))
+ BUG();
+
+ smp_mb__after_atomic();
+ folio_wake(folio, PG_writeback);
+ acct_reclaim_writeback(folio);
+ folio_put(folio);
}
-EXPORT_SYMBOL_GPL(page_endio);
+EXPORT_SYMBOL(folio_end_writeback);
/**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @__page: the page to lock
+ * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
+ * @folio: The folio to lock
*/
-void __lock_page(struct page *__page)
+void __folio_lock(struct folio *folio)
{
- struct page *page = compound_head(__page);
- wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+ folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
EXCLUSIVE);
}
-EXPORT_SYMBOL(__lock_page);
+EXPORT_SYMBOL(__folio_lock);
-int __lock_page_killable(struct page *__page)
+int __folio_lock_killable(struct folio *folio)
{
- struct page *page = compound_head(__page);
- wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+ return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
EXCLUSIVE);
}
-EXPORT_SYMBOL_GPL(__lock_page_killable);
+EXPORT_SYMBOL_GPL(__folio_lock_killable);
-int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
- return __wait_on_page_locked_async(page, wait, true);
+ struct wait_queue_head *q = folio_waitqueue(folio);
+ int ret = 0;
+
+ wait->folio = folio;
+ wait->bit_nr = PG_locked;
+
+ spin_lock_irq(&q->lock);
+ __add_wait_queue_entry_tail(q, &wait->wait);
+ folio_set_waiters(folio);
+ ret = !folio_trylock(folio);
+ /*
+ * If we were successful now, we know we're still on the
+ * waitqueue as we're still under the lock. This means it's
+ * safe to remove and return success, we know the callback
+ * isn't going to trigger.
+ */
+ if (!ret)
+ __remove_wait_queue(q, &wait->wait);
+ else
+ ret = -EIOCBQUEUED;
+ spin_unlock_irq(&q->lock);
+ return ret;
}
/*
* Return values:
- * 1 - page is locked; mmap_lock is still held.
- * 0 - page is not locked.
+ * true - folio is locked; mmap_lock is still held.
+ * false - folio is not locked.
* mmap_lock has been released (mmap_read_unlock(), unless flags had both
* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
* which case mmap_lock is still held.
*
- * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
- * with the page locked and the mmap_lock unperturbed.
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
+ * with the folio locked and the mmap_lock unperturbed.
*/
-int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
unsigned int flags)
{
if (fault_flag_allow_retry_first(flags)) {
@@ -1547,27 +1687,28 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
* even though return 0.
*/
if (flags & FAULT_FLAG_RETRY_NOWAIT)
- return 0;
+ return false;
mmap_read_unlock(mm);
if (flags & FAULT_FLAG_KILLABLE)
- wait_on_page_locked_killable(page);
+ folio_wait_locked_killable(folio);
else
- wait_on_page_locked(page);
- return 0;
- } else {
- if (flags & FAULT_FLAG_KILLABLE) {
- int ret;
+ folio_wait_locked(folio);
+ return false;
+ }
+ if (flags & FAULT_FLAG_KILLABLE) {
+ bool ret;
- ret = __lock_page_killable(page);
- if (ret) {
- mmap_read_unlock(mm);
- return 0;
- }
- } else
- __lock_page(page);
- return 1;
+ ret = __folio_lock_killable(folio);
+ if (ret) {
+ mmap_read_unlock(mm);
+ return false;
+ }
+ } else {
+ __folio_lock(folio);
}
+
+ return true;
}
/**
@@ -1642,337 +1783,371 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
}
EXPORT_SYMBOL(page_cache_prev_miss);
-/**
- * find_get_entry - find and get a page cache entry
+/*
+ * Lockless page cache protocol:
+ * On the lookup side:
+ * 1. Load the folio from i_pages
+ * 2. Increment the refcount if it's not zero
+ * 3. If the folio is not found by xas_reload(), put the refcount and retry
+ *
+ * On the removal side:
+ * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
+ * B. Remove the page from i_pages
+ * C. Return the page to the page allocator
+ *
+ * This means that any page may have its reference count temporarily
+ * increased by a speculative page cache (or fast GUP) lookup as it can
+ * be allocated by another user before the RCU grace period expires.
+ * Because the refcount temporarily acquired here may end up being the
+ * last refcount on the page, any page allocation must be freeable by
+ * folio_put().
+ */
+
+/*
+ * filemap_get_entry - Get a page cache entry.
* @mapping: the address_space to search
* @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, the head page is returned with an increased refcount.
- *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
+ * Looks up the page cache entry at @mapping & @index. If it is a folio,
+ * it is returned with an increased refcount. If it is a shadow entry
+ * of a previously evicted folio, or a swap entry from shmem/tmpfs,
+ * it is returned without further action.
*
- * Return: The head page or shadow entry, %NULL if nothing is found.
+ * Return: The folio, swap or shadow entry, %NULL if nothing is found.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
+void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
- struct page *page;
+ struct folio *folio;
rcu_read_lock();
repeat:
xas_reset(&xas);
- page = xas_load(&xas);
- if (xas_retry(&xas, page))
+ folio = xas_load(&xas);
+ if (xas_retry(&xas, folio))
goto repeat;
/*
* A shadow entry of a recently evicted page, or a swap entry from
* shmem/tmpfs. Return it without attempting to raise page count.
*/
- if (!page || xa_is_value(page))
+ if (!folio || xa_is_value(folio))
goto out;
- if (!page_cache_get_speculative(page))
+ if (!folio_try_get_rcu(folio))
goto repeat;
- /*
- * Has the page moved or been split?
- * This is part of the lockless pagecache protocol. See
- * include/linux/pagemap.h for details.
- */
- if (unlikely(page != xas_reload(&xas))) {
- put_page(page);
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
goto repeat;
}
out:
rcu_read_unlock();
- return page;
-}
-
-/**
- * find_lock_entry - Locate and lock a page cache entry.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * Looks up the page at @mapping & @index. If there is a page in the
- * cache, the head page is returned locked and with an increased refcount.
- *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Context: May sleep.
- * Return: The head page or shadow entry, %NULL if nothing is found.
- */
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
-{
- struct page *page;
-
-repeat:
- page = find_get_entry(mapping, index);
- if (page && !xa_is_value(page)) {
- lock_page(page);
- /* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto repeat;
- }
- VM_BUG_ON_PAGE(!thp_contains(page, index), page);
- }
- return page;
+ return folio;
}
/**
- * pagecache_get_page - Find and get a reference to a page.
+ * __filemap_get_folio - Find and get a reference to a folio.
* @mapping: The address_space to search.
* @index: The page index.
- * @fgp_flags: %FGP flags modify how the page is returned.
- * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
+ * @fgp_flags: %FGP flags modify how the folio is returned.
+ * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
*
* Looks up the page cache entry at @mapping & @index.
*
* @fgp_flags can be zero or more of these flags:
*
- * * %FGP_ACCESSED - The page will be marked accessed.
- * * %FGP_LOCK - The page is returned locked.
- * * %FGP_HEAD - If the page is present and a THP, return the head page
- * rather than the exact page specified by the index.
+ * * %FGP_ACCESSED - The folio will be marked accessed.
+ * * %FGP_LOCK - The folio is returned locked.
* * %FGP_CREAT - If no page is present then a new page is allocated using
- * @gfp_mask and added to the page cache and the VM's LRU list.
+ * @gfp and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
* page is already in cache. If the page was allocated, unlock it before
* returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
- * * %FGP_NOWAIT - Don't get blocked by page lock
+ * * %FGP_WRITE - The page will be written to by the caller.
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
+ * * %FGP_NOWAIT - Don't get blocked by page lock.
+ * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
*
- * Return: The found page or %NULL otherwise.
+ * Return: The found folio or an ERR_PTR() otherwise.
*/
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp_mask)
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+ int fgp_flags, gfp_t gfp)
{
- struct page *page;
+ struct folio *folio;
repeat:
- page = find_get_entry(mapping, index);
- if (xa_is_value(page))
- page = NULL;
- if (!page)
+ folio = filemap_get_entry(mapping, index);
+ if (xa_is_value(folio))
+ folio = NULL;
+ if (!folio)
goto no_page;
if (fgp_flags & FGP_LOCK) {
if (fgp_flags & FGP_NOWAIT) {
- if (!trylock_page(page)) {
- put_page(page);
- return NULL;
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EAGAIN);
}
} else {
- lock_page(page);
+ folio_lock(folio);
}
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto repeat;
}
- VM_BUG_ON_PAGE(!thp_contains(page, index), page);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
}
if (fgp_flags & FGP_ACCESSED)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
else if (fgp_flags & FGP_WRITE) {
/* Clear idle flag for buffer write */
- if (page_is_idle(page))
- clear_page_idle(page);
+ if (folio_test_idle(folio))
+ folio_clear_idle(folio);
}
- if (!(fgp_flags & FGP_HEAD))
- page = find_subpage(page, index);
+ if (fgp_flags & FGP_STABLE)
+ folio_wait_stable(folio);
no_page:
- if (!page && (fgp_flags & FGP_CREAT)) {
+ if (!folio && (fgp_flags & FGP_CREAT)) {
int err;
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
- gfp_mask |= __GFP_WRITE;
+ gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
- gfp_mask &= ~__GFP_FS;
+ gfp &= ~__GFP_FS;
+ if (fgp_flags & FGP_NOWAIT) {
+ gfp &= ~GFP_KERNEL;
+ gfp |= GFP_NOWAIT | __GFP_NOWARN;
+ }
- page = __page_cache_alloc(gfp_mask);
- if (!page)
- return NULL;
+ folio = filemap_alloc_folio(gfp, 0);
+ if (!folio)
+ return ERR_PTR(-ENOMEM);
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
- __SetPageReferenced(page);
+ __folio_set_referenced(folio);
- err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ err = filemap_add_folio(mapping, folio, index, gfp);
if (unlikely(err)) {
- put_page(page);
- page = NULL;
+ folio_put(folio);
+ folio = NULL;
if (err == -EEXIST)
goto repeat;
}
/*
- * add_to_page_cache_lru locks the page, and for mmap we expect
- * an unlocked page.
+ * filemap_add_folio locks the page, and for mmap
+ * we expect an unlocked page.
*/
- if (page && (fgp_flags & FGP_FOR_MMAP))
- unlock_page(page);
+ if (folio && (fgp_flags & FGP_FOR_MMAP))
+ folio_unlock(folio);
}
- return page;
+ if (!folio)
+ return ERR_PTR(-ENOENT);
+ return folio;
+}
+EXPORT_SYMBOL(__filemap_get_folio);
+
+static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
+ xa_mark_t mark)
+{
+ struct folio *folio;
+
+retry:
+ if (mark == XA_PRESENT)
+ folio = xas_find(xas, max);
+ else
+ folio = xas_find_marked(xas, max, mark);
+
+ if (xas_retry(xas, folio))
+ goto retry;
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ if (!folio || xa_is_value(folio))
+ return folio;
+
+ if (!folio_try_get_rcu(folio))
+ goto reset;
+
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ goto reset;
+ }
+
+ return folio;
+reset:
+ xas_reset(xas);
+ goto retry;
}
-EXPORT_SYMBOL(pagecache_get_page);
/**
* find_get_entries - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page cache index
- * @nr_entries: The maximum number of entries
- * @entries: Where the resulting entries are placed
+ * @end: The final page index (inclusive).
+ * @fbatch: Where the resulting entries are placed.
* @indices: The cache indices corresponding to the entries in @entries
*
- * find_get_entries() will search for and return a group of up to
- * @nr_entries entries in the mapping. The entries are placed at
- * @entries. find_get_entries() takes a reference against any actual
- * pages it returns.
+ * find_get_entries() will search for and return a batch of entries in
+ * the mapping. The entries are placed in @fbatch. find_get_entries()
+ * takes a reference on any actual folios it returns.
*
- * The search returns a group of mapping-contiguous page cache entries
- * with ascending indexes. There may be holes in the indices due to
- * not-present pages.
+ * The entries have ascending indexes. The indices may not be consecutive
+ * due to not-present entries or large folios.
*
- * Any shadow entries of evicted pages, or swap entries from
+ * Any shadow entries of evicted folios, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
- * If it finds a Transparent Huge Page, head or tail, find_get_entries()
- * stops at that page: the caller is likely to have a better way to handle
- * the compound page as a whole, and then skip its extent, than repeatedly
- * calling find_get_entries() to return all its tails.
- *
- * Return: the number of pages and shadow entries which were found.
+ * Return: The number of entries which were found.
*/
-unsigned find_get_entries(struct address_space *mapping,
- pgoff_t start, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices)
+unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
- XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
- unsigned int ret = 0;
-
- if (!nr_entries)
- return 0;
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct folio *folio;
rcu_read_lock();
- xas_for_each(&xas, page, ULONG_MAX) {
- if (xas_retry(&xas, page))
- continue;
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
- if (xa_is_value(page))
- goto export;
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+ indices[fbatch->nr] = xas.xa_index;
+ if (!folio_batch_add(fbatch, folio))
+ break;
+ }
+ rcu_read_unlock();
- if (!page_cache_get_speculative(page))
- goto retry;
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr = 1;
+ int idx = folio_batch_count(fbatch) - 1;
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
+ folio = fbatch->folios[idx];
+ if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ nr = folio_nr_pages(folio);
+ *start = indices[idx] + nr;
+ }
+ return folio_batch_count(fbatch);
+}
- /*
- * Terminate early on finding a THP, to allow the caller to
- * handle it all at once; but continue if this is hugetlbfs.
- */
- if (PageTransHuge(page) && !PageHuge(page)) {
- page = find_subpage(page, xas.xa_index);
- nr_entries = ret + 1;
+/**
+ * find_lock_entries - Find a batch of pagecache entries.
+ * @mapping: The address_space to search.
+ * @start: The starting page cache index.
+ * @end: The final page index (inclusive).
+ * @fbatch: Where the resulting entries are placed.
+ * @indices: The cache indices of the entries in @fbatch.
+ *
+ * find_lock_entries() will return a batch of entries from @mapping.
+ * Swap, shadow and DAX entries are included. Folios are returned
+ * locked and with an incremented refcount. Folios which are locked
+ * by somebody else or under writeback are skipped. Folios which are
+ * partially outside the range are not returned.
+ *
+ * The entries have ascending indexes. The indices may not be consecutive
+ * due to not-present entries, large folios, folios which could not be
+ * locked or folios under writeback.
+ *
+ * Return: The number of entries which were found.
+ */
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
+{
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct folio *folio;
+
+ rcu_read_lock();
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
+ if (!xa_is_value(folio)) {
+ if (folio->index < *start)
+ goto put;
+ if (folio->index + folio_nr_pages(folio) - 1 > end)
+ goto put;
+ if (!folio_trylock(folio))
+ goto put;
+ if (folio->mapping != mapping ||
+ folio_test_writeback(folio))
+ goto unlock;
+ VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
+ folio);
}
-export:
- indices[ret] = xas.xa_index;
- entries[ret] = page;
- if (++ret == nr_entries)
+ indices[fbatch->nr] = xas.xa_index;
+ if (!folio_batch_add(fbatch, folio))
break;
continue;
-put_page:
- put_page(page);
-retry:
- xas_reset(&xas);
+unlock:
+ folio_unlock(folio);
+put:
+ folio_put(folio);
}
rcu_read_unlock();
- return ret;
+
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr = 1;
+ int idx = folio_batch_count(fbatch) - 1;
+
+ folio = fbatch->folios[idx];
+ if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ nr = folio_nr_pages(folio);
+ *start = indices[idx] + nr;
+ }
+ return folio_batch_count(fbatch);
}
/**
- * find_get_pages_range - gang pagecache lookup
+ * filemap_get_folios - Get a batch of folios
* @mapping: The address_space to search
* @start: The starting page index
* @end: The final page index (inclusive)
- * @nr_pages: The maximum number of pages
- * @pages: Where the resulting pages are placed
+ * @fbatch: The batch to fill.
*
- * find_get_pages_range() will search for and return a group of up to @nr_pages
- * pages in the mapping starting at index @start and up to index @end
- * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
- * a reference against the returned pages.
+ * Search for and return a batch of folios in the mapping starting at
+ * index @start and up to index @end (inclusive). The folios are returned
+ * in @fbatch with an elevated reference count.
*
- * The search returns a group of mapping-contiguous pages with ascending
- * indexes. There may be holes in the indices due to not-present pages.
- * We also update @start to index the next page for the traversal.
+ * The first folio may start before @start; if it does, it will contain
+ * @start. The final folio may extend beyond @end; if it does, it will
+ * contain @end. The folios have ascending indices. There may be gaps
+ * between the folios if there are indices which have no folio in the
+ * page cache. If folios are added to or removed from the page cache
+ * while this is running, they may or may not be found by this call.
*
- * Return: the number of pages which were found. If this number is
- * smaller than @nr_pages, the end of specified range has been
- * reached.
+ * Return: The number of folios which were found.
+ * We also update @start to index the next folio for the traversal.
*/
-unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
- pgoff_t end, unsigned int nr_pages,
- struct page **pages)
+unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch)
{
XA_STATE(xas, &mapping->i_pages, *start);
- struct page *page;
- unsigned ret = 0;
-
- if (unlikely(!nr_pages))
- return 0;
+ struct folio *folio;
rcu_read_lock();
- xas_for_each(&xas, page, end) {
- if (xas_retry(&xas, page))
- continue;
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
/* Skip over shadow, swap and DAX entries */
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
+ if (!folio_batch_add(fbatch, folio)) {
+ unsigned long nr = folio_nr_pages(folio);
- if (!page_cache_get_speculative(page))
- goto retry;
-
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
-
- pages[ret] = find_subpage(page, xas.xa_index);
- if (++ret == nr_pages) {
- *start = xas.xa_index + 1;
+ if (folio_test_hugetlb(folio))
+ nr = 1;
+ *start = folio->index + nr;
goto out;
}
- continue;
-put_page:
- put_page(page);
-retry:
- xas_reset(&xas);
}
/*
@@ -1988,135 +2163,146 @@ retry:
out:
rcu_read_unlock();
- return ret;
+ return folio_batch_count(fbatch);
+}
+EXPORT_SYMBOL(filemap_get_folios);
+
+static inline
+bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
+{
+ if (!folio_test_large(folio) || folio_test_hugetlb(folio))
+ return false;
+ if (index >= max)
+ return false;
+ return index < folio->index + folio_nr_pages(folio) - 1;
}
/**
- * find_get_pages_contig - gang contiguous pagecache lookup
+ * filemap_get_folios_contig - Get a batch of contiguous folios
* @mapping: The address_space to search
- * @index: The starting page index
- * @nr_pages: The maximum number of pages
- * @pages: Where the resulting pages are placed
+ * @start: The starting page index
+ * @end: The final page index (inclusive)
+ * @fbatch: The batch to fill
*
- * find_get_pages_contig() works exactly like find_get_pages(), except
- * that the returned number of pages are guaranteed to be contiguous.
+ * filemap_get_folios_contig() works exactly like filemap_get_folios(),
+ * except the returned folios are guaranteed to be contiguous. This may
+ * not return all contiguous folios if the batch gets filled up.
*
- * Return: the number of pages which were found.
+ * Return: The number of folios found.
+ * Also update @start to be positioned for traversal of the next folio.
*/
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
- unsigned int nr_pages, struct page **pages)
-{
- XA_STATE(xas, &mapping->i_pages, index);
- struct page *page;
- unsigned int ret = 0;
- if (unlikely(!nr_pages))
- return 0;
+unsigned filemap_get_folios_contig(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
+{
+ XA_STATE(xas, &mapping->i_pages, *start);
+ unsigned long nr;
+ struct folio *folio;
rcu_read_lock();
- for (page = xas_load(&xas); page; page = xas_next(&xas)) {
- if (xas_retry(&xas, page))
+
+ for (folio = xas_load(&xas); folio && xas.xa_index <= end;
+ folio = xas_next(&xas)) {
+ if (xas_retry(&xas, folio))
continue;
/*
* If the entry has been swapped out, we can stop looking.
* No current caller is looking for DAX entries.
*/
- if (xa_is_value(page))
- break;
+ if (xa_is_value(folio))
+ goto update_start;
- if (!page_cache_get_speculative(page))
+ if (!folio_try_get_rcu(folio))
goto retry;
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
+ if (unlikely(folio != xas_reload(&xas)))
+ goto put_folio;
- pages[ret] = find_subpage(page, xas.xa_index);
- if (++ret == nr_pages)
- break;
+ if (!folio_batch_add(fbatch, folio)) {
+ nr = folio_nr_pages(folio);
+
+ if (folio_test_hugetlb(folio))
+ nr = 1;
+ *start = folio->index + nr;
+ goto out;
+ }
continue;
-put_page:
- put_page(page);
+put_folio:
+ folio_put(folio);
+
retry:
xas_reset(&xas);
}
+
+update_start:
+ nr = folio_batch_count(fbatch);
+
+ if (nr) {
+ folio = fbatch->folios[nr - 1];
+ if (folio_test_hugetlb(folio))
+ *start = folio->index + 1;
+ else
+ *start = folio->index + folio_nr_pages(folio);
+ }
+out:
rcu_read_unlock();
- return ret;
+ return folio_batch_count(fbatch);
}
-EXPORT_SYMBOL(find_get_pages_contig);
+EXPORT_SYMBOL(filemap_get_folios_contig);
/**
- * find_get_pages_range_tag - find and return pages in given range matching @tag
- * @mapping: the address_space to search
- * @index: the starting page index
- * @end: The final page index (inclusive)
- * @tag: the tag index
- * @nr_pages: the maximum number of pages
- * @pages: where the resulting pages are placed
+ * filemap_get_folios_tag - Get a batch of folios matching @tag
+ * @mapping: The address_space to search
+ * @start: The starting page index
+ * @end: The final page index (inclusive)
+ * @tag: The tag index
+ * @fbatch: The batch to fill
*
- * Like find_get_pages, except we only return pages which are tagged with
- * @tag. We update @index to index the next page for the traversal.
+ * Same as filemap_get_folios(), but only returning folios tagged with @tag.
*
- * Return: the number of pages which were found.
+ * Return: The number of folios found.
+ * Also update @start to index the next folio for traversal.
*/
-unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
- pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
- struct page **pages)
+unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
{
- XA_STATE(xas, &mapping->i_pages, *index);
- struct page *page;
- unsigned ret = 0;
-
- if (unlikely(!nr_pages))
- return 0;
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct folio *folio;
rcu_read_lock();
- xas_for_each_marked(&xas, page, end, tag) {
- if (xas_retry(&xas, page))
- continue;
+ while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
/*
* Shadow entries should never be tagged, but this iteration
* is lockless so there is a window for page reclaim to evict
- * a page we saw tagged. Skip over it.
+ * a page we saw tagged. Skip over it.
*/
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
+ if (!folio_batch_add(fbatch, folio)) {
+ unsigned long nr = folio_nr_pages(folio);
- if (!page_cache_get_speculative(page))
- goto retry;
-
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
-
- pages[ret] = find_subpage(page, xas.xa_index);
- if (++ret == nr_pages) {
- *index = xas.xa_index + 1;
+ if (folio_test_hugetlb(folio))
+ nr = 1;
+ *start = folio->index + nr;
goto out;
}
- continue;
-put_page:
- put_page(page);
-retry:
- xas_reset(&xas);
}
-
/*
- * We come here when we got to @end. We take care to not overflow the
- * index @index as it confuses some of the callers. This breaks the
- * iteration when there is a page at index -1 but that is already
- * broken anyway.
+ * We come here when there is no page beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is a page at index -1 but that is
+ * already broke anyway.
*/
if (end == (pgoff_t)-1)
- *index = (pgoff_t)-1;
+ *start = (pgoff_t)-1;
else
- *index = end + 1;
+ *start = end + 1;
out:
rcu_read_unlock();
- return ret;
+ return folio_batch_count(fbatch);
}
-EXPORT_SYMBOL(find_get_pages_range_tag);
+EXPORT_SYMBOL(filemap_get_folios_tag);
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
@@ -2138,299 +2324,452 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
ra->ra_pages /= 4;
}
-/**
- * generic_file_buffered_read - generic file read routine
- * @iocb: the iocb to read
- * @iter: data destination
- * @written: already copied
+/*
+ * filemap_get_read_batch - Get a batch of folios for read
*
- * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level stuff.
+ * Get a batch of folios which represent a contiguous range of bytes in
+ * the file. No exceptional entries will be returned. If @index is in
+ * the middle of a folio, the entire folio will be returned. The last
+ * folio in the batch may have the readahead flag set or the uptodate flag
+ * clear so that the caller can take the appropriate action.
+ */
+static void filemap_get_read_batch(struct address_space *mapping,
+ pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+ struct folio *folio;
+
+ rcu_read_lock();
+ for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
+ if (xas_retry(&xas, folio))
+ continue;
+ if (xas.xa_index > max || xa_is_value(folio))
+ break;
+ if (xa_is_sibling(folio))
+ break;
+ if (!folio_try_get_rcu(folio))
+ goto retry;
+
+ if (unlikely(folio != xas_reload(&xas)))
+ goto put_folio;
+
+ if (!folio_batch_add(fbatch, folio))
+ break;
+ if (!folio_test_uptodate(folio))
+ break;
+ if (folio_test_readahead(folio))
+ break;
+ xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
+ continue;
+put_folio:
+ folio_put(folio);
+retry:
+ xas_reset(&xas);
+ }
+ rcu_read_unlock();
+}
+
+static int filemap_read_folio(struct file *file, filler_t filler,
+ struct folio *folio)
+{
+ bool workingset = folio_test_workingset(folio);
+ unsigned long pflags;
+ int error;
+
+ /*
+ * A previous I/O error may have been due to temporary failures,
+ * eg. multipath errors. PG_error will be set again if read_folio
+ * fails.
+ */
+ folio_clear_error(folio);
+
+ /* Start the actual read. The read will unlock the page. */
+ if (unlikely(workingset))
+ psi_memstall_enter(&pflags);
+ error = filler(file, folio);
+ if (unlikely(workingset))
+ psi_memstall_leave(&pflags);
+ if (error)
+ return error;
+
+ error = folio_wait_locked_killable(folio);
+ if (error)
+ return error;
+ if (folio_test_uptodate(folio))
+ return 0;
+ if (file)
+ shrink_readahead_size_eio(&file->f_ra);
+ return -EIO;
+}
+
+static bool filemap_range_uptodate(struct address_space *mapping,
+ loff_t pos, size_t count, struct folio *folio,
+ bool need_uptodate)
+{
+ if (folio_test_uptodate(folio))
+ return true;
+ /* pipes can't handle partially uptodate pages */
+ if (need_uptodate)
+ return false;
+ if (!mapping->a_ops->is_partially_uptodate)
+ return false;
+ if (mapping->host->i_blkbits >= folio_shift(folio))
+ return false;
+
+ if (folio_pos(folio) > pos) {
+ count -= folio_pos(folio) - pos;
+ pos = 0;
+ } else {
+ pos -= folio_pos(folio);
+ }
+
+ return mapping->a_ops->is_partially_uptodate(folio, pos, count);
+}
+
+static int filemap_update_page(struct kiocb *iocb,
+ struct address_space *mapping, size_t count,
+ struct folio *folio, bool need_uptodate)
+{
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!filemap_invalidate_trylock_shared(mapping))
+ return -EAGAIN;
+ } else {
+ filemap_invalidate_lock_shared(mapping);
+ }
+
+ if (!folio_trylock(folio)) {
+ error = -EAGAIN;
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
+ goto unlock_mapping;
+ if (!(iocb->ki_flags & IOCB_WAITQ)) {
+ filemap_invalidate_unlock_shared(mapping);
+ /*
+ * This is where we usually end up waiting for a
+ * previously submitted readahead to finish.
+ */
+ folio_put_wait_locked(folio, TASK_KILLABLE);
+ return AOP_TRUNCATED_PAGE;
+ }
+ error = __folio_lock_async(folio, iocb->ki_waitq);
+ if (error)
+ goto unlock_mapping;
+ }
+
+ error = AOP_TRUNCATED_PAGE;
+ if (!folio->mapping)
+ goto unlock;
+
+ error = 0;
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
+ need_uptodate))
+ goto unlock;
+
+ error = -EAGAIN;
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
+ goto unlock;
+
+ error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
+ folio);
+ goto unlock_mapping;
+unlock:
+ folio_unlock(folio);
+unlock_mapping:
+ filemap_invalidate_unlock_shared(mapping);
+ if (error == AOP_TRUNCATED_PAGE)
+ folio_put(folio);
+ return error;
+}
+
+static int filemap_create_folio(struct file *file,
+ struct address_space *mapping, pgoff_t index,
+ struct folio_batch *fbatch)
+{
+ struct folio *folio;
+ int error;
+
+ folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
+ if (!folio)
+ return -ENOMEM;
+
+ /*
+ * Protect against truncate / hole punch. Grabbing invalidate_lock
+ * here assures we cannot instantiate and bring uptodate new
+ * pagecache folios after evicting page cache during truncate
+ * and before actually freeing blocks. Note that we could
+ * release invalidate_lock after inserting the folio into
+ * the page cache as the locked folio would then be enough to
+ * synchronize with hole punching. But there are code paths
+ * such as filemap_update_page() filling in partially uptodate
+ * pages or ->readahead() that need to hold invalidate_lock
+ * while mapping blocks for IO so let's hold the lock here as
+ * well to keep locking rules simple.
+ */
+ filemap_invalidate_lock_shared(mapping);
+ error = filemap_add_folio(mapping, folio, index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
+ if (error == -EEXIST)
+ error = AOP_TRUNCATED_PAGE;
+ if (error)
+ goto error;
+
+ error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
+ if (error)
+ goto error;
+
+ filemap_invalidate_unlock_shared(mapping);
+ folio_batch_add(fbatch, folio);
+ return 0;
+error:
+ filemap_invalidate_unlock_shared(mapping);
+ folio_put(folio);
+ return error;
+}
+
+static int filemap_readahead(struct kiocb *iocb, struct file *file,
+ struct address_space *mapping, struct folio *folio,
+ pgoff_t last_index)
+{
+ DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
+
+ if (iocb->ki_flags & IOCB_NOIO)
+ return -EAGAIN;
+ page_cache_async_ra(&ractl, folio, last_index - folio->index);
+ return 0;
+}
+
+static int filemap_get_pages(struct kiocb *iocb, size_t count,
+ struct folio_batch *fbatch, bool need_uptodate)
+{
+ struct file *filp = iocb->ki_filp;
+ struct address_space *mapping = filp->f_mapping;
+ struct file_ra_state *ra = &filp->f_ra;
+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+ pgoff_t last_index;
+ struct folio *folio;
+ int err = 0;
+
+ /* "last_index" is the index of the page beyond the end of the read */
+ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
+retry:
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
+ if (!folio_batch_count(fbatch)) {
+ if (iocb->ki_flags & IOCB_NOIO)
+ return -EAGAIN;
+ page_cache_sync_readahead(mapping, ra, filp, index,
+ last_index - index);
+ filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
+ }
+ if (!folio_batch_count(fbatch)) {
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
+ return -EAGAIN;
+ err = filemap_create_folio(filp, mapping,
+ iocb->ki_pos >> PAGE_SHIFT, fbatch);
+ if (err == AOP_TRUNCATED_PAGE)
+ goto retry;
+ return err;
+ }
+
+ folio = fbatch->folios[folio_batch_count(fbatch) - 1];
+ if (folio_test_readahead(folio)) {
+ err = filemap_readahead(iocb, filp, mapping, folio, last_index);
+ if (err)
+ goto err;
+ }
+ if (!folio_test_uptodate(folio)) {
+ if ((iocb->ki_flags & IOCB_WAITQ) &&
+ folio_batch_count(fbatch) > 1)
+ iocb->ki_flags |= IOCB_NOWAIT;
+ err = filemap_update_page(iocb, mapping, count, folio,
+ need_uptodate);
+ if (err)
+ goto err;
+ }
+
+ return 0;
+err:
+ if (err < 0)
+ folio_put(folio);
+ if (likely(--fbatch->nr))
+ return 0;
+ if (err == AOP_TRUNCATED_PAGE)
+ goto retry;
+ return err;
+}
+
+static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
+{
+ unsigned int shift = folio_shift(folio);
+
+ return (pos1 >> shift == pos2 >> shift);
+}
+
+/**
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
*
- * This is really ugly. But the goto's actually try to clarify some
- * of the logic when it comes to error handling etc.
+ * Copies data from the page cache. If the data is not currently present,
+ * uses the readahead and read_folio address_space operations to fetch it.
*
- * Return:
- * * total number of bytes copied, including those the were already @written
- * * negative error code if nothing was copied
+ * Return: Total number of bytes copied, including those already read by
+ * the caller. If an error happens before any bytes are copied, returns
+ * a negative error number.
*/
-ssize_t generic_file_buffered_read(struct kiocb *iocb,
- struct iov_iter *iter, ssize_t written)
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+ ssize_t already_read)
{
struct file *filp = iocb->ki_filp;
+ struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- struct file_ra_state *ra = &filp->f_ra;
- loff_t *ppos = &iocb->ki_pos;
- pgoff_t index;
- pgoff_t last_index;
- pgoff_t prev_index;
- unsigned long offset; /* offset into pagecache page */
- unsigned int prev_offset;
- int error = 0;
+ struct folio_batch fbatch;
+ int i, error = 0;
+ bool writably_mapped;
+ loff_t isize, end_offset;
- if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
+ return 0;
+ if (unlikely(!iov_iter_count(iter)))
return 0;
+
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
+ folio_batch_init(&fbatch);
- index = *ppos >> PAGE_SHIFT;
- prev_index = ra->prev_pos >> PAGE_SHIFT;
- prev_offset = ra->prev_pos & (PAGE_SIZE-1);
- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
- offset = *ppos & ~PAGE_MASK;
+ do {
+ cond_resched();
- for (;;) {
- struct page *page;
- pgoff_t end_index;
- loff_t isize;
- unsigned long nr, ret;
+ /*
+ * If we've already successfully copied some data, then we
+ * can no longer safely return -EIOCBQUEUED. Hence mark
+ * an async read NOWAIT at that point.
+ */
+ if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
+ iocb->ki_flags |= IOCB_NOWAIT;
- cond_resched();
-find_page:
- if (fatal_signal_pending(current)) {
- error = -EINTR;
- goto out;
- }
+ if (unlikely(iocb->ki_pos >= i_size_read(inode)))
+ break;
+
+ error = filemap_get_pages(iocb, iter->count, &fbatch, false);
+ if (error < 0)
+ break;
- page = find_get_page(mapping, index);
- if (!page) {
- if (iocb->ki_flags & IOCB_NOIO)
- goto would_block;
- page_cache_sync_readahead(mapping,
- ra, filp,
- index, last_index - index);
- page = find_get_page(mapping, index);
- if (unlikely(page == NULL))
- goto no_cached_page;
- }
- if (PageReadahead(page)) {
- if (iocb->ki_flags & IOCB_NOIO) {
- put_page(page);
- goto out;
- }
- page_cache_async_readahead(mapping,
- ra, filp, page,
- index, last_index - index);
- }
- if (!PageUptodate(page)) {
- /*
- * See comment in do_read_cache_page on why
- * wait_on_page_locked is used to avoid unnecessarily
- * serialisations and why it's safe.
- */
- if (iocb->ki_flags & IOCB_WAITQ) {
- if (written) {
- put_page(page);
- goto out;
- }
- error = wait_on_page_locked_async(page,
- iocb->ki_waitq);
- } else {
- if (iocb->ki_flags & IOCB_NOWAIT) {
- put_page(page);
- goto would_block;
- }
- error = wait_on_page_locked_killable(page);
- }
- if (unlikely(error))
- goto readpage_error;
- if (PageUptodate(page))
- goto page_ok;
-
- if (inode->i_blkbits == PAGE_SHIFT ||
- !mapping->a_ops->is_partially_uptodate)
- goto page_not_up_to_date;
- /* pipes can't handle partially uptodate pages */
- if (unlikely(iov_iter_is_pipe(iter)))
- goto page_not_up_to_date;
- if (!trylock_page(page))
- goto page_not_up_to_date;
- /* Did it get truncated before we got the lock? */
- if (!page->mapping)
- goto page_not_up_to_date_locked;
- if (!mapping->a_ops->is_partially_uptodate(page,
- offset, iter->count))
- goto page_not_up_to_date_locked;
- unlock_page(page);
- }
-page_ok:
/*
- * i_size must be checked after we know the page is Uptodate.
+ * i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
-
isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_SHIFT;
- if (unlikely(!isize || index > end_index)) {
- put_page(page);
- goto out;
- }
-
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_SIZE;
- if (index == end_index) {
- nr = ((isize - 1) & ~PAGE_MASK) + 1;
- if (nr <= offset) {
- put_page(page);
- goto out;
- }
- }
- nr = nr - offset;
-
- /* If users can be writing to this page using arbitrary
- * virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
- */
- if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ if (unlikely(iocb->ki_pos >= isize))
+ goto put_folios;
+ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
/*
- * When a sequential read accesses a page several times,
- * only mark it as accessed the first time.
+ * Once we start copying data, we don't want to be touching any
+ * cachelines that might be contended:
*/
- if (prev_index != index || offset != prev_offset)
- mark_page_accessed(page);
- prev_index = index;
+ writably_mapped = mapping_writably_mapped(mapping);
/*
- * Ok, we have the page, and it's up-to-date, so
- * now we can copy it to user space...
+ * When a read accesses the same folio several times, only
+ * mark it as accessed the first time.
*/
+ if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1,
+ fbatch.folios[0]))
+ folio_mark_accessed(fbatch.folios[0]);
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ size_t fsize = folio_size(folio);
+ size_t offset = iocb->ki_pos & (fsize - 1);
+ size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+ fsize - offset);
+ size_t copied;
+
+ if (end_offset < folio_pos(folio))
+ break;
+ if (i > 0)
+ folio_mark_accessed(folio);
+ /*
+ * If users can be writing to this folio using arbitrary
+ * virtual addresses, take care of potential aliasing
+ * before reading the folio on the kernel side.
+ */
+ if (writably_mapped)
+ flush_dcache_folio(folio);
- ret = copy_page_to_iter(page, offset, nr, iter);
- offset += ret;
- index += offset >> PAGE_SHIFT;
- offset &= ~PAGE_MASK;
- prev_offset = offset;
-
- put_page(page);
- written += ret;
- if (!iov_iter_count(iter))
- goto out;
- if (ret < nr) {
- error = -EFAULT;
- goto out;
- }
- continue;
-
-page_not_up_to_date:
- /* Get exclusive access to the page ... */
- if (iocb->ki_flags & IOCB_WAITQ)
- error = lock_page_async(page, iocb->ki_waitq);
- else
- error = lock_page_killable(page);
- if (unlikely(error))
- goto readpage_error;
-
-page_not_up_to_date_locked:
- /* Did it get truncated before we got the lock? */
- if (!page->mapping) {
- unlock_page(page);
- put_page(page);
- continue;
- }
+ copied = copy_folio_to_iter(folio, offset, bytes, iter);
- /* Did somebody else fill it already? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto page_ok;
- }
+ already_read += copied;
+ iocb->ki_pos += copied;
+ ra->prev_pos = iocb->ki_pos;
-readpage:
- if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
- unlock_page(page);
- put_page(page);
- goto would_block;
- }
- /*
- * A previous I/O error may have been due to temporary
- * failures, eg. multipath errors.
- * PG_error will be set again if readpage fails.
- */
- ClearPageError(page);
- /* Start the actual read. The read will unlock the page. */
- error = mapping->a_ops->readpage(filp, page);
-
- if (unlikely(error)) {
- if (error == AOP_TRUNCATED_PAGE) {
- put_page(page);
- error = 0;
- goto find_page;
+ if (copied < bytes) {
+ error = -EFAULT;
+ break;
}
- goto readpage_error;
}
+put_folios:
+ for (i = 0; i < folio_batch_count(&fbatch); i++)
+ folio_put(fbatch.folios[i]);
+ folio_batch_init(&fbatch);
+ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
- if (!PageUptodate(page)) {
- if (iocb->ki_flags & IOCB_WAITQ)
- error = lock_page_async(page, iocb->ki_waitq);
- else
- error = lock_page_killable(page);
-
- if (unlikely(error))
- goto readpage_error;
- if (!PageUptodate(page)) {
- if (page->mapping == NULL) {
- /*
- * invalidate_mapping_pages got it
- */
- unlock_page(page);
- put_page(page);
- goto find_page;
- }
- unlock_page(page);
- shrink_readahead_size_eio(ra);
- error = -EIO;
- goto readpage_error;
- }
- unlock_page(page);
- }
+ file_accessed(filp);
- goto page_ok;
+ return already_read ? already_read : error;
+}
+EXPORT_SYMBOL_GPL(filemap_read);
-readpage_error:
- /* UHHUH! A synchronous read error occurred. Report it */
- put_page(page);
- goto out;
+int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ loff_t pos = iocb->ki_pos;
+ loff_t end = pos + count - 1;
-no_cached_page:
- /*
- * Ok, it wasn't cached, so we need to create a new
- * page..
- */
- page = page_cache_alloc(mapping);
- if (!page) {
- error = -ENOMEM;
- goto out;
- }
- error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
- if (error) {
- put_page(page);
- if (error == -EEXIST) {
- error = 0;
- goto find_page;
- }
- goto out;
- }
- goto readpage;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_needs_writeback(mapping, pos, end))
+ return -EAGAIN;
+ return 0;
}
-would_block:
- error = -EAGAIN;
-out:
- ra->prev_pos = prev_index;
- ra->prev_pos <<= PAGE_SHIFT;
- ra->prev_pos |= prev_offset;
+ return filemap_write_and_wait_range(mapping, pos, end);
+}
- *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
- file_accessed(filp);
- return written ? written : error;
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ loff_t pos = iocb->ki_pos;
+ loff_t end = pos + count - 1;
+ int ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* we could block if there are any pages in the range */
+ if (filemap_range_has_page(mapping, pos, end))
+ return -EAGAIN;
+ } else {
+ ret = filemap_write_and_wait_range(mapping, pos, end);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
}
-EXPORT_SYMBOL_GPL(generic_file_buffered_read);
/**
* generic_file_read_iter - generic filesystem read routine
@@ -2460,27 +2799,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t retval = 0;
if (!count)
- goto out; /* skip atime */
+ return 0; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- loff_t size;
-
- size = i_size_read(inode);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_has_page(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1))
- return -EAGAIN;
- } else {
- retval = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (retval < 0)
- goto out;
- }
+ retval = kiocb_write_and_wait(iocb, count);
+ if (retval < 0)
+ return retval;
file_accessed(file);
retval = mapping->a_ops->direct_IO(iocb, iter);
@@ -2488,7 +2816,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += retval;
count -= retval;
}
- iov_iter_revert(iter, count - iov_iter_count(iter));
+ if (retval != -EIOCBQUEUED)
+ iov_iter_revert(iter, count - iov_iter_count(iter));
/*
* Btrfs can have a short DIO read if we encounter
@@ -2499,34 +2828,284 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
- if (retval < 0 || !count || iocb->ki_pos >= size ||
- IS_DAX(inode))
- goto out;
+ if (retval < 0 || !count || IS_DAX(inode))
+ return retval;
+ if (iocb->ki_pos >= i_size_read(inode))
+ return retval;
}
- retval = generic_file_buffered_read(iocb, iter, retval);
-out:
- return retval;
+ return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);
+/*
+ * Splice subpages from a folio into a pipe.
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size)
+{
+ struct page *page;
+ size_t spliced = 0, offset = offset_in_folio(folio, fpos);
+
+ page = folio_page(folio, offset / PAGE_SIZE);
+ size = min(size, folio_size(folio) - offset);
+ offset %= PAGE_SIZE;
+
+ while (spliced < size &&
+ !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+ size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &page_cache_pipe_buf_ops,
+ .page = page,
+ .offset = offset,
+ .len = part,
+ };
+ folio_get(folio);
+ pipe->head++;
+ page++;
+ spliced += part;
+ offset = 0;
+ }
+
+ return spliced;
+}
+
+/**
+ * filemap_splice_read - Splice data from a file's pagecache into a pipe
+ * @in: The file to read from
+ * @ppos: Pointer to the file position to read from
+ * @pipe: The pipe to splice into
+ * @len: The amount to splice
+ * @flags: The SPLICE_F_* flags
+ *
+ * This function gets folios from a file's pagecache and splices them into the
+ * pipe. Readahead will be called as necessary to fill more folios. This may
+ * be used for blockdevs also.
+ *
+ * Return: On success, the number of bytes read will be returned and *@ppos
+ * will be updated if appropriate; 0 will be returned if there is no more data
+ * to be read; -EAGAIN will be returned if the pipe had no space, and some
+ * other negative error code will be returned on error. A short read may occur
+ * if the pipe has insufficient space, we reach the end of the data or we hit a
+ * hole.
+ */
+ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct folio_batch fbatch;
+ struct kiocb iocb;
+ size_t total_spliced = 0, used, npages;
+ loff_t isize, end_offset;
+ bool writably_mapped;
+ int i, error = 0;
+
+ if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
+ return 0;
+
+ init_sync_kiocb(&iocb, in);
+ iocb.ki_pos = *ppos;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ folio_batch_init(&fbatch);
+
+ do {
+ cond_resched();
+
+ if (*ppos >= i_size_read(in->f_mapping->host))
+ break;
+
+ iocb.ki_pos = *ppos;
+ error = filemap_get_pages(&iocb, len, &fbatch, true);
+ if (error < 0)
+ break;
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(in->f_mapping->host);
+ if (unlikely(*ppos >= isize))
+ break;
+ end_offset = min_t(loff_t, isize, *ppos + len);
+
+ /*
+ * Once we start copying data, we don't want to be touching any
+ * cachelines that might be contended:
+ */
+ writably_mapped = mapping_writably_mapped(in->f_mapping);
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ size_t n;
+
+ if (folio_pos(folio) >= end_offset)
+ goto out;
+ folio_mark_accessed(folio);
+
+ /*
+ * If users can be writing to this folio using arbitrary
+ * virtual addresses, take care of potential aliasing
+ * before reading the folio on the kernel side.
+ */
+ if (writably_mapped)
+ flush_dcache_folio(folio);
+
+ n = min_t(loff_t, len, isize - *ppos);
+ n = splice_folio_into_pipe(pipe, folio, *ppos, n);
+ if (!n)
+ goto out;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ goto out;
+ }
+
+ folio_batch_release(&fbatch);
+ } while (len);
+
+out:
+ folio_batch_release(&fbatch);
+ file_accessed(in);
+
+ return total_spliced ? total_spliced : error;
+}
+EXPORT_SYMBOL(filemap_splice_read);
+
+static inline loff_t folio_seek_hole_data(struct xa_state *xas,
+ struct address_space *mapping, struct folio *folio,
+ loff_t start, loff_t end, bool seek_data)
+{
+ const struct address_space_operations *ops = mapping->a_ops;
+ size_t offset, bsz = i_blocksize(mapping->host);
+
+ if (xa_is_value(folio) || folio_test_uptodate(folio))
+ return seek_data ? start : end;
+ if (!ops->is_partially_uptodate)
+ return seek_data ? end : start;
+
+ xas_pause(xas);
+ rcu_read_unlock();
+ folio_lock(folio);
+ if (unlikely(folio->mapping != mapping))
+ goto unlock;
+
+ offset = offset_in_folio(folio, start) & ~(bsz - 1);
+
+ do {
+ if (ops->is_partially_uptodate(folio, offset, bsz) ==
+ seek_data)
+ break;
+ start = (start + bsz) & ~(bsz - 1);
+ offset += bsz;
+ } while (offset < folio_size(folio));
+unlock:
+ folio_unlock(folio);
+ rcu_read_lock();
+ return start;
+}
+
+static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
+{
+ if (xa_is_value(folio))
+ return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
+ return folio_size(folio);
+}
+
+/**
+ * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ * @mapping: Address space to search.
+ * @start: First byte to consider.
+ * @end: Limit of search (exclusive).
+ * @whence: Either SEEK_HOLE or SEEK_DATA.
+ *
+ * If the page cache knows which blocks contain holes and which blocks
+ * contain data, your filesystem can use this function to implement
+ * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are
+ * entirely memory-based such as tmpfs, and filesystems which support
+ * unwritten extents.
+ *
+ * Return: The requested offset on success, or -ENXIO if @whence specifies
+ * SEEK_DATA and there is no data after @start. There is an implicit hole
+ * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
+ * and @end contain data.
+ */
+loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
+ loff_t end, int whence)
+{
+ XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
+ pgoff_t max = (end - 1) >> PAGE_SHIFT;
+ bool seek_data = (whence == SEEK_DATA);
+ struct folio *folio;
+
+ if (end <= start)
+ return -ENXIO;
+
+ rcu_read_lock();
+ while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
+ loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
+ size_t seek_size;
+
+ if (start < pos) {
+ if (!seek_data)
+ goto unlock;
+ start = pos;
+ }
+
+ seek_size = seek_folio_size(&xas, folio);
+ pos = round_up((u64)pos + 1, seek_size);
+ start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
+ seek_data);
+ if (start < pos)
+ goto unlock;
+ if (start >= end)
+ break;
+ if (seek_size > PAGE_SIZE)
+ xas_set(&xas, pos >> PAGE_SHIFT);
+ if (!xa_is_value(folio))
+ folio_put(folio);
+ }
+ if (seek_data)
+ start = -ENXIO;
+unlock:
+ rcu_read_unlock();
+ if (folio && !xa_is_value(folio))
+ folio_put(folio);
+ if (start > end)
+ return end;
+ return start;
+}
+
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
/*
- * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
+ * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
* @vmf - the vm_fault for this fault.
- * @page - the page to lock.
+ * @folio - the folio to lock.
* @fpin - the pointer to the file we may pin (or is already pinned).
*
- * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
- * It differs in that it actually returns the page locked if it returns 1 and 0
- * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin
- * will point to the pinned file and needs to be fput()'ed at a later point.
+ * This works similar to lock_folio_or_retry in that it can drop the
+ * mmap_lock. It differs in that it actually returns the folio locked
+ * if it returns 1 and 0 if it couldn't lock the folio. If we did have
+ * to drop the mmap_lock then fpin will point to the pinned file and
+ * needs to be fput()'ed at a later point.
*/
-static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
+static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
struct file **fpin)
{
- if (trylock_page(page))
+ if (folio_trylock(folio))
return 1;
/*
@@ -2539,7 +3118,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
if (vmf->flags & FAULT_FLAG_KILLABLE) {
- if (__lock_page_killable(page)) {
+ if (__folio_lock_killable(folio)) {
/*
* We didn't have the right flags to drop the mmap_lock,
* but all fault_handlers only check for fatal signals
@@ -2551,11 +3130,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
return 0;
}
} else
- __lock_page(page);
+ __folio_lock(folio);
+
return 1;
}
-
/*
* Synchronous readahead happens when we don't even find a page in the page
* cache at all. We don't want to perform IO under the mmap sem, so if we have
@@ -2568,20 +3147,38 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
struct file *fpin = NULL;
- pgoff_t offset = vmf->pgoff;
+ unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* Use the readahead code, even if readahead is disabled */
+ if (vm_flags & VM_HUGEPAGE) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
+ ra->size = HPAGE_PMD_NR;
+ /*
+ * Fetch two PMD folios, so we get the chance to actually
+ * readahead, unless we've been told not to.
+ */
+ if (!(vm_flags & VM_RAND_READ))
+ ra->size *= 2;
+ ra->async_size = HPAGE_PMD_NR;
+ page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+ return fpin;
+ }
+#endif
+
/* If we don't want any read-ahead, don't bother */
- if (vmf->vma->vm_flags & VM_RAND_READ)
+ if (vm_flags & VM_RAND_READ)
return fpin;
if (!ra->ra_pages)
return fpin;
- if (vmf->vma->vm_flags & VM_SEQ_READ) {
+ if (vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra->ra_pages);
return fpin;
}
@@ -2601,10 +3198,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
- ra_submit(ra, mapping, file);
+ ractl._index = ra->start;
+ page_cache_ra_order(&ractl, ra, 0);
return fpin;
}
@@ -2614,25 +3212,25 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* was pinned if we have to drop the mmap_lock in order to do IO.
*/
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
- struct page *page)
+ struct folio *folio)
{
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
- struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
struct file *fpin = NULL;
unsigned int mmap_miss;
- pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
+
mmap_miss = READ_ONCE(ra->mmap_miss);
if (mmap_miss)
WRITE_ONCE(ra->mmap_miss, --mmap_miss);
- if (PageReadahead(page)) {
+
+ if (folio_test_readahead(folio)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_async_readahead(mapping, ra, file,
- page, offset, ra->ra_pages);
+ page_cache_async_ra(&ractl, folio, ra->ra_pages);
}
return fpin;
}
@@ -2651,7 +3249,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
* vma->vm_mm->mmap_lock must be held on entry.
*
* If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
- * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
+ * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
*
* If our return value does not have VM_FAULT_RETRY set, the mmap_lock
* has not been released.
@@ -2666,61 +3264,86 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
- struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
- pgoff_t offset = vmf->pgoff;
- pgoff_t max_off;
- struct page *page;
+ pgoff_t max_idx, index = vmf->pgoff;
+ struct folio *folio;
vm_fault_t ret = 0;
+ bool mapping_locked = false;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
+ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(index >= max_idx))
return VM_FAULT_SIGBUS;
/*
* Do we have something in the page cache already?
*/
- page = find_get_page(mapping, offset);
- if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+ folio = filemap_get_folio(mapping, index);
+ if (likely(!IS_ERR(folio))) {
/*
- * We found the page, so try async readahead before
- * waiting for the lock.
+ * We found the page, so try async readahead before waiting for
+ * the lock.
*/
- fpin = do_async_mmap_readahead(vmf, page);
- } else if (!page) {
+ if (!(vmf->flags & FAULT_FLAG_TRIED))
+ fpin = do_async_mmap_readahead(vmf, folio);
+ if (unlikely(!folio_test_uptodate(folio))) {
+ filemap_invalidate_lock_shared(mapping);
+ mapping_locked = true;
+ }
+ } else {
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
fpin = do_sync_mmap_readahead(vmf);
retry_find:
- page = pagecache_get_page(mapping, offset,
+ /*
+ * See comment in filemap_create_folio() why we need
+ * invalidate_lock
+ */
+ if (!mapping_locked) {
+ filemap_invalidate_lock_shared(mapping);
+ mapping_locked = true;
+ }
+ folio = __filemap_get_folio(mapping, index,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
- if (!page) {
+ if (IS_ERR(folio)) {
if (fpin)
goto out_retry;
+ filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_OOM;
}
}
- if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+ if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
goto out_retry;
/* Did it get truncated? */
- if (unlikely(compound_head(page)->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto retry_find;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
- if (unlikely(!PageUptodate(page)))
+ if (unlikely(!folio_test_uptodate(folio))) {
+ /*
+ * The page was in cache and uptodate and now it is not.
+ * Strange but possible since we didn't hold the page lock all
+ * the time. Let's drop everything get the invalidate lock and
+ * try again.
+ */
+ if (!mapping_locked) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto retry_find;
+ }
goto page_not_uptodate;
+ }
/*
* We've made it this far and we had to drop our mmap_lock, now is the
@@ -2728,22 +3351,24 @@ retry_find:
* redo the fault.
*/
if (fpin) {
- unlock_page(page);
+ folio_unlock(folio);
goto out_retry;
}
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
/*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off)) {
- unlock_page(page);
- put_page(page);
+ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(index >= max_idx)) {
+ folio_unlock(folio);
+ folio_put(folio);
return VM_FAULT_SIGBUS;
}
- vmf->page = page;
+ vmf->page = folio_file_page(folio, index);
return ret | VM_FAULT_LOCKED;
page_not_uptodate:
@@ -2753,22 +3378,16 @@ page_not_uptodate:
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- ClearPageError(page);
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (!PageUptodate(page))
- error = -EIO;
- }
+ error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
if (fpin)
goto out_retry;
- put_page(page);
+ folio_put(folio);
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
+ filemap_invalidate_unlock_shared(mapping);
- shrink_readahead_size_eio(ra);
return VM_FAULT_SIGBUS;
out_retry:
@@ -2777,108 +3396,206 @@ out_retry:
* re-find the vma and come back and find our hopefully still populated
* page.
*/
- if (page)
- put_page(page);
+ if (!IS_ERR(folio))
+ folio_put(folio);
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_fault *vmf,
- pgoff_t start_pgoff, pgoff_t end_pgoff)
+static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
+ pgoff_t start)
{
- struct file *file = vmf->vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- pgoff_t last_pgoff = start_pgoff;
- unsigned long max_idx;
- XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *head, *page;
- unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+ struct mm_struct *mm = vmf->vma->vm_mm;
- rcu_read_lock();
- xas_for_each(&xas, head, end_pgoff) {
- if (xas_retry(&xas, head))
- continue;
- if (xa_is_value(head))
- goto next;
+ /* Huge page is mapped? No need to proceed. */
+ if (pmd_trans_huge(*vmf->pmd)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return true;
+ }
- /*
- * Check for a locked page first, as a speculative
- * reference may adversely influence page migration.
- */
- if (PageLocked(head))
- goto next;
- if (!page_cache_get_speculative(head))
- goto next;
+ if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
+ struct page *page = folio_file_page(folio, start);
+ vm_fault_t ret = do_set_pmd(vmf, page);
+ if (!ret) {
+ /* The page is mapped successfully, reference consumed. */
+ folio_unlock(folio);
+ return true;
+ }
+ }
+
+ if (pmd_none(*vmf->pmd))
+ pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
+
+ return false;
+}
+
+static struct folio *next_uptodate_page(struct folio *folio,
+ struct address_space *mapping,
+ struct xa_state *xas, pgoff_t end_pgoff)
+{
+ unsigned long max_idx;
+ do {
+ if (!folio)
+ return NULL;
+ if (xas_retry(xas, folio))
+ continue;
+ if (xa_is_value(folio))
+ continue;
+ if (folio_test_locked(folio))
+ continue;
+ if (!folio_try_get_rcu(folio))
+ continue;
/* Has the page moved or been split? */
- if (unlikely(head != xas_reload(&xas)))
+ if (unlikely(folio != xas_reload(xas)))
goto skip;
- page = find_subpage(head, xas.xa_index);
-
- if (!PageUptodate(head) ||
- PageReadahead(page) ||
- PageHWPoison(page))
+ if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
goto skip;
- if (!trylock_page(head))
+ if (!folio_trylock(folio))
goto skip;
-
- if (head->mapping != mapping || !PageUptodate(head))
+ if (folio->mapping != mapping)
+ goto unlock;
+ if (!folio_test_uptodate(folio))
goto unlock;
-
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- if (xas.xa_index >= max_idx)
+ if (xas->xa_index >= max_idx)
+ goto unlock;
+ return folio;
+unlock:
+ folio_unlock(folio);
+skip:
+ folio_put(folio);
+ } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
+
+ return NULL;
+}
+
+static inline struct folio *first_map_page(struct address_space *mapping,
+ struct xa_state *xas,
+ pgoff_t end_pgoff)
+{
+ return next_uptodate_page(xas_find(xas, end_pgoff),
+ mapping, xas, end_pgoff);
+}
+
+static inline struct folio *next_map_page(struct address_space *mapping,
+ struct xa_state *xas,
+ pgoff_t end_pgoff)
+{
+ return next_uptodate_page(xas_next_entry(xas, end_pgoff),
+ mapping, xas, end_pgoff);
+}
+
+vm_fault_t filemap_map_pages(struct vm_fault *vmf,
+ pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ pgoff_t last_pgoff = start_pgoff;
+ unsigned long addr;
+ XA_STATE(xas, &mapping->i_pages, start_pgoff);
+ struct folio *folio;
+ struct page *page;
+ unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+ vm_fault_t ret = 0;
+
+ rcu_read_lock();
+ folio = first_map_page(mapping, &xas, end_pgoff);
+ if (!folio)
+ goto out;
+
+ if (filemap_map_pmd(vmf, folio, start_pgoff)) {
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+
+ addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
+ if (!vmf->pte) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+ }
+ do {
+again:
+ page = folio_file_page(folio, xas.xa_index);
+ if (PageHWPoison(page))
goto unlock;
if (mmap_miss > 0)
mmap_miss--;
- vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
- if (vmf->pte)
- vmf->pte += xas.xa_index - last_pgoff;
+ addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
+ vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
- if (alloc_set_pte(vmf, page))
+
+ /*
+ * NOTE: If there're PTE markers, we'll leave them to be
+ * handled in the specific fault path, and it'll prohibit the
+ * fault-around logic.
+ */
+ if (!pte_none(ptep_get(vmf->pte)))
goto unlock;
- unlock_page(head);
- goto next;
+
+ /* We're about to handle the fault */
+ if (vmf->address == addr)
+ ret = VM_FAULT_NOPAGE;
+
+ do_set_pte(vmf, page, addr);
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, addr, vmf->pte);
+ if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
+ xas.xa_index++;
+ folio_ref_inc(folio);
+ goto again;
+ }
+ folio_unlock(folio);
+ continue;
unlock:
- unlock_page(head);
-skip:
- put_page(head);
-next:
- /* Huge page is mapped? No need to proceed. */
- if (pmd_trans_huge(*vmf->pmd))
- break;
- }
+ if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
+ xas.xa_index++;
+ goto again;
+ }
+ folio_unlock(folio);
+ folio_put(folio);
+ } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+out:
rcu_read_unlock();
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
+ return ret;
}
EXPORT_SYMBOL(filemap_map_pages);
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
- struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ struct folio *folio = page_folio(vmf->page);
vm_fault_t ret = VM_FAULT_LOCKED;
- sb_start_pagefault(inode->i_sb);
+ sb_start_pagefault(mapping->host->i_sb);
file_update_time(vmf->vma->vm_file);
- lock_page(page);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
goto out;
}
/*
- * We mark the page dirty already here so that when freeze is in
+ * We mark the folio dirty already here so that when freeze is in
* progress, we are guaranteed that writeback during freezing will
- * see the dirty page and writeprotect it again.
+ * see the dirty folio and writeprotect it again.
*/
- set_page_dirty(page);
- wait_for_stable_page(page);
+ folio_mark_dirty(folio);
+ folio_wait_stable(folio);
out:
- sb_end_pagefault(inode->i_sb);
+ sb_end_pagefault(mapping->host->i_sb);
return ret;
}
@@ -2890,11 +3607,11 @@ const struct vm_operations_struct generic_file_vm_ops = {
/* This is used for a general mmap of a disk file */
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &generic_file_vm_ops;
@@ -2915,11 +3632,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
}
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
-int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
@@ -2929,145 +3646,129 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
-static struct page *wait_on_page_read(struct page *page)
-{
- if (!IS_ERR(page)) {
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- put_page(page);
- page = ERR_PTR(-EIO);
- }
- }
- return page;
-}
-
-static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data,
- gfp_t gfp)
+static struct folio *do_read_cache_folio(struct address_space *mapping,
+ pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
{
- struct page *page;
+ struct folio *folio;
int err;
+
+ if (!filler)
+ filler = mapping->a_ops->read_folio;
repeat:
- page = find_get_page(mapping, index);
- if (!page) {
- page = __page_cache_alloc(gfp);
- if (!page)
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ folio = filemap_alloc_folio(gfp, 0);
+ if (!folio)
return ERR_PTR(-ENOMEM);
- err = add_to_page_cache_lru(page, mapping, index, gfp);
+ err = filemap_add_folio(mapping, folio, index, gfp);
if (unlikely(err)) {
- put_page(page);
+ folio_put(folio);
if (err == -EEXIST)
goto repeat;
/* Presumably ENOMEM for xarray node */
return ERR_PTR(err);
}
-filler:
- if (filler)
- err = filler(data, page);
- else
- err = mapping->a_ops->readpage(data, page);
-
- if (err < 0) {
- put_page(page);
- return ERR_PTR(err);
- }
-
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
- goto out;
+ goto filler;
}
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
goto out;
- /*
- * Page is not up to date and may be locked due one of the following
- * case a: Page is being filled and the page lock is held
- * case b: Read/write error clearing the page uptodate status
- * case c: Truncation in progress (page locked)
- * case d: Reclaim in progress
- *
- * Case a, the page will be up to date when the page is unlocked.
- * There is no need to serialise on the page lock here as the page
- * is pinned so the lock gives no additional protection. Even if the
- * page is truncated, the data is still valid if PageUptodate as
- * it's a race vs truncate race.
- * Case b, the page will not be up to date
- * Case c, the page may be truncated but in itself, the data may still
- * be valid after IO completes as it's a read vs truncate race. The
- * operation must restart if the page is not uptodate on unlock but
- * otherwise serialising on page lock to stabilise the mapping gives
- * no additional guarantees to the caller as the page lock is
- * released before return.
- * Case d, similar to truncation. If reclaim holds the page lock, it
- * will be a race with remove_mapping that determines if the mapping
- * is valid on unlock but otherwise the data is valid and there is
- * no need to serialise with page lock.
- *
- * As the page lock gives no additional guarantee, we optimistically
- * wait on the page to be unlocked and check if it's up to date and
- * use the page if it is. Otherwise, the page lock is required to
- * distinguish between the different cases. The motivation is that we
- * avoid spurious serialisations and wakeups when multiple processes
- * wait on the same page for IO to complete.
- */
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto out;
-
- /* Distinguish between all the cases under the safety of the lock */
- lock_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
+ goto repeat;
+ }
- /* Case c or d, restart the operation */
- if (!page->mapping) {
- unlock_page(page);
- put_page(page);
+ /* Folio was truncated from mapping */
+ if (!folio->mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
goto repeat;
}
/* Someone else locked and filled the page in a very small window */
- if (PageUptodate(page)) {
- unlock_page(page);
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
goto out;
}
- /*
- * A previous I/O error may have been due to temporary
- * failures.
- * Clear page error before actual read, PG_error will be
- * set again if read page fails.
- */
- ClearPageError(page);
- goto filler;
+filler:
+ err = filemap_read_folio(file, filler, folio);
+ if (err) {
+ folio_put(folio);
+ if (err == AOP_TRUNCATED_PAGE)
+ goto repeat;
+ return ERR_PTR(err);
+ }
out:
- mark_page_accessed(page);
- return page;
+ folio_mark_accessed(folio);
+ return folio;
}
/**
- * read_cache_page - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: first arg to filler(data, page) function, often left as NULL
+ * read_cache_folio - Read into page cache, fill it if needed.
+ * @mapping: The address_space to read from.
+ * @index: The index to read.
+ * @filler: Function to perform the read, or NULL to use aops->read_folio().
+ * @file: Passed to filler function, may be NULL if not required.
*
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page and wait for it to become unlocked.
+ * Read one page into the page cache. If it succeeds, the folio returned
+ * will contain @index, but it may not be the first page of the folio.
*
- * If the page does not get brought uptodate, return -EIO.
+ * If the filler function returns an error, it will be returned to the
+ * caller.
*
- * Return: up to date page on success, ERR_PTR() on failure.
+ * Context: May sleep. Expects mapping->invalidate_lock to be held.
+ * Return: An uptodate folio on success, ERR_PTR() on failure.
*/
+struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
+ filler_t filler, struct file *file)
+{
+ return do_read_cache_folio(mapping, index, filler, file,
+ mapping_gfp_mask(mapping));
+}
+EXPORT_SYMBOL(read_cache_folio);
+
+/**
+ * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
+ * @mapping: The address_space for the folio.
+ * @index: The index that the allocated folio will contain.
+ * @gfp: The page allocator flags to use if allocating.
+ *
+ * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
+ * any new memory allocations done using the specified allocation flags.
+ *
+ * The most likely error from this function is EIO, but ENOMEM is
+ * possible and so is EINTR. If ->read_folio returns another error,
+ * that will be returned to the caller.
+ *
+ * The function expects mapping->invalidate_lock to be already held.
+ *
+ * Return: Uptodate folio on success, ERR_PTR() on failure.
+ */
+struct folio *mapping_read_folio_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+ return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
+}
+EXPORT_SYMBOL(mapping_read_folio_gfp);
+
+static struct page *do_read_cache_page(struct address_space *mapping,
+ pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
+{
+ struct folio *folio;
+
+ folio = do_read_cache_folio(mapping, index, filler, file, gfp);
+ if (IS_ERR(folio))
+ return &folio->page;
+ return folio_file_page(folio, index);
+}
+
struct page *read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data)
+ pgoff_t index, filler_t *filler, struct file *file)
{
- return do_read_cache_page(mapping, index, filler, data,
+ return do_read_cache_page(mapping, index, filler, file,
mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);
@@ -3083,6 +3784,8 @@ EXPORT_SYMBOL(read_cache_page);
*
* If the page does not get brought uptodate, return -EIO.
*
+ * The function expects mapping->invalidate_lock to be already held.
+ *
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page_gfp(struct address_space *mapping,
@@ -3094,259 +3797,15 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
EXPORT_SYMBOL(read_cache_page_gfp);
/*
- * Don't operate on ranges the page cache doesn't support, and don't exceed the
- * LFS limits. If pos is under the limit it becomes a short access. If it
- * exceeds the limit we return -EFBIG.
- */
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
-
- *count = min(*count, max_size - pos);
-
- return 0;
-}
-
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- loff_t count;
- int ret;
-
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
-
- if (!iov_iter_count(from))
- return 0;
-
- /* FIXME: this is for backwards compatibility with 2.4 */
- if (iocb->ki_flags & IOCB_APPEND)
- iocb->ki_pos = i_size_read(inode);
-
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
- return -EINVAL;
-
- count = iov_iter_count(from);
- ret = generic_write_check_limits(file, iocb->ki_pos, &count);
- if (ret)
- return ret;
-
- iov_iter_truncate(from, count);
- return iov_iter_count(from);
-}
-EXPORT_SYMBOL(generic_write_checks);
-
-/*
- * Performs necessary checks before doing a clone.
- *
- * Can adjust amount of bytes to clone via @req_count argument.
- * Returns appropriate error code that caller should return or
- * zero in case the clone should be allowed.
- */
-int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-
-/*
- * Performs common checks before doing a file copy/clone
- * from @file_in to @file_out.
- */
-int generic_file_rw_checks(struct file *file_in, struct file *file_out)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
-
- /* Don't copy dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- if (!(file_in->f_mode & FMODE_READ) ||
- !(file_out->f_mode & FMODE_WRITE) ||
- (file_out->f_flags & O_APPEND))
- return -EBADF;
-
- return 0;
-}
-
-/*
- * Performs necessary checks before doing a file copy
- *
- * Can adjust amount of bytes to copy via @req_count argument.
- * Returns appropriate error code that caller should return or
- * zero in case the copy should be allowed.
- */
-int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t *req_count, unsigned int flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- uint64_t count = *req_count;
- loff_t size_in;
- int ret;
-
- ret = generic_file_rw_checks(file_in, file_out);
- if (ret)
- return ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EOVERFLOW;
-
- /* Shorten the copy to EOF */
- size_in = i_size_read(inode_in);
- if (pos_in >= size_in)
- count = 0;
- else
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /* Don't allow overlapped copying within the same file. */
- if (inode_in == inode_out &&
- pos_out + count > pos_in &&
- pos_out < pos_in + count)
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-int pagecache_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- const struct address_space_operations *aops = mapping->a_ops;
-
- return aops->write_begin(file, mapping, pos, len, flags,
- pagep, fsdata);
-}
-EXPORT_SYMBOL(pagecache_write_begin);
-
-int pagecache_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- const struct address_space_operations *aops = mapping->a_ops;
-
- return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
-}
-EXPORT_SYMBOL(pagecache_write_end);
-
-/*
* Warn about a page cache invalidation failure during a direct I/O write.
*/
-void dio_warn_stale_pagecache(struct file *filp)
+static void dio_warn_stale_pagecache(struct file *filp)
{
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
char pathname[128];
- struct inode *inode = file_inode(filp);
char *path;
- errseq_set(&inode->i_mapping->wb_err, -EIO);
+ errseq_set(&filp->f_mapping->wb_err, -EIO);
if (__ratelimit(&_rs)) {
path = file_path(filp, pathname, sizeof(pathname));
if (IS_ERR(path))
@@ -3357,48 +3816,33 @@ void dio_warn_stale_pagecache(struct file *filp)
}
}
-ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos;
- ssize_t written;
- size_t write_len;
- pgoff_t end;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
- write_len = iov_iter_count(from);
- end = (pos + write_len - 1) >> PAGE_SHIFT;
+ if (mapping->nrpages &&
+ invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT,
+ (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
+ dio_warn_stale_pagecache(iocb->ki_filp);
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* If there are pages to writeback, return */
- if (filemap_range_has_page(inode->i_mapping, pos,
- pos + write_len - 1))
- return -EAGAIN;
- } else {
- written = filemap_write_and_wait_range(mapping, pos,
- pos + write_len - 1);
- if (written)
- goto out;
- }
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ size_t write_len = iov_iter_count(from);
+ ssize_t written;
/*
- * After a write we want buffered reads to be sure to go to disk to get
- * the new data. We invalidate clean cached page from the region we're
- * about to write. We do this *before* the write so that we can return
- * without clobbering -EIOCBQUEUED from ->direct_IO().
- */
- written = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- /*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
*/
+ written = kiocb_invalidate_pages(iocb, write_len);
if (written) {
if (written == -EBUSY)
return 0;
- goto out;
+ return written;
}
written = mapping->a_ops->direct_IO(iocb, from);
@@ -3420,11 +3864,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
*
* Skip invalidation for async writes or if mapping has no pages.
*/
- if (written > 0 && mapping->nrpages &&
- invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
- dio_warn_stale_pagecache(file);
-
if (written > 0) {
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos;
+
+ kiocb_invalidate_post_direct_write(iocb, written);
pos += written;
write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -3433,49 +3877,27 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
iocb->ki_pos = pos;
}
- iov_iter_revert(from, write_len - iov_iter_count(from));
-out:
+ if (written != -EIOCBQUEUED)
+ iov_iter_revert(from, write_len - iov_iter_count(from));
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
-/*
- * Find or create a page at the given pagecache position. Return the locked
- * page. This function is specifically for buffered writes.
- */
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
- pgoff_t index, unsigned flags)
-{
- struct page *page;
- int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
-
- if (flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
-
- page = pagecache_get_page(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping));
- if (page)
- wait_for_stable_page(page);
-
- return page;
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
-
-ssize_t generic_perform_write(struct file *file,
- struct iov_iter *i, loff_t pos)
+ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
{
+ struct file *file = iocb->ki_filp;
+ loff_t pos = iocb->ki_pos;
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
- unsigned int flags = 0;
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
- void *fsdata;
+ void *fsdata = NULL;
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
@@ -3487,12 +3909,8 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
break;
}
@@ -3502,7 +3920,7 @@ again:
break;
}
- status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ status = a_ops->write_begin(file, mapping, pos, bytes,
&page, &fsdata);
if (unlikely(status < 0))
break;
@@ -3510,38 +3928,39 @@ again:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ copied = copy_page_from_iter_atomic(page, offset, bytes, i);
flush_dcache_page(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
- if (unlikely(status < 0))
- break;
- copied = status;
-
+ if (unlikely(status != copied)) {
+ iov_iter_revert(i, copied - max(status, 0L));
+ if (unlikely(status < 0))
+ break;
+ }
cond_resched();
- iov_iter_advance(i, copied);
- if (unlikely(copied == 0)) {
+ if (unlikely(status == 0)) {
/*
- * If we were unable to copy any data at all, we must
- * fall back to a single segment length write.
- *
- * If we didn't fallback here, we could livelock
- * because not all segments in the iov can be copied at
- * once without a pagefault.
+ * A short copy made ->write_end() reject the
+ * thing entirely. Might be memory poisoning
+ * halfway through, might be a race with munmap,
+ * might be severe memory pressure.
*/
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_single_seg_count(i));
+ if (copied)
+ bytes = copied;
goto again;
}
- pos += copied;
- written += copied;
+ pos += status;
+ written += status;
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
- return written ? written : status;
+ if (!written)
+ return status;
+ iocb->ki_pos += written;
+ return written;
}
EXPORT_SYMBOL(generic_perform_write);
@@ -3555,12 +3974,12 @@ EXPORT_SYMBOL(generic_perform_write);
* modification times and calls proper subroutines depending on whether we
* do direct IO or a standard buffered write.
*
- * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * It expects i_rwsem to be grabbed unless we work on a block device or similar
* object which does not need locking at all.
*
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
- * avoid syncing under i_mutex.
+ * avoid syncing under i_rwsem.
*
* Return:
* * number of bytes written, even for truncated writes
@@ -3569,26 +3988,20 @@ EXPORT_SYMBOL(generic_perform_write);
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t written = 0;
- ssize_t err;
- ssize_t status;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_privs(file);
- if (err)
- goto out;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
- err = file_update_time(file);
- if (err)
- goto out;
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
- if (iocb->ki_flags & IOCB_DIRECT) {
- loff_t pos, endbyte;
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
- written = generic_file_direct_write(iocb, from);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = generic_file_direct_write(iocb, from);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
@@ -3596,48 +4009,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
- if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
- goto out;
-
- status = generic_perform_write(file, from, pos = iocb->ki_pos);
- /*
- * If generic_perform_write() returned a synchronous error
- * then we want to return the number of bytes which were
- * direct-written, or the error code if that was zero. Note
- * that this differs from normal direct-io semantics, which
- * will return -EFOO even if some bytes were written.
- */
- if (unlikely(status < 0)) {
- err = status;
- goto out;
- }
- /*
- * We need to ensure that the page cache pages are written to
- * disk and invalidated to preserve the expected O_DIRECT
- * semantics.
- */
- endbyte = pos + status - 1;
- err = filemap_write_and_wait_range(mapping, pos, endbyte);
- if (err == 0) {
- iocb->ki_pos = endbyte + 1;
- written += status;
- invalidate_mapping_pages(mapping,
- pos >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
- } else {
- /*
- * We don't know how much we wrote, so just return
- * the number of bytes which were direct-written
- */
- }
- } else {
- written = generic_perform_write(file, from, iocb->ki_pos);
- if (likely(written > 0))
- iocb->ki_pos += written;
+ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
+ return ret;
+ return direct_write_fallback(iocb, from, ret,
+ generic_perform_write(iocb, from));
}
-out:
- current->backing_dev_info = NULL;
- return written ? written : err;
+
+ return generic_perform_write(iocb, from);
}
EXPORT_SYMBOL(__generic_file_write_iter);
@@ -3648,7 +4026,7 @@ EXPORT_SYMBOL(__generic_file_write_iter);
*
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
- * and acquires i_mutex as needed.
+ * and acquires i_rwsem as needed.
* Return:
* * negative error code if no data has been written at all of
* vfs_fsync_range() failed for a synchronous write
@@ -3673,33 +4051,200 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
EXPORT_SYMBOL(generic_file_write_iter);
/**
- * try_to_release_page() - release old fs-specific metadata on a page
- *
- * @page: the page which the kernel is trying to free
- * @gfp_mask: memory allocation flags (and I/O mode)
+ * filemap_release_folio() - Release fs-specific metadata on a folio.
+ * @folio: The folio which the kernel is trying to free.
+ * @gfp: Memory allocation flags (and I/O mode).
*
- * The address_space is to try to release any data against the page
- * (presumably at page->private).
+ * The address_space is trying to release any data attached to a folio
+ * (presumably at folio->private).
*
- * This may also be called if PG_fscache is set on a page, indicating that the
- * page is known to the local caching routines.
+ * This will also be called if the private_2 flag is set on a page,
+ * indicating that the folio has other metadata associated with it.
*
- * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
+ * The @gfp argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block
+ * (__GFP_RECLAIM & __GFP_FS).
*
- * Return: %1 if the release was successful, otherwise return zero.
+ * Return: %true if the release was successful, otherwise %false.
*/
-int try_to_release_page(struct page *page, gfp_t gfp_mask)
+bool filemap_release_folio(struct folio *folio, gfp_t gfp)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = folio->mapping;
- BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
- return 0;
+ BUG_ON(!folio_test_locked(folio));
+ if (folio_test_writeback(folio))
+ return false;
+
+ if (mapping && mapping->a_ops->release_folio)
+ return mapping->a_ops->release_folio(folio, gfp);
+ return try_to_free_buffers(folio);
+}
+EXPORT_SYMBOL(filemap_release_folio);
+
+#ifdef CONFIG_CACHESTAT_SYSCALL
+/**
+ * filemap_cachestat() - compute the page cache statistics of a mapping
+ * @mapping: The mapping to compute the statistics for.
+ * @first_index: The starting page cache index.
+ * @last_index: The final page index (inclusive).
+ * @cs: the cachestat struct to write the result to.
+ *
+ * This will query the page cache statistics of a mapping in the
+ * page range of [first_index, last_index] (inclusive). The statistics
+ * queried include: number of dirty pages, number of pages marked for
+ * writeback, and the number of (recently) evicted pages.
+ */
+static void filemap_cachestat(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
+{
+ XA_STATE(xas, &mapping->i_pages, first_index);
+ struct folio *folio;
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_index) {
+ unsigned long nr_pages;
+ pgoff_t folio_first_index, folio_last_index;
+
+ if (xas_retry(&xas, folio))
+ continue;
+
+ if (xa_is_value(folio)) {
+ /* page is evicted */
+ void *shadow = (void *)folio;
+ bool workingset; /* not used */
+ int order = xa_get_order(xas.xa, xas.xa_index);
+
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
- if (mapping && mapping->a_ops->releasepage)
- return mapping->a_ops->releasepage(page, gfp_mask);
- return try_to_free_buffers(page);
+ cs->nr_evicted += nr_pages;
+
+#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
+ if (shmem_mapping(mapping)) {
+ /* shmem file - in swap cache */
+ swp_entry_t swp = radix_to_swp_entry(folio);
+
+ shadow = get_shadow_from_swap_cache(swp);
+ }
+#endif
+ if (workingset_test_recent(shadow, true, &workingset))
+ cs->nr_recently_evicted += nr_pages;
+
+ goto resched;
+ }
+
+ nr_pages = folio_nr_pages(folio);
+ folio_first_index = folio_pgoff(folio);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ /* page is in cache */
+ cs->nr_cache += nr_pages;
+
+ if (folio_test_dirty(folio))
+ cs->nr_dirty += nr_pages;
+
+ if (folio_test_writeback(folio))
+ cs->nr_writeback += nr_pages;
+
+resched:
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
}
-EXPORT_SYMBOL(try_to_release_page);
+/*
+ * The cachestat(2) system call.
+ *
+ * cachestat() returns the page cache statistics of a file in the
+ * bytes range specified by `off` and `len`: number of cached pages,
+ * number of dirty pages, number of pages marked for writeback,
+ * number of evicted pages, and number of recently evicted pages.
+ *
+ * An evicted page is a page that is previously in the page cache
+ * but has been evicted since. A page is recently evicted if its last
+ * eviction was recent enough that its reentry to the cache would
+ * indicate that it is actively being used by the system, and that
+ * there is memory pressure on the system.
+ *
+ * `off` and `len` must be non-negative integers. If `len` > 0,
+ * the queried range is [`off`, `off` + `len`]. If `len` == 0,
+ * we will query in the range from `off` to the end of the file.
+ *
+ * The `flags` argument is unused for now, but is included for future
+ * extensibility. User should pass 0 (i.e no flag specified).
+ *
+ * Currently, hugetlbfs is not supported.
+ *
+ * Because the status of a page can change after cachestat() checks it
+ * but before it returns to the application, the returned values may
+ * contain stale information.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - cstat or cstat_range points to an illegal address
+ * -EINVAL - invalid flags
+ * -EBADF - invalid file descriptor
+ * -EOPNOTSUPP - file descriptor is of a hugetlbfs file
+ */
+SYSCALL_DEFINE4(cachestat, unsigned int, fd,
+ struct cachestat_range __user *, cstat_range,
+ struct cachestat __user *, cstat, unsigned int, flags)
+{
+ struct fd f = fdget(fd);
+ struct address_space *mapping;
+ struct cachestat_range csr;
+ struct cachestat cs;
+ pgoff_t first_index, last_index;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (copy_from_user(&csr, cstat_range,
+ sizeof(struct cachestat_range))) {
+ fdput(f);
+ return -EFAULT;
+ }
+
+ /* hugetlbfs is not supported */
+ if (is_file_hugepages(f.file)) {
+ fdput(f);
+ return -EOPNOTSUPP;
+ }
+
+ if (flags != 0) {
+ fdput(f);
+ return -EINVAL;
+ }
+
+ first_index = csr.off >> PAGE_SHIFT;
+ last_index =
+ csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
+ memset(&cs, 0, sizeof(struct cachestat));
+ mapping = f.file->f_mapping;
+ filemap_cachestat(mapping, first_index, last_index, &cs);
+ fdput(f);
+
+ if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
+ return -EFAULT;
+
+ return 0;
+}
+#endif /* CONFIG_CACHESTAT_SYSCALL */
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
new file mode 100644
index 000000000000..c6f056c20503
--- /dev/null
+++ b/mm/folio-compat.c
@@ -0,0 +1,134 @@
+/*
+ * Compatibility functions which bloat the callers too much to make inline.
+ * All of the callers of these functions should be converted to use folios
+ * eventually.
+ */
+
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include "internal.h"
+
+struct address_space *page_mapping(struct page *page)
+{
+ return folio_mapping(page_folio(page));
+}
+EXPORT_SYMBOL(page_mapping);
+
+void unlock_page(struct page *page)
+{
+ return folio_unlock(page_folio(page));
+}
+EXPORT_SYMBOL(unlock_page);
+
+void end_page_writeback(struct page *page)
+{
+ return folio_end_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(end_page_writeback);
+
+void wait_on_page_writeback(struct page *page)
+{
+ return folio_wait_writeback(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
+void wait_for_stable_page(struct page *page)
+{
+ return folio_wait_stable(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_for_stable_page);
+
+void mark_page_accessed(struct page *page)
+{
+ folio_mark_accessed(page_folio(page));
+}
+EXPORT_SYMBOL(mark_page_accessed);
+
+bool set_page_writeback(struct page *page)
+{
+ return folio_start_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_writeback);
+
+bool set_page_dirty(struct page *page)
+{
+ return folio_mark_dirty(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+int __set_page_dirty_nobuffers(struct page *page)
+{
+ return filemap_dirty_folio(page_mapping(page), page_folio(page));
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+bool clear_page_dirty_for_io(struct page *page)
+{
+ return folio_clear_dirty_for_io(page_folio(page));
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+bool redirty_page_for_writepage(struct writeback_control *wbc,
+ struct page *page)
+{
+ return folio_redirty_for_writepage(wbc, page_folio(page));
+}
+EXPORT_SYMBOL(redirty_page_for_writepage);
+
+void lru_cache_add_inactive_or_unevictable(struct page *page,
+ struct vm_area_struct *vma)
+{
+ folio_add_lru_vma(page_folio(page), vma);
+}
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+ return filemap_add_folio(mapping, page_folio(page), index, gfp);
+}
+EXPORT_SYMBOL(add_to_page_cache_lru);
+
+noinline
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+ int fgp_flags, gfp_t gfp)
+{
+ struct folio *folio;
+
+ folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
+ if (IS_ERR(folio))
+ return NULL;
+ return folio_file_page(folio, index);
+}
+EXPORT_SYMBOL(pagecache_get_page);
+
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+}
+EXPORT_SYMBOL(grab_cache_page_write_begin);
+
+bool isolate_lru_page(struct page *page)
+{
+ if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"))
+ return false;
+ return folio_isolate_lru((struct folio *)page);
+}
+
+void putback_lru_page(struct page *page)
+{
+ folio_putback_lru(page_folio(page));
+}
+
+#ifdef CONFIG_MMU
+void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
+{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ return folio_add_new_anon_rmap((struct folio *)page, vma, address);
+}
+#endif
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
deleted file mode 100644
index 10f82d5643b6..000000000000
--- a/mm/frame_vector.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
-#include <linux/sched.h>
-
-/**
- * get_vaddr_frames() - map virtual addresses to pfns
- * @start: starting user address
- * @nr_frames: number of pages / pfns from start to map
- * @gup_flags: flags modifying lookup behaviour
- * @vec: structure which receives pages / pfns of the addresses mapped.
- * It should have space for at least nr_frames entries.
- *
- * This function maps virtual addresses from @start and fills @vec structure
- * with page frame numbers or page pointers to corresponding pages (choice
- * depends on the type of the vma underlying the virtual address). If @start
- * belongs to a normal vma, the function grabs reference to each of the pages
- * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
- * touch page structures and the caller must make sure pfns aren't reused for
- * anything else while he is using them.
- *
- * The function returns number of pages mapped which may be less than
- * @nr_frames. In particular we stop mapping if there are more vmas of
- * different type underlying the specified range of virtual addresses.
- * When the function isn't able to map a single page, it returns error.
- *
- * This function takes care of grabbing mmap_lock as necessary.
- */
-int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
- unsigned int gup_flags, struct frame_vector *vec)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int ret = 0;
- int err;
- int locked;
-
- if (nr_frames == 0)
- return 0;
-
- if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
- nr_frames = vec->nr_allocated;
-
- start = untagged_addr(start);
-
- mmap_read_lock(mm);
- locked = 1;
- vma = find_vma_intersection(mm, start, start + 1);
- if (!vma) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * While get_vaddr_frames() could be used for transient (kernel
- * controlled lifetime) pinning of memory pages all current
- * users establish long term (userspace controlled lifetime)
- * page pinning. Treat get_vaddr_frames() like
- * get_user_pages_longterm() and disallow it for filesystem-dax
- * mappings.
- */
- if (vma_is_fsdax(vma)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
- vec->got_ref = true;
- vec->is_pfns = false;
- ret = pin_user_pages_locked(start, nr_frames,
- gup_flags, (struct page **)(vec->ptrs), &locked);
- goto out;
- }
-
- vec->got_ref = false;
- vec->is_pfns = true;
- do {
- unsigned long *nums = frame_vector_pfns(vec);
-
- while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
- err = follow_pfn(vma, start, &nums[ret]);
- if (err) {
- if (ret == 0)
- ret = err;
- goto out;
- }
- start += PAGE_SIZE;
- ret++;
- }
- /*
- * We stop if we have enough pages or if VMA doesn't completely
- * cover the tail page.
- */
- if (ret >= nr_frames || start < vma->vm_end)
- break;
- vma = find_vma_intersection(mm, start, start + 1);
- } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
-out:
- if (locked)
- mmap_read_unlock(mm);
- if (!ret)
- ret = -EFAULT;
- if (ret > 0)
- vec->nr_frames = ret;
- return ret;
-}
-EXPORT_SYMBOL(get_vaddr_frames);
-
-/**
- * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
- * them
- * @vec: frame vector to put
- *
- * Drop references to pages if get_vaddr_frames() acquired them. We also
- * invalidate the frame vector so that it is prepared for the next call into
- * get_vaddr_frames().
- */
-void put_vaddr_frames(struct frame_vector *vec)
-{
- struct page **pages;
-
- if (!vec->got_ref)
- goto out;
- pages = frame_vector_pages(vec);
- /*
- * frame_vector_pages() might needed to do a conversion when
- * get_vaddr_frames() got pages but vec was later converted to pfns.
- * But it shouldn't really fail to convert pfns back...
- */
- if (WARN_ON(IS_ERR(pages)))
- goto out;
-
- unpin_user_pages(pages, vec->nr_frames);
- vec->got_ref = false;
-out:
- vec->nr_frames = 0;
-}
-EXPORT_SYMBOL(put_vaddr_frames);
-
-/**
- * frame_vector_to_pages - convert frame vector to contain page pointers
- * @vec: frame vector to convert
- *
- * Convert @vec to contain array of page pointers. If the conversion is
- * successful, return 0. Otherwise return an error. Note that we do not grab
- * page references for the page structures.
- */
-int frame_vector_to_pages(struct frame_vector *vec)
-{
- int i;
- unsigned long *nums;
- struct page **pages;
-
- if (!vec->is_pfns)
- return 0;
- nums = frame_vector_pfns(vec);
- for (i = 0; i < vec->nr_frames; i++)
- if (!pfn_valid(nums[i]))
- return -EINVAL;
- pages = (struct page **)nums;
- for (i = 0; i < vec->nr_frames; i++)
- pages[i] = pfn_to_page(nums[i]);
- vec->is_pfns = false;
- return 0;
-}
-EXPORT_SYMBOL(frame_vector_to_pages);
-
-/**
- * frame_vector_to_pfns - convert frame vector to contain pfns
- * @vec: frame vector to convert
- *
- * Convert @vec to contain array of pfns.
- */
-void frame_vector_to_pfns(struct frame_vector *vec)
-{
- int i;
- unsigned long *nums;
- struct page **pages;
-
- if (vec->is_pfns)
- return;
- pages = (struct page **)(vec->ptrs);
- nums = (unsigned long *)pages;
- for (i = 0; i < vec->nr_frames; i++)
- nums[i] = page_to_pfn(pages[i]);
- vec->is_pfns = true;
-}
-EXPORT_SYMBOL(frame_vector_to_pfns);
-
-/**
- * frame_vector_create() - allocate & initialize structure for pinned pfns
- * @nr_frames: number of pfns slots we should reserve
- *
- * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
- * pfns.
- */
-struct frame_vector *frame_vector_create(unsigned int nr_frames)
-{
- struct frame_vector *vec;
- int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
-
- if (WARN_ON_ONCE(nr_frames == 0))
- return NULL;
- /*
- * This is absurdly high. It's here just to avoid strange effects when
- * arithmetics overflows.
- */
- if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
- return NULL;
- /*
- * Avoid higher order allocations, use vmalloc instead. It should
- * be rare anyway.
- */
- vec = kvmalloc(size, GFP_KERNEL);
- if (!vec)
- return NULL;
- vec->nr_allocated = nr_frames;
- vec->nr_frames = 0;
- return vec;
-}
-EXPORT_SYMBOL(frame_vector_create);
-
-/**
- * frame_vector_destroy() - free memory allocated to carry frame vector
- * @vec: Frame vector to free
- *
- * Free structure allocated by frame_vector_create() to carry frames.
- */
-void frame_vector_destroy(struct frame_vector *vec)
-{
- /* Make sure put_vaddr_frames() got called properly... */
- VM_BUG_ON(vec->nr_frames > 0);
- kvfree(vec);
-}
-EXPORT_SYMBOL(frame_vector_destroy);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 2183a56c7874..2fb5df3384b8 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -4,7 +4,7 @@
*
* This code provides the generic "frontend" layer to call a matching
* "backend" driver implementation of frontswap. See
- * Documentation/vm/frontswap.rst for more information.
+ * Documentation/mm/frontswap.rst for more information.
*
* Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
* Author: Dan Magenheimer
@@ -27,27 +27,7 @@ DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
* may be registered, but implementations can never deregister. This
* is a simple singly-linked list of all registered implementations.
*/
-static struct frontswap_ops *frontswap_ops __read_mostly;
-
-#define for_each_frontswap_ops(ops) \
- for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
-
-/*
- * If enabled, frontswap_store will return failure even on success. As
- * a result, the swap subsystem will always write the page to swap, in
- * effect converting frontswap into a writethrough cache. In this mode,
- * there is no direct reduction in swap writes, but a frontswap backend
- * can unilaterally "reclaim" any pages in use with no data loss, thus
- * providing increases control over maximum memory usage due to frontswap.
- */
-static bool frontswap_writethrough_enabled __read_mostly;
-
-/*
- * If enabled, the underlying tmem implementation is capable of doing
- * exclusive gets, so frontswap_load, on a successful tmem_get must
- * mark the page as no longer in frontswap AND mark it dirty.
- */
-static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
+static const struct frontswap_ops *frontswap_ops __read_mostly;
#ifdef CONFIG_DEBUG_FS
/*
@@ -60,16 +40,20 @@ static u64 frontswap_succ_stores;
static u64 frontswap_failed_stores;
static u64 frontswap_invalidates;
-static inline void inc_frontswap_loads(void) {
+static inline void inc_frontswap_loads(void)
+{
data_race(frontswap_loads++);
}
-static inline void inc_frontswap_succ_stores(void) {
+static inline void inc_frontswap_succ_stores(void)
+{
data_race(frontswap_succ_stores++);
}
-static inline void inc_frontswap_failed_stores(void) {
+static inline void inc_frontswap_failed_stores(void)
+{
data_race(frontswap_failed_stores++);
}
-static inline void inc_frontswap_invalidates(void) {
+static inline void inc_frontswap_invalidates(void)
+{
data_race(frontswap_invalidates++);
}
#else
@@ -110,87 +94,22 @@ static inline void inc_frontswap_invalidates(void) { }
/*
* Register operations for frontswap
*/
-void frontswap_register_ops(struct frontswap_ops *ops)
+int frontswap_register_ops(const struct frontswap_ops *ops)
{
- DECLARE_BITMAP(a, MAX_SWAPFILES);
- DECLARE_BITMAP(b, MAX_SWAPFILES);
- struct swap_info_struct *si;
- unsigned int i;
-
- bitmap_zero(a, MAX_SWAPFILES);
- bitmap_zero(b, MAX_SWAPFILES);
-
- spin_lock(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list) {
- if (!WARN_ON(!si->frontswap_map))
- set_bit(si->type, a);
- }
- spin_unlock(&swap_lock);
-
- /* the new ops needs to know the currently active swap devices */
- for_each_set_bit(i, a, MAX_SWAPFILES)
- ops->init(i);
-
- /*
- * Setting frontswap_ops must happen after the ops->init() calls
- * above; cmpxchg implies smp_mb() which will ensure the init is
- * complete at this point.
- */
- do {
- ops->next = frontswap_ops;
- } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+ if (frontswap_ops)
+ return -EINVAL;
+ frontswap_ops = ops;
static_branch_inc(&frontswap_enabled_key);
-
- spin_lock(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list) {
- if (si->frontswap_map)
- set_bit(si->type, b);
- }
- spin_unlock(&swap_lock);
-
- /*
- * On the very unlikely chance that a swap device was added or
- * removed between setting the "a" list bits and the ops init
- * calls, we re-check and do init or invalidate for any changed
- * bits.
- */
- if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
- for (i = 0; i < MAX_SWAPFILES; i++) {
- if (!test_bit(i, a) && test_bit(i, b))
- ops->init(i);
- else if (test_bit(i, a) && !test_bit(i, b))
- ops->invalidate_area(i);
- }
- }
-}
-EXPORT_SYMBOL(frontswap_register_ops);
-
-/*
- * Enable/disable frontswap writethrough (see above).
- */
-void frontswap_writethrough(bool enable)
-{
- frontswap_writethrough_enabled = enable;
-}
-EXPORT_SYMBOL(frontswap_writethrough);
-
-/*
- * Enable/disable frontswap exclusive gets (see above).
- */
-void frontswap_tmem_exclusive_gets(bool enable)
-{
- frontswap_tmem_exclusive_gets_enabled = enable;
+ return 0;
}
-EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
/*
* Called when a swap device is swapon'd.
*/
-void __frontswap_init(unsigned type, unsigned long *map)
+void frontswap_init(unsigned type, unsigned long *map)
{
struct swap_info_struct *sis = swap_info[type];
- struct frontswap_ops *ops;
VM_BUG_ON(sis == NULL);
@@ -207,19 +126,18 @@ void __frontswap_init(unsigned type, unsigned long *map)
*/
frontswap_map_set(sis, map);
- for_each_frontswap_ops(ops)
- ops->init(type);
+ if (!frontswap_enabled())
+ return;
+ frontswap_ops->init(type);
}
-EXPORT_SYMBOL(__frontswap_init);
-bool __frontswap_test(struct swap_info_struct *sis,
+static bool __frontswap_test(struct swap_info_struct *sis,
pgoff_t offset)
{
if (sis->frontswap_map)
return test_bit(offset, sis->frontswap_map);
return false;
}
-EXPORT_SYMBOL(__frontswap_test);
static inline void __frontswap_set(struct swap_info_struct *sis,
pgoff_t offset)
@@ -249,7 +167,6 @@ int __frontswap_store(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
- struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@@ -263,28 +180,19 @@ int __frontswap_store(struct page *page)
*/
if (__frontswap_test(sis, offset)) {
__frontswap_clear(sis, offset);
- for_each_frontswap_ops(ops)
- ops->invalidate_page(type, offset);
+ frontswap_ops->invalidate_page(type, offset);
}
- /* Try to store in each implementation, until one succeeds. */
- for_each_frontswap_ops(ops) {
- ret = ops->store(type, offset, page);
- if (!ret) /* successful store */
- break;
- }
+ ret = frontswap_ops->store(type, offset, page);
if (ret == 0) {
__frontswap_set(sis, offset);
inc_frontswap_succ_stores();
} else {
inc_frontswap_failed_stores();
}
- if (frontswap_writethrough_enabled)
- /* report failure so swap also writes to swap device */
- ret = -1;
+
return ret;
}
-EXPORT_SYMBOL(__frontswap_store);
/*
* "Get" data from frontswap associated with swaptype and offset that were
@@ -298,7 +206,7 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
- struct frontswap_ops *ops;
+ bool exclusive = false;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@@ -308,21 +216,16 @@ int __frontswap_load(struct page *page)
return -1;
/* Try loading from each implementation, until one succeeds. */
- for_each_frontswap_ops(ops) {
- ret = ops->load(type, offset, page);
- if (!ret) /* successful load */
- break;
- }
+ ret = frontswap_ops->load(type, offset, page, &exclusive);
if (ret == 0) {
inc_frontswap_loads();
- if (frontswap_tmem_exclusive_gets_enabled) {
+ if (exclusive) {
SetPageDirty(page);
__frontswap_clear(sis, offset);
}
}
return ret;
}
-EXPORT_SYMBOL(__frontswap_load);
/*
* Invalidate any data from frontswap associated with the specified swaptype
@@ -331,7 +234,6 @@ EXPORT_SYMBOL(__frontswap_load);
void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct swap_info_struct *sis = swap_info[type];
- struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(sis == NULL);
@@ -339,12 +241,10 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
if (!__frontswap_test(sis, offset))
return;
- for_each_frontswap_ops(ops)
- ops->invalidate_page(type, offset);
+ frontswap_ops->invalidate_page(type, offset);
__frontswap_clear(sis, offset);
inc_frontswap_invalidates();
}
-EXPORT_SYMBOL(__frontswap_invalidate_page);
/*
* Invalidate all data from frontswap associated with all offsets for the
@@ -353,7 +253,6 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
void __frontswap_invalidate_area(unsigned type)
{
struct swap_info_struct *sis = swap_info[type];
- struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(sis == NULL);
@@ -361,123 +260,10 @@ void __frontswap_invalidate_area(unsigned type)
if (sis->frontswap_map == NULL)
return;
- for_each_frontswap_ops(ops)
- ops->invalidate_area(type);
+ frontswap_ops->invalidate_area(type);
atomic_set(&sis->frontswap_pages, 0);
bitmap_zero(sis->frontswap_map, sis->max);
}
-EXPORT_SYMBOL(__frontswap_invalidate_area);
-
-static unsigned long __frontswap_curr_pages(void)
-{
- unsigned long totalpages = 0;
- struct swap_info_struct *si = NULL;
-
- assert_spin_locked(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list)
- totalpages += atomic_read(&si->frontswap_pages);
- return totalpages;
-}
-
-static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
- int *swapid)
-{
- int ret = -EINVAL;
- struct swap_info_struct *si = NULL;
- int si_frontswap_pages;
- unsigned long total_pages_to_unuse = total;
- unsigned long pages = 0, pages_to_unuse = 0;
-
- assert_spin_locked(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list) {
- si_frontswap_pages = atomic_read(&si->frontswap_pages);
- if (total_pages_to_unuse < si_frontswap_pages) {
- pages = pages_to_unuse = total_pages_to_unuse;
- } else {
- pages = si_frontswap_pages;
- pages_to_unuse = 0; /* unuse all */
- }
- /* ensure there is enough RAM to fetch pages from frontswap */
- if (security_vm_enough_memory_mm(current->mm, pages)) {
- ret = -ENOMEM;
- continue;
- }
- vm_unacct_memory(pages);
- *unused = pages_to_unuse;
- *swapid = si->type;
- ret = 0;
- break;
- }
-
- return ret;
-}
-
-/*
- * Used to check if it's necessary and feasible to unuse pages.
- * Return 1 when nothing to do, 0 when need to shrink pages,
- * error code when there is an error.
- */
-static int __frontswap_shrink(unsigned long target_pages,
- unsigned long *pages_to_unuse,
- int *type)
-{
- unsigned long total_pages = 0, total_pages_to_unuse;
-
- assert_spin_locked(&swap_lock);
-
- total_pages = __frontswap_curr_pages();
- if (total_pages <= target_pages) {
- /* Nothing to do */
- *pages_to_unuse = 0;
- return 1;
- }
- total_pages_to_unuse = total_pages - target_pages;
- return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
-}
-
-/*
- * Frontswap, like a true swap device, may unnecessarily retain pages
- * under certain circumstances; "shrink" frontswap is essentially a
- * "partial swapoff" and works by calling try_to_unuse to attempt to
- * unuse enough frontswap pages to attempt to -- subject to memory
- * constraints -- reduce the number of pages in frontswap to the
- * number given in the parameter target_pages.
- */
-void frontswap_shrink(unsigned long target_pages)
-{
- unsigned long pages_to_unuse = 0;
- int type, ret;
-
- /*
- * we don't want to hold swap_lock while doing a very
- * lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_active_head each time
- */
- spin_lock(&swap_lock);
- ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
- spin_unlock(&swap_lock);
- if (ret == 0)
- try_to_unuse(type, true, pages_to_unuse);
- return;
-}
-EXPORT_SYMBOL(frontswap_shrink);
-
-/*
- * Count and return the number of frontswap pages across all
- * swap devices. This is exported so that backend drivers can
- * determine current usage without reading debugfs.
- */
-unsigned long frontswap_curr_pages(void)
-{
- unsigned long totalpages = 0;
-
- spin_lock(&swap_lock);
- totalpages = __frontswap_curr_pages();
- spin_unlock(&swap_lock);
-
- return totalpages;
-}
-EXPORT_SYMBOL(frontswap_curr_pages);
static int __init init_frontswap(void)
{
diff --git a/mm/gup.c b/mm/gup.c
index ad617e7f22f5..6e2f9e9d6537 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/secretmem.h>
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
@@ -17,6 +18,7 @@
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
+#include <linux/shmem_fs.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -28,189 +30,235 @@ struct follow_page_context {
unsigned int page_mask;
};
-static void hpage_pincount_add(struct page *page, int refs)
+static inline void sanity_check_pinned_pages(struct page **pages,
+ unsigned long npages)
{
- VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
- VM_BUG_ON_PAGE(page != compound_head(page), page);
-
- atomic_add(refs, compound_pincount_ptr(page));
-}
+ if (!IS_ENABLED(CONFIG_DEBUG_VM))
+ return;
-static void hpage_pincount_sub(struct page *page, int refs)
-{
- VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
- VM_BUG_ON_PAGE(page != compound_head(page), page);
+ /*
+ * We only pin anonymous pages if they are exclusive. Once pinned, we
+ * can no longer turn them possibly shared and PageAnonExclusive() will
+ * stick around until the page is freed.
+ *
+ * We'd like to verify that our pinned anonymous pages are still mapped
+ * exclusively. The issue with anon THP is that we don't know how
+ * they are/were mapped when pinning them. However, for anon
+ * THP we can assume that either the given page (PTE-mapped THP) or
+ * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
+ * neither is the case, there is certainly something wrong.
+ */
+ for (; npages; npages--, pages++) {
+ struct page *page = *pages;
+ struct folio *folio = page_folio(page);
- atomic_sub(refs, compound_pincount_ptr(page));
+ if (is_zero_page(page) ||
+ !folio_test_anon(folio))
+ continue;
+ if (!folio_test_large(folio) || folio_test_hugetlb(folio))
+ VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
+ else
+ /* Either a PTE-mapped or a PMD-mapped THP. */
+ VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
+ !PageAnonExclusive(page), page);
+ }
}
/*
- * Return the compound head page with ref appropriately incremented,
+ * Return the folio with ref appropriately incremented,
* or NULL if that failed.
*/
-static inline struct page *try_get_compound_head(struct page *page, int refs)
+static inline struct folio *try_get_folio(struct page *page, int refs)
{
- struct page *head = compound_head(page);
+ struct folio *folio;
- if (WARN_ON_ONCE(page_ref_count(head) < 0))
+retry:
+ folio = page_folio(page);
+ if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
return NULL;
- if (unlikely(!page_cache_add_speculative(head, refs)))
+ if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
return NULL;
- return head;
+
+ /*
+ * At this point we have a stable reference to the folio; but it
+ * could be that between calling page_folio() and the refcount
+ * increment, the folio was split, in which case we'd end up
+ * holding a reference on a folio that has nothing to do with the page
+ * we were given anymore.
+ * So now that the folio is stable, recheck that the page still
+ * belongs to this folio.
+ */
+ if (unlikely(page_folio(page) != folio)) {
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
+ goto retry;
+ }
+
+ return folio;
}
-/*
- * try_grab_compound_head() - attempt to elevate a page's refcount, by a
- * flags-dependent amount.
+/**
+ * try_grab_folio() - Attempt to get or pin a folio.
+ * @page: pointer to page to be grabbed
+ * @refs: the value to (effectively) add to the folio's refcount
+ * @flags: gup flags: these are the FOLL_* flag values.
*
* "grab" names in this file mean, "look at flags to decide whether to use
- * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
*
* Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
* same time. (That's true throughout the get_user_pages*() and
* pin_user_pages*() APIs.) Cases:
*
- * FOLL_GET: page's refcount will be incremented by 1.
- * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ * FOLL_GET: folio's refcount will be incremented by @refs.
+ *
+ * FOLL_PIN on large folios: folio's refcount will be incremented by
+ * @refs, and its pincount will be incremented by @refs.
*
- * Return: head page (with refcount appropriately incremented) for success, or
- * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
- * considered failure, and furthermore, a likely bug in the caller, so a warning
- * is also emitted.
+ * FOLL_PIN on single-page folios: folio's refcount will be incremented by
+ * @refs * GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: The folio containing @page (with refcount appropriately
+ * incremented) for success, or NULL upon failure. If neither FOLL_GET
+ * nor FOLL_PIN was set, that's considered failure, and furthermore,
+ * a likely bug in the caller, so a warning is also emitted.
*/
-static __maybe_unused struct page *try_grab_compound_head(struct page *page,
- int refs,
- unsigned int flags)
+struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
{
+ struct folio *folio;
+
+ if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
+ return NULL;
+
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return NULL;
+
if (flags & FOLL_GET)
- return try_get_compound_head(page, refs);
- else if (flags & FOLL_PIN) {
- int orig_refs = refs;
+ return try_get_folio(page, refs);
- /*
- * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
- * path, so fail and let the caller fall back to the slow path.
- */
- if (unlikely(flags & FOLL_LONGTERM) &&
- is_migrate_cma_page(page))
- return NULL;
+ /* FOLL_PIN is set */
- /*
- * When pinning a compound page of order > 1 (which is what
- * hpage_pincount_available() checks for), use an exact count to
- * track it, via hpage_pincount_add/_sub().
- *
- * However, be sure to *also* increment the normal page refcount
- * field at least once, so that the page really is pinned.
- */
- if (!hpage_pincount_available(page))
- refs *= GUP_PIN_COUNTING_BIAS;
+ /*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return page_folio(page);
- page = try_get_compound_head(page, refs);
- if (!page)
- return NULL;
+ folio = try_get_folio(page, refs);
+ if (!folio)
+ return NULL;
- if (hpage_pincount_available(page))
- hpage_pincount_add(page, refs);
+ /*
+ * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+ * right zone, so fail and let the caller fall back to the slow
+ * path.
+ */
+ if (unlikely((flags & FOLL_LONGTERM) &&
+ !folio_is_longterm_pinnable(folio))) {
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
+ return NULL;
+ }
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
- orig_refs);
+ /*
+ * When pinning a large folio, use an exact count to track it.
+ *
+ * However, be sure to *also* increment the normal folio
+ * refcount field at least once, so that the folio really
+ * is pinned. That's why the refcount from the earlier
+ * try_get_folio() is left intact.
+ */
+ if (folio_test_large(folio))
+ atomic_add(refs, &folio->_pincount);
+ else
+ folio_ref_add(folio,
+ refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in page_try_share_anon_rmap().
+ */
+ smp_mb__after_atomic();
- return page;
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
+
+ return folio;
+}
+
+static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
+{
+ if (flags & FOLL_PIN) {
+ if (is_zero_folio(folio))
+ return;
+ node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
+ if (folio_test_large(folio))
+ atomic_sub(refs, &folio->_pincount);
+ else
+ refs *= GUP_PIN_COUNTING_BIAS;
}
- WARN_ON_ONCE(1);
- return NULL;
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
}
/**
* try_grab_page() - elevate a page's refcount by a flag-dependent amount
+ * @page: pointer to page to be grabbed
+ * @flags: gup flags: these are the FOLL_* flag values.
*
* This might not do anything at all, depending on the flags argument.
*
* "grab" names in this file mean, "look at flags to decide whether to use
* FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
*
- * @page: pointer to page to be grabbed
- * @flags: gup flags: these are the FOLL_* flag values.
- *
* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
- * time. Cases:
+ * time. Cases: please see the try_grab_folio() documentation, with
+ * "refs=1".
*
- * FOLL_GET: page's refcount will be incremented by 1.
- * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ * Return: 0 for success, or if no action was required (if neither FOLL_PIN
+ * nor FOLL_GET was set, nothing is done). A negative error code for failure:
*
- * Return: true for success, or if no action was required (if neither FOLL_PIN
- * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
- * FOLL_PIN was set, but the page could not be grabbed.
+ * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not
+ * be grabbed.
*/
-bool __must_check try_grab_page(struct page *page, unsigned int flags)
+int __must_check try_grab_page(struct page *page, unsigned int flags)
{
- WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
+ struct folio *folio = page_folio(page);
- if (flags & FOLL_GET)
- return try_get_page(page);
- else if (flags & FOLL_PIN) {
- int refs = 1;
+ if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
+ return -ENOMEM;
- page = compound_head(page);
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return -EREMOTEIO;
- if (WARN_ON_ONCE(page_ref_count(page) <= 0))
- return false;
-
- if (hpage_pincount_available(page))
- hpage_pincount_add(page, 1);
- else
- refs = GUP_PIN_COUNTING_BIAS;
+ if (flags & FOLL_GET)
+ folio_ref_inc(folio);
+ else if (flags & FOLL_PIN) {
+ /*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return 0;
/*
- * Similar to try_grab_compound_head(): even if using the
- * hpage_pincount_add/_sub() routines, be sure to
- * *also* increment the normal page refcount field at least
- * once, so that the page really is pinned.
+ * Similar to try_grab_folio(): be sure to *also*
+ * increment the normal page refcount field at least once,
+ * so that the page really is pinned.
*/
- page_ref_add(page, refs);
+ if (folio_test_large(folio)) {
+ folio_ref_add(folio, 1);
+ atomic_add(1, &folio->_pincount);
+ } else {
+ folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
+ }
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
}
- return true;
-}
-
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-static bool __unpin_devmap_managed_user_page(struct page *page)
-{
- int count, refs = 1;
-
- if (!page_is_devmap_managed(page))
- return false;
-
- if (hpage_pincount_available(page))
- hpage_pincount_sub(page, 1);
- else
- refs = GUP_PIN_COUNTING_BIAS;
-
- count = page_ref_sub_return(page, refs);
-
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1);
- /*
- * devmap page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (count == 1)
- free_devmap_managed_page(page);
- else if (!count)
- __put_page(page);
-
- return true;
-}
-#else
-static bool __unpin_devmap_managed_user_page(struct page *page)
-{
- return false;
+ return 0;
}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
/**
* unpin_user_page() - release a dma-pinned page
@@ -223,30 +271,67 @@ static bool __unpin_devmap_managed_user_page(struct page *page)
*/
void unpin_user_page(struct page *page)
{
- int refs = 1;
+ sanity_check_pinned_pages(&page, 1);
+ gup_put_folio(page_folio(page), 1, FOLL_PIN);
+}
+EXPORT_SYMBOL(unpin_user_page);
- page = compound_head(page);
+/**
+ * folio_add_pin - Try to get an additional pin on a pinned folio
+ * @folio: The folio to be pinned
+ *
+ * Get an additional pin on a folio we already have a pin on. Makes no change
+ * if the folio is a zero_page.
+ */
+void folio_add_pin(struct folio *folio)
+{
+ if (is_zero_folio(folio))
+ return;
/*
- * For devmap managed pages we need to catch refcount transition from
- * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
- * page is free and we need to inform the device driver through
- * callback. See include/linux/memremap.h and HMM for details.
+ * Similar to try_grab_folio(): be sure to *also* increment the normal
+ * page refcount field at least once, so that the page really is
+ * pinned.
*/
- if (__unpin_devmap_managed_user_page(page))
- return;
+ if (folio_test_large(folio)) {
+ WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
+ folio_ref_inc(folio);
+ atomic_inc(&folio->_pincount);
+ } else {
+ WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
+ folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
+ }
+}
- if (hpage_pincount_available(page))
- hpage_pincount_sub(page, 1);
- else
- refs = GUP_PIN_COUNTING_BIAS;
+static inline struct folio *gup_folio_range_next(struct page *start,
+ unsigned long npages, unsigned long i, unsigned int *ntails)
+{
+ struct page *next = nth_page(start, i);
+ struct folio *folio = page_folio(next);
+ unsigned int nr = 1;
- if (page_ref_sub_and_test(page, refs))
- __put_page(page);
+ if (folio_test_large(folio))
+ nr = min_t(unsigned int, npages - i,
+ folio_nr_pages(folio) - folio_page_idx(folio, next));
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1);
+ *ntails = nr;
+ return folio;
+}
+
+static inline struct folio *gup_folio_next(struct page **list,
+ unsigned long npages, unsigned long i, unsigned int *ntails)
+{
+ struct folio *folio = page_folio(list[i]);
+ unsigned int nr;
+
+ for (nr = i + 1; nr < npages; nr++) {
+ if (page_folio(list[nr]) != folio)
+ break;
+ }
+
+ *ntails = nr - i;
+ return folio;
}
-EXPORT_SYMBOL(unpin_user_page);
/**
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
@@ -273,21 +358,18 @@ EXPORT_SYMBOL(unpin_user_page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty)
{
- unsigned long index;
-
- /*
- * TODO: this can be optimized for huge pages: if a series of pages is
- * physically contiguous and part of the same compound page, then a
- * single operation to the head page should suffice.
- */
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
if (!make_dirty) {
unpin_user_pages(pages, npages);
return;
}
- for (index = 0; index < npages; index++) {
- struct page *page = compound_head(pages[index]);
+ sanity_check_pinned_pages(pages, npages);
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_next(pages, npages, i, &nr);
/*
* Checking PageDirty at this point may race with
* clear_page_dirty_for_io(), but that's OK. Two key
@@ -308,14 +390,74 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
* written back, so it gets written back again in the
* next writeback cycle. This is harmless.
*/
- if (!PageDirty(page))
- set_page_dirty_lock(page);
- unpin_user_page(page);
+ if (!folio_test_dirty(folio)) {
+ folio_lock(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ }
+ gup_put_folio(folio, nr, FOLL_PIN);
}
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
/**
+ * unpin_user_page_range_dirty_lock() - release and optionally dirty
+ * gup-pinned page range
+ *
+ * @page: the starting page of a range maybe marked dirty, and definitely released.
+ * @npages: number of consecutive pages to release.
+ * @make_dirty: whether to mark the pages dirty
+ *
+ * "gup-pinned page range" refers to a range of pages that has had one of the
+ * pin_user_pages() variants called on that page.
+ *
+ * For the page ranges defined by [page .. page+npages], make that range (or
+ * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
+ * page range was previously listed as clean.
+ *
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), unpin_user_page().
+ *
+ */
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+ bool make_dirty)
+{
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
+
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_range_next(page, npages, i, &nr);
+ if (make_dirty && !folio_test_dirty(folio)) {
+ folio_lock(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ }
+ gup_put_folio(folio, nr, FOLL_PIN);
+ }
+}
+EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
+
+static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
+{
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
+
+ /*
+ * Don't perform any sanity checks because we might have raced with
+ * fork() and some anonymous pages might now actually be shared --
+ * which is why we're unpinning after all.
+ */
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_next(pages, npages, i, &nr);
+ gup_put_folio(folio, nr, FOLL_PIN);
+ }
+}
+
+/**
* unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
* @npages: number of pages in the @pages array.
@@ -326,7 +468,9 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
*/
void unpin_user_pages(struct page **pages, unsigned long npages)
{
- unsigned long index;
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
/*
* If this WARN_ON() fires, then the system *might* be leaking pages (by
@@ -335,16 +479,26 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
*/
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
- /*
- * TODO: this can be optimized for huge pages: if a series of pages is
- * physically contiguous and part of the same compound page, then a
- * single operation to the head page should suffice.
- */
- for (index = 0; index < npages; index++)
- unpin_user_page(pages[index]);
+
+ sanity_check_pinned_pages(pages, npages);
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_next(pages, npages, i, &nr);
+ gup_put_folio(folio, nr, FOLL_PIN);
+ }
}
EXPORT_SYMBOL(unpin_user_pages);
+/*
+ * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
+ * lifecycle. Avoid setting the bit unless necessary, or it might cause write
+ * cache bouncing on large SMP machines for concurrent pinned gups.
+ */
+static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
+{
+ if (!test_bit(MMF_HAS_PINNED, mm_flags))
+ set_bit(MMF_HAS_PINNED, mm_flags);
+}
+
#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
@@ -366,18 +520,15 @@ static struct page *no_page_table(struct vm_area_struct *vma,
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, unsigned int flags)
{
- /* No page to get reference */
- if (flags & FOLL_GET)
- return -EFAULT;
-
if (flags & FOLL_TOUCH) {
- pte_t entry = *pte;
+ pte_t orig_entry = ptep_get(pte);
+ pte_t entry = orig_entry;
if (flags & FOLL_WRITE)
entry = pte_mkdirty(entry);
entry = pte_mkyoung(entry);
- if (!pte_same(*pte, entry)) {
+ if (!pte_same(orig_entry, entry)) {
set_pte_at(vma->vm_mm, address, pte, entry);
update_mmu_cache(vma, address, pte);
}
@@ -387,14 +538,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
return -EEXIST;
}
-/*
- * FOLL_FORCE can write to even unwritable pte's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
+static inline bool can_follow_write_pte(pte_t pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
{
- return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+ /* If the pte is writable, we can write to the page. */
+ if (pte_write(pte))
+ return true;
+
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+
+ /* ... and a write-fault isn't required for other reasons. */
+ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
+ return false;
+ return !userfaultfd_pte_wp(vma, pte);
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -411,38 +590,28 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
-retry:
- if (unlikely(pmd_bad(*pmd)))
- return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
- pte = *ptep;
- if (!pte_present(pte)) {
- swp_entry_t entry;
- /*
- * KSM's break_ksm() relies upon recognizing a ksm page
- * even while it is being migrated, so for that case we
- * need migration_entry_wait().
- */
- if (likely(!(flags & FOLL_MIGRATION)))
- goto no_page;
- if (pte_none(pte))
- goto no_page;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
- goto no_page;
- pte_unmap_unlock(ptep, ptl);
- migration_entry_wait(mm, pmd, address);
- goto retry;
- }
- if ((flags & FOLL_NUMA) && pte_protnone(pte))
+ if (!ptep)
+ return no_page_table(vma, flags);
+ pte = ptep_get(ptep);
+ if (!pte_present(pte))
+ goto no_page;
+ if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
goto no_page;
- if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
- pte_unmap_unlock(ptep, ptl);
- return NULL;
- }
page = vm_normal_page(vma, address, pte);
+
+ /*
+ * We only care about anon pages in can_follow_write_pte() and don't
+ * have to worry about pte_devmap() because they are never anon.
+ */
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pte(pte, page, vma, flags)) {
+ page = NULL;
+ goto out;
+ }
+
if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
@@ -470,23 +639,21 @@ retry:
}
}
- if (flags & FOLL_SPLIT && PageTransCompound(page)) {
- get_page(page);
- pte_unmap_unlock(ptep, ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (ret)
- return ERR_PTR(ret);
- goto retry;
+ if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
+ page = ERR_PTR(-EMLINK);
+ goto out;
}
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
- if (unlikely(!try_grab_page(page, flags))) {
- page = ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (unlikely(ret)) {
+ page = ERR_PTR(ret);
goto out;
}
+
/*
* We need to make the page accessible if and only if we are going
* to access its content (the FOLL_PIN case). Please see
@@ -511,32 +678,6 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /* Do not mlock pte-mapped THP */
- if (PageTransCompound(page))
- goto out;
-
- /*
- * The preliminary mapping check is mainly to avoid the
- * pointless overhead of lock_page on the ZERO_PAGE
- * which might bounce very badly if there is contention.
- *
- * If the page is already locked, we don't need to
- * handle it now - vmscan will handle it later if and
- * when it attempts to reclaim the page.
- */
- if (page->mapping && trylock_page(page)) {
- lru_add_drain(); /* push cached pages to LRU */
- /*
- * Because we lock page here, and migration is
- * blocked by the pte's page reference, and we
- * know the page is still mapped, we don't even
- * need to check for file-cache page truncation.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- }
out:
pte_unmap_unlock(ptep, ptl);
return page;
@@ -558,44 +699,11 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
pmd = pmd_offset(pudp, address);
- /*
- * The READ_ONCE() will stabilize the pmdval in a register or
- * on the stack so that it will stop changing under the code.
- */
- pmdval = READ_ONCE(*pmd);
+ pmdval = pmdp_get_lockless(pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd(mm, address, pmd, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pmd_val(pmdval)), flags,
- PMD_SHIFT);
- if (page)
- return page;
+ if (!pmd_present(pmdval))
return no_page_table(vma, flags);
- }
-retry:
- if (!pmd_present(pmdval)) {
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(pmdval));
- if (is_pmd_migration_entry(pmdval))
- pmd_migration_entry_wait(mm, pmd);
- pmdval = READ_ONCE(*pmd);
- /*
- * MADV_DONTNEED may convert the pmd to null because
- * mmap_lock is held in read mode
- */
- if (pmd_none(pmdval))
- return no_page_table(vma, flags);
- goto retry;
- }
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
@@ -606,54 +714,23 @@ retry:
if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
- if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
+ if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
return no_page_table(vma, flags);
-retry_locked:
ptl = pmd_lock(mm, pmd);
- if (unlikely(pmd_none(*pmd))) {
- spin_unlock(ptl);
- return no_page_table(vma, flags);
- }
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
- pmd_migration_entry_wait(mm, pmd);
- goto retry_locked;
+ return no_page_table(vma, flags);
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
- int ret;
- page = pmd_page(*pmd);
- if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
- ret = 0;
- split_huge_pmd(vma, pmd, address);
- if (pmd_trans_unstable(pmd))
- ret = -EBUSY;
- } else if (flags & FOLL_SPLIT) {
- if (unlikely(!try_get_page(page))) {
- spin_unlock(ptl);
- return ERR_PTR(-ENOMEM);
- }
- spin_unlock(ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (pmd_none(*pmd))
- return no_page_table(vma, flags);
- } else { /* flags & FOLL_SPLIT_PMD */
- spin_unlock(ptl);
- split_huge_pmd(vma, pmd, address);
- ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
- }
-
- return ret ? ERR_PTR(ret) :
+ if (flags & FOLL_SPLIT_PMD) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ /* If pmd was left empty, stuff a page table in there quickly */
+ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
@@ -675,20 +752,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pud(mm, address, pud, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pud_val(*pud)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pud_val(*pud)), flags,
- PUD_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
@@ -708,7 +771,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
struct follow_page_context *ctx)
{
p4d_t *p4d;
- struct page *page;
p4d = p4d_offset(pgdp, address);
if (p4d_none(*p4d))
@@ -717,14 +779,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
if (unlikely(p4d_bad(*p4d)))
return no_page_table(vma, flags);
- if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(p4d_val(*p4d)), flags,
- P4D_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
return follow_pud_mask(vma, address, p4d, flags, ctx);
}
@@ -741,6 +795,11 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
* When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
* the device's dev_pagemap metadata to avoid repeating expensive lookups.
*
+ * When getting an anonymous page and the caller has to trigger unsharing
+ * of a shared anonymous page first, -EMLINK is returned. The caller should
+ * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
+ * relevant with FOLL_PIN and !FOLL_WRITE.
+ *
* On output, the @ctx->page_mask is set according to the size of the page.
*
* Return: the mapped (struct page *), %NULL if no mapping exists, or
@@ -757,10 +816,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
ctx->page_mask = 0;
- /* make this handle hugepd */
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+ /*
+ * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
+ * special hugetlb page table walking code. This eliminates the
+ * need to check for hugetlb entries in the general walking code.
+ *
+ * hugetlb_follow_page_mask is only for follow_page() handling here.
+ * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = hugetlb_follow_page_mask(vma, address, flags);
+ if (!page)
+ page = no_page_table(vma, flags);
return page;
}
@@ -769,21 +836,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags);
- if (pgd_huge(*pgd)) {
- page = follow_huge_pgd(mm, address, pgd, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pgd_val(*pgd)), flags,
- PGDIR_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
-
return follow_p4d_mask(vma, address, pgd, flags, ctx);
}
@@ -793,6 +845,16 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
struct follow_page_context ctx = { NULL };
struct page *page;
+ if (vma_is_secretmem(vma))
+ return NULL;
+
+ if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
+ return NULL;
+
+ /*
+ * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect
+ * to fail on PROT_NONE-mapped pages.
+ */
page = follow_page_mask(vma, address, foll_flags, &ctx);
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
@@ -808,6 +870,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pte_t entry;
int ret = -EFAULT;
/* user gate pages are read-only */
@@ -828,23 +891,24 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return -EFAULT;
- VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, address);
- if (pte_none(*pte))
+ if (!pte)
+ return -EFAULT;
+ entry = ptep_get(pte);
+ if (pte_none(entry))
goto unmap;
*vma = get_gate_vma(mm);
if (!page)
goto out;
- *page = vm_normal_page(*vma, address, *pte);
+ *page = vm_normal_page(*vma, address, entry);
if (!*page) {
- if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
+ if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
goto unmap;
- *page = pte_page(*pte);
+ *page = pte_page(entry);
}
- if (unlikely(!try_grab_page(*page, gup_flags))) {
- ret = -ENOMEM;
+ ret = try_grab_page(*page, gup_flags);
+ if (unlikely(ret))
goto unmap;
- }
out:
ret = 0;
unmap:
@@ -853,25 +917,34 @@ unmap:
}
/*
- * mmap_lock must be held on entry. If @locked != NULL and *@flags
- * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
- * is, *@locked will be set to 0 and -EBUSY returned.
+ * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not
+ * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set
+ * to 0 and -EBUSY returned.
*/
static int faultin_page(struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, int *locked)
+ unsigned long address, unsigned int *flags, bool unshare,
+ int *locked)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
- /* mlock all present pages, but do not fault in new pages */
- if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
- return -ENOENT;
+ if (*flags & FOLL_NOFAULT)
+ return -EFAULT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE;
- if (locked)
+ if (*flags & FOLL_UNLOCKABLE) {
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ /*
+ * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
+ * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
+ * That's because some callers may not be prepared to
+ * handle early exits caused by non-fatal signals.
+ */
+ if (*flags & FOLL_INTERRUPTIBLE)
+ fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
+ }
if (*flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
if (*flags & FOLL_TRIED) {
@@ -881,8 +954,32 @@ static int faultin_page(struct vm_area_struct *vma,
*/
fault_flags |= FAULT_FLAG_TRIED;
}
+ if (unshare) {
+ fault_flags |= FAULT_FLAG_UNSHARE;
+ /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
+ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
+ }
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
+ * mmap lock in the page fault handler. Sanity check this.
+ */
+ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
+ *locked = 0;
+
+ /*
+ * We should do the same as VM_FAULT_RETRY, but let's not
+ * return -EBUSY since that's not reflecting the reality of
+ * what has happened - we've just fully completed a page
+ * fault, with the mmap lock released. Use -EAGAIN to show
+ * that we want to take the mmap lock _again_.
+ */
+ return -EAGAIN;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -892,23 +989,49 @@ static int faultin_page(struct vm_area_struct *vma,
}
if (ret & VM_FAULT_RETRY) {
- if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
+ if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*locked = 0;
return -EBUSY;
}
+ return 0;
+}
+
+/*
+ * Writing to file-backed mappings which require folio dirty tracking using GUP
+ * is a fundamentally broken operation, as kernel write access to GUP mappings
+ * do not adhere to the semantics expected by a file system.
+ *
+ * Consider the following scenario:-
+ *
+ * 1. A folio is written to via GUP which write-faults the memory, notifying
+ * the file system and dirtying the folio.
+ * 2. Later, writeback is triggered, resulting in the folio being cleaned and
+ * the PTE being marked read-only.
+ * 3. The GUP caller writes to the folio, as it is mapped read/write via the
+ * direct mapping.
+ * 4. The GUP caller, now done with the page, unpins it and sets it dirty
+ * (though it does not have to).
+ *
+ * This results in both data being written to a folio without writenotify, and
+ * the folio being dirtied unexpectedly (if the caller decides to do so).
+ */
+static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
+ unsigned long gup_flags)
+{
/*
- * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
- * necessary, even if maybe_mkwrite decided not to set pte_write. We
- * can thus safely do subsequent page lookups as if they were reads.
- * But only do so when looping for pte_write is futile: in some cases
- * userspace may also be wanting to write to the gotten user page,
- * which a read fault here might prevent (a readonly page might get
- * reCOWed by userspace write).
+ * If we aren't pinning then no problematic write can occur. A long term
+ * pin is the most egregious case so this is the case we disallow.
*/
- if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
- *flags |= FOLL_COW;
- return 0;
+ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
+ (FOLL_PIN | FOLL_LONGTERM))
+ return true;
+
+ /*
+ * If the VMA does not require dirty tracking then no problematic write
+ * can occur either.
+ */
+ return !vma_needs_dirty_tracking(vma);
}
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
@@ -916,17 +1039,31 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
vm_flags_t vm_flags = vma->vm_flags;
int write = (gup_flags & FOLL_WRITE);
int foreign = (gup_flags & FOLL_REMOTE);
+ bool vma_anon = vma_is_anonymous(vma);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
- if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ if ((gup_flags & FOLL_ANON) && !vma_anon)
+ return -EFAULT;
+
+ if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
+ return -EOPNOTSUPP;
+
+ if (vma_is_secretmem(vma))
return -EFAULT;
if (write) {
+ if (!vma_anon &&
+ !writable_file_mapping_allowed(vma, gup_flags))
+ return -EFAULT;
+
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
+ /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
+ if (is_vm_hugetlb_page(vma))
+ return -EFAULT;
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
@@ -958,6 +1095,45 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
return 0;
}
+/*
+ * This is "vma_lookup()", but with a warning if we would have
+ * historically expanded the stack in the GUP code.
+ */
+static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,
+ unsigned long addr)
+{
+#ifdef CONFIG_STACK_GROWSUP
+ return vma_lookup(mm, addr);
+#else
+ static volatile unsigned long next_warn;
+ struct vm_area_struct *vma;
+ unsigned long now, next;
+
+ vma = find_vma(mm, addr);
+ if (!vma || (addr >= vma->vm_start))
+ return vma;
+
+ /* Only warn for half-way relevant accesses */
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return NULL;
+ if (vma->vm_start - addr > 65536)
+ return NULL;
+
+ /* Let's not warn more than once an hour.. */
+ now = jiffies; next = next_warn;
+ if (next && time_before(now, next))
+ return NULL;
+ next_warn = now + 60*60*HZ;
+
+ /* Let people know things may have changed. */
+ pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",
+ current->comm, task_pid_nr(current),
+ vma->vm_start, vma->vm_end, addr);
+ dump_stack();
+ return NULL;
+#endif
+}
+
/**
* __get_user_pages() - pin user pages in memory
* @mm: mm_struct of target mm
@@ -967,8 +1143,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: whether we're still with the mmap_lock held
*
* Returns either number of pages pinned (which may be less than the
@@ -982,8 +1156,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_lock is held.
- *
* Must be called with mmap_lock held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
@@ -994,7 +1166,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* This does not guarantee that the page exists in the user mappings when
* __get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
+ * and subsequently re-faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
@@ -1005,14 +1177,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
- * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
- * released by an up_read(). That can happen if @gup_flags does not
- * have FOLL_NOWAIT.
+ * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
+ * be released. If this happens *@locked will be set to 0 on return.
*
- * A caller using such a combination of @locked and @gup_flags
- * must therefore hold the mmap_lock for reading only, and recognize
- * when it's been released. Otherwise, it must be held for either
- * reading or writing and will not be released.
+ * A caller using such a combination of @gup_flags must therefore hold the
+ * mmap_lock for reading only, and recognize when it's been released. Otherwise,
+ * it must be held for either reading or writing and will not be released.
*
* In most cases, get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages. __get_user_pages should be used only if
@@ -1021,7 +1191,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
@@ -1030,18 +1200,10 @@ static long __get_user_pages(struct mm_struct *mm,
if (!nr_pages)
return 0;
- start = untagged_addr(start);
+ start = untagged_addr_remote(mm, start);
VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
- /*
- * If FOLL_FORCE is set then do not force a full fault as the hinting
- * fault information is unrelated to the reference behaviour of a task
- * using the address space
- */
- if (!(gup_flags & FOLL_FORCE))
- gup_flags |= FOLL_NUMA;
-
do {
struct page *page;
unsigned int foll_flags = gup_flags;
@@ -1049,7 +1211,7 @@ static long __get_user_pages(struct mm_struct *mm,
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
- vma = find_extend_vma(mm, start);
+ vma = gup_vma_lookup(mm, start);
if (!vma && in_gate_area(mm, start)) {
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
@@ -1060,22 +1222,25 @@ static long __get_user_pages(struct mm_struct *mm,
goto next_page;
}
- if (!vma || check_vma_flags(vma, gup_flags)) {
+ if (!vma) {
ret = -EFAULT;
goto out;
}
+ ret = check_vma_flags(vma, gup_flags);
+ if (ret)
+ goto out;
+
if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i,
- gup_flags, locked);
- if (locked && *locked == 0) {
+ i = follow_hugetlb_page(mm, vma, pages,
+ &start, &nr_pages, i,
+ gup_flags, locked);
+ if (!*locked) {
/*
* We've got a VM_FAULT_RETRY
* and we've lost mmap_lock.
* We must stop here.
*/
BUG_ON(gup_flags & FOLL_NOWAIT);
- BUG_ON(ret != 0);
goto out;
}
continue;
@@ -1093,27 +1258,34 @@ retry:
cond_resched();
page = follow_page_mask(vma, start, foll_flags, &ctx);
- if (!page) {
- ret = faultin_page(vma, start, &foll_flags, locked);
+ if (!page || PTR_ERR(page) == -EMLINK) {
+ ret = faultin_page(vma, start, &foll_flags,
+ PTR_ERR(page) == -EMLINK, locked);
switch (ret) {
case 0:
goto retry;
case -EBUSY:
+ case -EAGAIN:
ret = 0;
fallthrough;
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
goto out;
- case -ENOENT:
- goto next_page;
}
BUG();
} else if (PTR_ERR(page) == -EEXIST) {
/*
* Proper page table entry exists, but no corresponding
- * struct page.
+ * struct page. If the caller expects **pages to be
+ * filled in, bail out now, because that can't be done
+ * for this page.
*/
+ if (pages) {
+ ret = PTR_ERR(page);
+ goto out;
+ }
+
goto next_page;
} else if (IS_ERR(page)) {
ret = PTR_ERR(page);
@@ -1126,10 +1298,6 @@ retry:
ctx.page_mask = 0;
}
next_page:
- if (vmas) {
- vmas[i] = vma;
- ctx.page_mask = 0;
- }
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
@@ -1200,16 +1368,16 @@ int fixup_user_fault(struct mm_struct *mm,
bool *unlocked)
{
struct vm_area_struct *vma;
- vm_fault_t ret, major = 0;
+ vm_fault_t ret;
- address = untagged_addr(address);
+ address = untagged_addr_remote(mm, address);
if (unlocked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
retry:
- vma = find_extend_vma(mm, address);
- if (!vma || address < vma->vm_start)
+ vma = gup_vma_lookup(mm, address);
+ if (!vma)
return -EFAULT;
if (!vma_permits_fault(vma, fault_flags))
@@ -1220,7 +1388,18 @@ retry:
return -EINTR;
ret = handle_mm_fault(vma, address, fault_flags, NULL);
- major |= ret & VM_FAULT_MAJOR;
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * NOTE: it's a pity that we need to retake the lock here
+ * to pair with the unlock() in the callers. Ideally we
+ * could tell the callers so they do not need to unlock.
+ */
+ mmap_read_lock(mm);
+ *unlocked = true;
+ return 0;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1241,29 +1420,59 @@ retry:
EXPORT_SYMBOL_GPL(fixup_user_fault);
/*
- * Please note that this function, unlike __get_user_pages will not
- * return 0 for nr_pages > 0 without FOLL_NOWAIT
+ * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is
+ * specified, it'll also respond to generic signals. The caller of GUP
+ * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
+ */
+static bool gup_signal_pending(unsigned int flags)
+{
+ if (fatal_signal_pending(current))
+ return true;
+
+ if (!(flags & FOLL_INTERRUPTIBLE))
+ return false;
+
+ return signal_pending(current);
+}
+
+/*
+ * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
+ * the caller. This function may drop the mmap_lock. If it does so, then it will
+ * set (*locked = 0).
+ *
+ * (*locked == 0) means that the caller expects this function to acquire and
+ * drop the mmap_lock. Therefore, the value of *locked will still be zero when
+ * the function returns, even though it may have changed temporarily during
+ * function execution.
+ *
+ * Please note that this function, unlike __get_user_pages(), will not return 0
+ * for nr_pages > 0, unless FOLL_NOWAIT is used.
*/
static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas,
int *locked,
unsigned int flags)
{
long ret, pages_done;
- bool lock_dropped;
+ bool must_unlock = false;
- if (locked) {
- /* if VM_FAULT_RETRY can be returned, vmas become invalid */
- BUG_ON(vmas);
- /* check caller initialized locked */
- BUG_ON(*locked != 1);
+ /*
+ * The internal caller expects GUP to manage the lock internally and the
+ * lock must be released when this returns.
+ */
+ if (!*locked) {
+ if (mmap_read_lock_killable(mm))
+ return -EAGAIN;
+ must_unlock = true;
+ *locked = 1;
}
+ else
+ mmap_assert_locked(mm);
if (flags & FOLL_PIN)
- atomic_set(&mm->has_pinned, 1);
+ mm_set_has_pinned_flag(&mm->flags);
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
@@ -1278,15 +1487,16 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
flags |= FOLL_GET;
pages_done = 0;
- lock_dropped = false;
for (;;) {
ret = __get_user_pages(mm, start, nr_pages, flags, pages,
- vmas, locked);
- if (!locked)
+ locked);
+ if (!(flags & FOLL_UNLOCKABLE)) {
/* VM_FAULT_RETRY couldn't trigger, bypass */
- return ret;
+ pages_done = ret;
+ break;
+ }
- /* VM_FAULT_RETRY cannot return errors */
+ /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
if (!*locked) {
BUG_ON(ret < 0);
BUG_ON(ret >= nr_pages);
@@ -1314,18 +1524,20 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
if (likely(pages))
pages += ret;
start += ret << PAGE_SHIFT;
- lock_dropped = true;
+
+ /* The lock was temporarily dropped, so we must unlock later */
+ must_unlock = true;
retry:
/*
* Repeat on the address that fired VM_FAULT_RETRY
* with both FAULT_FLAG_ALLOW_RETRY and
* FAULT_FLAG_TRIED. Note that GUP can be interrupted
- * by fatal signals, so we need to check it before we
+ * by fatal signals of even common signals, depending on
+ * the caller's request. So we need to check it before we
* start trying again otherwise it can loop forever.
*/
-
- if (fatal_signal_pending(current)) {
+ if (gup_signal_pending(flags)) {
if (!pages_done)
pages_done = -EINTR;
break;
@@ -1341,7 +1553,7 @@ retry:
*locked = 1;
ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
- pages, NULL, locked);
+ pages, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
BUG_ON(ret != 0);
@@ -1361,10 +1573,11 @@ retry:
pages++;
start += PAGE_SIZE;
}
- if (lock_dropped && *locked) {
+ if (must_unlock && *locked) {
/*
- * We must let the caller know we temporarily dropped the lock
- * and so the critical section protected by it was lost.
+ * We either temporarily dropped the lock, or the caller
+ * requested that we both acquire and drop the lock. Either way,
+ * we must now unlock, and notify the caller of that state.
*/
mmap_read_unlock(mm);
*locked = 0;
@@ -1397,17 +1610,24 @@ long populate_vma_page_range(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int local_locked = 1;
int gup_flags;
+ long ret;
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+ /*
+ * Rightly or wrongly, the VM_LOCKONFAULT case has never used
+ * faultin_page() to break COW, so it has no work to do here.
+ */
if (vma->vm_flags & VM_LOCKONFAULT)
- gup_flags &= ~FOLL_POPULATE;
+ return nr_pages;
+
+ gup_flags = FOLL_TOUCH;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -1423,12 +1643,75 @@ long populate_vma_page_range(struct vm_area_struct *vma,
if (vma_is_accessible(vma))
gup_flags |= FOLL_FORCE;
+ if (locked)
+ gup_flags |= FOLL_UNLOCKABLE;
+
/*
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked);
+ ret = __get_user_pages(mm, start, nr_pages, gup_flags,
+ NULL, locked ? locked : &local_locked);
+ lru_add_drain();
+ return ret;
+}
+
+/*
+ * faultin_vma_page_range() - populate (prefault) page tables inside the
+ * given VMA range readable/writable
+ *
+ * This takes care of mlocking the pages, too, if VM_LOCKED is set.
+ *
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @write: whether to prefault readable or writable
+ * @locked: whether the mmap_lock is still held
+ *
+ * Returns either number of processed pages in the vma, or a negative error
+ * code on error (see __get_user_pages()).
+ *
+ * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
+ * covered by the VMA. If it's released, *@locked will be set to 0.
+ */
+long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool write, int *locked)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+ long ret;
+
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ mmap_assert_locked(mm);
+
+ /*
+ * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
+ * the page dirty with FOLL_WRITE -- which doesn't make a
+ * difference with !FOLL_FORCE, because the page is writable
+ * in the page table.
+ * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
+ * a poisoned page.
+ * !FOLL_FORCE: Require proper access permissions.
+ */
+ gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want to report -EINVAL instead of -EFAULT for any permission
+ * problems or incompatible mappings.
+ */
+ if (check_vma_flags(vma, gup_flags))
+ return -EINVAL;
+
+ ret = __get_user_pages(mm, start, nr_pages, gup_flags,
+ NULL, locked);
+ lru_add_drain();
+ return ret;
}
/*
@@ -1456,10 +1739,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
if (!locked) {
locked = 1;
mmap_read_lock(mm);
- vma = find_vma(mm, nstart);
+ vma = find_vma_intersection(mm, nstart, end);
} else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
+ vma = find_vma_intersection(mm, vma->vm_end, end);
+
+ if (!vma)
break;
/*
* Set [nstart; nend) to intersection of desired address
@@ -1490,44 +1774,29 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
mmap_read_unlock(mm);
return ret; /* 0 or negative error code */
}
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_lock, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
- struct vm_area_struct *vma;
- struct page *page;
-
- if (__get_user_pages(current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
- NULL) < 1)
- return NULL;
- flush_cache_page(vma, addr, page_to_pfn(page));
- return page;
-}
-#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
- struct vm_area_struct **vmas, int *locked,
- unsigned int foll_flags)
+ int *locked, unsigned int foll_flags)
{
struct vm_area_struct *vma;
+ bool must_unlock = false;
unsigned long vm_flags;
- int i;
+ long i;
+
+ if (!nr_pages)
+ return 0;
+
+ /*
+ * The internal caller expects GUP to manage the lock internally and the
+ * lock must be released when this returns.
+ */
+ if (!*locked) {
+ if (mmap_read_lock_killable(mm))
+ return -EAGAIN;
+ must_unlock = true;
+ *locked = 1;
+ }
/* calculate required read or write permissions.
* If FOLL_FORCE is set, we only require the "MAY" flags.
@@ -1540,153 +1809,363 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
for (i = 0; i < nr_pages; i++) {
vma = find_vma(mm, start);
if (!vma)
- goto finish_or_fault;
+ break;
/* protect what we can, including chardevs */
if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
!(vm_flags & vma->vm_flags))
- goto finish_or_fault;
+ break;
if (pages) {
- pages[i] = virt_to_page(start);
+ pages[i] = virt_to_page((void *)start);
if (pages[i])
get_page(pages[i]);
}
- if (vmas)
- vmas[i] = vma;
+
start = (start + PAGE_SIZE) & PAGE_MASK;
}
- return i;
+ if (must_unlock && *locked) {
+ mmap_read_unlock(mm);
+ *locked = 0;
+ }
-finish_or_fault:
return i ? : -EFAULT;
}
#endif /* !CONFIG_MMU */
-#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+/**
+ * fault_in_writeable - fault in userspace address range for writing
+ * @uaddr: start of address range
+ * @size: size of address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_writeable(char __user *uaddr, size_t size)
+{
+ char __user *start = uaddr, *end;
+
+ if (unlikely(size == 0))
+ return 0;
+ if (!user_write_access_begin(uaddr, size))
+ return size;
+ if (!PAGE_ALIGNED(uaddr)) {
+ unsafe_put_user(0, uaddr, out);
+ uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
+ }
+ end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
+ if (unlikely(end < start))
+ end = NULL;
+ while (uaddr != end) {
+ unsafe_put_user(0, uaddr, out);
+ uaddr += PAGE_SIZE;
+ }
+
+out:
+ user_write_access_end();
+ if (size > uaddr - start)
+ return size - (uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_writeable);
+
+/**
+ * fault_in_subpage_writeable - fault in an address range for writing
+ * @uaddr: start of address range
+ * @size: size of address range
+ *
+ * Fault in a user address range for writing while checking for permissions at
+ * sub-page granularity (e.g. arm64 MTE). This function should be used when
+ * the caller cannot guarantee forward progress of a copy_to_user() loop.
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
{
- long i;
- struct vm_area_struct *vma_prev = NULL;
+ size_t faulted_in;
- for (i = 0; i < nr_pages; i++) {
- struct vm_area_struct *vma = vmas[i];
+ /*
+ * Attempt faulting in at page granularity first for page table
+ * permission checking. The arch-specific probe_subpage_writeable()
+ * functions may not check for this.
+ */
+ faulted_in = size - fault_in_writeable(uaddr, size);
+ if (faulted_in)
+ faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
- if (vma == vma_prev)
- continue;
+ return size - faulted_in;
+}
+EXPORT_SYMBOL(fault_in_subpage_writeable);
+
+/*
+ * fault_in_safe_writeable - fault in an address range for writing
+ * @uaddr: start of address range
+ * @size: length of address range
+ *
+ * Faults in an address range for writing. This is primarily useful when we
+ * already know that some or all of the pages in the address range aren't in
+ * memory.
+ *
+ * Unlike fault_in_writeable(), this function is non-destructive.
+ *
+ * Note that we don't pin or otherwise hold the pages referenced that we fault
+ * in. There's no guarantee that they'll stay in memory for any duration of
+ * time.
+ *
+ * Returns the number of bytes not faulted in, like copy_to_user() and
+ * copy_from_user().
+ */
+size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
+{
+ unsigned long start = (unsigned long)uaddr, end;
+ struct mm_struct *mm = current->mm;
+ bool unlocked = false;
+
+ if (unlikely(size == 0))
+ return 0;
+ end = PAGE_ALIGN(start + size);
+ if (end < start)
+ end = 0;
+
+ mmap_read_lock(mm);
+ do {
+ if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
+ break;
+ start = (start + PAGE_SIZE) & PAGE_MASK;
+ } while (start != end);
+ mmap_read_unlock(mm);
- vma_prev = vma;
+ if (size > (unsigned long)uaddr - start)
+ return size - ((unsigned long)uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_safe_writeable);
- if (vma_is_fsdax(vma))
- return true;
+/**
+ * fault_in_readable - fault in userspace address range for reading
+ * @uaddr: start of user address range
+ * @size: size of user address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_readable(const char __user *uaddr, size_t size)
+{
+ const char __user *start = uaddr, *end;
+ volatile char c;
+
+ if (unlikely(size == 0))
+ return 0;
+ if (!user_read_access_begin(uaddr, size))
+ return size;
+ if (!PAGE_ALIGNED(uaddr)) {
+ unsafe_get_user(c, uaddr, out);
+ uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
+ }
+ end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
+ if (unlikely(end < start))
+ end = NULL;
+ while (uaddr != end) {
+ unsafe_get_user(c, uaddr, out);
+ uaddr += PAGE_SIZE;
}
- return false;
+
+out:
+ user_read_access_end();
+ (void)c;
+ if (size > uaddr - start)
+ return size - (uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_readable);
+
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save disk space.
+ *
+ * Called without mmap_lock (takes and releases the mmap_lock by itself).
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct page *page;
+ int locked = 0;
+ int ret;
+
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ return (ret == 1) ? page : NULL;
}
+#endif /* CONFIG_ELF_CORE */
-#ifdef CONFIG_CMA
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
- unsigned long start,
+#ifdef CONFIG_MIGRATION
+/*
+ * Returns the number of collected pages. Return value is always >= 0.
+ */
+static unsigned long collect_longterm_unpinnable_pages(
+ struct list_head *movable_page_list,
unsigned long nr_pages,
- struct page **pages,
- struct vm_area_struct **vmas,
- unsigned int gup_flags)
+ struct page **pages)
{
- unsigned long i;
- unsigned long step;
+ unsigned long i, collected = 0;
+ struct folio *prev_folio = NULL;
bool drain_allow = true;
- bool migrate_allow = true;
- LIST_HEAD(cma_page_list);
- long ret = nr_pages;
- struct migration_target_control mtc = {
- .nid = NUMA_NO_NODE,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
- };
-check_again:
- for (i = 0; i < nr_pages;) {
+ for (i = 0; i < nr_pages; i++) {
+ struct folio *folio = page_folio(pages[i]);
- struct page *head = compound_head(pages[i]);
+ if (folio == prev_folio)
+ continue;
+ prev_folio = folio;
- /*
- * gup may start from a tail page. Advance step by the left
- * part.
- */
- step = compound_nr(head) - (pages[i] - head);
- /*
- * If we get a page from the CMA zone, since we are going to
- * be pinning these entries, we might as well move them out
- * of the CMA zone if possible.
- */
- if (is_migrate_cma_page(head)) {
- if (PageHuge(head))
- isolate_huge_page(head, &cma_page_list);
- else {
- if (!PageLRU(head) && drain_allow) {
- lru_add_drain_all();
- drain_allow = false;
- }
+ if (folio_is_longterm_pinnable(folio))
+ continue;
- if (!isolate_lru_page(head)) {
- list_add_tail(&head->lru, &cma_page_list);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON +
- page_is_file_lru(head),
- thp_nr_pages(head));
- }
- }
+ collected++;
+
+ if (folio_is_device_coherent(folio))
+ continue;
+
+ if (folio_test_hugetlb(folio)) {
+ isolate_hugetlb(folio, movable_page_list);
+ continue;
+ }
+
+ if (!folio_test_lru(folio) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
}
- i += step;
+ if (!folio_isolate_lru(folio))
+ continue;
+
+ list_add_tail(&folio->lru, movable_page_list);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
}
- if (!list_empty(&cma_page_list)) {
- /*
- * drop the above get_user_pages reference.
- */
- for (i = 0; i < nr_pages; i++)
- put_page(pages[i]);
+ return collected;
+}
- if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+/*
+ * Unpins all pages and migrates device coherent pages and movable_page_list.
+ * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
+ * (or partial success).
+ */
+static int migrate_longterm_unpinnable_pages(
+ struct list_head *movable_page_list,
+ unsigned long nr_pages,
+ struct page **pages)
+{
+ int ret;
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct folio *folio = page_folio(pages[i]);
+
+ if (folio_is_device_coherent(folio)) {
/*
- * some of the pages failed migration. Do get_user_pages
- * without migration.
+ * Migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
*/
- migrate_allow = false;
+ pages[i] = NULL;
+ folio_get(folio);
+ gup_put_folio(folio, 1, FOLL_PIN);
- if (!list_empty(&cma_page_list))
- putback_movable_pages(&cma_page_list);
+ if (migrate_device_coherent_page(&folio->page)) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ continue;
}
+
/*
- * We did migrate all the pages, Try to get the page references
- * again migrating any new CMA pages which we failed to isolate
- * earlier.
+ * We can't migrate pages with unexpected references, so drop
+ * the reference obtained by __get_user_pages_locked().
+ * Migrating pages have been added to movable_page_list after
+ * calling folio_isolate_lru() which takes a reference so the
+ * page won't be freed if it's migrating.
*/
- ret = __get_user_pages_locked(mm, start, nr_pages,
- pages, vmas, NULL,
- gup_flags);
-
- if ((ret > 0) && migrate_allow) {
- nr_pages = ret;
- drain_allow = true;
- goto check_again;
+ unpin_user_page(pages[i]);
+ pages[i] = NULL;
+ }
+
+ if (!list_empty(movable_page_list)) {
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_NOWARN,
+ };
+
+ if (migrate_pages(movable_page_list, alloc_migration_target,
+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+ MR_LONGTERM_PIN, NULL)) {
+ ret = -ENOMEM;
+ goto err;
}
}
+ putback_movable_pages(movable_page_list);
+
+ return -EAGAIN;
+
+err:
+ for (i = 0; i < nr_pages; i++)
+ if (pages[i])
+ unpin_user_page(pages[i]);
+ putback_movable_pages(movable_page_list);
+
return ret;
}
+
+/*
+ * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
+ * pages in the range are required to be pinned via FOLL_PIN, before calling
+ * this routine.
+ *
+ * If any pages in the range are not allowed to be pinned, then this routine
+ * will migrate those pages away, unpin all the pages in the range and return
+ * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
+ * call this routine again.
+ *
+ * If an error other than -EAGAIN occurs, this indicates a migration failure.
+ * The caller should give up, and propagate the error back up the call stack.
+ *
+ * If everything is OK and all pages in the range are allowed to be pinned, then
+ * this routine leaves all pages pinned and returns zero for success.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages)
+{
+ unsigned long collected;
+ LIST_HEAD(movable_page_list);
+
+ collected = collect_longterm_unpinnable_pages(&movable_page_list,
+ nr_pages, pages);
+ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
+ pages);
+}
#else
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
- unsigned long start,
- unsigned long nr_pages,
- struct page **pages,
- struct vm_area_struct **vmas,
- unsigned int gup_flags)
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages)
{
- return nr_pages;
+ return 0;
}
-#endif /* CONFIG_CMA */
+#endif /* CONFIG_MIGRATION */
/*
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1696,113 +2175,92 @@ static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas,
+ int *locked,
unsigned int gup_flags)
{
- struct vm_area_struct **vmas_tmp = vmas;
- unsigned long flags = 0;
- long rc, i;
-
- if (gup_flags & FOLL_LONGTERM) {
- if (!pages)
- return -EINVAL;
-
- if (!vmas_tmp) {
- vmas_tmp = kcalloc(nr_pages,
- sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas_tmp)
- return -ENOMEM;
- }
- flags = memalloc_nocma_save();
- }
-
- rc = __get_user_pages_locked(mm, start, nr_pages, pages,
- vmas_tmp, NULL, gup_flags);
+ unsigned int flags;
+ long rc, nr_pinned_pages;
- if (gup_flags & FOLL_LONGTERM) {
- if (rc < 0)
- goto out;
+ if (!(gup_flags & FOLL_LONGTERM))
+ return __get_user_pages_locked(mm, start, nr_pages, pages,
+ locked, gup_flags);
- if (check_dax_vmas(vmas_tmp, rc)) {
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
- goto out;
+ flags = memalloc_pin_save();
+ do {
+ nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
+ pages, locked,
+ gup_flags);
+ if (nr_pinned_pages <= 0) {
+ rc = nr_pinned_pages;
+ break;
}
- rc = check_and_migrate_cma_pages(mm, start, rc, pages,
- vmas_tmp, gup_flags);
-out:
- memalloc_nocma_restore(flags);
- }
-
- if (vmas_tmp != vmas)
- kfree(vmas_tmp);
- return rc;
-}
-#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
-static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
- unsigned long start,
- unsigned long nr_pages,
- struct page **pages,
- struct vm_area_struct **vmas,
- unsigned int flags)
-{
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, flags);
+ /* FOLL_LONGTERM implies FOLL_PIN */
+ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
+ } while (rc == -EAGAIN);
+ memalloc_pin_restore(flags);
+ return rc ? rc : nr_pinned_pages;
}
-#endif /* CONFIG_FS_DAX || CONFIG_CMA */
-static bool is_valid_gup_flags(unsigned int gup_flags)
+/*
+ * Check that the given flags are valid for the exported gup/pup interface, and
+ * update them with the required flags that the caller must have set.
+ */
+static bool is_valid_gup_args(struct page **pages, int *locked,
+ unsigned int *gup_flags_p, unsigned int to_set)
{
+ unsigned int gup_flags = *gup_flags_p;
+
/*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that with an assertion:
+ * These flags not allowed to be specified externally to the gup
+ * interfaces:
+ * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
+ * - FOLL_REMOTE is internal only and used on follow_page()
+ * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
*/
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE |
+ FOLL_REMOTE | FOLL_FAST_ONLY)))
return false;
+
+ gup_flags |= to_set;
+ if (locked) {
+ /* At the external interface locked must be set */
+ if (WARN_ON_ONCE(*locked != 1))
+ return false;
+
+ gup_flags |= FOLL_UNLOCKABLE;
+ }
+
/*
- * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
- * that is, FOLL_LONGTERM is a specific case, more restrictive case of
- * FOLL_PIN.
+ * For now, always trigger NUMA hinting faults. Some GUP users like
+ * KVM require the hint to be as the calling context of GUP is
+ * functionally similar to a memory reference from task context.
*/
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ gup_flags |= FOLL_HONOR_NUMA_FAULT;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
return false;
- return true;
-}
+ /* LONGTERM can only be specified when pinning */
+ if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
+ return false;
-#ifdef CONFIG_MMU
-static long __get_user_pages_remote(struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
-{
- /*
- * Parts of FOLL_LONGTERM behavior are incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. However, this only comes up if locked is set, and there are
- * callers that do request FOLL_LONGTERM, but do not set locked. So,
- * allow what we can.
- */
- if (gup_flags & FOLL_LONGTERM) {
- if (WARN_ON_ONCE(locked))
- return -EINVAL;
- /*
- * This will check the vmas (even if our vmas arg is NULL)
- * and return -ENOTSUPP if DAX isn't allowed in this case:
- */
- return __gup_longterm_locked(mm, start, nr_pages, pages,
- vmas, gup_flags | FOLL_TOUCH |
- FOLL_REMOTE);
- }
+ /* Pages input must be given if using GET/PIN */
+ if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
+ return false;
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+ /* We want to allow the pgmap to be hot-unplugged at all times */
+ if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
+ (gup_flags & FOLL_PCI_P2PDMA)))
+ return false;
+
+ *gup_flags_p = gup_flags;
+ return true;
}
+#ifdef CONFIG_MMU
/**
* get_user_pages_remote() - pin user pages in memory
* @mm: mm_struct of target mm
@@ -1812,8 +2270,6 @@ static long __get_user_pages_remote(struct mm_struct *mm,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
@@ -1828,8 +2284,6 @@ static long __get_user_pages_remote(struct mm_struct *mm,
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_lock is held.
- *
* Must be called with mmap_lock held for read or write.
*
* get_user_pages_remote walks a process's page tables and takes a reference
@@ -1840,7 +2294,7 @@ static long __get_user_pages_remote(struct mm_struct *mm,
* This does not guarantee that the page exists in the user mappings when
* get_user_pages_remote returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
+ * and subsequently re-faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
@@ -1866,13 +2320,17 @@ static long __get_user_pages_remote(struct mm_struct *mm,
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
- if (!is_valid_gup_flags(gup_flags))
+ int local_locked = 1;
+
+ if (!is_valid_gup_args(pages, locked, &gup_flags,
+ FOLL_TOUCH | FOLL_REMOTE))
return -EINVAL;
- return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
- pages, vmas, locked);
+ return __get_user_pages_locked(mm, start, nr_pages, pages,
+ locked ? locked : &local_locked,
+ gup_flags);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -1880,15 +2338,7 @@ EXPORT_SYMBOL(get_user_pages_remote);
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
-{
- return 0;
-}
-
-static long __get_user_pages_remote(struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
return 0;
}
@@ -1902,8 +2352,6 @@ static long __get_user_pages_remote(struct mm_struct *mm,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
*
* This is the same as get_user_pages_remote(), just with a less-flexible
* calling convention where we assume that the mm being operated on belongs to
@@ -1911,73 +2359,17 @@ static long __get_user_pages_remote(struct mm_struct *mm,
* obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ unsigned int gup_flags, struct page **pages)
{
- if (!is_valid_gup_flags(gup_flags))
- return -EINVAL;
-
- return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
+ int locked = 1;
-/**
- * get_user_pages_locked() is suitable to replace the form:
- *
- * mmap_read_lock(mm);
- * do_something()
- * get_user_pages(mm, ..., pages, NULL);
- * mmap_read_unlock(mm);
- *
- * to:
- *
- * int locked = 1;
- * mmap_read_lock(mm);
- * do_something()
- * get_user_pages_locked(mm, ..., pages, &locked);
- * if (locked)
- * mmap_read_unlock(mm);
- *
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying lookup behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @locked: pointer to lock flag indicating whether lock is held and
- * subsequently whether VM_FAULT_RETRY functionality can be
- * utilised. Lock must initially be held.
- *
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
- */
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
return -EINVAL;
- return __get_user_pages_locked(current->mm, start, nr_pages,
- pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
+ return __get_user_pages_locked(current->mm, start, nr_pages, pages,
+ &locked, gup_flags);
}
-EXPORT_SYMBOL(get_user_pages_locked);
+EXPORT_SYMBOL(get_user_pages);
/*
* get_user_pages_unlocked() is suitable to replace the form:
@@ -1997,25 +2389,14 @@ EXPORT_SYMBOL(get_user_pages_locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
- struct mm_struct *mm = current->mm;
- int locked = 1;
- long ret;
+ int locked = 0;
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
+ FOLL_TOUCH | FOLL_UNLOCKABLE))
return -EINVAL;
- mmap_read_lock(mm);
- ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
- &locked, gup_flags | FOLL_TOUCH);
- if (locked)
- mmap_read_unlock(mm);
- return ret;
+ return __get_user_pages_locked(current->mm, start, nr_pages, pages,
+ &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -2054,83 +2435,81 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*/
#ifdef CONFIG_HAVE_FAST_GUP
-static void put_compound_head(struct page *page, int refs, unsigned int flags)
+/*
+ * Used in the GUP-fast path to determine whether a pin is permitted for a
+ * specific folio.
+ *
+ * This call assumes the caller has pinned the folio, that the lowest page table
+ * level still points to this folio, and that interrupts have been disabled.
+ *
+ * Writing to pinned file-backed dirty tracked folios is inherently problematic
+ * (see comment describing the writable_file_mapping_allowed() function). We
+ * therefore try to avoid the most egregious case of a long-term mapping doing
+ * so.
+ *
+ * This function cannot be as thorough as that one as the VMA is not available
+ * in the fast path, so instead we whitelist known good cases and if in doubt,
+ * fall back to the slow path.
+ */
+static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
{
- if (flags & FOLL_PIN) {
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
- refs);
+ struct address_space *mapping;
+ unsigned long mapping_flags;
- if (hpage_pincount_available(page))
- hpage_pincount_sub(page, refs);
- else
- refs *= GUP_PIN_COUNTING_BIAS;
- }
+ /*
+ * If we aren't pinning then no problematic write can occur. A long term
+ * pin is the most egregious case so this is the one we disallow.
+ */
+ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) !=
+ (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
+ return true;
+
+ /* The folio is pinned, so we can safely access folio fields. */
+
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return false;
+
+ /* hugetlb mappings do not require dirty-tracking. */
+ if (folio_test_hugetlb(folio))
+ return true;
- VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
/*
- * Calling put_page() for each ref is unnecessarily slow. Only the last
- * ref needs a put_page().
+ * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
+ * cannot proceed, which means no actions performed under RCU can
+ * proceed either.
+ *
+ * inodes and thus their mappings are freed under RCU, which means the
+ * mapping cannot be freed beneath us and thus we can safely dereference
+ * it.
*/
- if (refs > 1)
- page_ref_sub(page, refs - 1);
- put_page(page);
-}
+ lockdep_assert_irqs_disabled();
-#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
+ /*
+ * However, there may be operations which _alter_ the mapping, so ensure
+ * we read it once and only once.
+ */
+ mapping = READ_ONCE(folio->mapping);
-/*
- * WARNING: only to be used in the get_user_pages_fast() implementation.
- *
- * With get_user_pages_fast(), we walk down the pagetables without taking any
- * locks. For this we would like to load the pointers atomically, but sometimes
- * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
- * we do have is the guarantee that a PTE will only either go from not present
- * to present, or present to not present or both -- it will not switch to a
- * completely different present page without a TLB flush in between; something
- * that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- *
- * ptep->pte_high = h;
- * smp_wmb();
- * ptep->pte_low = l;
- *
- * And present to not present goes:
- *
- * ptep->pte_low = 0;
- * smp_wmb();
- * ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
- * We load pte_high *after* loading pte_low, which ensures we don't see an older
- * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
- * picked up a changed pte high. We might have gotten rubbish values from
- * pte_low and pte_high, but we are guaranteed that pte_low will not have the
- * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
- * operates on present ptes we're safe.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
- pte_t pte;
+ /*
+ * The mapping may have been truncated, in any case we cannot determine
+ * if this mapping is safe - fall back to slow path to determine how to
+ * proceed.
+ */
+ if (!mapping)
+ return false;
- do {
- pte.pte_low = ptep->pte_low;
- smp_rmb();
- pte.pte_high = ptep->pte_high;
- smp_rmb();
- } while (unlikely(pte.pte_low != ptep->pte_low));
+ /* Anonymous folios pose no problem. */
+ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
+ if (mapping_flags)
+ return mapping_flags & PAGE_MAPPING_ANON;
- return pte;
-}
-#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-/*
- * We require that the PTE can be read atomically.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
- return ptep_get(ptep);
+ /*
+ * At this point, we know the mapping is non-null and points to an
+ * address_space object. The only remaining whitelisted file system is
+ * shmem.
+ */
+ return shmem_mapping(mapping);
}
-#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
unsigned int flags,
@@ -2148,21 +2527,47 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
ptem = ptep = pte_offset_map(&pmd, addr);
+ if (!ptep)
+ return 0;
do {
- pte_t pte = gup_get_pte(ptep);
- struct page *head, *page;
+ pte_t pte = ptep_get_lockless(ptep);
+ struct page *page;
+ struct folio *folio;
/*
- * Similar to the PMD case below, NUMA hinting must take slow
- * path using the pte_protnone check.
+ * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
+ * pte_access_permitted() better should reject these pages
+ * either way: otherwise, GUP-fast might succeed in
+ * cases where ordinary GUP would fail due to VMA access
+ * permissions.
*/
if (pte_protnone(pte))
goto pte_unmap;
@@ -2185,16 +2590,30 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- head = try_grab_compound_head(page, 1, flags);
- if (!head)
+ folio = try_grab_folio(page, 1, flags);
+ if (!folio)
goto pte_unmap;
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_compound_head(head, 1, flags);
+ if (unlikely(page_is_secretmem(page))) {
+ gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
+ }
+
+ if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
+ }
/*
* We need to make the page accessible if and only if we are
@@ -2205,14 +2624,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
if (flags & FOLL_PIN) {
ret = arch_make_page_accessible(page);
if (ret) {
- unpin_user_page(page);
+ gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
}
- SetPageReferenced(page);
+ folio_set_referenced(folio);
pages[*nr] = page;
(*nr)++;
-
} while (ptep++, addr += PAGE_SIZE, addr != end);
ret = 1;
@@ -2234,8 +2652,9 @@ pte_unmap:
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2255,21 +2674,26 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
+ break;
+ }
+
+ if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ break;
}
+
SetPageReferenced(page);
pages[*nr] = page;
- if (unlikely(!try_grab_page(page, flags))) {
+ if (unlikely(try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
+ break;
}
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
- if (pgmap)
- put_dev_pagemap(pgmap);
- return 1;
+ put_dev_pagemap(pgmap);
+ return addr == end;
}
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
@@ -2330,8 +2754,8 @@ static int record_subpages(struct page *page, unsigned long addr,
{
int nr;
- for (nr = 0; addr != end; addr += PAGE_SIZE)
- pages[nr++] = page++;
+ for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
+ pages[nr] = nth_page(page, nr);
return nr;
}
@@ -2349,7 +2773,8 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
struct page **pages, int *nr)
{
unsigned long pte_end;
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
pte_t pte;
int refs;
@@ -2365,21 +2790,30 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- head = pte_page(pte);
- page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+ page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_grab_compound_head(head, refs, flags);
- if (!head)
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
return 0;
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_compound_head(head, refs, flags);
+ if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
- SetPageReferenced(head);
+ folio_set_referenced(folio);
return 1;
}
@@ -2413,7 +2847,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
int refs;
if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
@@ -2426,20 +2861,29 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
pages, nr);
}
- page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_grab_compound_head(pmd_page(orig), refs, flags);
- if (!head)
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
return 0;
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- put_compound_head(head, refs, flags);
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+ if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
- SetPageReferenced(head);
+ folio_set_referenced(folio);
return 1;
}
@@ -2447,7 +2891,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
int refs;
if (!pud_access_permitted(orig, flags & FOLL_WRITE))
@@ -2460,20 +2905,30 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
pages, nr);
}
- page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_grab_compound_head(pud_page(orig), refs, flags);
- if (!head)
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
return 0;
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- put_compound_head(head, refs, flags);
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
- SetPageReferenced(head);
+ folio_set_referenced(folio);
return 1;
}
@@ -2482,27 +2937,38 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
struct page **pages, int *nr)
{
int refs;
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
- page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+ page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_grab_compound_head(pgd_page(orig), refs, flags);
- if (!head)
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
return 0;
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- put_compound_head(head, refs, flags);
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
- SetPageReferenced(head);
+ folio_set_referenced(folio);
return 1;
}
@@ -2514,7 +2980,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
pmdp = pmd_offset_lockless(pudp, pud, addr);
do {
- pmd_t pmd = READ_ONCE(*pmdp);
+ pmd_t pmd = pmdp_get_lockless(pmdp);
next = pmd_addr_end(addr, end);
if (!pmd_present(pmd))
@@ -2522,11 +2988,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
pmd_devmap(pmd))) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
+ /* See gup_pte_range() */
if (pmd_protnone(pmd))
return 0;
@@ -2542,7 +3004,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -2562,7 +3024,7 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo
next = pud_addr_end(addr, end);
if (unlikely(!pud_present(pud)))
return 0;
- if (unlikely(pud_huge(pud))) {
+ if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
@@ -2645,97 +3107,108 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#endif
-static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages)
+static unsigned long lockless_pages_from_mm(unsigned long start,
+ unsigned long end,
+ unsigned int gup_flags,
+ struct page **pages)
{
- int ret;
+ unsigned long flags;
+ int nr_pinned = 0;
+ unsigned seq;
+
+ if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
+ !gup_fast_permitted(start, end))
+ return 0;
+
+ if (gup_flags & FOLL_PIN) {
+ seq = raw_read_seqcount(&current->mm->write_protect_seq);
+ if (seq & 1)
+ return 0;
+ }
/*
- * FIXME: FOLL_LONGTERM does not work with
- * get_user_pages_unlocked() (see comments in that function)
+ * Disable interrupts. The nested form is used, in order to allow full,
+ * general purpose use of this routine.
+ *
+ * With interrupts disabled, we block page table pages from being freed
+ * from under us. See struct mmu_table_batch comments in
+ * include/asm-generic/tlb.h for more details.
+ *
+ * We do not adopt an rcu_read_lock() here as we also want to block IPIs
+ * that come from THPs splitting.
*/
- if (gup_flags & FOLL_LONGTERM) {
- mmap_read_lock(current->mm);
- ret = __gup_longterm_locked(current->mm,
- start, nr_pages,
- pages, NULL, gup_flags);
- mmap_read_unlock(current->mm);
- } else {
- ret = get_user_pages_unlocked(start, nr_pages,
- pages, gup_flags);
- }
+ local_irq_save(flags);
+ gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
+ local_irq_restore(flags);
- return ret;
+ /*
+ * When pinning pages for DMA there could be a concurrent write protect
+ * from fork() via copy_page_range(), in this case always fail fast GUP.
+ */
+ if (gup_flags & FOLL_PIN) {
+ if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
+ unpin_user_pages_lockless(pages, nr_pinned);
+ return 0;
+ } else {
+ sanity_check_pinned_pages(pages, nr_pinned);
+ }
+ }
+ return nr_pinned;
}
-static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
+static int internal_get_user_pages_fast(unsigned long start,
+ unsigned long nr_pages,
unsigned int gup_flags,
struct page **pages)
{
- unsigned long addr, len, end;
- unsigned long flags;
- int nr_pinned = 0, ret = 0;
+ unsigned long len, end;
+ unsigned long nr_pinned;
+ int locked = 0;
+ int ret;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
- FOLL_FAST_ONLY)))
+ FOLL_FAST_ONLY | FOLL_NOFAULT |
+ FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
return -EINVAL;
if (gup_flags & FOLL_PIN)
- atomic_set(&current->mm->has_pinned, 1);
+ mm_set_has_pinned_flag(&current->mm->flags);
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(&current->mm->mmap_lock);
start = untagged_addr(start) & PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
-
- if (end <= start)
- return 0;
+ len = nr_pages << PAGE_SHIFT;
+ if (check_add_overflow(start, len, &end))
+ return -EOVERFLOW;
+ if (end > TASK_SIZE_MAX)
+ return -EFAULT;
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
- /*
- * Disable interrupts. The nested form is used, in order to allow
- * full, general purpose use of this routine.
- *
- * With interrupts disabled, we block page table pages from being
- * freed from under us. See struct mmu_table_batch comments in
- * include/asm-generic/tlb.h for more details.
- *
- * We do not adopt an rcu_read_lock(.) here as we also want to
- * block IPIs that come from THPs splitting.
- */
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) {
- unsigned long fast_flags = gup_flags;
-
- local_irq_save(flags);
- gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned);
- local_irq_restore(flags);
- ret = nr_pinned;
- }
-
- if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) {
- /* Try to get the remaining pages with get_user_pages */
- start += nr_pinned << PAGE_SHIFT;
- pages += nr_pinned;
-
- ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned,
- gup_flags, pages);
-
- /* Have to be a bit careful with return values */
- if (nr_pinned > 0) {
- if (ret < 0)
- ret = nr_pinned;
- else
- ret += nr_pinned;
- }
+ nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
+ if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
+ return nr_pinned;
+
+ /* Slow path: try to get the remaining pages with get_user_pages */
+ start += nr_pinned << PAGE_SHIFT;
+ pages += nr_pinned;
+ ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
+ pages, &locked,
+ gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
+ if (ret < 0) {
+ /*
+ * The caller has to unpin the pages we already pinned so
+ * returning -errno is not an option
+ */
+ if (nr_pinned)
+ return nr_pinned;
+ return ret;
}
-
- return ret;
+ return ret + nr_pinned;
}
+
/**
* get_user_pages_fast_only() - pin user pages in memory
* @start: starting user address
@@ -2746,8 +3219,6 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
*
* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
* the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
*
* If the architecture does not support this function, simply return with no
* pages pinned.
@@ -2759,7 +3230,6 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
int get_user_pages_fast_only(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- int nr_pinned;
/*
* Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
* because gup fast is always a "pin with a +1 page refcount" request.
@@ -2767,21 +3237,11 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages,
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
*/
- gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
-
- nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
- pages);
-
- /*
- * As specified in the API description above, this routine is not
- * allowed to return negative values. However, the common core
- * routine internal_get_user_pages_fast() *can* return -errno.
- * Therefore, correct for that here:
- */
- if (nr_pinned < 0)
- nr_pinned = 0;
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
+ FOLL_GET | FOLL_FAST_ONLY))
+ return -EINVAL;
- return nr_pinned;
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
@@ -2804,16 +3264,14 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- if (!is_valid_gup_flags(gup_flags))
- return -EINVAL;
-
/*
* The caller may or may not have explicitly set FOLL_GET; either way is
* OK. However, internally (within mm/gup.c), gup fast variants must set
* FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
* request.
*/
- gup_flags |= FOLL_GET;
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
+ return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -2833,55 +3291,19 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for further details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page() will not remove pins from it.
*/
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
return -EINVAL;
-
- gup_flags |= FOLL_PIN;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
-/*
- * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
- * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
- *
- * The API rules are the same, too: no negative values may be returned.
- */
-int pin_user_pages_fast_only(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages)
-{
- int nr_pinned;
-
- /*
- * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
- * rules require returning 0, rather than -errno:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return 0;
- /*
- * FOLL_FAST_ONLY is required in order to match the API description of
- * this routine: no fall back to regular ("slow") GUP.
- */
- gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
- nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
- pages);
- /*
- * This routine is not allowed to return negative values. However,
- * internal_get_user_pages_fast() *can* return -errno. Therefore,
- * correct for that here:
- */
- if (nr_pinned < 0)
- nr_pinned = 0;
-
- return nr_pinned;
-}
-EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
-
/**
* pin_user_pages_remote() - pin pages of a remote process
*
@@ -2890,10 +3312,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
+ * Should be at least nr_pages long.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
@@ -2904,19 +3323,23 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
+ int local_locked = 1;
- gup_flags |= FOLL_PIN;
- return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
- pages, vmas, locked);
+ if (!is_valid_gup_args(pages, locked, &gup_flags,
+ FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
+ return 0;
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
+ locked ? locked : &local_locked,
+ gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_remote);
@@ -2927,28 +3350,26 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
+ * Should be at least nr_pages long.
*
* Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
* FOLL_PIN is set.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ unsigned int gup_flags, struct page **pages)
{
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
+ int locked = 1;
- gup_flags |= FOLL_PIN;
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
+ return 0;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, gup_flags);
+ pages, &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
@@ -2956,44 +3377,20 @@ EXPORT_SYMBOL(pin_user_pages);
* pin_user_pages_unlocked() is the FOLL_PIN variant of
* get_user_pages_unlocked(). Behavior is the same, except that this one sets
* FOLL_PIN and rejects FOLL_GET.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
-
- gup_flags |= FOLL_PIN;
- return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
-}
-EXPORT_SYMBOL(pin_user_pages_unlocked);
-
-/*
- * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
- * Behavior is the same, except that this one sets FOLL_PIN and rejects
- * FOLL_GET.
- */
-long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
+ int locked = 0;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
+ FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
+ return 0;
- gup_flags |= FOLL_PIN;
- return __get_user_pages_locked(current->mm, start, nr_pages,
- pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
+ return __gup_longterm_locked(current->mm, start, nr_pages, pages,
+ &locked, gup_flags);
}
-EXPORT_SYMBOL(pin_user_pages_locked);
+EXPORT_SYMBOL(pin_user_pages_unlocked);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
deleted file mode 100644
index 464cae1fa3ea..000000000000
--- a/mm/gup_benchmark.c
+++ /dev/null
@@ -1,201 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include <linux/ktime.h>
-#include <linux/debugfs.h>
-
-#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
-#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
-#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
-
-struct gup_benchmark {
- __u64 get_delta_usec;
- __u64 put_delta_usec;
- __u64 addr;
- __u64 size;
- __u32 nr_pages_per_call;
- __u32 flags;
- __u64 expansion[10]; /* For future use */
-};
-
-static void put_back_pages(unsigned int cmd, struct page **pages,
- unsigned long nr_pages)
-{
- unsigned long i;
-
- switch (cmd) {
- case GUP_FAST_BENCHMARK:
- case GUP_BENCHMARK:
- for (i = 0; i < nr_pages; i++)
- put_page(pages[i]);
- break;
-
- case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
- case PIN_LONGTERM_BENCHMARK:
- unpin_user_pages(pages, nr_pages);
- break;
- }
-}
-
-static void verify_dma_pinned(unsigned int cmd, struct page **pages,
- unsigned long nr_pages)
-{
- unsigned long i;
- struct page *page;
-
- switch (cmd) {
- case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
- case PIN_LONGTERM_BENCHMARK:
- for (i = 0; i < nr_pages; i++) {
- page = pages[i];
- if (WARN(!page_maybe_dma_pinned(page),
- "pages[%lu] is NOT dma-pinned\n", i)) {
-
- dump_page(page, "gup_benchmark failure");
- break;
- }
- }
- break;
- }
-}
-
-static int __gup_benchmark_ioctl(unsigned int cmd,
- struct gup_benchmark *gup)
-{
- ktime_t start_time, end_time;
- unsigned long i, nr_pages, addr, next;
- int nr;
- struct page **pages;
- int ret = 0;
-
- if (gup->size > ULONG_MAX)
- return -EINVAL;
-
- nr_pages = gup->size / PAGE_SIZE;
- pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
-
- i = 0;
- nr = gup->nr_pages_per_call;
- start_time = ktime_get();
- for (addr = gup->addr; addr < gup->addr + gup->size; addr = next) {
- if (nr != gup->nr_pages_per_call)
- break;
-
- next = addr + nr * PAGE_SIZE;
- if (next > gup->addr + gup->size) {
- next = gup->addr + gup->size;
- nr = (next - addr) / PAGE_SIZE;
- }
-
- /* Filter out most gup flags: only allow a tiny subset here: */
- gup->flags &= FOLL_WRITE;
-
- switch (cmd) {
- case GUP_FAST_BENCHMARK:
- nr = get_user_pages_fast(addr, nr, gup->flags,
- pages + i);
- break;
- case GUP_BENCHMARK:
- nr = get_user_pages(addr, nr, gup->flags, pages + i,
- NULL);
- break;
- case PIN_FAST_BENCHMARK:
- nr = pin_user_pages_fast(addr, nr, gup->flags,
- pages + i);
- break;
- case PIN_BENCHMARK:
- nr = pin_user_pages(addr, nr, gup->flags, pages + i,
- NULL);
- break;
- case PIN_LONGTERM_BENCHMARK:
- nr = pin_user_pages(addr, nr,
- gup->flags | FOLL_LONGTERM,
- pages + i, NULL);
- break;
- default:
- kvfree(pages);
- ret = -EINVAL;
- goto out;
- }
-
- if (nr <= 0)
- break;
- i += nr;
- }
- end_time = ktime_get();
-
- /* Shifting the meaning of nr_pages: now it is actual number pinned: */
- nr_pages = i;
-
- gup->get_delta_usec = ktime_us_delta(end_time, start_time);
- gup->size = addr - gup->addr;
-
- /*
- * Take an un-benchmark-timed moment to verify DMA pinned
- * state: print a warning if any non-dma-pinned pages are found:
- */
- verify_dma_pinned(cmd, pages, nr_pages);
-
- start_time = ktime_get();
-
- put_back_pages(cmd, pages, nr_pages);
-
- end_time = ktime_get();
- gup->put_delta_usec = ktime_us_delta(end_time, start_time);
-
- kvfree(pages);
-out:
- return ret;
-}
-
-static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
- unsigned long arg)
-{
- struct gup_benchmark gup;
- int ret;
-
- switch (cmd) {
- case GUP_FAST_BENCHMARK:
- case GUP_BENCHMARK:
- case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
- case PIN_LONGTERM_BENCHMARK:
- break;
- default:
- return -EINVAL;
- }
-
- if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
- return -EFAULT;
-
- ret = __gup_benchmark_ioctl(cmd, &gup);
- if (ret)
- return ret;
-
- if (copy_to_user((void __user *)arg, &gup, sizeof(gup)))
- return -EFAULT;
-
- return 0;
-}
-
-static const struct file_operations gup_benchmark_fops = {
- .open = nonseekable_open,
- .unlocked_ioctl = gup_benchmark_ioctl,
-};
-
-static int gup_benchmark_init(void)
-{
- debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
- &gup_benchmark_fops);
-
- return 0;
-}
-
-late_initcall(gup_benchmark_init);
diff --git a/mm/gup_test.c b/mm/gup_test.c
new file mode 100644
index 000000000000..eeb3f4d87c51
--- /dev/null
+++ b/mm/gup_test.c
@@ -0,0 +1,395 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/ktime.h>
+#include <linux/debugfs.h>
+#include <linux/highmem.h>
+#include "gup_test.h"
+
+static void put_back_pages(unsigned int cmd, struct page **pages,
+ unsigned long nr_pages, unsigned int gup_test_flags)
+{
+ unsigned long i;
+
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ case GUP_BASIC_TEST:
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+ break;
+
+ case PIN_FAST_BENCHMARK:
+ case PIN_BASIC_TEST:
+ case PIN_LONGTERM_BENCHMARK:
+ unpin_user_pages(pages, nr_pages);
+ break;
+ case DUMP_USER_PAGES_TEST:
+ if (gup_test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) {
+ unpin_user_pages(pages, nr_pages);
+ } else {
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+
+ }
+ break;
+ }
+}
+
+static void verify_dma_pinned(unsigned int cmd, struct page **pages,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ struct folio *folio;
+
+ switch (cmd) {
+ case PIN_FAST_BENCHMARK:
+ case PIN_BASIC_TEST:
+ case PIN_LONGTERM_BENCHMARK:
+ for (i = 0; i < nr_pages; i++) {
+ folio = page_folio(pages[i]);
+
+ if (WARN(!folio_maybe_dma_pinned(folio),
+ "pages[%lu] is NOT dma-pinned\n", i)) {
+
+ dump_page(&folio->page, "gup_test failure");
+ break;
+ } else if (cmd == PIN_LONGTERM_BENCHMARK &&
+ WARN(!folio_is_longterm_pinnable(folio),
+ "pages[%lu] is NOT pinnable but pinned\n",
+ i)) {
+ dump_page(&folio->page, "gup_test failure");
+ break;
+ }
+ }
+ break;
+ }
+}
+
+static void dump_pages_test(struct gup_test *gup, struct page **pages,
+ unsigned long nr_pages)
+{
+ unsigned int index_to_dump;
+ unsigned int i;
+
+ /*
+ * Zero out any user-supplied page index that is out of range. Remember:
+ * .which_pages[] contains a 1-based set of page indices.
+ */
+ for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) {
+ if (gup->which_pages[i] > nr_pages) {
+ pr_warn("ZEROING due to out of range: .which_pages[%u]: %u\n",
+ i, gup->which_pages[i]);
+ gup->which_pages[i] = 0;
+ }
+ }
+
+ for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) {
+ index_to_dump = gup->which_pages[i];
+
+ if (index_to_dump) {
+ index_to_dump--; // Decode from 1-based, to 0-based
+ pr_info("---- page #%u, starting from user virt addr: 0x%llx\n",
+ index_to_dump, gup->addr);
+ dump_page(pages[index_to_dump],
+ "gup_test: dump_pages() test");
+ }
+ }
+}
+
+static int __gup_test_ioctl(unsigned int cmd,
+ struct gup_test *gup)
+{
+ ktime_t start_time, end_time;
+ unsigned long i, nr_pages, addr, next;
+ long nr;
+ struct page **pages;
+ int ret = 0;
+ bool needs_mmap_lock =
+ cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK;
+
+ if (gup->size > ULONG_MAX)
+ return -EINVAL;
+
+ nr_pages = gup->size / PAGE_SIZE;
+ pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) {
+ ret = -EINTR;
+ goto free_pages;
+ }
+
+ i = 0;
+ nr = gup->nr_pages_per_call;
+ start_time = ktime_get();
+ for (addr = gup->addr; addr < gup->addr + gup->size; addr = next) {
+ if (nr != gup->nr_pages_per_call)
+ break;
+
+ next = addr + nr * PAGE_SIZE;
+ if (next > gup->addr + gup->size) {
+ next = gup->addr + gup->size;
+ nr = (next - addr) / PAGE_SIZE;
+ }
+
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ nr = get_user_pages_fast(addr, nr, gup->gup_flags,
+ pages + i);
+ break;
+ case GUP_BASIC_TEST:
+ nr = get_user_pages(addr, nr, gup->gup_flags, pages + i);
+ break;
+ case PIN_FAST_BENCHMARK:
+ nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
+ pages + i);
+ break;
+ case PIN_BASIC_TEST:
+ nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i);
+ break;
+ case PIN_LONGTERM_BENCHMARK:
+ nr = pin_user_pages(addr, nr,
+ gup->gup_flags | FOLL_LONGTERM,
+ pages + i);
+ break;
+ case DUMP_USER_PAGES_TEST:
+ if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
+ nr = pin_user_pages(addr, nr, gup->gup_flags,
+ pages + i);
+ else
+ nr = get_user_pages(addr, nr, gup->gup_flags,
+ pages + i);
+ break;
+ default:
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (nr <= 0)
+ break;
+ i += nr;
+ }
+ end_time = ktime_get();
+
+ /* Shifting the meaning of nr_pages: now it is actual number pinned: */
+ nr_pages = i;
+
+ gup->get_delta_usec = ktime_us_delta(end_time, start_time);
+ gup->size = addr - gup->addr;
+
+ /*
+ * Take an un-benchmark-timed moment to verify DMA pinned
+ * state: print a warning if any non-dma-pinned pages are found:
+ */
+ verify_dma_pinned(cmd, pages, nr_pages);
+
+ if (cmd == DUMP_USER_PAGES_TEST)
+ dump_pages_test(gup, pages, nr_pages);
+
+ start_time = ktime_get();
+
+ put_back_pages(cmd, pages, nr_pages, gup->test_flags);
+
+ end_time = ktime_get();
+ gup->put_delta_usec = ktime_us_delta(end_time, start_time);
+
+unlock:
+ if (needs_mmap_lock)
+ mmap_read_unlock(current->mm);
+free_pages:
+ kvfree(pages);
+ return ret;
+}
+
+static DEFINE_MUTEX(pin_longterm_test_mutex);
+static struct page **pin_longterm_test_pages;
+static unsigned long pin_longterm_test_nr_pages;
+
+static inline void pin_longterm_test_stop(void)
+{
+ if (pin_longterm_test_pages) {
+ if (pin_longterm_test_nr_pages)
+ unpin_user_pages(pin_longterm_test_pages,
+ pin_longterm_test_nr_pages);
+ kvfree(pin_longterm_test_pages);
+ pin_longterm_test_pages = NULL;
+ pin_longterm_test_nr_pages = 0;
+ }
+}
+
+static inline int pin_longterm_test_start(unsigned long arg)
+{
+ long nr_pages, cur_pages, addr, remaining_pages;
+ int gup_flags = FOLL_LONGTERM;
+ struct pin_longterm_test args;
+ struct page **pages;
+ int ret = 0;
+ bool fast;
+
+ if (pin_longterm_test_pages)
+ return -EINVAL;
+
+ if (copy_from_user(&args, (void __user *)arg, sizeof(args)))
+ return -EFAULT;
+
+ if (args.flags &
+ ~(PIN_LONGTERM_TEST_FLAG_USE_WRITE|PIN_LONGTERM_TEST_FLAG_USE_FAST))
+ return -EINVAL;
+ if (!IS_ALIGNED(args.addr | args.size, PAGE_SIZE))
+ return -EINVAL;
+ if (args.size > LONG_MAX)
+ return -EINVAL;
+ nr_pages = args.size / PAGE_SIZE;
+ if (!nr_pages)
+ return -EINVAL;
+
+ pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (args.flags & PIN_LONGTERM_TEST_FLAG_USE_WRITE)
+ gup_flags |= FOLL_WRITE;
+ fast = !!(args.flags & PIN_LONGTERM_TEST_FLAG_USE_FAST);
+
+ if (!fast && mmap_read_lock_killable(current->mm)) {
+ kvfree(pages);
+ return -EINTR;
+ }
+
+ pin_longterm_test_pages = pages;
+ pin_longterm_test_nr_pages = 0;
+
+ while (nr_pages - pin_longterm_test_nr_pages) {
+ remaining_pages = nr_pages - pin_longterm_test_nr_pages;
+ addr = args.addr + pin_longterm_test_nr_pages * PAGE_SIZE;
+
+ if (fast)
+ cur_pages = pin_user_pages_fast(addr, remaining_pages,
+ gup_flags, pages);
+ else
+ cur_pages = pin_user_pages(addr, remaining_pages,
+ gup_flags, pages);
+ if (cur_pages < 0) {
+ pin_longterm_test_stop();
+ ret = cur_pages;
+ break;
+ }
+ pin_longterm_test_nr_pages += cur_pages;
+ pages += cur_pages;
+ }
+
+ if (!fast)
+ mmap_read_unlock(current->mm);
+ return ret;
+}
+
+static inline int pin_longterm_test_read(unsigned long arg)
+{
+ __u64 user_addr;
+ unsigned long i;
+
+ if (!pin_longterm_test_pages)
+ return -EINVAL;
+
+ if (copy_from_user(&user_addr, (void __user *)arg, sizeof(user_addr)))
+ return -EFAULT;
+
+ for (i = 0; i < pin_longterm_test_nr_pages; i++) {
+ void *addr = kmap_local_page(pin_longterm_test_pages[i]);
+ unsigned long ret;
+
+ ret = copy_to_user((void __user *)(unsigned long)user_addr, addr,
+ PAGE_SIZE);
+ kunmap_local(addr);
+ if (ret)
+ return -EFAULT;
+ user_addr += PAGE_SIZE;
+ }
+ return 0;
+}
+
+static long pin_longterm_test_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+
+ if (mutex_lock_killable(&pin_longterm_test_mutex))
+ return -EINTR;
+
+ switch (cmd) {
+ case PIN_LONGTERM_TEST_START:
+ ret = pin_longterm_test_start(arg);
+ break;
+ case PIN_LONGTERM_TEST_STOP:
+ pin_longterm_test_stop();
+ ret = 0;
+ break;
+ case PIN_LONGTERM_TEST_READ:
+ ret = pin_longterm_test_read(arg);
+ break;
+ }
+
+ mutex_unlock(&pin_longterm_test_mutex);
+ return ret;
+}
+
+static long gup_test_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ struct gup_test gup;
+ int ret;
+
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ case PIN_FAST_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
+ case GUP_BASIC_TEST:
+ case PIN_BASIC_TEST:
+ case DUMP_USER_PAGES_TEST:
+ break;
+ case PIN_LONGTERM_TEST_START:
+ case PIN_LONGTERM_TEST_STOP:
+ case PIN_LONGTERM_TEST_READ:
+ return pin_longterm_test_ioctl(filep, cmd, arg);
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
+ return -EFAULT;
+
+ ret = __gup_test_ioctl(cmd, &gup);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &gup, sizeof(gup)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int gup_test_release(struct inode *inode, struct file *file)
+{
+ pin_longterm_test_stop();
+
+ return 0;
+}
+
+static const struct file_operations gup_test_fops = {
+ .open = nonseekable_open,
+ .unlocked_ioctl = gup_test_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = gup_test_release,
+};
+
+static int __init gup_test_init(void)
+{
+ debugfs_create_file_unsafe("gup_test", 0600, NULL, NULL,
+ &gup_test_fops);
+
+ return 0;
+}
+
+late_initcall(gup_test_init);
diff --git a/mm/gup_test.h b/mm/gup_test.h
new file mode 100644
index 000000000000..5b37b54e8bea
--- /dev/null
+++ b/mm/gup_test.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __GUP_TEST_H
+#define __GUP_TEST_H
+
+#include <linux/types.h>
+
+#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test)
+#define PIN_FAST_BENCHMARK _IOWR('g', 2, struct gup_test)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 3, struct gup_test)
+#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test)
+#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test)
+#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test)
+#define PIN_LONGTERM_TEST_START _IOW('g', 7, struct pin_longterm_test)
+#define PIN_LONGTERM_TEST_STOP _IO('g', 8)
+#define PIN_LONGTERM_TEST_READ _IOW('g', 9, __u64)
+
+#define GUP_TEST_MAX_PAGES_TO_DUMP 8
+
+#define GUP_TEST_FLAG_DUMP_PAGES_USE_PIN 0x1
+
+struct gup_test {
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
+ __u64 addr;
+ __u64 size;
+ __u32 nr_pages_per_call;
+ __u32 gup_flags;
+ __u32 test_flags;
+ /*
+ * Each non-zero entry is the number of the page (1-based: first page is
+ * page 1, so that zero entries mean "do nothing") from the .addr base.
+ */
+ __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP];
+};
+
+#define PIN_LONGTERM_TEST_FLAG_USE_WRITE 1
+#define PIN_LONGTERM_TEST_FLAG_USE_FAST 2
+
+struct pin_longterm_test {
+ __u64 addr;
+ __u64 size;
+ __u32 flags;
+};
+
+#endif /* __GUP_TEST_H */
diff --git a/mm/highmem.c b/mm/highmem.c
index 64d8dea47dd1..e19269093a93 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -23,7 +23,6 @@
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
-#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
@@ -31,9 +30,16 @@
#include <asm/tlbflush.h>
#include <linux/vmalloc.h>
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
-DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#ifdef CONFIG_KMAP_LOCAL
+static inline int kmap_local_calc_idx(int idx)
+{
+ return idx + KM_MAX_IDX * smp_processor_id();
+}
+
+#ifndef arch_kmap_local_map_idx
+#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
#endif
+#endif /* CONFIG_KMAP_LOCAL */
/*
* Virtual_count is not a pure "count".
@@ -108,9 +114,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
atomic_long_t _totalhigh_pages __read_mostly;
EXPORT_SYMBOL(_totalhigh_pages);
-EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
-
-unsigned int nr_free_highpages (void)
+unsigned int __nr_free_highpages(void)
{
struct zone *zone;
unsigned int pages = 0;
@@ -126,7 +130,7 @@ unsigned int nr_free_highpages (void)
static int pkmap_count[LAST_PKMAP];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
-pte_t * pkmap_page_table;
+pte_t *pkmap_page_table;
/*
* Most architectures have no use for kmap_high_get(), so let's abstract
@@ -147,18 +151,36 @@ pte_t * pkmap_page_table;
do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
#endif
-struct page *kmap_to_page(void *vaddr)
+struct page *__kmap_to_page(void *vaddr)
{
+ unsigned long base = (unsigned long) vaddr & PAGE_MASK;
+ struct kmap_ctrl *kctrl = &current->kmap_ctrl;
unsigned long addr = (unsigned long)vaddr;
+ int i;
- if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
- int i = PKMAP_NR(addr);
- return pte_page(pkmap_page_table[i]);
+ /* kmap() mappings */
+ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) &&
+ addr < PKMAP_ADDR(LAST_PKMAP)))
+ return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)]));
+
+ /* kmap_local_page() mappings */
+ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) &&
+ base < __fix_to_virt(FIX_KMAP_BEGIN))) {
+ for (i = 0; i < kctrl->idx; i++) {
+ unsigned long base_addr;
+ int idx;
+
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+ base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+
+ if (base_addr == base)
+ return pte_page(kctrl->pteval[i]);
+ }
}
- return virt_to_page(addr);
+ return virt_to_page(vaddr);
}
-EXPORT_SYMBOL(kmap_to_page);
+EXPORT_SYMBOL(__kmap_to_page);
static void flush_all_zero_pkmaps(void)
{
@@ -169,6 +191,7 @@ static void flush_all_zero_pkmaps(void)
for (i = 0; i < LAST_PKMAP; i++) {
struct page *page;
+ pte_t ptent;
/*
* zero means we don't have anything to do,
@@ -181,7 +204,8 @@ static void flush_all_zero_pkmaps(void)
pkmap_count[i] = 0;
/* sanity check */
- BUG_ON(pte_none(pkmap_page_table[i]));
+ ptent = ptep_get(&pkmap_page_table[i]);
+ BUG_ON(pte_none(ptent));
/*
* Don't need an atomic fetch-and-clear op here;
@@ -190,7 +214,7 @@ static void flush_all_zero_pkmaps(void)
* getting the kmap_lock (which is held here).
* So no dangers, even with speculative execution.
*/
- page = pte_page(pkmap_page_table[i]);
+ page = pte_page(ptent);
pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
set_page_address(page, NULL);
@@ -200,10 +224,7 @@ static void flush_all_zero_pkmaps(void)
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
-/**
- * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
- */
-void kmap_flush_unused(void)
+void __kmap_flush_unused(void)
{
lock_kmap();
flush_all_zero_pkmaps();
@@ -287,9 +308,8 @@ void *kmap_high(struct page *page)
pkmap_count[PKMAP_NR(vaddr)]++;
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
unlock_kmap();
- return (void*) vaddr;
+ return (void *) vaddr;
}
-
EXPORT_SYMBOL(kmap_high);
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
@@ -314,7 +334,7 @@ void *kmap_high_get(struct page *page)
pkmap_count[PKMAP_NR(vaddr)]++;
}
unlock_kmap_any(flags);
- return (void*) vaddr;
+ return (void *) vaddr;
}
#endif
@@ -367,8 +387,322 @@ void kunmap_high(struct page *page)
if (need_wakeup)
wake_up(pkmap_map_wait);
}
-
EXPORT_SYMBOL(kunmap_high);
+
+void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
+ unsigned start2, unsigned end2)
+{
+ unsigned int i;
+
+ BUG_ON(end1 > page_size(page) || end2 > page_size(page));
+
+ if (start1 >= end1)
+ start1 = end1 = 0;
+ if (start2 >= end2)
+ start2 = end2 = 0;
+
+ for (i = 0; i < compound_nr(page); i++) {
+ void *kaddr = NULL;
+
+ if (start1 >= PAGE_SIZE) {
+ start1 -= PAGE_SIZE;
+ end1 -= PAGE_SIZE;
+ } else {
+ unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
+
+ if (end1 > start1) {
+ kaddr = kmap_local_page(page + i);
+ memset(kaddr + start1, 0, this_end - start1);
+ }
+ end1 -= this_end;
+ start1 = 0;
+ }
+
+ if (start2 >= PAGE_SIZE) {
+ start2 -= PAGE_SIZE;
+ end2 -= PAGE_SIZE;
+ } else {
+ unsigned this_end = min_t(unsigned, end2, PAGE_SIZE);
+
+ if (end2 > start2) {
+ if (!kaddr)
+ kaddr = kmap_local_page(page + i);
+ memset(kaddr + start2, 0, this_end - start2);
+ }
+ end2 -= this_end;
+ start2 = 0;
+ }
+
+ if (kaddr) {
+ kunmap_local(kaddr);
+ flush_dcache_page(page + i);
+ }
+
+ if (!end1 && !end2)
+ break;
+ }
+
+ BUG_ON((start1 | start2 | end1 | end2) != 0);
+}
+EXPORT_SYMBOL(zero_user_segments);
+#endif /* CONFIG_HIGHMEM */
+
+#ifdef CONFIG_KMAP_LOCAL
+
+#include <asm/kmap_size.h>
+
+/*
+ * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second
+ * slot is unused which acts as a guard page
+ */
+#ifdef CONFIG_DEBUG_KMAP_LOCAL
+# define KM_INCR 2
+#else
+# define KM_INCR 1
+#endif
+
+static inline int kmap_local_idx_push(void)
+{
+ WARN_ON_ONCE(in_hardirq() && !irqs_disabled());
+ current->kmap_ctrl.idx += KM_INCR;
+ BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX);
+ return current->kmap_ctrl.idx - 1;
+}
+
+static inline int kmap_local_idx(void)
+{
+ return current->kmap_ctrl.idx - 1;
+}
+
+static inline void kmap_local_idx_pop(void)
+{
+ current->kmap_ctrl.idx -= KM_INCR;
+ BUG_ON(current->kmap_ctrl.idx < 0);
+}
+
+#ifndef arch_kmap_local_post_map
+# define arch_kmap_local_post_map(vaddr, pteval) do { } while (0)
+#endif
+
+#ifndef arch_kmap_local_pre_unmap
+# define arch_kmap_local_pre_unmap(vaddr) do { } while (0)
+#endif
+
+#ifndef arch_kmap_local_post_unmap
+# define arch_kmap_local_post_unmap(vaddr) do { } while (0)
+#endif
+
+#ifndef arch_kmap_local_unmap_idx
+#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx)
+#endif
+
+#ifndef arch_kmap_local_high_get
+static inline void *arch_kmap_local_high_get(struct page *page)
+{
+ return NULL;
+}
+#endif
+
+#ifndef arch_kmap_local_set_pte
+#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \
+ set_pte_at(mm, vaddr, ptep, ptev)
+#endif
+
+/* Unmap a local mapping which was obtained by kmap_high_get() */
+static inline bool kmap_high_unmap_local(unsigned long vaddr)
+{
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+ if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
+ kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)])));
+ return true;
+ }
+#endif
+ return false;
+}
+
+static pte_t *__kmap_pte;
+
+static pte_t *kmap_get_pte(unsigned long vaddr, int idx)
+{
+ if (IS_ENABLED(CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY))
+ /*
+ * Set by the arch if __kmap_pte[-idx] does not produce
+ * the correct entry.
+ */
+ return virt_to_kpte(vaddr);
+ if (!__kmap_pte)
+ __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
+ return &__kmap_pte[-idx];
+}
+
+void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pteval, *kmap_pte;
+ unsigned long vaddr;
+ int idx;
+
+ /*
+ * Disable migration so resulting virtual address is stable
+ * across preemption.
+ */
+ migrate_disable();
+ preempt_disable();
+ idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn);
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ kmap_pte = kmap_get_pte(vaddr, idx);
+ BUG_ON(!pte_none(ptep_get(kmap_pte)));
+ pteval = pfn_pte(pfn, prot);
+ arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval);
+ arch_kmap_local_post_map(vaddr, pteval);
+ current->kmap_ctrl.pteval[kmap_local_idx()] = pteval;
+ preempt_enable();
+
+ return (void *)vaddr;
+}
+EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot);
+
+void *__kmap_local_page_prot(struct page *page, pgprot_t prot)
+{
+ void *kmap;
+
+ /*
+ * To broaden the usage of the actual kmap_local() machinery always map
+ * pages when debugging is enabled and the architecture has no problems
+ * with alias mappings.
+ */
+ if (!IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) && !PageHighMem(page))
+ return page_address(page);
+
+ /* Try kmap_high_get() if architecture has it enabled */
+ kmap = arch_kmap_local_high_get(page);
+ if (kmap)
+ return kmap;
+
+ return __kmap_local_pfn_prot(page_to_pfn(page), prot);
+}
+EXPORT_SYMBOL(__kmap_local_page_prot);
+
+void kunmap_local_indexed(const void *vaddr)
+{
+ unsigned long addr = (unsigned long) vaddr & PAGE_MASK;
+ pte_t *kmap_pte;
+ int idx;
+
+ if (addr < __fix_to_virt(FIX_KMAP_END) ||
+ addr > __fix_to_virt(FIX_KMAP_BEGIN)) {
+ if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP)) {
+ /* This _should_ never happen! See above. */
+ WARN_ON_ONCE(1);
+ return;
+ }
+ /*
+ * Handle mappings which were obtained by kmap_high_get()
+ * first as the virtual address of such mappings is below
+ * PAGE_OFFSET. Warn for all other addresses which are in
+ * the user space part of the virtual address space.
+ */
+ if (!kmap_high_unmap_local(addr))
+ WARN_ON_ONCE(addr < PAGE_OFFSET);
+ return;
+ }
+
+ preempt_disable();
+ idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr);
+ WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+
+ kmap_pte = kmap_get_pte(addr, idx);
+ arch_kmap_local_pre_unmap(addr);
+ pte_clear(&init_mm, addr, kmap_pte);
+ arch_kmap_local_post_unmap(addr);
+ current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0);
+ kmap_local_idx_pop();
+ preempt_enable();
+ migrate_enable();
+}
+EXPORT_SYMBOL(kunmap_local_indexed);
+
+/*
+ * Invoked before switch_to(). This is safe even when during or after
+ * clearing the maps an interrupt which needs a kmap_local happens because
+ * the task::kmap_ctrl.idx is not modified by the unmapping code so a
+ * nested kmap_local will use the next unused index and restore the index
+ * on unmap. The already cleared kmaps of the outgoing task are irrelevant
+ * because the interrupt context does not know about them. The same applies
+ * when scheduling back in for an interrupt which happens before the
+ * restore is complete.
+ */
+void __kmap_local_sched_out(void)
+{
+ struct task_struct *tsk = current;
+ pte_t *kmap_pte;
+ int i;
+
+ /* Clear kmaps */
+ for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
+ pte_t pteval = tsk->kmap_ctrl.pteval[i];
+ unsigned long addr;
+ int idx;
+
+ /* With debug all even slots are unmapped and act as guard */
+ if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) {
+ WARN_ON_ONCE(pte_val(pteval) != 0);
+ continue;
+ }
+ if (WARN_ON_ONCE(pte_none(pteval)))
+ continue;
+
+ /*
+ * This is a horrible hack for XTENSA to calculate the
+ * coloured PTE index. Uses the PFN encoded into the pteval
+ * and the map index calculation because the actual mapped
+ * virtual address is not stored in task::kmap_ctrl.
+ * For any sane architecture this is optimized out.
+ */
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+
+ addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ kmap_pte = kmap_get_pte(addr, idx);
+ arch_kmap_local_pre_unmap(addr);
+ pte_clear(&init_mm, addr, kmap_pte);
+ arch_kmap_local_post_unmap(addr);
+ }
+}
+
+void __kmap_local_sched_in(void)
+{
+ struct task_struct *tsk = current;
+ pte_t *kmap_pte;
+ int i;
+
+ /* Restore kmaps */
+ for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
+ pte_t pteval = tsk->kmap_ctrl.pteval[i];
+ unsigned long addr;
+ int idx;
+
+ /* With debug all even slots are unmapped and act as guard */
+ if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) {
+ WARN_ON_ONCE(pte_val(pteval) != 0);
+ continue;
+ }
+ if (WARN_ON_ONCE(pte_none(pteval)))
+ continue;
+
+ /* See comment in __kmap_local_sched_out() */
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+ addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ kmap_pte = kmap_get_pte(addr, idx);
+ set_pte_at(&init_mm, addr, kmap_pte, pteval);
+ arch_kmap_local_post_map(addr, pteval);
+ }
+}
+
+void kmap_local_fork(struct task_struct *tsk)
+{
+ if (WARN_ON_ONCE(tsk->kmap_ctrl.idx))
+ memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl));
+}
+
#endif
#if defined(HASHED_PAGE_VIRTUAL)
@@ -423,15 +757,14 @@ void *page_address(const struct page *page)
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
ret = pam->virtual;
- goto done;
+ break;
}
}
}
-done:
+
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
-
EXPORT_SYMBOL(page_address);
/**
@@ -461,13 +794,12 @@ void set_page_address(struct page *page, void *virtual)
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
list_del(&pam->list);
- spin_unlock_irqrestore(&pas->lock, flags);
- goto done;
+ break;
}
}
spin_unlock_irqrestore(&pas->lock, flags);
}
-done:
+
return;
}
@@ -481,4 +813,4 @@ void __init page_address_init(void)
}
}
-#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+#endif /* defined(HASHED_PAGE_VIRTUAL) */
diff --git a/mm/hmm.c b/mm/hmm.c
index 943cb2ba4442..277ddcab4947 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -26,6 +26,8 @@
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
+#include "internal.h"
+
struct hmm_vma_walk {
struct hmm_range *range;
unsigned long last;
@@ -210,14 +212,6 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline bool hmm_is_device_private_entry(struct hmm_range *range,
- swp_entry_t entry)
-{
- return is_device_private_entry(entry) &&
- device_private_entry_to_page(entry)->pgmap->owner ==
- range->dev_private_owner;
-}
-
static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
pte_t pte)
{
@@ -234,10 +228,10 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
unsigned long cpu_flags;
- pte_t pte = *ptep;
+ pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
- if (pte_none(pte)) {
+ if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
@@ -250,15 +244,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
swp_entry_t entry = pte_to_swp_entry(pte);
/*
- * Never fault in device private pages, but just report
- * the PFN even if not present.
+ * Don't fault in device private pages owned by the caller,
+ * just report the PFN.
*/
- if (hmm_is_device_private_entry(range, entry)) {
+ if (is_device_private_entry(entry) &&
+ pfn_swap_entry_to_page(entry)->pgmap->owner ==
+ range->dev_private_owner) {
cpu_flags = HMM_PFN_VALID;
- if (is_write_device_private_entry(entry))
+ if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = device_private_entry_to_pfn(entry) |
- cpu_flags;
+ *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
return 0;
}
@@ -272,6 +267,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (!non_swap_entry(entry))
goto fault;
+ if (is_device_private_entry(entry))
+ goto fault;
+
+ if (is_device_exclusive_entry(entry))
+ goto fault;
+
if (is_migration_entry(entry)) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
@@ -291,10 +292,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
goto fault;
/*
+ * Bypass devmap pte such as DAX page when all pfn requested
+ * flags(pfn_req_flags) are fulfilled.
* Since each architecture defines a struct page for the zero page, just
* fall through and treat it like a normal page.
*/
- if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
+ if (!vm_normal_page(walk->vma, addr, pte) &&
+ !pte_devmap(pte) &&
+ !is_zero_pfn(pte_pfn(pte))) {
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
return -EFAULT;
@@ -327,7 +332,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
pmd_t pmd;
again:
- pmd = READ_ONCE(*pmdp);
+ pmd = pmdp_get_lockless(pmdp);
if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, -1, walk);
@@ -356,8 +361,7 @@ again:
* huge or device mapping one and compute corresponding pfn
* values.
*/
- pmd = pmd_read_atomic(pmdp);
- barrier();
+ pmd = pmdp_get_lockless(pmdp);
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
@@ -377,6 +381,8 @@ again:
}
ptep = pte_offset_map(pmdp, addr);
+ if (!ptep)
+ goto again;
for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
int r;
@@ -409,7 +415,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
unsigned long addr = start;
pud_t pud;
- int ret = 0;
spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
if (!ptl)
@@ -458,7 +463,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
out_unlock:
spin_unlock(ptl);
- return ret;
+ return 0;
}
#else
#define hmm_vma_walk_pud NULL
@@ -489,8 +494,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) {
+ int ret;
+
spin_unlock(ptl);
- return hmm_vma_fault(addr, end, required_fault, walk);
+ hugetlb_vma_unlock_read(vma);
+ /*
+ * Avoid deadlock: drop the vma lock before calling
+ * hmm_vma_fault(), which will itself potentially take and
+ * drop the vma lock. This is also correct from a
+ * protection point of view, because there is no further
+ * use here of either pte or ptl after dropping the vma
+ * lock.
+ */
+ ret = hmm_vma_fault(addr, end, required_fault, walk);
+ hugetlb_vma_lock_read(vma);
+ return ret;
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
@@ -511,7 +529,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
vma->vm_flags & VM_READ)
return 0;
@@ -544,6 +562,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
.pte_hole = hmm_vma_walk_hole,
.hugetlb_entry = hmm_vma_walk_hugetlb_entry,
.test_walk = hmm_vma_walk_test,
+ .walk_lock = PGWALK_RDLOCK,
};
/**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 65c289c13b58..164d22365bde 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
@@ -17,6 +18,7 @@
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
+#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
@@ -33,10 +35,16 @@
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
+#include <linux/sched/sysctl.h>
+#include <linux/memory-tiers.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#include "swap.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
/*
* By default, transparent hugepage support is disabled in order to avoid
@@ -61,47 +69,114 @@ static struct shrinker deferred_split_shrinker;
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+ bool smaps, bool in_pf, bool enforce_sysfs)
{
- /* The addr is used to check if the vma size fits */
- unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+ if (!vma->vm_mm) /* vdso */
+ return false;
- if (!transhuge_vma_suitable(vma, addr))
+ /*
+ * Explicitly disabled through madvise or prctl, or some
+ * architectures may disable THP for some mappings, for
+ * example, s390 kvm.
+ * */
+ if ((vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
+ /*
+ * If the hardware/firmware marked hugepage support disabled.
+ */
+ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
return false;
- if (vma_is_anonymous(vma))
- return __transparent_hugepage_enabled(vma);
- if (vma_is_shmem(vma))
- return shmem_huge_enabled(vma);
- return false;
+ /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
+ if (vma_is_dax(vma))
+ return in_pf;
+
+ /*
+ * Special VMA and hugetlb VMA.
+ * Must be checked after dax since some dax mappings may have
+ * VM_MIXEDMAP set.
+ */
+ if (vm_flags & VM_NO_KHUGEPAGED)
+ return false;
+
+ /*
+ * Check alignment for file vma and size for both file and anon vma.
+ *
+ * Skip the check for page fault. Huge fault does the check in fault
+ * handlers. And this check is not suitable for huge PUD fault.
+ */
+ if (!in_pf &&
+ !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
+ return false;
+
+ /*
+ * Enabled via shmem mount options or sysfs settings.
+ * Must be done before hugepage flags check since shmem has its
+ * own flags.
+ */
+ if (!in_pf && shmem_file(vma->vm_file))
+ return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
+ !enforce_sysfs, vma->vm_mm, vm_flags);
+
+ /* Enforce sysfs THP requirements as necessary */
+ if (enforce_sysfs &&
+ (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
+ !hugepage_flags_always())))
+ return false;
+
+ /* Only regular file is valid */
+ if (!in_pf && file_thp_enabled(vma))
+ return true;
+
+ if (!vma_is_anonymous(vma))
+ return false;
+
+ if (vma_is_temporary_stack(vma))
+ return false;
+
+ /*
+ * THPeligible bit of smaps should show 1 for proper VMAs even
+ * though anon_vma is not initialized yet.
+ *
+ * Allow page fault since anon_vma may be not initialized until
+ * the first page fault.
+ */
+ if (!vma->anon_vma)
+ return (smaps || in_pf);
+
+ return true;
}
-static struct page *get_huge_zero_page(void)
+static bool get_huge_zero_page(void)
{
struct page *zero_page;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return READ_ONCE(huge_zero_page);
+ return true;
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!zero_page) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
- return NULL;
+ return false;
}
- count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
preempt_enable();
__free_pages(zero_page, compound_order(zero_page));
goto retry;
}
+ WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
- return READ_ONCE(huge_zero_page);
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
+ return true;
}
static void put_huge_zero_page(void)
@@ -146,6 +221,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
+ WRITE_ONCE(huge_zero_pfn, ~0UL);
__free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -163,12 +239,17 @@ static struct shrinker huge_zero_page_shrinker = {
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
+ const char *output;
+
if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "[always] madvise never\n");
- else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always [madvise] never\n");
+ output = "[always] madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ output = "always [madvise] never";
else
- return sprintf(buf, "always madvise [never]\n");
+ output = "always madvise [never]";
+
+ return sysfs_emit(buf, "%s\n", output);
}
static ssize_t enabled_store(struct kobject *kobj,
@@ -196,15 +277,15 @@ static ssize_t enabled_store(struct kobject *kobj,
}
return ret;
}
-static struct kobj_attribute enabled_attr =
- __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
ssize_t single_hugepage_flag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf,
- enum transparent_hugepage_flag flag)
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
{
- return sprintf(buf, "%d\n",
- !!test_bit(flag, &transparent_hugepage_flags));
+ return sysfs_emit(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
}
ssize_t single_hugepage_flag_store(struct kobject *kobj,
@@ -232,15 +313,24 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "[always] defer defer+madvise madvise never\n");
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always [defer] defer+madvise madvise never\n");
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always defer [defer+madvise] madvise never\n");
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always defer defer+madvise [madvise] never\n");
- return sprintf(buf, "always defer defer+madvise madvise [never]\n");
+ const char *output;
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ &transparent_hugepage_flags))
+ output = "[always] defer defer+madvise madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+ &transparent_hugepage_flags))
+ output = "always [defer] defer+madvise madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
+ &transparent_hugepage_flags))
+ output = "always defer [defer+madvise] madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ output = "always defer defer+madvise [madvise] never";
+ else
+ output = "always defer defer+madvise madvise [never]";
+
+ return sysfs_emit(buf, "%s\n", output);
}
static ssize_t defrag_store(struct kobject *kobj,
@@ -277,14 +367,13 @@ static ssize_t defrag_store(struct kobject *kobj,
return count;
}
-static struct kobj_attribute defrag_attr =
- __ATTR(defrag, 0644, defrag_show, defrag_store);
+static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
static ssize_t use_zero_page_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static ssize_t use_zero_page_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
@@ -292,13 +381,12 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
-static struct kobj_attribute use_zero_page_attr =
- __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
static ssize_t hpage_pmd_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
+ return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
}
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
@@ -372,14 +460,14 @@ static int __init hugepage_init(void)
struct kobject *hugepage_kobj;
if (!has_transparent_hugepage()) {
- transparent_hugepage_flags = 0;
+ transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
return -EINVAL;
}
/*
* hugepages can't be allocated by the buddy allocator
*/
- MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
/*
* we use page->mapping and page->index in second tail page
* as list_head: assuming THP order >= 2
@@ -394,10 +482,10 @@ static int __init hugepage_init(void)
if (err)
goto err_slab;
- err = register_shrinker(&huge_zero_page_shrinker);
+ err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
if (err)
goto err_hzp_shrinker;
- err = register_shrinker(&deferred_split_shrinker);
+ err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
if (err)
goto err_split_shrinker;
@@ -468,10 +556,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
}
#ifdef CONFIG_MEMCG
-static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+static inline
+struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
- struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
- struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
if (memcg)
return &memcg->deferred_split_queue;
@@ -479,9 +568,10 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page)
return &pgdat->deferred_split_queue;
}
#else
-static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+static inline
+struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
- struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+ struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
return &pgdat->deferred_split_queue;
}
@@ -489,25 +579,24 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page)
void prep_transhuge_page(struct page *page)
{
- /*
- * we use page->mapping and page->indexlru in second tail page
- * as list_head: assuming THP order >= 2
- */
+ struct folio *folio = (struct folio *)page;
- INIT_LIST_HEAD(page_deferred_list(page));
- set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+ VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
+ INIT_LIST_HEAD(&folio->_deferred_list);
+ folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR);
}
-bool is_transparent_hugepage(struct page *page)
+static inline bool is_transparent_hugepage(struct page *page)
{
+ struct folio *folio;
+
if (!PageCompound(page))
return false;
- page = compound_head(page);
- return is_huge_zero_page(page) ||
- page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
+ folio = page_folio(page);
+ return is_huge_zero_page(&folio->page) ||
+ folio->_folio_dtor == TRANSHUGE_PAGE_DTOR;
}
-EXPORT_SYMBOL_GPL(is_transparent_hugepage);
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
@@ -551,13 +640,10 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long ret;
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
- if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
- goto out;
-
ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
if (ret)
return ret;
-out:
+
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
@@ -566,19 +652,20 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio = page_folio(page);
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret = 0;
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
- put_page(page);
+ if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
- cgroup_throttle_swaprate(page, gfp);
+ folio_throttle_swaprate(folio, gfp);
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
@@ -588,11 +675,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* clear_huge_page writes become visible before the set_pmd_at()
* write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
@@ -606,22 +693,21 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
- vm_fault_t ret2;
-
spin_unlock(vmf->ptl);
- put_page(page);
+ folio_put(folio);
pte_free(vma->vm_mm, pgtable);
- ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
- VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
- return ret2;
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ return ret;
}
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr, true);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ folio_add_new_anon_rmap(folio, vma, haddr);
+ folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
@@ -635,7 +721,7 @@ unlock_release:
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- put_page(page);
+ folio_put(folio);
return ret;
}
@@ -649,9 +735,9 @@ release:
* available
* never: never stall for any thp allocation
*/
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{
- const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+ const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
/* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
@@ -676,41 +762,38 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
}
/* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
pmd_t entry;
if (!pmd_none(*pmd))
- return false;
+ return;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- if (pgtable)
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
- return true;
}
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
- struct page *page;
+ struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
- return VM_FAULT_OOM;
+ khugepaged_enter_vma(vma, vma->vm_flags);
+
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
struct page *zero_page;
- bool set;
vm_fault_t ret;
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable))
@@ -723,35 +806,35 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
ret = 0;
- set = false;
if (pmd_none(*vmf->pmd)) {
ret = check_stable_address_space(vma->vm_mm);
if (ret) {
spin_unlock(vmf->ptl);
+ pte_free(vma->vm_mm, pgtable);
} else if (userfaultfd_missing(vma)) {
spin_unlock(vmf->ptl);
+ pte_free(vma->vm_mm, pgtable);
ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
haddr, vmf->pmd, zero_page);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
- set = true;
}
- } else
+ } else {
spin_unlock(vmf->ptl);
- if (!set)
pte_free(vma->vm_mm, pgtable);
+ }
return ret;
}
- gfp = alloc_hugepage_direct_gfpmask(vma);
- page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
- if (unlikely(!page)) {
+ gfp = vma_thp_gfp_mask(vma);
+ folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
+ if (unlikely(!folio)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(vmf, page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -802,23 +885,20 @@ out_unlock:
}
/**
- * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
+ * vmf_insert_pfn_pmd - insert a pmd size pfn
* @vmf: Structure describing the fault
* @pfn: pfn to insert
- * @pgprot: page protection to use
* @write: whether it's a write fault
*
- * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
- * also consult the vmf_insert_mixed_prot() documentation when
- * @pgprot != @vmf->vma->vm_page_prot.
+ * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
unsigned long addr = vmf->address & PMD_MASK;
struct vm_area_struct *vma = vmf->vma;
+ pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
/*
@@ -846,7 +926,7 @@ vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
@@ -857,9 +937,10 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
}
static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
+ pud_t *pud, pfn_t pfn, bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ pgprot_t prot = vma->vm_page_prot;
pud_t entry;
spinlock_t *ptl;
@@ -893,23 +974,20 @@ out_unlock:
}
/**
- * vmf_insert_pfn_pud_prot - insert a pud size pfn
+ * vmf_insert_pfn_pud - insert a pud size pfn
* @vmf: Structure describing the fault
* @pfn: pfn to insert
- * @pgprot: page protection to use
* @write: whether it's a write fault
*
- * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
- * also consult the vmf_insert_mixed_prot() documentation when
- * @pgprot != @vmf->vma->vm_page_prot.
+ * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
+ pgprot_t pgprot = vma->vm_page_prot;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -927,22 +1005,22 @@ vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
+ insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags)
+ pmd_t *pmd, bool write)
{
pmd_t _pmd;
_pmd = pmd_mkyoung(*pmd);
- if (flags & FOLL_WRITE)
+ if (write)
_pmd = pmd_mkdirty(_pmd);
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, flags & FOLL_WRITE))
+ pmd, _pmd, write))
update_mmu_cache_pmd(vma, addr, pmd);
}
@@ -952,20 +1030,10 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pmd_lockptr(mm, pmd));
- /*
- * When we COW a devmap PMD entry, we split it into PTEs, so we should
- * not be in this function with `flags & FOLL_COW` set.
- */
- WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
-
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return NULL;
-
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
@@ -975,7 +1043,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags);
+ touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
/*
* device mapped pages can only be returned if the
@@ -989,15 +1057,16 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
return page;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
@@ -1006,7 +1075,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
/* Skip if can be re-fill on fault */
- if (!vma_is_anonymous(vma))
+ if (!vma_is_anonymous(dst_vma))
return 0;
pgtable = pte_alloc_one(dst_mm);
@@ -1020,29 +1089,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- /*
- * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
- * does not have the VM_UFFD_WP, which means that the uffd
- * fork event is not enabled.
- */
- if (!(vma->vm_flags & VM_UFFD_WP))
- pmd = pmd_clear_uffd_wp(pmd);
-
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (is_write_migration_entry(entry)) {
- make_migration_entry_read(&entry);
+ if (!is_readable_migration_entry(entry)) {
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
goto out_unlock;
@@ -1059,46 +1125,35 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- struct page *zero_page;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = mm_get_huge_zero_page(dst_mm);
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
- zero_page);
- ret = 0;
- goto out_unlock;
+ mm_get_huge_zero_page(dst_mm);
+ goto out_zero_page;
}
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
- /*
- * If this page is a potentially pinned page, split and retry the fault
- * with smaller page size. Normally this should not happen because the
- * userspace should use MADV_DONTFORK upon pinned regions. This is a
- * best effort that the pinned pages won't be replaced by another
- * random page during the coming copy-on-write.
- */
- if (unlikely(is_cow_mapping(vma->vm_flags) &&
- atomic_read(&src_mm->has_pinned) &&
- page_maybe_dma_pinned(src_page))) {
+ get_page(src_page);
+ if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
+ /* Page maybe pinned: split and retry the fault on PTEs. */
+ put_page(src_page);
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
return -EAGAIN;
}
-
- get_page(src_page);
- page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+out_zero_page:
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-
pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_clear_uffd_wp(pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
@@ -1112,15 +1167,15 @@ out:
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags)
+ pud_t *pud, bool write)
{
pud_t _pud;
_pud = pud_mkyoung(*pud);
- if (flags & FOLL_WRITE)
+ if (write)
_pud = pud_mkdirty(_pud);
if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
- pud, _pud, flags & FOLL_WRITE))
+ pud, _pud, write))
update_mmu_cache_pud(vma, addr, pud);
}
@@ -1130,24 +1185,20 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pud_lockptr(mm, pud));
if (flags & FOLL_WRITE && !pud_write(*pud))
return NULL;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return NULL;
-
if (pud_present(*pud) && pud_devmap(*pud))
/* pass */;
else
return NULL;
if (flags & FOLL_TOUCH)
- touch_pud(vma, addr, pud, flags);
+ touch_pud(vma, addr, pud, flags & FOLL_WRITE);
/*
* device mapped pages can only be returned if the
@@ -1163,8 +1214,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
+
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
return page;
}
@@ -1195,16 +1248,10 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* No huge zero pud yet */
}
- /* Please refer to comments in copy_huge_pmd() */
- if (unlikely(is_cow_mapping(vma->vm_flags) &&
- atomic_read(&src_mm->has_pinned) &&
- page_maybe_dma_pinned(pud_page(pud)))) {
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- __split_huge_pud(vma, src_pud, addr);
- return -EAGAIN;
- }
-
+ /*
+ * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
+ * and split if duplicating fails.
+ */
pudp_set_wrprotect(src_mm, addr, src_pud);
pud = pud_mkold(pud_wrprotect(pud));
set_pud_at(dst_mm, addr, dst_pud, pud);
@@ -1218,52 +1265,40 @@ out_unlock:
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
- pud_t entry;
- unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
if (unlikely(!pud_same(*vmf->pud, orig_pud)))
goto unlock;
- entry = pud_mkyoung(orig_pud);
- if (write)
- entry = pud_mkdirty(entry);
- haddr = vmf->address & HPAGE_PUD_MASK;
- if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
- update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
-
+ touch_pud(vmf->vma, vmf->address, vmf->pud, write);
unlock:
spin_unlock(vmf->ptl);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf)
{
- pmd_t entry;
- unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
+ if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
goto unlock;
- entry = pmd_mkyoung(orig_pmd);
- if (write)
- entry = pmd_mkdirty(entry);
- haddr = vmf->address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
- update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
+ touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
unlock:
spin_unlock(vmf->ptl);
}
-vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ pmd_t orig_pmd = vmf->orig_pmd;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1279,53 +1314,135 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
}
page = pmd_page(orig_pmd);
- VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+ folio = page_folio(page);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ /* Early check when only holding the PT lock. */
+ if (PageAnonExclusive(page))
+ goto reuse;
- /* Lock page for reuse_swap_page() */
- if (!trylock_page(page)) {
- get_page(page);
+ if (!folio_trylock(folio)) {
+ folio_get(folio);
spin_unlock(vmf->ptl);
- lock_page(page);
+ folio_lock(folio);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return 0;
}
- put_page(page);
+ folio_put(folio);
+ }
+
+ /* Recheck after temporarily dropping the PT lock. */
+ if (PageAnonExclusive(page)) {
+ folio_unlock(folio);
+ goto reuse;
}
/*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
+ * See do_wp_page(): we can only reuse the folio exclusively if
+ * there are no additional references. Note that we always drain
+ * the LRU cache immediately after adding a THP.
*/
- if (reuse_swap_page(page, NULL)) {
+ if (folio_ref_count(folio) >
+ 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
+ goto unlock_fallback;
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_ref_count(folio) == 1) {
pmd_t entry;
+
+ page_move_anon_rmap(page, vma);
+ folio_unlock(folio);
+reuse:
+ if (unlikely(unshare)) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- unlock_page(page);
spin_unlock(vmf->ptl);
- return VM_FAULT_WRITE;
+ return 0;
}
- unlock_page(page);
+unlock_fallback:
+ folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
-/*
- * FOLL_FORCE can write to even unwritable pmd's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ struct page *page;
+
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
+ return false;
+
+ /* Don't touch entries that are not even readable (NUMA hinting). */
+ if (pmd_protnone(pmd))
+ return false;
+
+ /* Do we need write faults for softdirty tracking? */
+ if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+ return false;
+
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_huge_pmd_wp(vma, pmd))
+ return false;
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ /* See can_change_pte_writable(). */
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ return page && PageAnon(page) && PageAnonExclusive(page);
+ }
+
+ /* See can_change_pte_writable(). */
+ return pmd_dirty(pmd);
+}
+
+/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
+static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
{
- return pmd_write(pmd) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+ /* If the pmd is writable, we can write to the page. */
+ if (pmd_write(pmd))
+ return true;
+
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+
+ /* ... and a write-fault isn't required for other reasons. */
+ if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+ return false;
+ return !userfaultfd_huge_pmd_wp(vma, pmd);
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1334,225 +1451,131 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
+ struct page *page;
+ int ret;
assert_spin_locked(pmd_lockptr(mm, pmd));
- if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
- goto out;
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pmd(*pmd, page, vma, flags))
+ return NULL;
/* Avoid dumping huge zero page */
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
return ERR_PTR(-EFAULT);
- /* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
- goto out;
+ if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
+ return NULL;
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+ if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
+ return ERR_PTR(-EMLINK);
- if (!try_grab_page(page, flags))
- return ERR_PTR(-ENOMEM);
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
- if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags);
+ ret = try_grab_page(page, flags);
+ if (ret)
+ return ERR_PTR(ret);
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /*
- * We don't mlock() pte-mapped THPs. This way we can avoid
- * leaking mlocked pages into non-VM_LOCKED VMAs.
- *
- * For anon THP:
- *
- * In most cases the pmd is the only mapping of the page as we
- * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
- * writable private mappings in populate_vma_page_range().
- *
- * The only scenario when we have the page shared here is if we
- * mlocking read-only mapping shared over fork(). We skip
- * mlocking such pages.
- *
- * For file THP:
- *
- * We can expect PageDoubleMap() to be stable under page lock:
- * for file pages we set it in page_add_file_rmap(), which
- * requires page to be locked.
- */
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
- if (PageAnon(page) && compound_mapcount(page) != 1)
- goto skip_mlock;
- if (PageDoubleMap(page) || !page->mapping)
- goto skip_mlock;
- if (!trylock_page(page))
- goto skip_mlock;
- if (page->mapping && !PageDoubleMap(page))
- mlock_vma_page(page);
- unlock_page(page);
- }
-skip_mlock:
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
-out:
return page;
}
/* NUMA hinting page fault entry point for trans huge pmds */
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct anon_vma *anon_vma = NULL;
+ pmd_t oldpmd = vmf->orig_pmd;
+ pmd_t pmd;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
- int target_nid, last_cpupid = -1;
- bool page_locked;
- bool migrated = false;
- bool was_writable;
+ int page_nid = NUMA_NO_NODE;
+ int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
+ bool migrated = false, writable = false;
int flags = 0;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(pmd, *vmf->pmd)))
- goto out_unlock;
-
- /*
- * If there are potential migrations, wait for completion and retry
- * without disrupting NUMA hinting information. Do not relock and
- * check_same as the page may no longer be mapped.
- */
- if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
- page = pmd_page(*vmf->pmd);
- if (!get_page_unless_zero(page))
- goto out_unlock;
+ if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page);
goto out;
}
- page = pmd_page(pmd);
- BUG_ON(is_huge_zero_page(page));
- page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
- count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == this_nid) {
- count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
- flags |= TNF_FAULT_LOCAL;
- }
-
- /* See similar comment in do_numa_page for explanation */
- if (!pmd_savedwrite(pmd))
- flags |= TNF_NO_GROUP;
+ pmd = pmd_modify(oldpmd, vma->vm_page_prot);
/*
- * Acquire the page lock to serialise THP migrations but avoid dropping
- * page_table_lock if at all possible
+ * Detect now whether the PMD could be writable; this information
+ * is only valid while holding the PT lock.
*/
- page_locked = trylock_page(page);
- target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == NUMA_NO_NODE) {
- /* If the page was locked, there are no parallel migrations */
- if (page_locked)
- goto clear_pmdnuma;
- }
+ writable = pmd_write(pmd);
+ if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+ can_change_pmd_writable(vma, vmf->address, pmd))
+ writable = true;
- /* Migration could have started since the pmd_trans_migrating check */
- if (!page_locked) {
- page_nid = NUMA_NO_NODE;
- if (!get_page_unless_zero(page))
- goto out_unlock;
- spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page);
- goto out;
- }
+ page = vm_normal_page_pmd(vma, haddr, pmd);
+ if (!page)
+ goto out_map;
+ /* See similar comment in do_numa_page for explanation */
+ if (!writable)
+ flags |= TNF_NO_GROUP;
+
+ page_nid = page_to_nid(page);
/*
- * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
- * to serialises splits
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
*/
- get_page(page);
- spin_unlock(vmf->ptl);
- anon_vma = page_lock_anon_vma_read(page);
+ if (node_is_toptier(page_nid))
+ last_cpupid = page_cpupid_last(page);
+ target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
+ &flags);
- /* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(vmf->ptl);
- if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
- unlock_page(page);
- put_page(page);
- page_nid = NUMA_NO_NODE;
- goto out_unlock;
- }
-
- /* Bail if we fail to protect against THP splits for any reason */
- if (unlikely(!anon_vma)) {
+ if (target_nid == NUMA_NO_NODE) {
put_page(page);
- page_nid = NUMA_NO_NODE;
- goto clear_pmdnuma;
- }
-
- /*
- * Since we took the NUMA fault, we must have observed the !accessible
- * bit. Make sure all other CPUs agree with that, to avoid them
- * modifying the page we're about to migrate.
- *
- * Must be done under PTL such that we'll observe the relevant
- * inc_tlb_flush_pending().
- *
- * We are not sure a pending tlb flush here is for a huge page
- * mapping or not. Hence use the tlb range variant
- */
- if (mm_tlb_flush_pending(vma->vm_mm)) {
- flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
- /*
- * change_huge_pmd() released the pmd lock before
- * invalidating the secondary MMUs sharing the primary
- * MMU pagetables (with ->invalidate_range()). The
- * mmu_notifier_invalidate_range_end() (which
- * internally calls ->invalidate_range()) in
- * change_pmd_range() will run after us, so we can't
- * rely on it here and we need an explicit invalidate.
- */
- mmu_notifier_invalidate_range(vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ goto out_map;
}
- /*
- * Migrate the THP to the requested node, returns with page unlocked
- * and access rights restored.
- */
spin_unlock(vmf->ptl);
+ writable = false;
- migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
- vmf->pmd, pmd, vmf->address, page, target_nid);
+ migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
- } else
+ } else {
flags |= TNF_MIGRATE_FAIL;
-
- goto out;
-clear_pmdnuma:
- BUG_ON(!PageLocked(page));
- was_writable = pmd_savedwrite(pmd);
- pmd = pmd_modify(pmd, vma->vm_page_prot);
- pmd = pmd_mkyoung(pmd);
- if (was_writable)
- pmd = pmd_mkwrite(pmd);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- unlock_page(page);
-out_unlock:
- spin_unlock(vmf->ptl);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+ goto out_map;
+ }
out:
- if (anon_vma)
- page_unlock_anon_vma_read(anon_vma);
-
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
flags);
return 0;
+
+out_map:
+ /* Restore the PMD */
+ pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+ pmd = pmd_mkyoung(pmd);
+ if (writable)
+ pmd = pmd_mkwrite(pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+ spin_unlock(vmf->ptl);
+ goto out;
}
/*
@@ -1564,7 +1587,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
{
spinlock_t *ptl;
pmd_t orig_pmd;
- struct page *page;
+ struct folio *folio;
struct mm_struct *mm = tlb->mm;
bool ret = false;
@@ -1584,15 +1607,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto out;
}
- page = pmd_page(orig_pmd);
+ folio = pfn_folio(pmd_pfn(orig_pmd));
/*
- * If other processes are mapping this page, we couldn't discard
- * the page unless they all do MADV_FREE so let's skip the page.
+ * If other processes are mapping this folio, we couldn't discard
+ * the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (page_mapcount(page) != 1)
+ if (folio_estimated_sharers(folio) != 1)
goto out;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto out;
/*
@@ -1600,17 +1623,17 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* will deactivate only them.
*/
if (next - addr != HPAGE_PMD_SIZE) {
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
goto out_unlocked;
}
- if (PageDirty(page))
- ClearPageDirty(page);
- unlock_page(page);
+ if (folio_test_dirty(folio))
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
pmdp_invalidate(vma, addr, pmd);
@@ -1621,7 +1644,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- mark_page_lazyfree(page);
+ folio_mark_lazyfree(folio);
ret = true;
out:
spin_unlock(ptl);
@@ -1662,19 +1685,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
- if (is_huge_zero_pmd(orig_pmd))
- tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
- tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
struct page *page = NULL;
int flush_needed = 1;
if (pmd_present(orig_pmd)) {
page = pmd_page(orig_pmd);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
@@ -1682,7 +1702,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = pfn_swap_entry_to_page(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1739,9 +1759,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* The destination pmd shouldn't be established, free_pgtables()
- * should have release it.
+ * should have released it; but move_page_tables() might have already
+ * inserted a page table, if racing against shmem/file collapse.
*/
- if (WARN_ON(!pmd_none(*new_pmd))) {
+ if (!pmd_none(*new_pmd)) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
return false;
}
@@ -1768,7 +1789,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pmd = move_soft_dirty_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
- flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -1780,60 +1801,92 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* Returns
* - 0 if PMD could not be locked
- * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ * or if prot_numa but THP migration is not supported
+ * - HPAGE_PMD_NR if protections changed and TLB flush necessary
*/
-int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
+int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr, pgprot_t newprot,
+ unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- pmd_t entry;
- bool preserve_write;
- int ret;
+ pmd_t oldpmd, entry;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ int ret = 1;
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+
+ if (prot_numa && !thp_migration_supported())
+ return 1;
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
- preserve_write = prot_numa && pmd_write(*pmd);
- ret = 1;
-
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
+ struct page *page = pfn_swap_entry_to_page(entry);
+ pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
- if (is_write_migration_entry(entry)) {
- pmd_t newpmd;
+ if (is_writable_migration_entry(entry)) {
/*
* A protection check is difficult so
* just be safe and disable write
*/
- make_migration_entry_read(&entry);
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
- set_pmd_at(mm, addr, pmd, newpmd);
+ } else {
+ newpmd = *pmd;
}
+
+ if (uffd_wp)
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
+ else if (uffd_wp_resolve)
+ newpmd = pmd_swp_clear_uffd_wp(newpmd);
+ if (!pmd_same(*pmd, newpmd))
+ set_pmd_at(mm, addr, pmd, newpmd);
goto unlock;
}
#endif
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd))
- goto unlock;
+ if (prot_numa) {
+ struct page *page;
+ bool toptier;
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (is_huge_zero_pmd(*pmd))
+ goto unlock;
- if (prot_numa && pmd_protnone(*pmd))
- goto unlock;
+ if (pmd_protnone(*pmd))
+ goto unlock;
+
+ page = pmd_page(*pmd);
+ toptier = node_is_toptier(page_to_nid(page));
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ toptier)
+ goto unlock;
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page, jiffies_to_msecs(jiffies));
+ }
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
@@ -1852,28 +1905,32 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* The race makes MADV_DONTNEED miss the huge pmd and don't clear it
* which may break userspace.
*
- * pmdp_invalidate() is required to make sure we don't miss
+ * pmdp_invalidate_ad() is required to make sure we don't miss
* dirty/young flags set by hardware.
*/
- entry = pmdp_invalidate(vma, addr, pmd);
+ oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
- entry = pmd_modify(entry, newprot);
- if (preserve_write)
- entry = pmd_mk_savedwrite(entry);
- if (uffd_wp) {
- entry = pmd_wrprotect(entry);
+ entry = pmd_modify(oldpmd, newprot);
+ if (uffd_wp)
entry = pmd_mkuffd_wp(entry);
- } else if (uffd_wp_resolve) {
+ else if (uffd_wp_resolve)
/*
* Leave the write bit to be handled by PF interrupt
* handler, then things like COW could be properly
* handled.
*/
entry = pmd_clear_uffd_wp(entry);
- }
+
+ /* See change_pte_range(). */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
+ can_change_pmd_writable(vma, addr, entry))
+ entry = pmd_mkwrite(entry);
+
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
- BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+
+ if (huge_pmd_needs_flush(oldpmd, entry))
+ tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
unlock:
spin_unlock(ptl);
return ret;
@@ -1897,10 +1954,10 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
}
/*
- * Returns true if a given pud maps a thp, false otherwise.
+ * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
*
- * Note that if it returns true, this routine returns without unlocking page
- * table lock. So callers must unlock it.
+ * Note that if it returns page table lock pointer, this routine returns without
+ * unlocking page table lock. So callers must unlock it.
*/
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
@@ -1922,12 +1979,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
ptl = __pud_trans_huge_lock(pud, vma);
if (!ptl)
return 0;
- /*
- * For architectures like ppc64 we look at deposited pgtable
- * when calling pudp_huge_get_and_clear. So do the
- * pgtable_trans_huge_withdraw after finishing pudp related
- * operations.
- */
+
pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_special_huge(vma)) {
@@ -1959,7 +2011,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address & HPAGE_PUD_MASK,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -1983,7 +2035,9 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
- pmd_t _pmd;
+ pmd_t _pmd, old_pmd;
+ unsigned long addr;
+ pte_t *pte;
int i;
/*
@@ -1992,22 +2046,27 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
* replacing a zero pmd write protected page with a zero pte write
* protected page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
- pmdp_huge_clear_flush(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte);
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t entry;
+
+ entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
entry = pte_mkspecial(entry);
- pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
+ if (pmd_uffd_wp(old_pmd))
+ entry = pte_mkuffd_wp(entry);
+ VM_BUG_ON(!pte_none(ptep_get(pte)));
+ set_pte_at(mm, addr, pte, entry);
+ pte++;
}
+ pte_unmap(pte - 1);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
}
@@ -2020,7 +2079,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
+ bool anon_exclusive = false, dirty = false;
unsigned long addr;
+ pte_t *pte;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
@@ -2032,7 +2093,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
- _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
@@ -2041,16 +2102,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
zap_deposited_table(mm, pmd);
if (vma_is_special_huge(vma))
return;
- page = pmd_page(_pmd);
- if (!PageDirty(page) && pmd_dirty(_pmd))
- set_page_dirty(page);
- if (!PageReferenced(page) && pmd_young(_pmd))
- SetPageReferenced(page);
- page_remove_rmap(page, true);
- put_page(page);
+ if (unlikely(is_pmd_migration_entry(old_pmd))) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(old_pmd);
+ page = pfn_swap_entry_to_page(entry);
+ } else {
+ page = pmd_page(old_pmd);
+ if (!PageDirty(page) && pmd_dirty(old_pmd))
+ set_page_dirty(page);
+ if (!PageReferenced(page) && pmd_young(old_pmd))
+ SetPageReferenced(page);
+ page_remove_rmap(page, vma, true);
+ put_page(page);
+ }
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+ }
+
+ if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
@@ -2090,22 +2160,48 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = pfn_to_page(swp_offset(entry));
- write = is_write_migration_entry(entry);
- young = false;
+ page = pfn_swap_entry_to_page(entry);
+ write = is_writable_migration_entry(entry);
+ if (PageAnon(page))
+ anon_exclusive = is_readable_exclusive_migration_entry(entry);
+ young = is_migration_entry_young(entry);
+ dirty = is_migration_entry_dirty(entry);
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
page = pmd_page(old_pmd);
- if (pmd_dirty(old_pmd))
+ if (pmd_dirty(old_pmd)) {
+ dirty = true;
SetPageDirty(page);
+ }
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
uffd_wp = pmd_uffd_wp(old_pmd);
+
+ VM_BUG_ON_PAGE(!page_count(page), page);
+
+ /*
+ * Without "freeze", we'll simply split the PMD, propagating the
+ * PageAnonExclusive() flag for each PTE by setting it for
+ * each subpage -- no need to (temporarily) clear.
+ *
+ * With "freeze" we want to replace mapped pages by
+ * migration entries right away. This is only possible if we
+ * managed to clear PageAnonExclusive() -- see
+ * set_pmd_migration_entry().
+ *
+ * In case we cannot clear PageAnonExclusive(), split the PMD
+ * only and let try_to_migrate_one() fail later.
+ *
+ * See page_try_share_anon_rmap(): invalidate PMD first.
+ */
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
+ freeze = false;
+ if (!freeze)
+ page_ref_add(page, HPAGE_PMD_NR - 1);
}
- VM_BUG_ON_PAGE(!page_count(page), page);
- page_ref_add(page, HPAGE_PMD_NR - 1);
/*
* Withdraw the table only after we mark the pmd entry invalid.
@@ -2114,8 +2210,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte);
for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry, *pte;
+ pte_t entry;
/*
* Note that NUMA hinting access restrictions are not
* transferred to avoid any possibility of altering
@@ -2123,7 +2221,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (freeze || pmd_migration) {
swp_entry_t swp_entry;
- swp_entry = make_migration_entry(page + i, write);
+ if (write)
+ swp_entry = make_writable_migration_entry(
+ page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_migration_entry(
+ page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
+ if (dirty)
+ swp_entry = make_migration_entry_dirty(swp_entry);
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
@@ -2131,113 +2241,68 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mkuffd_wp(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
- entry = maybe_mkwrite(entry, vma);
- if (!write)
- entry = pte_wrprotect(entry);
+ if (write)
+ entry = pte_mkwrite(entry);
+ if (anon_exclusive)
+ SetPageAnonExclusive(page + i);
if (!young)
entry = pte_mkold(entry);
+ /* NOTE: this may set soft-dirty too on some archs */
+ if (dirty)
+ entry = pte_mkdirty(entry);
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_mkuffd_wp(entry);
+ page_add_anon_rmap(page + i, vma, addr, false);
}
- pte = pte_offset_map(&_pmd, addr);
- BUG_ON(!pte_none(*pte));
+ VM_BUG_ON(!pte_none(ptep_get(pte)));
set_pte_at(mm, addr, pte, entry);
- if (!pmd_migration)
- atomic_inc(&page[i]._mapcount);
- pte_unmap(pte);
+ pte++;
}
+ pte_unmap(pte - 1);
- if (!pmd_migration) {
- /*
- * Set PG_double_map before dropping compound_mapcount to avoid
- * false-negative page_mapped().
- */
- if (compound_mapcount(page) > 1 &&
- !TestSetPageDoubleMap(page)) {
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_inc(&page[i]._mapcount);
- }
-
- lock_page_memcg(page);
- if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
- /* Last compound_mapcount is gone. */
- __dec_lruvec_page_state(page, NR_ANON_THPS);
- if (TestClearPageDoubleMap(page)) {
- /* No need in mapcount reference anymore */
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_dec(&page[i]._mapcount);
- }
- }
- unlock_page_memcg(page);
- }
+ if (!pmd_migration)
+ page_remove_rmap(page, vma, true);
+ if (freeze)
+ put_page(page);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
-
- if (freeze) {
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- page_remove_rmap(page + i, false);
- put_page(page + i);
- }
- }
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct page *page)
+ unsigned long address, bool freeze, struct folio *folio)
{
spinlock_t *ptl;
struct mmu_notifier_range range;
- bool was_locked = false;
- pmd_t _pmd;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address & HPAGE_PMD_MASK,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
/*
- * If caller asks to setup a migration entries, we need a page to check
- * pmd against. Otherwise we can end up replacing wrong page.
+ * If caller asks to setup a migration entry, we need a folio to check
+ * pmd against. Otherwise we can end up replacing wrong folio.
*/
- VM_BUG_ON(freeze && !page);
- if (page) {
- VM_WARN_ON_ONCE(!PageLocked(page));
- was_locked = true;
- if (page != pmd_page(*pmd))
+ VM_BUG_ON(freeze && !folio);
+ VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
+
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+ is_pmd_migration_entry(*pmd)) {
+ /*
+ * It's safe to call pmd_page when folio is set because it's
+ * guaranteed that pmd is present.
+ */
+ if (folio && folio != page_folio(pmd_page(*pmd)))
goto out;
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
}
-repeat:
- if (pmd_trans_huge(*pmd)) {
- if (!page) {
- page = pmd_page(*pmd);
- if (unlikely(!trylock_page(page))) {
- get_page(page);
- _pmd = *pmd;
- spin_unlock(ptl);
- lock_page(page);
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, _pmd))) {
- unlock_page(page);
- put_page(page);
- page = NULL;
- goto repeat;
- }
- put_page(page);
- }
- }
- if (PageMlocked(page))
- clear_page_mlock(page);
- } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
- goto out;
- __split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
- if (!was_locked && page)
- unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2255,28 +2320,26 @@ out:
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct page *page)
+ bool freeze, struct folio *folio)
{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return;
-
- p4d = p4d_offset(pgd, address);
- if (!p4d_present(*p4d))
- return;
+ pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
- pud = pud_offset(p4d, address);
- if (!pud_present(*pud))
+ if (!pmd)
return;
- pmd = pmd_offset(pud, address);
+ __split_huge_pmd(vma, pmd, address, freeze, folio);
+}
- __split_huge_pmd(vma, pmd, address, freeze, page);
+static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
+{
+ /*
+ * If the new address isn't hpage aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
+ range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
+ ALIGN(address, HPAGE_PMD_SIZE)))
+ split_huge_pmd_address(vma, address, false, NULL);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -2284,65 +2347,79 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long end,
long adjust_next)
{
- /*
- * If the new start address isn't hpage aligned and it could
- * previously contain an hugepage: check if we need to split
- * an huge pmd.
- */
- if (start & ~HPAGE_PMD_MASK &&
- (start & HPAGE_PMD_MASK) >= vma->vm_start &&
- (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, start, false, NULL);
+ /* Check if we need to split start first. */
+ split_huge_pmd_if_needed(vma, start);
- /*
- * If the new end address isn't hpage aligned and it could
- * previously contain an hugepage: check if we need to split
- * an huge pmd.
- */
- if (end & ~HPAGE_PMD_MASK &&
- (end & HPAGE_PMD_MASK) >= vma->vm_start &&
- (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, end, false, NULL);
+ /* Check if we need to split end next. */
+ split_huge_pmd_if_needed(vma, end);
/*
- * If we're also updating the vma->vm_next->vm_start, if the new
- * vm_next->vm_start isn't hpage aligned and it could previously
- * contain an hugepage: check if we need to split an huge pmd.
+ * If we're also updating the next vma vm_start,
+ * check if we need to split it.
*/
if (adjust_next > 0) {
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
unsigned long nstart = next->vm_start;
nstart += adjust_next;
- if (nstart & ~HPAGE_PMD_MASK &&
- (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
- (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_pmd_address(next, nstart, false, NULL);
+ split_huge_pmd_if_needed(next, nstart);
}
}
-static void unmap_page(struct page *page)
+static void unmap_folio(struct folio *folio)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- bool unmap_success;
+ enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+ TTU_SYNC;
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- if (PageAnon(page))
- ttu_flags |= TTU_SPLIT_FREEZE;
+ /*
+ * Anon pages need migration entries to preserve them, but file
+ * pages can simply be left unmapped, then faulted back on demand.
+ * If that is ever changed (perhaps for mlock), update remap_page().
+ */
+ if (folio_test_anon(folio))
+ try_to_migrate(folio, ttu_flags);
+ else
+ try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
+}
+
+static void remap_page(struct folio *folio, unsigned long nr)
+{
+ int i = 0;
- unmap_success = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(!unmap_success, page);
+ /* If unmap_folio() uses try_to_migrate() on file, remove this check */
+ if (!folio_test_anon(folio))
+ return;
+ for (;;) {
+ remove_migration_ptes(folio, folio, true);
+ i += folio_nr_pages(folio);
+ if (i >= nr)
+ break;
+ folio = folio_next(folio);
+ }
}
-static void remap_page(struct page *page)
+static void lru_add_page_tail(struct page *head, struct page *tail,
+ struct lruvec *lruvec, struct list_head *list)
{
- int i;
- if (PageTransHuge(page)) {
- remove_migration_ptes(page, page, true);
+ VM_BUG_ON_PAGE(!PageHead(head), head);
+ VM_BUG_ON_PAGE(PageCompound(tail), head);
+ VM_BUG_ON_PAGE(PageLRU(tail), head);
+ lockdep_assert_held(&lruvec->lru_lock);
+
+ if (list) {
+ /* page reclaim is reclaiming a huge page */
+ VM_WARN_ON(PageLRU(head));
+ get_page(tail);
+ list_add_tail(&tail->lru, list);
} else {
- for (i = 0; i < HPAGE_PMD_NR; i++)
- remove_migration_ptes(page + i, page + i, true);
+ /* head is still on lru (and we have it frozen) */
+ VM_WARN_ON(!PageLRU(head));
+ if (PageUnevictable(tail))
+ tail->mlock_count = 0;
+ else
+ list_add_tail(&tail->lru, &head->lru);
+ SetPageLRU(tail);
}
}
@@ -2357,7 +2434,14 @@ static void __split_huge_page_tail(struct page *head, int tail,
* Clone page flags before unfreezing refcount.
*
* After successful get_page_unless_zero() might follow flags change,
- * for exmaple lock_page() which set PG_waiters.
+ * for example lock_page() which set PG_waiters.
+ *
+ * Note that for mapped sub-pages of an anonymous THP,
+ * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
+ * the migration entry instead from where remap_page() will restore it.
+ * We can still have PG_anon_exclusive set on effectively unmapped and
+ * unreferenced sub-pages of an anonymous THP: we can simply drop
+ * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -2370,17 +2454,33 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
(1L << PG_arch_2) |
+ (1L << PG_arch_3) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
- /* ->mapping in first tail page is compound_mapcount */
+ /* ->mapping in first and second tail page is replaced by other uses */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
page_tail);
page_tail->mapping = head->mapping;
page_tail->index = head->index + tail;
+ /*
+ * page->private should not be set in tail pages with the exception
+ * of swap cache pages that store the swp_entry_t in tail pages.
+ * Fix up and warn once if private is unexpectedly set.
+ *
+ * What of 32-bit systems, on which folio->_pincount overlays
+ * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and
+ * pincount must be 0 for folio_ref_freeze() to have succeeded.
+ */
+ if (!folio_test_swapcache(page_folio(head))) {
+ VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
+ page_tail->private = 0;
+ }
+
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
@@ -2412,19 +2512,18 @@ static void __split_huge_page_tail(struct page *head, int tail,
}
static void __split_huge_page(struct page *page, struct list_head *list,
- pgoff_t end, unsigned long flags)
+ pgoff_t end)
{
- struct page *head = compound_head(page);
- pg_data_t *pgdat = page_pgdat(head);
+ struct folio *folio = page_folio(page);
+ struct page *head = &folio->page;
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
+ unsigned int nr = thp_nr_pages(head);
int i;
- lruvec = mem_cgroup_page_lruvec(head, pgdat);
-
/* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(head);
+ split_page_memcg(head, nr);
if (PageAnon(head) && PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2434,15 +2533,24 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
+ lruvec = folio_lruvec_lock(folio);
+
+ ClearPageHasHWPoisoned(head);
+
+ for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
- /* Some pages can be beyond i_size: drop them from page cache */
+ /* Some pages can be beyond EOF: drop them from page cache */
if (head[i].index >= end) {
- ClearPageDirty(head + i);
- __delete_from_page_cache(head + i, NULL);
- if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
+ struct folio *tail = page_folio(head + i);
+
+ if (shmem_mapping(head->mapping))
shmem_uncharge(head->mapping->host, 1);
- put_page(head + i);
+ else if (folio_test_clear_dirty(tail))
+ folio_account_cleaned(tail,
+ inode_to_wb(folio->mapping->host));
+ __filemap_remove_folio(tail, NULL);
+ folio_put(tail);
} else if (!PageAnon(page)) {
__xa_store(&head->mapping->i_pages, head[i].index,
head + i, 0);
@@ -2453,8 +2561,10 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
ClearPageCompound(head);
+ unlock_page_lruvec(lruvec);
+ /* Caller disabled irqs, so they are still disabled here */
- split_page_owner(head, HPAGE_PMD_ORDER);
+ split_page_owner(head, nr);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
@@ -2470,12 +2580,17 @@ static void __split_huge_page(struct page *page, struct list_head *list,
page_ref_add(head, 2);
xa_unlock(&head->mapping->i_pages);
}
+ local_irq_enable();
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ remap_page(folio, nr);
- remap_page(head);
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ split_swap_cluster(entry);
+ }
+
+ for (i = 0; i < nr; i++) {
struct page *subpage = head + i;
if (subpage == page)
continue;
@@ -2488,104 +2603,24 @@ static void __split_huge_page(struct page *page, struct list_head *list,
* requires taking the lru_lock so we do the put_page
* of the tail pages after the split is complete.
*/
- put_page(subpage);
+ free_page_and_swap_cache(subpage);
}
}
-int total_mapcount(struct page *page)
-{
- int i, compound, ret;
-
- VM_BUG_ON_PAGE(PageTail(page), page);
-
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
-
- compound = compound_mapcount(page);
- if (PageHuge(page))
- return compound;
- ret = compound;
- for (i = 0; i < HPAGE_PMD_NR; i++)
- ret += atomic_read(&page[i]._mapcount) + 1;
- /* File pages has compound_mapcount included in _mapcount */
- if (!PageAnon(page))
- return ret - compound * HPAGE_PMD_NR;
- if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
- return ret;
-}
-
-/*
- * This calculates accurately how many mappings a transparent hugepage
- * has (unlike page_mapcount() which isn't fully accurate). This full
- * accuracy is primarily needed to know if copy-on-write faults can
- * reuse the page and change the mapping to read-write instead of
- * copying them. At the same time this returns the total_mapcount too.
- *
- * The function returns the highest mapcount any one of the subpages
- * has. If the return value is one, even if different processes are
- * mapping different subpages of the transparent hugepage, they can
- * all reuse it, because each process is reusing a different subpage.
- *
- * The total_mapcount is instead counting all virtual mappings of the
- * subpages. If the total_mapcount is equal to "one", it tells the
- * caller all mappings belong to the same "mm" and in turn the
- * anon_vma of the transparent hugepage can become the vma->anon_vma
- * local one as no other process may be mapping any of the subpages.
- *
- * It would be more accurate to replace page_mapcount() with
- * page_trans_huge_mapcount(), however we only use
- * page_trans_huge_mapcount() in the copy-on-write faults where we
- * need full accuracy to avoid breaking page pinning, because
- * page_trans_huge_mapcount() is slower than page_mapcount().
- */
-int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
-{
- int i, ret, _total_mapcount, mapcount;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- if (likely(!PageTransCompound(page))) {
- mapcount = atomic_read(&page->_mapcount) + 1;
- if (total_mapcount)
- *total_mapcount = mapcount;
- return mapcount;
- }
-
- page = compound_head(page);
-
- _total_mapcount = ret = 0;
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- mapcount = atomic_read(&page[i]._mapcount) + 1;
- ret = max(ret, mapcount);
- _total_mapcount += mapcount;
- }
- if (PageDoubleMap(page)) {
- ret -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
- }
- mapcount = compound_mapcount(page);
- ret += mapcount;
- _total_mapcount += mapcount;
- if (total_mapcount)
- *total_mapcount = _total_mapcount;
- return ret;
-}
-
/* Racy check whether the huge page can be split */
-bool can_split_huge_page(struct page *page, int *pextra_pins)
+bool can_split_folio(struct folio *folio, int *pextra_pins)
{
int extra_pins;
/* Additional pins from page cache */
- if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ if (folio_test_anon(folio))
+ extra_pins = folio_test_swapcache(folio) ?
+ folio_nr_pages(folio) : 0;
else
- extra_pins = HPAGE_PMD_NR;
+ extra_pins = folio_nr_pages(folio);
if (pextra_pins)
*pextra_pins = extra_pins;
- return total_mapcount(page) == page_count(page) - extra_pins - 1;
+ return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
}
/*
@@ -2609,32 +2644,37 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
*/
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
- struct page *head = compound_head(page);
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
- struct deferred_split *ds_queue = get_deferred_split_queue(head);
+ struct folio *folio = page_folio(page);
+ struct deferred_split *ds_queue = get_deferred_split_queue(folio);
+ XA_STATE(xas, &folio->mapping->i_pages, folio->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- int count, mapcount, extra_pins, ret;
- unsigned long flags;
+ int extra_pins, ret;
pgoff_t end;
+ bool is_hzp;
- VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
- VM_BUG_ON_PAGE(!PageLocked(head), head);
- VM_BUG_ON_PAGE(!PageCompound(head), head);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- if (PageWriteback(head))
+ is_hzp = is_huge_zero_page(&folio->page);
+ if (is_hzp) {
+ pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
return -EBUSY;
+ }
- if (PageAnon(head)) {
+ if (folio_test_writeback(folio))
+ return -EBUSY;
+
+ if (folio_test_anon(folio)) {
/*
* The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
* reference to it and then lock the anon_vma for write. This
- * is similar to page_lock_anon_vma_read except the write lock
+ * is similar to folio_lock_anon_vma_read except the write lock
* is taken to serialise against parallel split or collapse
* operations.
*/
- anon_vma = page_get_anon_vma(head);
+ anon_vma = folio_get_anon_vma(folio);
if (!anon_vma) {
ret = -EBUSY;
goto out;
@@ -2643,7 +2683,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
- mapping = head->mapping;
+ gfp_t gfp;
+
+ mapping = folio->mapping;
/* Truncated ? */
if (!mapping) {
@@ -2651,6 +2693,21 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
+ gfp = current_gfp_context(mapping_gfp_mask(mapping) &
+ GFP_RECLAIM_MASK);
+
+ if (folio_test_private(folio) &&
+ !filemap_release_folio(folio, gfp)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ xas_split_alloc(&xas, folio, folio_order(folio), gfp);
+ if (xas_error(&xas)) {
+ ret = xas_error(&xas);
+ goto out;
+ }
+
anon_vma = NULL;
i_mmap_lock_read(mapping);
@@ -2659,77 +2716,69 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
* which cannot be nested inside the page tree lock. So note
* end now: i_size itself may be changed at any moment, but
- * head page lock is good enough to serialize the trimming.
+ * folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ if (shmem_mapping(mapping))
+ end = shmem_fallocend(mapping->host, end);
}
/*
- * Racy check if we can split the page, before unmap_page() will
+ * Racy check if we can split the page, before unmap_folio() will
* split PMDs
*/
- if (!can_split_huge_page(head, &extra_pins)) {
- ret = -EBUSY;
+ if (!can_split_folio(folio, &extra_pins)) {
+ ret = -EAGAIN;
goto out_unlock;
}
- unmap_page(head);
- VM_BUG_ON_PAGE(compound_mapcount(head), head);
-
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irqsave(&pgdata->lru_lock, flags);
+ unmap_folio(folio);
+ /* block interrupt reentry in xa_lock and spinlock */
+ local_irq_disable();
if (mapping) {
- XA_STATE(xas, &mapping->i_pages, page_index(head));
-
/*
- * Check if the head page is present in page cache.
- * We assume all tail are present too, if head is there.
+ * Check if the folio is present in page cache.
+ * We assume all tail are present too, if folio is there.
*/
- xa_lock(&mapping->i_pages);
- if (xas_load(&xas) != head)
+ xas_lock(&xas);
+ xas_reset(&xas);
+ if (xas_load(&xas) != folio)
goto fail;
}
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- count = page_count(head);
- mapcount = total_mapcount(head);
- if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
- if (!list_empty(page_deferred_list(head))) {
+ if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
- list_del(page_deferred_list(head));
+ list_del(&folio->_deferred_list);
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
- if (PageSwapBacked(head))
- __dec_node_page_state(head, NR_SHMEM_THPS);
- else
- __dec_node_page_state(head, NR_FILE_THPS);
- }
+ int nr = folio_nr_pages(folio);
- __split_huge_page(page, list, end, flags);
- if (PageSwapCache(head)) {
- swp_entry_t entry = { .val = page_private(head) };
+ xas_split(&xas, folio, folio_order(folio));
+ if (folio_test_swapbacked(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS,
+ -nr);
+ } else {
+ __lruvec_stat_mod_folio(folio, NR_FILE_THPS,
+ -nr);
+ filemap_nr_thps_dec(mapping);
+ }
+ }
- ret = split_swap_cluster(entry);
- } else
- ret = 0;
+ __split_huge_page(page, list, end);
+ ret = 0;
} else {
- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- pr_alert("total_mapcount: %u, page_count(): %u\n",
- mapcount, count);
- if (PageTail(page))
- dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > 0");
- BUG();
- }
spin_unlock(&ds_queue->split_queue_lock);
-fail: if (mapping)
- xa_unlock(&mapping->i_pages);
- spin_unlock_irqrestore(&pgdata->lru_lock, flags);
- remap_page(head);
- ret = -EBUSY;
+fail:
+ if (mapping)
+ xas_unlock(&xas);
+ local_irq_enable();
+ remap_page(folio, folio_nr_pages(folio));
+ ret = -EAGAIN;
}
out_unlock:
@@ -2740,56 +2789,68 @@ out_unlock:
if (mapping)
i_mmap_unlock_read(mapping);
out:
+ xas_destroy(&xas);
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret;
}
void free_transhuge_page(struct page *page)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct folio *folio = (struct folio *)page;
+ struct deferred_split *ds_queue = get_deferred_split_queue(folio);
unsigned long flags;
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- if (!list_empty(page_deferred_list(page))) {
- ds_queue->split_queue_len--;
- list_del(page_deferred_list(page));
+ /*
+ * At this point, there is no one trying to add the folio to
+ * deferred_list. If folio is not in deferred_list, it's safe
+ * to check without acquiring the split_queue_lock.
+ */
+ if (data_race(!list_empty(&folio->_deferred_list))) {
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ if (!list_empty(&folio->_deferred_list)) {
+ ds_queue->split_queue_len--;
+ list_del(&folio->_deferred_list);
+ }
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
-void deferred_split_huge_page(struct page *page)
+void deferred_split_folio(struct folio *folio)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct deferred_split *ds_queue = get_deferred_split_queue(folio);
#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct mem_cgroup *memcg = folio_memcg(folio);
#endif
unsigned long flags;
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
/*
* The try_to_unmap() in page reclaim path might reach here too,
* this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same page, it is
+ * And, if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in shrinker.
*
- * Check PageSwapCache to determine if the page is being
- * handled by page reclaim since THP swap would add the page into
+ * Check the swapcache flag to determine if the folio is being
+ * handled by page reclaim since THP swap would add the folio into
* swap cache before calling try_to_unmap().
*/
- if (PageSwapCache(page))
+ if (folio_test_swapcache(folio))
+ return;
+
+ if (!list_empty(&folio->_deferred_list))
return;
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- if (list_empty(page_deferred_list(page))) {
+ if (list_empty(&folio->_deferred_list)) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+ list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
if (memcg)
- memcg_set_shrinker_bit(memcg, page_to_nid(page),
- deferred_split_shrinker.id);
+ set_shrinker_bit(memcg, folio_nid(folio),
+ deferred_split_shrinker.id);
#endif
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -2814,8 +2875,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct pglist_data *pgdata = NODE_DATA(sc->nid);
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
- LIST_HEAD(list), *pos, *next;
- struct page *page;
+ LIST_HEAD(list);
+ struct folio *folio, *next;
int split = 0;
#ifdef CONFIG_MEMCG
@@ -2825,14 +2886,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &ds_queue->split_queue) {
- page = list_entry((void *)pos, struct page, mapping);
- page = compound_head(page);
- if (get_page_unless_zero(page)) {
- list_move(page_deferred_list(page), &list);
+ list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
+ _deferred_list) {
+ if (folio_try_get(folio)) {
+ list_move(&folio->_deferred_list, &list);
} else {
- /* We lost race with put_compound_page() */
- list_del_init(page_deferred_list(page));
+ /* We lost race with folio_put() */
+ list_del_init(&folio->_deferred_list);
ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
@@ -2840,16 +2900,15 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
- list_for_each_safe(pos, next, &list) {
- page = list_entry((void *)pos, struct page, mapping);
- if (!trylock_page(page))
+ list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+ if (!folio_trylock(folio))
goto next;
/* split_huge_page() removes page from list on success */
- if (!split_huge_page(page))
+ if (!split_folio(folio))
split++;
- unlock_page(page);
+ folio_unlock(folio);
next:
- put_page(page);
+ folio_put(folio);
}
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -2874,48 +2933,281 @@ static struct shrinker deferred_split_shrinker = {
};
#ifdef CONFIG_DEBUG_FS
-static int split_huge_pages_set(void *data, u64 val)
+static void split_huge_pages_all(void)
{
struct zone *zone;
struct page *page;
+ struct folio *folio;
unsigned long pfn, max_zone_pfn;
unsigned long total = 0, split = 0;
- if (val != 1)
- return -EINVAL;
-
- for_each_populated_zone(zone) {
+ pr_debug("Split all THPs\n");
+ for_each_zone(zone) {
+ if (!managed_zone(zone))
+ continue;
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
- if (!pfn_valid(pfn))
- continue;
+ int nr_pages;
- page = pfn_to_page(pfn);
- if (!get_page_unless_zero(page))
+ page = pfn_to_online_page(pfn);
+ if (!page || PageTail(page))
+ continue;
+ folio = page_folio(page);
+ if (!folio_try_get(folio))
continue;
- if (zone != page_zone(page))
+ if (unlikely(page_folio(page) != folio))
+ goto next;
+
+ if (zone != folio_zone(folio))
goto next;
- if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
+ if (!folio_test_large(folio)
+ || folio_test_hugetlb(folio)
+ || !folio_test_lru(folio))
goto next;
total++;
- lock_page(page);
- if (!split_huge_page(page))
+ folio_lock(folio);
+ nr_pages = folio_nr_pages(folio);
+ if (!split_folio(folio))
split++;
- unlock_page(page);
+ pfn += nr_pages - 1;
+ folio_unlock(folio);
next:
- put_page(page);
+ folio_put(folio);
+ cond_resched();
}
}
- pr_info("%lu of %lu THP split\n", split, total);
+ pr_debug("%lu of %lu THP split\n", split, total);
+}
- return 0;
+static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
+{
+ return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
+ is_vm_hugetlb_page(vma);
+}
+
+static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
+ unsigned long vaddr_end)
+{
+ int ret = 0;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ unsigned long total = 0, split = 0;
+ unsigned long addr;
+
+ vaddr_start &= PAGE_MASK;
+ vaddr_end &= PAGE_MASK;
+
+ /* Find the task_struct from pid */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out;
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /* Find the mm_struct */
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (!mm) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
+ pid, vaddr_start, vaddr_end);
+
+ mmap_read_lock(mm);
+ /*
+ * always increase addr by PAGE_SIZE, since we could have a PTE page
+ * table filled with PTE-mapped THPs, each of which is distinct.
+ */
+ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+ struct vm_area_struct *vma = vma_lookup(mm, addr);
+ struct page *page;
+
+ if (!vma)
+ break;
+
+ /* skip special VMA and hugetlb VMA */
+ if (vma_not_suitable_for_thp_split(vma)) {
+ addr = vma->vm_end;
+ continue;
+ }
+
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+
+ if (IS_ERR_OR_NULL(page))
+ continue;
+
+ if (!is_transparent_hugepage(page))
+ goto next;
+
+ total++;
+ if (!can_split_folio(page_folio(page), NULL))
+ goto next;
+
+ if (!trylock_page(page))
+ goto next;
+
+ if (!split_huge_page(page))
+ split++;
+
+ unlock_page(page);
+next:
+ put_page(page);
+ cond_resched();
+ }
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ pr_debug("%lu of %lu THP split\n", split, total);
+
+out:
+ return ret;
}
-DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
- "%llu\n");
+
+static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
+ pgoff_t off_end)
+{
+ struct filename *file;
+ struct file *candidate;
+ struct address_space *mapping;
+ int ret = -EINVAL;
+ pgoff_t index;
+ int nr_pages = 1;
+ unsigned long total = 0, split = 0;
+
+ file = getname_kernel(file_path);
+ if (IS_ERR(file))
+ return ret;
+
+ candidate = file_open_name(file, O_RDONLY, 0);
+ if (IS_ERR(candidate))
+ goto out;
+
+ pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
+ file_path, off_start, off_end);
+
+ mapping = candidate->f_mapping;
+
+ for (index = off_start; index < off_end; index += nr_pages) {
+ struct folio *folio = filemap_get_folio(mapping, index);
+
+ nr_pages = 1;
+ if (IS_ERR(folio))
+ continue;
+
+ if (!folio_test_large(folio))
+ goto next;
+
+ total++;
+ nr_pages = folio_nr_pages(folio);
+
+ if (!folio_trylock(folio))
+ goto next;
+
+ if (!split_folio(folio))
+ split++;
+
+ folio_unlock(folio);
+next:
+ folio_put(folio);
+ cond_resched();
+ }
+
+ filp_close(candidate, NULL);
+ ret = 0;
+
+ pr_debug("%lu of %lu file-backed THP split\n", split, total);
+out:
+ putname(file);
+ return ret;
+}
+
+#define MAX_INPUT_BUF_SZ 255
+
+static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppops)
+{
+ static DEFINE_MUTEX(split_debug_mutex);
+ ssize_t ret;
+ /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
+ char input_buf[MAX_INPUT_BUF_SZ];
+ int pid;
+ unsigned long vaddr_start, vaddr_end;
+
+ ret = mutex_lock_interruptible(&split_debug_mutex);
+ if (ret)
+ return ret;
+
+ ret = -EFAULT;
+
+ memset(input_buf, 0, MAX_INPUT_BUF_SZ);
+ if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
+ goto out;
+
+ input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
+
+ if (input_buf[0] == '/') {
+ char *tok;
+ char *buf = input_buf;
+ char file_path[MAX_INPUT_BUF_SZ];
+ pgoff_t off_start = 0, off_end = 0;
+ size_t input_len = strlen(input_buf);
+
+ tok = strsep(&buf, ",");
+ if (tok) {
+ strcpy(file_path, tok);
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
+ if (ret != 2) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = split_huge_pages_in_file(file_path, off_start, off_end);
+ if (!ret)
+ ret = input_len;
+
+ goto out;
+ }
+
+ ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+ if (ret == 1 && pid == 1) {
+ split_huge_pages_all();
+ ret = strlen(input_buf);
+ goto out;
+ } else if (ret != 3) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
+ if (!ret)
+ ret = strlen(input_buf);
+out:
+ mutex_unlock(&split_debug_mutex);
+ return ret;
+
+}
+
+static const struct file_operations split_huge_pages_fops = {
+ .owner = THIS_MODULE,
+ .write = split_huge_pages_write,
+ .llseek = no_llseek,
+};
static int __init split_huge_pages_debugfs(void)
{
@@ -2927,30 +3219,53 @@ late_initcall(split_huge_pages_debugfs);
#endif
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct page *page)
{
struct vm_area_struct *vma = pvmw->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long address = pvmw->address;
+ bool anon_exclusive;
pmd_t pmdval;
swp_entry_t entry;
pmd_t pmdswp;
if (!(pvmw->pmd && !pvmw->pte))
- return;
+ return 0;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+
+ /* See page_try_share_anon_rmap(): invalidate PMD first. */
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pmd_at(mm, address, pvmw->pmd, pmdval);
+ return -EBUSY;
+ }
+
if (pmd_dirty(pmdval))
set_page_dirty(page);
- entry = make_migration_entry(page, pmd_write(pmdval));
+ if (pmd_write(pmdval))
+ entry = make_writable_migration_entry(page_to_pfn(page));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
+ else
+ entry = make_readable_migration_entry(page_to_pfn(page));
+ if (pmd_young(pmdval))
+ entry = make_migration_entry_young(entry);
+ if (pmd_dirty(pmdval))
+ entry = make_migration_entry_dirty(entry);
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
+ if (pmd_uffd_wp(pmdval))
+ pmdswp = pmd_swp_mkuffd_wp(pmdswp);
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
put_page(page);
+ trace_set_migration_pmd(address, pmd_val(pmdswp));
+
+ return 0;
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -2958,7 +3273,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
struct vm_area_struct *vma = pvmw->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long address = pvmw->address;
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
pmd_t pmde;
swp_entry_t entry;
@@ -2967,20 +3282,34 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
get_page(new);
- pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
+ pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
- if (is_write_migration_entry(entry))
- pmde = maybe_pmd_mkwrite(pmde, vma);
+ if (is_writable_migration_entry(entry))
+ pmde = pmd_mkwrite(pmde);
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
+ pmde = pmd_mkuffd_wp(pmde);
+ if (!is_migration_entry_young(entry))
+ pmde = pmd_mkold(pmde);
+ /* NOTE: this may contain setting soft-dirty on some archs */
+ if (PageDirty(new) && is_migration_entry_dirty(entry))
+ pmde = pmd_mkdirty(pmde);
+
+ if (PageAnon(new)) {
+ rmap_t rmap_flags = RMAP_COMPOUND;
+
+ if (!is_readable_migration_entry(entry))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ page_add_anon_rmap(new, vma, haddr, rmap_flags);
+ } else {
+ page_add_file_rmap(new, vma, true);
+ }
+ VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
+ set_pmd_at(mm, haddr, pvmw->pmd, pmde);
- flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
- if (PageAnon(new))
- page_add_anon_rmap(new, vma, mmun_start, true);
- else
- page_add_file_rmap(new, true);
- set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
- if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
- mlock_vma_page(new);
+ /* No need to invalidate - it was non-present before */
update_mmu_cache_pmd(vma, address, pvmw->pmd);
+ trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fe76f8fd5a73..6da626bfb52e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,6 +30,10 @@
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
+#include <linux/migrate.h>
+#include <linux/nospec.h>
+#include <linux/delayacct.h>
+#include <linux/memory.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -39,9 +43,9 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
-#include <linux/userfaultfd_k.h>
#include <linux/page_owner.h>
#include "internal.h"
+#include "hugetlb_vmemmap.h"
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
@@ -49,15 +53,20 @@ struct hstate hstates[HUGE_MAX_HSTATE];
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
+static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
+static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
+{
+ return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page,
+ 1 << order);
+}
+#else
+static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
+{
+ return false;
+}
#endif
static unsigned long hugetlb_cma_size __initdata;
-/*
- * Minimum page order among possible hugepage sizes, set to a proper value
- * at boot time.
- */
-static unsigned int minimum_order __read_mostly = UINT_MAX;
-
__initdata LIST_HEAD(huge_boot_pages);
/* for command line parsing */
@@ -65,6 +74,7 @@ static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
+static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -81,17 +91,33 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
-static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
- bool free = (spool->count == 0) && (spool->used_hpages == 0);
+ if (spool->count)
+ return false;
+ if (spool->max_hpages != -1)
+ return spool->used_hpages == 0;
+ if (spool->min_hpages != -1)
+ return spool->rsv_hpages == spool->min_hpages;
+
+ return true;
+}
- spin_unlock(&spool->lock);
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
+ unsigned long irq_flags)
+{
+ spin_unlock_irqrestore(&spool->lock, irq_flags);
/* If no pages are used, and no other handles to the subpool
* remain, give up any reservations based on minimum size and
* free the subpool */
- if (free) {
+ if (subpool_is_free(spool)) {
if (spool->min_hpages != -1)
hugetlb_acct_memory(spool->hstate,
-spool->min_hpages);
@@ -125,10 +151,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
void hugepage_put_subpool(struct hugepage_subpool *spool)
{
- spin_lock(&spool->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&spool->lock, flags);
BUG_ON(!spool->count);
spool->count--;
- unlock_or_release_subpool(spool);
+ unlock_or_release_subpool(spool, flags);
}
/*
@@ -147,7 +175,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
if (!spool)
return ret;
- spin_lock(&spool->lock);
+ spin_lock_irq(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */
if ((spool->used_hpages + delta) <= spool->max_hpages)
@@ -174,7 +202,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
}
unlock_ret:
- spin_unlock(&spool->lock);
+ spin_unlock_irq(&spool->lock);
return ret;
}
@@ -188,11 +216,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
long ret = delta;
+ unsigned long flags;
if (!spool)
return delta;
- spin_lock(&spool->lock);
+ spin_lock_irqsave(&spool->lock, flags);
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
@@ -213,7 +242,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
* If hugetlbfs_put_super couldn't free spool due to an outstanding
* quota reference, free it now.
*/
- unlock_or_release_subpool(spool);
+ unlock_or_release_subpool(spool, flags);
return ret;
}
@@ -228,13 +257,153 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
return subpool_inode(file_inode(vma->vm_file));
}
+/*
+ * hugetlb vma_lock helper routines
+ */
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_read(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ up_read(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_write(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ up_write(&vma_lock->rw_sema);
+ }
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (!__vma_shareable_lock(vma))
+ return 1;
+
+ return down_write_trylock(&vma_lock->rw_sema);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ lockdep_assert_held(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+ struct hugetlb_vma_lock *vma_lock = container_of(kref,
+ struct hugetlb_vma_lock, refs);
+
+ kfree(vma_lock);
+}
+
+static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+{
+ struct vm_area_struct *vma = vma_lock->vma;
+
+ /*
+ * vma_lock structure may or not be released as a result of put,
+ * it certainly will no longer be attached to vma so clear pointer.
+ * Semaphore synchronizes access to vma_lock->vma field.
+ */
+ vma_lock->vma = NULL;
+ vma->vm_private_data = NULL;
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ __hugetlb_vma_unlock_write_put(vma_lock);
+ }
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+ /*
+ * Only present in sharable vmas.
+ */
+ if (!vma || !__vma_shareable_lock(vma))
+ return;
+
+ if (vma->vm_private_data) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_write(&vma_lock->rw_sema);
+ __hugetlb_vma_unlock_write_put(vma_lock);
+ }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+ struct hugetlb_vma_lock *vma_lock;
+
+ /* Only establish in (flags) sharable vmas */
+ if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ /* Should never get here with non-NULL vm_private_data */
+ if (vma->vm_private_data)
+ return;
+
+ vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
+ if (!vma_lock) {
+ /*
+ * If we can not allocate structure, then vma can not
+ * participate in pmd sharing. This is only a possible
+ * performance enhancement and memory saving issue.
+ * However, the lock is also used to synchronize page
+ * faults with truncation. If the lock is not present,
+ * unlikely races could leave pages in a file past i_size
+ * until the file is removed. Warn in the unlikely case of
+ * allocation failure.
+ */
+ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
+ return;
+ }
+
+ kref_init(&vma_lock->refs);
+ init_rwsem(&vma_lock->rw_sema);
+ vma_lock->vma = vma;
+ vma->vm_private_data = vma_lock;
+}
+
/* Helper that removes a struct file_region from the resv_map cache and returns
* it for use.
*/
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
- struct file_region *nrg = NULL;
+ struct file_region *nrg;
VM_BUG_ON(resv->region_cache_count <= 0);
@@ -270,6 +439,17 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
nrg->reservation_counter =
&h_cg->rsvd_hugepage[hstate_index(h)];
nrg->css = &h_cg->css;
+ /*
+ * The caller will hold exactly one h_cg->css reference for the
+ * whole contiguous reservation region. But this area might be
+ * scattered when there are already some file_regions reside in
+ * it. As a result, many file_regions may share only one css
+ * reference. In order to ensure that one file_region must hold
+ * exactly one h_cg->css reference, we should do css_get for
+ * each file_region and leave the reference held by caller
+ * untouched.
+ */
+ css_get(&h_cg->css);
if (!resv->pages_per_hpage)
resv->pages_per_hpage = pages_per_huge_page(h);
/* pages_per_hpage should be the same for all entries in
@@ -283,12 +463,19 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
#endif
}
+static void put_uncharge_info(struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (rg->css)
+ css_put(rg->css);
+#endif
+}
+
static bool has_same_uncharge_info(struct file_region *rg,
struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
- return rg && org &&
- rg->reservation_counter == org->reservation_counter &&
+ return rg->reservation_counter == org->reservation_counter &&
rg->css == org->css;
#else
@@ -298,7 +485,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
- struct file_region *nrg = NULL, *prg = NULL;
+ struct file_region *nrg, *prg;
prg = list_prev_entry(rg, link);
if (&prg->link != &resv->regions && prg->to == rg->from &&
@@ -306,6 +493,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
prg->to = rg->to;
list_del(&rg->link);
+ put_uncharge_info(rg);
kfree(rg);
rg = prg;
@@ -317,10 +505,29 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
nrg->from = rg->from;
list_del(&rg->link);
+ put_uncharge_info(rg);
kfree(rg);
}
}
+static inline long
+hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
+ long to, struct hstate *h, struct hugetlb_cgroup *cg,
+ long *regions_needed)
+{
+ struct file_region *nrg;
+
+ if (!regions_needed) {
+ nrg = get_file_region_entry_from_cache(map, from, to);
+ record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
+ list_add(&nrg->link, rg);
+ coalesce_file_region(map, nrg);
+ } else
+ *regions_needed += 1;
+
+ return to - from;
+}
+
/*
* Must be called with resv->lock held.
*
@@ -336,67 +543,56 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
- struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
+ struct file_region *iter, *trg = NULL;
+ struct list_head *rg = NULL;
if (regions_needed)
*regions_needed = 0;
/* In this loop, we essentially handle an entry for the range
- * [last_accounted_offset, rg->from), at every iteration, with some
+ * [last_accounted_offset, iter->from), at every iteration, with some
* bounds checking.
*/
- list_for_each_entry_safe(rg, trg, head, link) {
+ list_for_each_entry_safe(iter, trg, head, link) {
/* Skip irrelevant regions that start before our range. */
- if (rg->from < f) {
+ if (iter->from < f) {
/* If this region ends after the last accounted offset,
* then we need to update last_accounted_offset.
*/
- if (rg->to > last_accounted_offset)
- last_accounted_offset = rg->to;
+ if (iter->to > last_accounted_offset)
+ last_accounted_offset = iter->to;
continue;
}
/* When we find a region that starts beyond our range, we've
* finished.
*/
- if (rg->from > t)
+ if (iter->from >= t) {
+ rg = iter->link.prev;
break;
+ }
- /* Add an entry for last_accounted_offset -> rg->from, and
+ /* Add an entry for last_accounted_offset -> iter->from, and
* update last_accounted_offset.
*/
- if (rg->from > last_accounted_offset) {
- add += rg->from - last_accounted_offset;
- if (!regions_needed) {
- nrg = get_file_region_entry_from_cache(
- resv, last_accounted_offset, rg->from);
- record_hugetlb_cgroup_uncharge_info(h_cg, h,
- resv, nrg);
- list_add(&nrg->link, rg->link.prev);
- coalesce_file_region(resv, nrg);
- } else
- *regions_needed += 1;
- }
+ if (iter->from > last_accounted_offset)
+ add += hugetlb_resv_map_add(resv, iter->link.prev,
+ last_accounted_offset,
+ iter->from, h, h_cg,
+ regions_needed);
- last_accounted_offset = rg->to;
+ last_accounted_offset = iter->to;
}
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
- if (last_accounted_offset < t) {
- add += t - last_accounted_offset;
- if (!regions_needed) {
- nrg = get_file_region_entry_from_cache(
- resv, last_accounted_offset, t);
- record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
- list_add(&nrg->link, rg->link.prev);
- coalesce_file_region(resv, nrg);
- } else
- *regions_needed += 1;
- }
+ if (!rg)
+ rg = head->prev;
+ if (last_accounted_offset < t)
+ add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
+ t, h, h_cg, regions_needed);
- VM_BUG_ON(add < 0);
return add;
}
@@ -406,14 +602,12 @@ static int allocate_file_region_entries(struct resv_map *resv,
int regions_needed)
__must_hold(&resv->lock)
{
- struct list_head allocated_regions;
+ LIST_HEAD(allocated_regions);
int to_allocate = 0, i = 0;
struct file_region *trg = NULL, *rg = NULL;
VM_BUG_ON(regions_needed < 0);
- INIT_LIST_HEAD(&allocated_regions);
-
/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations plus regions_needed.
@@ -429,7 +623,7 @@ static int allocate_file_region_entries(struct resv_map *resv,
resv->region_cache_count;
/* At this point, we should have enough entries in the cache
- * for all the existings adds_in_progress. We should only be
+ * for all the existing adds_in_progress. We should only be
* needing to allocate for regions_needed.
*/
VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
@@ -519,7 +713,6 @@ retry:
resv->adds_in_progress -= in_regions_needed;
spin_unlock(&resv->lock);
- VM_BUG_ON(add < 0);
return add;
}
@@ -648,6 +841,8 @@ retry:
}
del += t - f;
+ hugetlb_cgroup_uncharge_file_region(
+ resv, rg, t - f, false);
/* New entry for end of split region */
nrg->from = t;
@@ -660,9 +855,6 @@ retry:
/* Original entry is trimmed */
rg->to = f;
- hugetlb_cgroup_uncharge_file_region(
- resv, rg, nrg->to - nrg->from);
-
list_add(&nrg->link, &rg->link);
nrg = NULL;
break;
@@ -671,24 +863,24 @@ retry:
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
del += rg->to - rg->from;
hugetlb_cgroup_uncharge_file_region(resv, rg,
- rg->to - rg->from);
+ rg->to - rg->from, true);
list_del(&rg->link);
kfree(rg);
continue;
}
if (f <= rg->from) { /* Trim beginning of region */
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ t - rg->from, false);
+
del += t - rg->from;
rg->from = t;
-
- hugetlb_cgroup_uncharge_file_region(resv, rg,
- t - rg->from);
} else { /* Trim end of region */
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - f, false);
+
del += rg->to - f;
rg->to = f;
-
- hugetlb_cgroup_uncharge_file_region(resv, rg,
- rg->to - f);
}
}
@@ -710,13 +902,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
+ bool reserved = false;
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
- if (rsv_adjust) {
+ if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
- hugetlb_acct_memory(h, 1);
+ if (!hugetlb_acct_memory(h, 1))
+ reserved = true;
+ } else if (!rsv_adjust) {
+ reserved = true;
}
+
+ if (!reserved)
+ pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}
/*
@@ -805,7 +1004,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
* is guaranteed to have their future faults succeed.
*
- * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * With the exception of hugetlb_dup_vma_private() which is called at fork(),
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
@@ -952,12 +1151,59 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
- if (!(vma->vm_flags & VM_MAYSHARE))
- vma->vm_private_data = (void *)0;
+ /*
+ * Clear vm_private_data
+ * - For shared mappings this is a per-vma semaphore that may be
+ * allocated in a subsequent call to hugetlb_vm_op_open.
+ * Before clearing, make sure pointer is not associated with vma
+ * as this will leak the structure. This is the case when called
+ * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
+ * been called to allocate a new structure.
+ * - For MAP_PRIVATE mappings, this is the reserve map which does
+ * not apply to children. Faults generated by the children are
+ * not guaranteed to succeed, even if read-only.
+ */
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock && vma_lock->vma != vma)
+ vma->vm_private_data = NULL;
+ } else
+ vma->vm_private_data = NULL;
+}
+
+/*
+ * Reset and decrement one ref on hugepage private reservation.
+ * Called with mm->mmap_lock writer semaphore held.
+ * This function should be only used by move_vma() and operate on
+ * same sized vma. It should never come here with last ref on the
+ * reservation.
+ */
+void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ /*
+ * Clear the old hugetlb private page reservation.
+ * It has already been transferred to new_vma.
+ *
+ * During a mremap() operation of a hugetlb vma we call move_vma()
+ * which copies vma into new_vma and unmaps vma. After the copy
+ * operation both new_vma and vma share a reference to the resv_map
+ * struct, and at that point vma is about to be unmapped. We don't
+ * want to return the reservation to the pool at unmap of vma because
+ * the reservation still lives on in new_vma, so simply decrement the
+ * ref here and remove the resv_map reference from this vma.
+ */
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
+ kref_put(&reservations->refs, resv_map_release);
+ }
+
+ hugetlb_dup_vma_private(vma);
}
/* Returns true if the VMA has associated reserve pages */
@@ -1023,38 +1269,46 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
return false;
}
-static void enqueue_huge_page(struct hstate *h, struct page *page)
+static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- int nid = page_to_nid(page);
- list_move(&page->lru, &h->hugepage_freelists[nid]);
+ int nid = folio_nid(folio);
+
+ lockdep_assert_held(&hugetlb_lock);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+
+ list_move(&folio->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
+ folio_set_hugetlb_freed(folio);
}
-static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
+static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
+ int nid)
{
- struct page *page;
- bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+ struct folio *folio;
+ bool pin = !!(current->flags & PF_MEMALLOC_PIN);
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
- if (nocma && is_migrate_cma_page(page))
+ lockdep_assert_held(&hugetlb_lock);
+ list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
+ if (pin && !folio_is_longterm_pinnable(folio))
continue;
- if (PageHWPoison(page))
+ if (folio_test_hwpoison(folio))
continue;
- list_move(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
+ list_move(&folio->lru, &h->hugepage_activelist);
+ folio_ref_unfreeze(folio, 1);
+ folio_clear_hugetlb_freed(folio);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
- return page;
+ return folio;
}
return NULL;
}
-static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
- nodemask_t *nmask)
+static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
unsigned int cpuset_mems_cookie;
struct zonelist *zonelist;
@@ -1067,7 +1321,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask,
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
- struct page *page;
+ struct folio *folio;
if (!cpuset_zone_allowed(zone, gfp_mask))
continue;
@@ -1079,9 +1333,9 @@ retry_cpuset:
continue;
node = zone_to_nid(zone);
- page = dequeue_huge_page_node_exact(h, node);
- if (page)
- return page;
+ folio = dequeue_hugetlb_folio_node_exact(h, node);
+ if (folio)
+ return folio;
}
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
@@ -1089,12 +1343,17 @@ retry_cpuset:
return NULL;
}
-static struct page *dequeue_huge_page_vma(struct hstate *h,
+static unsigned long available_huge_pages(struct hstate *h)
+{
+ return h->free_huge_pages - h->resv_huge_pages;
+}
+
+static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
long chg)
{
- struct page *page;
+ struct folio *folio = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask;
nodemask_t *nodemask;
@@ -1105,24 +1364,35 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
- if (!vma_has_reserves(vma, chg) &&
- h->free_huge_pages - h->resv_huge_pages == 0)
+ if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
- if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ if (avoid_reserve && !available_huge_pages(h))
goto err;
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
- if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
- SetPagePrivate(page);
+
+ if (mpol_is_preferred_many(mpol)) {
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ nid, nodemask);
+
+ /* Fallback to all nodes if page==NULL */
+ nodemask = NULL;
+ }
+
+ if (!folio)
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ nid, nodemask);
+
+ if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
+ folio_set_hugetlb_restore_reserve(folio);
h->resv_huge_pages--;
}
mpol_cond_put(mpol);
- return page;
+ return folio;
err:
return NULL;
@@ -1170,7 +1440,7 @@ static int hstate_next_node_to_alloc(struct hstate *h,
}
/*
- * helper for free_pool_huge_page() - return the previously saved
+ * helper for remove_pool_huge_page() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
* next node id whether or not we find a free huge page to free so
* that the next attempt to free addresses the next node.
@@ -1199,59 +1469,76 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static void destroy_compound_gigantic_page(struct page *page,
- unsigned int order)
+/* used to demote non-gigantic_huge pages as well */
+static void __destroy_compound_gigantic_folio(struct folio *folio,
+ unsigned int order, bool demote)
{
int i;
int nr_pages = 1 << order;
- struct page *p = page + 1;
+ struct page *p;
- atomic_set(compound_mapcount_ptr(page), 0);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(&folio->_entire_mapcount, 0);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ for (i = 1; i < nr_pages; i++) {
+ p = folio_page(folio, i);
+ p->mapping = NULL;
clear_compound_head(p);
- set_page_refcounted(p);
+ if (!demote)
+ set_page_refcounted(p);
}
- set_compound_order(page, 0);
- __ClearPageHead(page);
+ __folio_clear_head(folio);
}
-static void free_gigantic_page(struct page *page, unsigned int order)
+static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio,
+ unsigned int order)
+{
+ __destroy_compound_gigantic_folio(folio, order, true);
+}
+
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static void destroy_compound_gigantic_folio(struct folio *folio,
+ unsigned int order)
+{
+ __destroy_compound_gigantic_folio(folio, order, false);
+}
+
+static void free_gigantic_folio(struct folio *folio, unsigned int order)
{
/*
* If the page isn't allocated using the cma allocator,
* cma_release() returns false.
*/
#ifdef CONFIG_CMA
- if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+ int nid = folio_nid(folio);
+
+ if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order))
return;
#endif
- free_contig_range(page_to_pfn(page), 1 << order);
+ free_contig_range(folio_pfn(folio), 1 << order);
}
#ifdef CONFIG_CONTIG_ALLOC
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
- unsigned long nr_pages = 1UL << huge_page_order(h);
+ struct page *page;
+ unsigned long nr_pages = pages_per_huge_page(h);
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
#ifdef CONFIG_CMA
{
- struct page *page;
int node;
if (hugetlb_cma[nid]) {
page = cma_alloc(hugetlb_cma[nid], nr_pages,
huge_page_order(h), true);
if (page)
- return page;
+ return page_folio(page);
}
if (!(gfp_mask & __GFP_THISNODE)) {
@@ -1262,19 +1549,18 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
page = cma_alloc(hugetlb_cma[node], nr_pages,
huge_page_order(h), true);
if (page)
- return page;
+ return page_folio(page);
}
}
}
#endif
- return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+ page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+ return page ? page_folio(page) : NULL;
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
#else /* !CONFIG_CONTIG_ALLOC */
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
@@ -1282,133 +1568,341 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
}
-static inline void free_gigantic_page(struct page *page, unsigned int order) { }
-static inline void destroy_compound_gigantic_page(struct page *page,
+static inline void free_gigantic_folio(struct folio *folio,
+ unsigned int order) { }
+static inline void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order) { }
#endif
-static void update_and_free_page(struct hstate *h, struct page *page)
+static inline void __clear_hugetlb_destructor(struct hstate *h,
+ struct folio *folio)
{
- int i;
+ lockdep_assert_held(&hugetlb_lock);
+
+ /*
+ * Very subtle
+ *
+ * For non-gigantic pages set the destructor to the normal compound
+ * page dtor. This is needed in case someone takes an additional
+ * temporary ref to the page, and freeing is delayed until they drop
+ * their reference.
+ *
+ * For gigantic pages set the destructor to the null dtor. This
+ * destructor will never be called. Before freeing the gigantic
+ * page destroy_compound_gigantic_folio will turn the folio into a
+ * simple group of pages. After this the destructor does not
+ * apply.
+ *
+ */
+ if (hstate_is_gigantic(h))
+ folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
+ else
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
+}
+/*
+ * Remove hugetlb folio from lists.
+ * If vmemmap exists for the folio, update dtor so that the folio appears
+ * as just a compound page. Otherwise, wait until after allocating vmemmap
+ * to update dtor.
+ *
+ * A reference is held on the folio, except in the case of demote.
+ *
+ * Must be called with hugetlb lock held.
+ */
+static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus,
+ bool demote)
+{
+ int nid = folio_nid(folio);
+
+ VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
+ VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
+
+ lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
+ list_del(&folio->lru);
+
+ if (folio_test_hugetlb_freed(folio)) {
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ }
+ if (adjust_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
+ }
+
+ /*
+ * We can only clear the hugetlb destructor after allocating vmemmap
+ * pages. Otherwise, someone (memory error handling) may try to write
+ * to tail struct pages.
+ */
+ if (!folio_test_hugetlb_vmemmap_optimized(folio))
+ __clear_hugetlb_destructor(h, folio);
+
+ /*
+ * In the case of demote we do not ref count the page as it will soon
+ * be turned into a page of smaller size.
+ */
+ if (!demote)
+ folio_ref_unfreeze(folio, 1);
+
h->nr_huge_pages--;
- h->nr_huge_pages_node[page_to_nid(page)]--;
- for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
- 1 << PG_referenced | 1 << PG_dirty |
- 1 << PG_active | 1 << PG_private |
- 1 << PG_writeback);
+ h->nr_huge_pages_node[nid]--;
+}
+
+static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus)
+{
+ __remove_hugetlb_folio(h, folio, adjust_surplus, false);
+}
+
+static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio,
+ bool adjust_surplus)
+{
+ __remove_hugetlb_folio(h, folio, adjust_surplus, true);
+}
+
+static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus)
+{
+ int zeroed;
+ int nid = folio_nid(folio);
+
+ VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ INIT_LIST_HEAD(&folio->lru);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+
+ if (adjust_surplus) {
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[nid]++;
}
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
- set_page_refcounted(page);
- if (hstate_is_gigantic(h)) {
+
+ folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ folio_change_private(folio, NULL);
+ /*
+ * We have to set hugetlb_vmemmap_optimized again as above
+ * folio_change_private(folio, NULL) cleared it.
+ */
+ folio_set_hugetlb_vmemmap_optimized(folio);
+
+ /*
+ * This folio is about to be managed by the hugetlb allocator and
+ * should have no users. Drop our reference, and check for others
+ * just in case.
+ */
+ zeroed = folio_put_testzero(folio);
+ if (unlikely(!zeroed))
/*
- * Temporarily drop the hugetlb_lock, because
- * we might block in free_gigantic_page().
+ * It is VERY unlikely soneone else has taken a ref on
+ * the page. In this case, we simply return as the
+ * hugetlb destructor (free_huge_page) will be called
+ * when this other ref is dropped.
*/
- spin_unlock(&hugetlb_lock);
- destroy_compound_gigantic_page(page, huge_page_order(h));
- free_gigantic_page(page, huge_page_order(h));
- spin_lock(&hugetlb_lock);
- } else {
- __free_pages(page, huge_page_order(h));
- }
+ return;
+
+ arch_clear_hugepage_flags(&folio->page);
+ enqueue_hugetlb_folio(h, folio);
}
-struct hstate *size_to_hstate(unsigned long size)
+static void __update_and_free_hugetlb_folio(struct hstate *h,
+ struct folio *folio)
{
- struct hstate *h;
+ int i;
+ struct page *subpage;
+ bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
- for_each_hstate(h) {
- if (huge_page_size(h) == size)
- return h;
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return;
+
+ /*
+ * If we don't know which subpages are hwpoisoned, we can't free
+ * the hugepage, so it's leaked intentionally.
+ */
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
+ return;
+
+ if (hugetlb_vmemmap_restore(h, &folio->page)) {
+ spin_lock_irq(&hugetlb_lock);
+ /*
+ * If we cannot allocate vmemmap pages, just refuse to free the
+ * page and put the page back on the hugetlb free list and treat
+ * as a surplus page.
+ */
+ add_hugetlb_folio(h, folio, true);
+ spin_unlock_irq(&hugetlb_lock);
+ return;
+ }
+
+ /*
+ * Move PageHWPoison flag from head page to the raw error pages,
+ * which makes any healthy subpages reusable.
+ */
+ if (unlikely(folio_test_hwpoison(folio)))
+ folio_clear_hugetlb_hwpoison(folio);
+
+ /*
+ * If vmemmap pages were allocated above, then we need to clear the
+ * hugetlb destructor under the hugetlb lock.
+ */
+ if (clear_dtor) {
+ spin_lock_irq(&hugetlb_lock);
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
+
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ subpage = folio_page(folio, i);
+ subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_private |
+ 1 << PG_writeback);
+ }
+
+ /*
+ * Non-gigantic pages demoted from CMA allocated gigantic pages
+ * need to be given back to CMA in free_gigantic_folio.
+ */
+ if (hstate_is_gigantic(h) ||
+ hugetlb_cma_folio(folio, huge_page_order(h))) {
+ destroy_compound_gigantic_folio(folio, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
+ } else {
+ __free_pages(&folio->page, huge_page_order(h));
}
- return NULL;
}
/*
- * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
- * to hstate->hugepage_activelist.)
+ * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
+ * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
+ * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
+ * the vmemmap pages.
*
- * This function can be called for tail pages, but never returns true for them.
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to be
+ * freed and frees them one-by-one. As the page->mapping pointer is going
+ * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
+ * structure of a lockless linked list of huge pages to be freed.
*/
-bool page_huge_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- return PageHead(page) && PagePrivate(&page[1]);
-}
+static LLIST_HEAD(hpage_freelist);
-/* never called for tail page */
-static void set_page_huge_active(struct page *page)
+static void free_hpage_workfn(struct work_struct *work)
{
- VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
- SetPagePrivate(&page[1]);
+ struct llist_node *node;
+
+ node = llist_del_all(&hpage_freelist);
+
+ while (node) {
+ struct page *page;
+ struct hstate *h;
+
+ page = container_of((struct address_space **)node,
+ struct page, mapping);
+ node = node->next;
+ page->mapping = NULL;
+ /*
+ * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
+ * is going to trigger because a previous call to
+ * remove_hugetlb_folio() will call folio_set_compound_dtor
+ * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate()
+ * directly.
+ */
+ h = size_to_hstate(page_size(page));
+
+ __update_and_free_hugetlb_folio(h, page_folio(page));
+
+ cond_resched();
+ }
}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
-static void clear_page_huge_active(struct page *page)
+static inline void flush_free_hpage_work(struct hstate *h)
{
- VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
- ClearPagePrivate(&page[1]);
+ if (hugetlb_vmemmap_optimizable(h))
+ flush_work(&free_hpage_work);
}
-/*
- * Internal hugetlb specific page flag. Do not use outside of the hugetlb
- * code
- */
-static inline bool PageHugeTemporary(struct page *page)
+static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool atomic)
{
- if (!PageHuge(page))
- return false;
+ if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
+ __update_and_free_hugetlb_folio(h, folio);
+ return;
+ }
- return (unsigned long)page[2].mapping == -1U;
+ /*
+ * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
+ *
+ * Only call schedule_work() if hpage_freelist is previously
+ * empty. Otherwise, schedule_work() had been called but the workfn
+ * hasn't retrieved the list yet.
+ */
+ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
+ schedule_work(&free_hpage_work);
}
-static inline void SetPageHugeTemporary(struct page *page)
+static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
- page[2].mapping = (void *)-1U;
+ struct page *page, *t_page;
+ struct folio *folio;
+
+ list_for_each_entry_safe(page, t_page, list, lru) {
+ folio = page_folio(page);
+ update_and_free_hugetlb_folio(h, folio, false);
+ cond_resched();
+ }
}
-static inline void ClearPageHugeTemporary(struct page *page)
+struct hstate *size_to_hstate(unsigned long size)
{
- page[2].mapping = NULL;
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ if (huge_page_size(h) == size)
+ return h;
+ }
+ return NULL;
}
-static void __free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
* compound page destructor.
*/
- struct hstate *h = page_hstate(page);
- int nid = page_to_nid(page);
- struct hugepage_subpool *spool =
- (struct hugepage_subpool *)page_private(page);
+ struct folio *folio = page_folio(page);
+ struct hstate *h = folio_hstate(folio);
+ int nid = folio_nid(folio);
+ struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
bool restore_reserve;
+ unsigned long flags;
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(page_mapcount(page), page);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+ VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
- set_page_private(page, 0);
- page->mapping = NULL;
- restore_reserve = PagePrivate(page);
- ClearPagePrivate(page);
+ hugetlb_set_folio_subpool(folio, NULL);
+ if (folio_test_anon(folio))
+ __ClearPageAnonExclusive(&folio->page);
+ folio->mapping = NULL;
+ restore_reserve = folio_test_hugetlb_restore_reserve(folio);
+ folio_clear_hugetlb_restore_reserve(folio);
/*
- * If PagePrivate() was set on page, page allocation consumed a
+ * If HPageRestoreReserve was set on page, page allocation consumed a
* reservation. If the page was associated with a subpool, there
* would have been a page reserved in the subpool before allocation
* via hugepage_subpool_get_pages(). Since we are 'restoring' the
- * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * reservation, do not call hugepage_subpool_put_pages() as this will
* remove the reserved page from the subpool.
*/
if (!restore_reserve) {
@@ -1422,103 +1916,70 @@ static void __free_huge_page(struct page *page)
restore_reserve = true;
}
- spin_lock(&hugetlb_lock);
- clear_page_huge_active(page);
- hugetlb_cgroup_uncharge_page(hstate_index(h),
- pages_per_huge_page(h), page);
- hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
- pages_per_huge_page(h), page);
+ spin_lock_irqsave(&hugetlb_lock, flags);
+ folio_clear_hugetlb_migratable(folio);
+ hugetlb_cgroup_uncharge_folio(hstate_index(h),
+ pages_per_huge_page(h), folio);
+ hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
+ pages_per_huge_page(h), folio);
if (restore_reserve)
h->resv_huge_pages++;
- if (PageHugeTemporary(page)) {
- list_del(&page->lru);
- ClearPageHugeTemporary(page);
- update_and_free_page(h, page);
+ if (folio_test_hugetlb_temporary(folio)) {
+ remove_hugetlb_folio(h, folio, false);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
+ update_and_free_hugetlb_folio(h, folio, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
- list_del(&page->lru);
- update_and_free_page(h, page);
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[nid]--;
+ remove_hugetlb_folio(h, folio, true);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
+ update_and_free_hugetlb_folio(h, folio, true);
} else {
arch_clear_hugepage_flags(page);
- enqueue_huge_page(h, page);
+ enqueue_hugetlb_folio(h, folio);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
}
- spin_unlock(&hugetlb_lock);
}
/*
- * As free_huge_page() can be called from a non-task context, we have
- * to defer the actual freeing in a workqueue to prevent potential
- * hugetlb_lock deadlock.
- *
- * free_hpage_workfn() locklessly retrieves the linked list of pages to
- * be freed and frees them one-by-one. As the page->mapping pointer is
- * going to be cleared in __free_huge_page() anyway, it is reused as the
- * llist_node structure of a lockless linked list of huge pages to be freed.
+ * Must be called with the hugetlb lock held
*/
-static LLIST_HEAD(hpage_freelist);
-
-static void free_hpage_workfn(struct work_struct *work)
+static void __prep_account_new_huge_page(struct hstate *h, int nid)
{
- struct llist_node *node;
- struct page *page;
-
- node = llist_del_all(&hpage_freelist);
-
- while (node) {
- page = container_of((struct address_space **)node,
- struct page, mapping);
- node = node->next;
- __free_huge_page(page);
- }
+ lockdep_assert_held(&hugetlb_lock);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
}
-static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
-void free_huge_page(struct page *page)
+static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- /*
- * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
- */
- if (!in_task()) {
- /*
- * Only call schedule_work() if hpage_freelist is previously
- * empty. Otherwise, schedule_work() had been called but the
- * workfn hasn't retrieved the list yet.
- */
- if (llist_add((struct llist_node *)&page->mapping,
- &hpage_freelist))
- schedule_work(&free_hpage_work);
- return;
- }
-
- __free_huge_page(page);
+ hugetlb_vmemmap_optimize(h, &folio->page);
+ INIT_LIST_HEAD(&folio->lru);
+ folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ hugetlb_set_folio_subpool(folio, NULL);
+ set_hugetlb_cgroup(folio, NULL);
+ set_hugetlb_cgroup_rsvd(folio, NULL);
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
{
- INIT_LIST_HEAD(&page->lru);
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- set_hugetlb_cgroup(page, NULL);
- set_hugetlb_cgroup_rsvd(page, NULL);
- spin_lock(&hugetlb_lock);
- h->nr_huge_pages++;
- h->nr_huge_pages_node[nid]++;
- spin_unlock(&hugetlb_lock);
+ __prep_new_hugetlb_folio(h, folio);
+ spin_lock_irq(&hugetlb_lock);
+ __prep_account_new_huge_page(h, nid);
+ spin_unlock_irq(&hugetlb_lock);
}
-static void prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool __prep_compound_gigantic_folio(struct folio *folio,
+ unsigned int order, bool demote)
{
- int i;
+ int i, j;
int nr_pages = 1 << order;
- struct page *p = page + 1;
+ struct page *p;
+
+ __folio_clear_reserved(folio);
+ for (i = 0; i < nr_pages; i++) {
+ p = folio_page(folio, i);
- /* we rely on prep_new_huge_page to set the destructor */
- set_compound_order(page, order);
- __ClearPageReserved(page);
- __SetPageHead(page);
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
@@ -1531,14 +1992,70 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
* on the head page when they need know if put_page() is needed
* after get_user_pages().
*/
+ if (i != 0) /* head page cleared above */
+ __ClearPageReserved(p);
+ /*
+ * Subtle and very unlikely
+ *
+ * Gigantic 'page allocators' such as memblock or cma will
+ * return a set of pages with each page ref counted. We need
+ * to turn this set of pages into a compound page with tail
+ * page ref counts set to zero. Code such as speculative page
+ * cache adding could take a ref on a 'to be' tail page.
+ * We need to respect any increased ref count, and only set
+ * the ref count to zero if count is currently 1. If count
+ * is not 1, we return an error. An error return indicates
+ * the set of pages can not be converted to a gigantic page.
+ * The caller who allocated the pages should then discard the
+ * pages using the appropriate free interface.
+ *
+ * In the case of demote, the ref count will be zero.
+ */
+ if (!demote) {
+ if (!page_ref_freeze(p, 1)) {
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ goto out_error;
+ }
+ } else {
+ VM_BUG_ON_PAGE(page_count(p), p);
+ }
+ if (i != 0)
+ set_compound_head(p, &folio->page);
+ }
+ __folio_set_head(folio);
+ /* we rely on prep_new_hugetlb_folio to set the destructor */
+ folio_set_order(folio, order);
+ atomic_set(&folio->_entire_mapcount, -1);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
+ return true;
+
+out_error:
+ /* undo page modifications made above */
+ for (j = 0; j < i; j++) {
+ p = folio_page(folio, j);
+ if (j != 0)
+ clear_compound_head(p);
+ set_page_refcounted(p);
+ }
+ /* need to clear PG_reserved on remaining tail pages */
+ for (; j < nr_pages; j++) {
+ p = folio_page(folio, j);
__ClearPageReserved(p);
- set_page_count(p, 0);
- set_compound_head(p, page);
}
- atomic_set(compound_mapcount_ptr(page), -1);
+ return false;
+}
+
+static bool prep_compound_gigantic_folio(struct folio *folio,
+ unsigned int order)
+{
+ return __prep_compound_gigantic_folio(folio, order, false);
+}
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
+ unsigned int order)
+{
+ return __prep_compound_gigantic_folio(folio, order, true);
}
/*
@@ -1548,136 +2065,60 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
*/
int PageHuge(struct page *page)
{
+ struct folio *folio;
+
if (!PageCompound(page))
return 0;
-
- page = compound_head(page);
- return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
+ folio = page_folio(page);
+ return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHuge);
-/*
- * PageHeadHuge() only returns true for hugetlbfs head page, but not for
- * normal or transparent huge pages.
- */
-int PageHeadHuge(struct page *page_head)
-{
- if (!PageHead(page_head))
- return 0;
-
- return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
-}
-
-/*
- * Find address_space associated with hugetlbfs page.
- * Upon entry page is locked and page 'was' mapped although mapped state
- * could change. If necessary, use anon_vma to find vma and associated
- * address space. The returned mapping may be stale, but it can not be
- * invalid as page lock (which is held) is required to destroy mapping.
+/**
+ * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
+ * @folio: The folio to test.
+ *
+ * Context: Any context. Caller should have a reference on the folio to
+ * prevent it from being turned into a tail page.
+ * Return: True for hugetlbfs folios, false for anon folios or folios
+ * belonging to other filesystems.
*/
-static struct address_space *_get_hugetlb_page_mapping(struct page *hpage)
+bool folio_test_hugetlb(struct folio *folio)
{
- struct anon_vma *anon_vma;
- pgoff_t pgoff_start, pgoff_end;
- struct anon_vma_chain *avc;
- struct address_space *mapping = page_mapping(hpage);
-
- /* Simple file based mapping */
- if (mapping)
- return mapping;
-
- /*
- * Even anonymous hugetlbfs mappings are associated with an
- * underlying hugetlbfs file (see hugetlb_file_setup in mmap
- * code). Find a vma associated with the anonymous vma, and
- * use the file pointer to get address_space.
- */
- anon_vma = page_lock_anon_vma_read(hpage);
- if (!anon_vma)
- return mapping; /* NULL */
-
- /* Use first found vma */
- pgoff_start = page_to_pgoff(hpage);
- pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
- pgoff_start, pgoff_end) {
- struct vm_area_struct *vma = avc->vma;
-
- mapping = vma->vm_file->f_mapping;
- break;
- }
+ if (!folio_test_large(folio))
+ return false;
- anon_vma_unlock_read(anon_vma);
- return mapping;
+ return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
}
+EXPORT_SYMBOL_GPL(folio_test_hugetlb);
/*
* Find and lock address space (mapping) in write mode.
*
- * Upon entry, the page is locked which allows us to find the mapping
- * even in the case of an anon page. However, locking order dictates
- * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs
- * specific. So, we first try to lock the sema while still holding the
- * page lock. If this works, great! If not, then we need to drop the
- * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of
- * course, need to revalidate state along the way.
+ * Upon entry, the page is locked which means that page_mapping() is
+ * stable. Due to locking order, we can only trylock_write. If we can
+ * not get the lock, simply return NULL to caller.
*/
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
{
- struct address_space *mapping, *mapping2;
+ struct address_space *mapping = page_mapping(hpage);
- mapping = _get_hugetlb_page_mapping(hpage);
-retry:
if (!mapping)
return mapping;
- /*
- * If no contention, take lock and return
- */
if (i_mmap_trylock_write(mapping))
return mapping;
- /*
- * Must drop page lock and wait on mapping sema.
- * Note: Once page lock is dropped, mapping could become invalid.
- * As a hack, increase map count until we lock page again.
- */
- atomic_inc(&hpage->_mapcount);
- unlock_page(hpage);
- i_mmap_lock_write(mapping);
- lock_page(hpage);
- atomic_add_negative(-1, &hpage->_mapcount);
-
- /* verify page is still mapped */
- if (!page_mapped(hpage)) {
- i_mmap_unlock_write(mapping);
- return NULL;
- }
-
- /*
- * Get address space again and verify it is the same one
- * we locked. If not, drop lock and retry.
- */
- mapping2 = _get_hugetlb_page_mapping(hpage);
- if (mapping2 != mapping) {
- i_mmap_unlock_write(mapping);
- mapping = mapping2;
- goto retry;
- }
-
- return mapping;
+ return NULL;
}
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (!PageHuge(page_head))
- return page_index(page);
-
- if (compound_order(page_head) >= MAX_ORDER)
+ if (compound_order(page_head) > MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
compound_idx = page - page_head;
@@ -1685,13 +2126,14 @@ pgoff_t __basepage_index(struct page *page)
return (index << compound_order(page_head)) + compound_idx;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
+static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
bool alloc_try_hard = true;
+ bool retry = true;
/*
* By default we always try hard to allocate the page with
@@ -1707,11 +2149,20 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
- page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
- if (page)
- __count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+retry:
+ page = __alloc_pages(gfp_mask, order, nid, nmask);
+
+ /* Freeze head page */
+ if (page && !page_ref_freeze(page, 1)) {
+ __free_pages(page, order);
+ if (retry) { /* retry once */
+ retry = false;
+ goto retry;
+ }
+ /* WOW! twice in a row. */
+ pr_warn("HugeTLB head page unexpected inflated ref count\n");
+ page = NULL;
+ }
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
@@ -1729,32 +2180,54 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
if (node_alloc_noretry && !page && alloc_try_hard)
node_set(nid, *node_alloc_noretry);
- return page;
+ if (!page) {
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ return NULL;
+ }
+
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ return page_folio(page);
}
/*
* Common helper to allocate a fresh hugetlb page. All specific allocators
* should use this function to get new hugetlb pages
+ *
+ * Note that returned page is 'frozen': ref count of head page and all tail
+ * pages is zero.
*/
-static struct page *alloc_fresh_huge_page(struct hstate *h,
+static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
- struct page *page;
+ struct folio *folio;
+ bool retry = false;
+retry:
if (hstate_is_gigantic(h))
- page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
else
- page = alloc_buddy_huge_page(h, gfp_mask,
+ folio = alloc_buddy_hugetlb_folio(h, gfp_mask,
nid, nmask, node_alloc_noretry);
- if (!page)
+ if (!folio)
return NULL;
+ if (hstate_is_gigantic(h)) {
+ if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+ /*
+ * Rare failure to convert pages to compound page.
+ * Free pages and try again - ONCE!
+ */
+ free_gigantic_folio(folio, huge_page_order(h));
+ if (!retry) {
+ retry = true;
+ goto retry;
+ }
+ return NULL;
+ }
+ }
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
- if (hstate_is_gigantic(h))
- prep_compound_gigantic_page(page, huge_page_order(h));
- prep_new_huge_page(h, page, page_to_nid(page));
-
- return page;
+ return folio;
}
/*
@@ -1764,37 +2237,38 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
nodemask_t *node_alloc_noretry)
{
- struct page *page;
+ struct folio *folio;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
- node_alloc_noretry);
- if (page)
- break;
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
+ nodes_allowed, node_alloc_noretry);
+ if (folio) {
+ free_huge_page(&folio->page); /* free it into the hugepage allocator */
+ return 1;
+ }
}
- if (!page)
- return 0;
-
- put_page(page); /* free it into the hugepage allocator */
-
- return 1;
+ return 0;
}
/*
- * Free huge page from pool from next node to free.
- * Attempt to keep persistent huge pages more or less
- * balanced over allowed nodes.
+ * Remove huge page from pool from next node to free. Attempt to keep
+ * persistent huge pages more or less balanced over allowed nodes.
+ * This routine only 'removes' the hugetlb page. The caller must make
+ * an additional call to free the page to low level allocators.
* Called with hugetlb_lock locked.
*/
-static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
- bool acct_surplus)
+static struct page *remove_pool_huge_page(struct hstate *h,
+ nodemask_t *nodes_allowed,
+ bool acct_surplus)
{
int nr_nodes, node;
- int ret = 0;
+ struct page *page = NULL;
+ struct folio *folio;
+ lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
/*
* If we're returning unused surplus pages, only examine
@@ -1802,23 +2276,15 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
*/
if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
!list_empty(&h->hugepage_freelists[node])) {
- struct page *page =
- list_entry(h->hugepage_freelists[node].next,
+ page = list_entry(h->hugepage_freelists[node].next,
struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[node]--;
- if (acct_surplus) {
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[node]--;
- }
- update_and_free_page(h, page);
- ret = 1;
+ folio = page_folio(page);
+ remove_hugetlb_folio(h, folio, acct_surplus);
break;
}
}
- return ret;
+ return page;
}
/*
@@ -1826,48 +2292,81 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
* nothing for in-use hugepages and non-hugepages.
* This function returns values like below:
*
- * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- * (allocated or reserved.)
- * 0: successfully dissolved free hugepages or the page is not a
- * hugepage (considered as already dissolved)
+ * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ * when the system is under memory pressure and the feature of
+ * freeing unused vmemmap pages associated with each hugetlb page
+ * is enabled.
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
+ struct folio *folio = page_folio(page);
+retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
- if (!PageHuge(page))
+ if (!folio_test_hugetlb(folio))
return 0;
- spin_lock(&hugetlb_lock);
- if (!PageHuge(page)) {
+ spin_lock_irq(&hugetlb_lock);
+ if (!folio_test_hugetlb(folio)) {
rc = 0;
goto out;
}
- if (!page_count(page)) {
- struct page *head = compound_head(page);
- struct hstate *h = page_hstate(head);
- int nid = page_to_nid(head);
- if (h->free_huge_pages - h->resv_huge_pages == 0)
+ if (!folio_ref_count(folio)) {
+ struct hstate *h = folio_hstate(folio);
+ if (!available_huge_pages(h))
goto out;
+
/*
- * Move PageHWPoison flag from head page to the raw error page,
- * which makes any subpages rather than the error page reusable.
+ * We should make sure that the page is already on the free list
+ * when it is dissolved.
*/
- if (PageHWPoison(head) && page != head) {
- SetPageHWPoison(page);
- ClearPageHWPoison(head);
+ if (unlikely(!folio_test_hugetlb_freed(folio))) {
+ spin_unlock_irq(&hugetlb_lock);
+ cond_resched();
+
+ /*
+ * Theoretically, we should return -EBUSY when we
+ * encounter this race. In fact, we have a chance
+ * to successfully dissolve the page if we do a
+ * retry. Because the race window is quite small.
+ * If we seize this opportunity, it is an optimization
+ * for increasing the success rate of dissolving page.
+ */
+ goto retry;
}
- list_del(&head->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
+
+ remove_hugetlb_folio(h, folio, false);
h->max_huge_pages--;
- update_and_free_page(h, head);
- rc = 0;
+ spin_unlock_irq(&hugetlb_lock);
+
+ /*
+ * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
+ * before freeing the page. update_and_free_hugtlb_folio will fail to
+ * free the page if it can not allocate required vmemmap. We
+ * need to adjust max_huge_pages if the page is not freed.
+ * Attempt to allocate vmemmmap here so that we can take
+ * appropriate action on failure.
+ */
+ rc = hugetlb_vmemmap_restore(h, &folio->page);
+ if (!rc) {
+ update_and_free_hugetlb_folio(h, folio, false);
+ } else {
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_folio(h, folio, false);
+ h->max_huge_pages++;
+ spin_unlock_irq(&hugetlb_lock);
+ }
+
+ return rc;
}
out:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return rc;
}
@@ -1884,11 +2383,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
unsigned long pfn;
struct page *page;
int rc = 0;
+ unsigned int order;
+ struct hstate *h;
if (!hugepages_supported())
return rc;
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+ order = huge_page_order(&default_hstate);
+ for_each_hstate(h)
+ order = min(order, huge_page_order(h));
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
page = pfn_to_page(pfn);
rc = dissolve_free_huge_page(page);
if (rc)
@@ -1901,24 +2406,24 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
/*
* Allocates a fresh surplus page from the page allocator.
*/
-static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
if (hstate_is_gigantic(h))
return NULL;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
goto out_unlock;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
- if (!page)
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (!folio)
return NULL;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
@@ -1927,112 +2432,128 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- SetPageHugeTemporary(page);
- spin_unlock(&hugetlb_lock);
- put_page(page);
+ folio_set_hugetlb_temporary(folio);
+ spin_unlock_irq(&hugetlb_lock);
+ free_huge_page(&folio->page);
return NULL;
- } else {
- h->surplus_huge_pages++;
- h->surplus_huge_pages_node[page_to_nid(page)]++;
}
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[folio_nid(folio)]++;
+
out_unlock:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
- return page;
+ return folio;
}
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page;
+ struct folio *folio;
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
- if (!page)
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (!folio)
return NULL;
+ /* fresh huge pages are frozen */
+ folio_ref_unfreeze(folio, 1);
/*
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
- SetPageHugeTemporary(page);
+ folio_set_hugetlb_temporary(folio);
- return page;
+ return folio;
}
/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
-struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
+ struct folio *folio = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask = htlb_alloc_mask(h);
int nid;
nodemask_t *nodemask;
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
- mpol_cond_put(mpol);
+ if (mpol_is_preferred_many(mpol)) {
+ gfp_t gfp = gfp_mask | __GFP_NOWARN;
- return page;
+ gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+ folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);
+
+ /* Fallback to all nodes if page==NULL */
+ nodemask = NULL;
+ }
+
+ if (!folio)
+ folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
+ mpol_cond_put(mpol);
+ return folio;
}
-/* page migration callback function */
-struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+/* folio migration callback function */
+struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
- spin_lock(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0) {
- struct page *page;
+ spin_lock_irq(&hugetlb_lock);
+ if (available_huge_pages(h)) {
+ struct folio *folio;
- page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
- if (page) {
- spin_unlock(&hugetlb_lock);
- return page;
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ preferred_nid, nmask);
+ if (folio) {
+ spin_unlock_irq(&hugetlb_lock);
+ return folio;
}
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
- return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+ return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}
/* mempolicy aware migration callback */
-struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address)
{
struct mempolicy *mpol;
nodemask_t *nodemask;
- struct page *page;
+ struct folio *folio;
gfp_t gfp_mask;
int node;
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
+ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
- return page;
+ return folio;
}
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
*/
-static int gather_surplus_pages(struct hstate *h, int delta)
+static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
- struct list_head surplus_list;
+ LIST_HEAD(surplus_list);
+ struct folio *folio;
struct page *page, *tmp;
- int ret, i;
- int needed, allocated;
+ int ret;
+ long i;
+ long needed, allocated;
bool alloc_ok = true;
+ lockdep_assert_held(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
h->resv_huge_pages += delta;
@@ -2040,19 +2561,18 @@ static int gather_surplus_pages(struct hstate *h, int delta)
}
allocated = 0;
- INIT_LIST_HEAD(&surplus_list);
ret = -ENOMEM;
retry:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
- if (!page) {
+ if (!folio) {
alloc_ok = false;
break;
}
- list_add(&page->lru, &surplus_list);
+ list_add(&folio->lru, &surplus_list);
cond_resched();
}
allocated += i;
@@ -2061,7 +2581,7 @@ retry:
* After retaking hugetlb_lock, we need to recalculate 'needed'
* because either resv_huge_pages or free_huge_pages may have changed.
*/
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
if (needed > 0) {
@@ -2090,21 +2610,19 @@ retry:
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- put_page_testzero(page);
- VM_BUG_ON_PAGE(page_count(page), page);
- enqueue_huge_page(h, page);
+ /* Add the page to the hugetlb allocator */
+ enqueue_hugetlb_folio(h, page_folio(page));
}
free:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
- /* Free unnecessary surplus pages to the buddy allocator */
+ /*
+ * Free unnecessary surplus pages to the buddy allocator.
+ * Pages have no ref count, call free_huge_page directly.
+ */
list_for_each_entry_safe(page, tmp, &surplus_list, lru)
- put_page(page);
- spin_lock(&hugetlb_lock);
+ free_huge_page(page);
+ spin_lock_irq(&hugetlb_lock);
return ret;
}
@@ -2116,20 +2634,19 @@ free:
* to the associated reservation map.
* 2) Free any unused surplus pages that may have been allocated to satisfy
* the reservation. As many as unused_resv_pages may be freed.
- *
- * Called with hugetlb_lock held. However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
- * we must make sure nobody else can claim pages we are in the process of
- * freeing. Do this by ensuring resv_huge_page always is greater than the
- * number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
+ struct page *page;
+ LIST_HEAD(page_list);
- /* Cannot return gigantic pages currently */
- if (hstate_is_gigantic(h))
+ lockdep_assert_held(&hugetlb_lock);
+ /* Uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
+
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
goto out;
/*
@@ -2143,24 +2660,21 @@ static void return_unused_surplus_pages(struct hstate *h,
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the freed pages across the
+ * remove_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
- *
- * Note that we decrement resv_huge_pages as we free the pages. If
- * we drop the lock, resv_huge_pages will still be sufficiently large
- * to cover subsequent pages we may free.
*/
while (nr_pages--) {
- h->resv_huge_pages--;
- unused_resv_pages--;
- if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+ page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
+ if (!page)
goto out;
- cond_resched_lock(&hugetlb_lock);
+
+ list_add(&page->lru, &page_list);
}
out:
- /* Fully uncommit the reservation */
- h->resv_huge_pages -= unused_resv_pages;
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
}
@@ -2187,12 +2701,18 @@ out:
* be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a
* reservation exists.
+ *
+ * vma_del_reservation is used in error paths where an entry in the reserve
+ * map was created during huge page allocation and must be removed. It is to
+ * be called after calling vma_needs_reservation to determine if a reservation
+ * exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
VMA_ADD_RESV,
+ VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
@@ -2236,33 +2756,42 @@ static long __vma_reservation_common(struct hstate *h,
ret = region_del(resv, idx, idx + 1);
}
break;
+ case VMA_DEL_RESV:
+ if (vma->vm_flags & VM_MAYSHARE) {
+ region_abort(resv, idx, idx + 1, 1);
+ ret = region_del(resv, idx, idx + 1);
+ } else {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ }
+ break;
default:
BUG();
}
- if (vma->vm_flags & VM_MAYSHARE)
+ if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
return ret;
- else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
- /*
- * In most cases, reserves always exist for private mappings.
- * However, a file associated with mapping could have been
- * hole punched or truncated after reserves were consumed.
- * As subsequent fault on such a range will not use reserves.
- * Subtle - The reserve map for private mappings has the
- * opposite meaning than that of shared mappings. If NO
- * entry is in the reserve map, it means a reservation exists.
- * If an entry exists in the reserve map, it means the
- * reservation has already been consumed. As a result, the
- * return value of this routine is the opposite of the
- * value returned from reserve map manipulation routines above.
- */
- if (ret)
- return 0;
- else
- return 1;
- }
- else
- return ret < 0 ? ret : 0;
+ /*
+ * We know private mapping must have HPAGE_RESV_OWNER set.
+ *
+ * In most cases, reserves always exist for private mappings.
+ * However, a file associated with mapping could have been
+ * hole punched or truncated after reserves were consumed.
+ * As subsequent fault on such a range will not use reserves.
+ * Subtle - The reserve map for private mappings has the
+ * opposite meaning than that of shared mappings. If NO
+ * entry is in the reserve map, it means a reservation exists.
+ * If an entry exists in the reserve map, it means the
+ * reservation has already been consumed. As a result, the
+ * return value of this routine is the opposite of the
+ * value returned from reserve map manipulation routines above.
+ */
+ if (ret > 0)
+ return 0;
+ if (ret == 0)
+ return 1;
+ return ret;
}
static long vma_needs_reservation(struct hstate *h,
@@ -2289,60 +2818,242 @@ static long vma_add_reservation(struct hstate *h,
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}
+static long vma_del_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
+}
+
/*
- * This routine is called to restore a reservation on error paths. In the
- * specific error paths, a huge page was allocated (via alloc_huge_page)
- * and is about to be freed. If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set PagePrivate
- * in the newly allocated page. When the page is freed via free_huge_page,
- * the global reservation count will be incremented if PagePrivate is set.
- * However, free_huge_page can not adjust the reserve map. Adjust the
- * reserve map here to be consistent with global reserve count adjustments
- * to be made by free_huge_page.
+ * This routine is called to restore reservation information on error paths.
+ * It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
+ * and the hugetlb mutex should remain held when calling this routine.
+ *
+ * It handles two specific cases:
+ * 1) A reservation was in place and the folio consumed the reservation.
+ * hugetlb_restore_reserve is set in the folio.
+ * 2) No reservation was in place for the page, so hugetlb_restore_reserve is
+ * not set. However, alloc_hugetlb_folio always updates the reserve map.
+ *
+ * In case 1, free_huge_page later in the error path will increment the
+ * global reserve count. But, free_huge_page does not have enough context
+ * to adjust the reservation map. This case deals primarily with private
+ * mappings. Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page. Make sure the
+ * reserve map indicates there is a reservation present.
+ *
+ * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
*/
-static void restore_reserve_on_error(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address,
- struct page *page)
+void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address, struct folio *folio)
{
- if (unlikely(PagePrivate(page))) {
- long rc = vma_needs_reservation(h, vma, address);
+ long rc = vma_needs_reservation(h, vma, address);
- if (unlikely(rc < 0)) {
+ if (folio_test_hugetlb_restore_reserve(folio)) {
+ if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
- * manipulation. Clear PagePrivate so that
- * global reserve count will not be incremented
+ * manipulation. Clear hugetlb_restore_reserve so
+ * that global reserve count will not be incremented
* by free_huge_page. This will make it appear
- * as though the reservation for this page was
+ * as though the reservation for this folio was
* consumed. This may prevent the task from
- * faulting in the page at a later time. This
+ * faulting in the folio at a later time. This
* is better than inconsistent global huge page
* accounting of reserve counts.
*/
- ClearPagePrivate(page);
- } else if (rc) {
- rc = vma_add_reservation(h, vma, address);
- if (unlikely(rc < 0))
+ folio_clear_hugetlb_restore_reserve(folio);
+ else if (rc)
+ (void)vma_add_reservation(h, vma, address);
+ else
+ vma_end_reservation(h, vma, address);
+ } else {
+ if (!rc) {
+ /*
+ * This indicates there is an entry in the reserve map
+ * not added by alloc_hugetlb_folio. We know it was added
+ * before the alloc_hugetlb_folio call, otherwise
+ * hugetlb_restore_reserve would be set on the folio.
+ * Remove the entry so that a subsequent allocation
+ * does not consume a reservation.
+ */
+ rc = vma_del_reservation(h, vma, address);
+ if (rc < 0)
/*
- * See above comment about rare out of
- * memory condition.
+ * VERY rare out of memory condition. Since
+ * we can not delete the entry, set
+ * hugetlb_restore_reserve so that the reserve
+ * count will be incremented when the folio
+ * is freed. This reserve will be consumed
+ * on a subsequent allocation.
*/
- ClearPagePrivate(page);
+ folio_set_hugetlb_restore_reserve(folio);
+ } else if (rc < 0) {
+ /*
+ * Rare out of memory condition from
+ * vma_needs_reservation call. Memory allocation is
+ * only attempted if a new entry is needed. Therefore,
+ * this implies there is not an entry in the
+ * reserve map.
+ *
+ * For shared mappings, no entry in the map indicates
+ * no reservation. We are done.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ /*
+ * For private mappings, no entry indicates
+ * a reservation is present. Since we can
+ * not add an entry, set hugetlb_restore_reserve
+ * on the folio so reserve count will be
+ * incremented when freed. This reserve will
+ * be consumed on a subsequent allocation.
+ */
+ folio_set_hugetlb_restore_reserve(folio);
} else
- vma_end_reservation(h, vma, address);
+ /*
+ * No reservation present, do nothing
+ */
+ vma_end_reservation(h, vma, address);
+ }
+}
+
+/*
+ * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
+ * the old one
+ * @h: struct hstate old page belongs to
+ * @old_folio: Old folio to dissolve
+ * @list: List to isolate the page in case we need to
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
+ struct folio *old_folio, struct list_head *list)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ int nid = folio_nid(old_folio);
+ struct folio *new_folio;
+ int ret = 0;
+
+ /*
+ * Before dissolving the folio, we need to allocate a new one for the
+ * pool to remain stable. Here, we allocate the folio and 'prep' it
+ * by doing everything but actually updating counters and adding to
+ * the pool. This simplifies and let us do most of the processing
+ * under the lock.
+ */
+ new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
+ if (!new_folio)
+ return -ENOMEM;
+ __prep_new_hugetlb_folio(h, new_folio);
+
+retry:
+ spin_lock_irq(&hugetlb_lock);
+ if (!folio_test_hugetlb(old_folio)) {
+ /*
+ * Freed from under us. Drop new_folio too.
+ */
+ goto free_new;
+ } else if (folio_ref_count(old_folio)) {
+ bool isolated;
+
+ /*
+ * Someone has grabbed the folio, try to isolate it here.
+ * Fail with -EBUSY if not possible.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ isolated = isolate_hugetlb(old_folio, list);
+ ret = isolated ? 0 : -EBUSY;
+ spin_lock_irq(&hugetlb_lock);
+ goto free_new;
+ } else if (!folio_test_hugetlb_freed(old_folio)) {
+ /*
+ * Folio's refcount is 0 but it has not been enqueued in the
+ * freelist yet. Race window is small, so we can succeed here if
+ * we retry.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ cond_resched();
+ goto retry;
+ } else {
+ /*
+ * Ok, old_folio is still a genuine free hugepage. Remove it from
+ * the freelist and decrease the counters. These will be
+ * incremented again when calling __prep_account_new_huge_page()
+ * and enqueue_hugetlb_folio() for new_folio. The counters will
+ * remain stable since this happens under the lock.
+ */
+ remove_hugetlb_folio(h, old_folio, false);
+
+ /*
+ * Ref count on new_folio is already zero as it was dropped
+ * earlier. It can be directly added to the pool free list.
+ */
+ __prep_account_new_huge_page(h, nid);
+ enqueue_hugetlb_folio(h, new_folio);
+
+ /*
+ * Folio has been replaced, we can safely free the old one.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_hugetlb_folio(h, old_folio, false);
+ }
+
+ return ret;
+
+free_new:
+ spin_unlock_irq(&hugetlb_lock);
+ /* Folio has a zero ref count, but needs a ref to be freed */
+ folio_ref_unfreeze(new_folio, 1);
+ update_and_free_hugetlb_folio(h, new_folio, false);
+
+ return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+{
+ struct hstate *h;
+ struct folio *folio = page_folio(page);
+ int ret = -EBUSY;
+
+ /*
+ * The page might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ * Return success when racing as if we dissolved the page ourselves.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ if (folio_test_hugetlb(folio)) {
+ h = folio_hstate(folio);
+ } else {
+ spin_unlock_irq(&hugetlb_lock);
+ return 0;
}
+ spin_unlock_irq(&hugetlb_lock);
+
+ /*
+ * Fence off gigantic pages as there is a cyclic dependency between
+ * alloc_contig_range and them. Return -ENOMEM as this has the effect
+ * of bailing out right away without further retrying.
+ */
+ if (hstate_is_gigantic(h))
+ return -ENOMEM;
+
+ if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
+ ret = 0;
+ else if (!folio_ref_count(folio))
+ ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
+
+ return ret;
}
-struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
- struct page *page;
+ struct folio *folio;
long map_chg, map_commit;
long gbl_chg;
int ret, idx;
- struct hugetlb_cgroup *h_cg;
+ struct hugetlb_cgroup *h_cg = NULL;
bool deferred_reserve;
idx = hstate_index(h);
@@ -2383,7 +3094,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
/* If this allocation is not consuming a reservation, charge it now.
*/
- deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+ deferred_reserve = map_chg || avoid_reserve;
if (deferred_reserve) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
@@ -2395,38 +3106,40 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
if (ret)
goto out_uncharge_cgroup_reservation;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* glb_chg is passed to indicate whether or not a page must be taken
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
- if (!page) {
- spin_unlock(&hugetlb_lock);
- page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
- if (!page)
+ folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg);
+ if (!folio) {
+ spin_unlock_irq(&hugetlb_lock);
+ folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
+ if (!folio)
goto out_uncharge_cgroup;
+ spin_lock_irq(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- SetPagePrivate(page);
+ folio_set_hugetlb_restore_reserve(folio);
h->resv_huge_pages--;
}
- spin_lock(&hugetlb_lock);
- list_add(&page->lru, &h->hugepage_activelist);
+ list_add(&folio->lru, &h->hugepage_activelist);
+ folio_ref_unfreeze(folio, 1);
/* Fall through */
}
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
if (deferred_reserve) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
- h_cg, page);
+ h_cg, folio);
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
- set_page_private(page, (unsigned long)spool);
+ hugetlb_set_folio_subpool(folio, spool);
map_commit = vma_commit_reservation(h, vma, addr);
if (unlikely(map_chg > map_commit)) {
@@ -2443,8 +3156,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
+ if (deferred_reserve)
+ hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
+ pages_per_huge_page(h), folio);
}
- return page;
+ return folio;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
@@ -2459,33 +3175,37 @@ out_subpool_put:
return ERR_PTR(-ENOSPC);
}
-int alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
-int __alloc_bootmem_huge_page(struct hstate *h)
+int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
- struct huge_bootmem_page *m;
+ struct huge_bootmem_page *m = NULL; /* initialize for clang */
int nr_nodes, node;
+ /* do node specific alloc */
+ if (nid != NUMA_NO_NODE) {
+ m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!m)
+ return 0;
+ goto found;
+ }
+ /* allocate from next node when distributing huge pages */
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
- void *addr;
-
- addr = memblock_alloc_try_nid_raw(
+ m = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
- if (addr) {
- /*
- * Use the beginning of the huge page to store the
- * huge_bootmem_page struct (until gather_bootmem
- * puts them into the mem_map).
- */
- m = addr;
- goto found;
- }
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ if (!m)
+ return 0;
+ goto found;
}
- return 0;
found:
- BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
@@ -2493,47 +3213,94 @@ found:
return 1;
}
-static void __init prep_compound_huge_page(struct page *page,
- unsigned int order)
-{
- if (unlikely(order > (MAX_ORDER - 1)))
- prep_compound_gigantic_page(page, order);
- else
- prep_compound_page(page, order);
-}
-
-/* Put bootmem huge pages into the standard lists after mem_map is up */
+/*
+ * Put bootmem huge pages into the standard lists after mem_map is up.
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ */
static void __init gather_bootmem_prealloc(void)
{
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
struct page *page = virt_to_page(m);
+ struct folio *folio = page_folio(page);
struct hstate *h = m->hstate;
- WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, h->order);
- WARN_ON(PageReserved(page));
- prep_new_huge_page(h, page, page_to_nid(page));
- put_page(page); /* free it into the hugepage allocator */
+ VM_BUG_ON(!hstate_is_gigantic(h));
+ WARN_ON(folio_ref_count(folio) != 1);
+ if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+ WARN_ON(folio_test_reserved(folio));
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
+ free_huge_page(page); /* add to the hugepage allocator */
+ } else {
+ /* VERY unlikely inflated ref count on a tail page */
+ free_gigantic_folio(folio, huge_page_order(h));
+ }
/*
- * If we had gigantic hugepages allocated at boot time, we need
- * to restore the 'stolen' pages to totalram_pages in order to
- * fix confusing memory reports from free(1) and another
- * side-effects, like CommitLimit going negative.
+ * We need to restore the 'stolen' pages to totalram_pages
+ * in order to fix confusing memory reports from free(1) and
+ * other side-effects, like CommitLimit going negative.
*/
- if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, 1 << h->order);
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
+static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
+{
+ unsigned long i;
+ char buf[32];
+
+ for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
+ if (hstate_is_gigantic(h)) {
+ if (!alloc_bootmem_huge_page(h, nid))
+ break;
+ } else {
+ struct folio *folio;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
+ &node_states[N_MEMORY], NULL);
+ if (!folio)
+ break;
+ free_huge_page(&folio->page); /* free it into the hugepage allocator */
+ }
+ cond_resched();
+ }
+ if (i == h->max_huge_pages_node[nid])
+ return;
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
+ h->max_huge_pages_node[nid], buf, nid, i);
+ h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
+ h->max_huge_pages_node[nid] = i;
+}
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
nodemask_t *node_alloc_noretry;
+ bool node_specific_alloc = false;
+
+ /* skip gigantic hugepages allocation if hugetlb_cma enabled */
+ if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+ return;
+ }
+
+ /* do node specific alloc */
+ for_each_online_node(i) {
+ if (h->max_huge_pages_node[i] > 0) {
+ hugetlb_hstate_alloc_pages_onenode(h, i);
+ node_specific_alloc = true;
+ }
+ }
+
+ if (node_specific_alloc)
+ return;
+ /* below will do all node balanced alloc */
if (!hstate_is_gigantic(h)) {
/*
* Bit mask controlling how hard we retry per-node allocations.
@@ -2554,11 +3321,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
- if (hugetlb_cma_size) {
- pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
- break;
- }
- if (!alloc_bootmem_huge_page(h))
+ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
break;
} else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY],
@@ -2574,23 +3337,38 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
-
kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
{
- struct hstate *h;
+ struct hstate *h, *h2;
for_each_hstate(h) {
- if (minimum_order > huge_page_order(h))
- minimum_order = huge_page_order(h);
-
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
+
+ /*
+ * Set demote order for each hstate. Note that
+ * h->demote_order is initially 0.
+ * - We can not demote gigantic pages if runtime freeing
+ * is not supported, so skip this.
+ * - If CMA allocation is possible, we can not demote
+ * HUGETLB_PAGE_ORDER or smaller size pages.
+ */
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ continue;
+ if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
+ continue;
+ for_each_hstate(h2) {
+ if (h2 == h)
+ continue;
+ if (h2->order < h->order &&
+ h2->order > h->demote_order)
+ h->demote_order = h2->order;
+ }
}
- VM_BUG_ON(minimum_order == UINT_MAX);
}
static void __init report_hugepages(void)
@@ -2601,8 +3379,10 @@ static void __init report_hugepages(void)
char buf[32];
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
- pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+ pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
buf, h->free_huge_pages);
+ pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
+ hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
}
@@ -2611,24 +3391,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{
int i;
+ LIST_HEAD(page_list);
+ lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h))
return;
+ /*
+ * Collect pages to be freed on a list, and free after dropping lock
+ */
for_each_node_mask(i, *nodes_allowed) {
struct page *page, *next;
struct list_head *freel = &h->hugepage_freelists[i];
list_for_each_entry_safe(page, next, freel, lru) {
if (count >= h->nr_huge_pages)
- return;
+ goto out;
if (PageHighMem(page))
continue;
- list_del(&page->lru);
- update_and_free_page(h, page);
- h->free_huge_pages--;
- h->free_huge_pages_node[page_to_nid(page)]--;
+ remove_hugetlb_folio(h, page_folio(page), false);
+ list_add(&page->lru, &page_list);
}
}
+
+out:
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
}
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count,
@@ -2647,6 +3435,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
{
int nr_nodes, node;
+ lockdep_assert_held(&hugetlb_lock);
VM_BUG_ON(delta != -1 && delta != 1);
if (delta < 0) {
@@ -2674,6 +3463,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
+ struct page *page;
+ LIST_HEAD(page_list);
NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
/*
@@ -2686,7 +3477,13 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
else
return -ENOMEM;
- spin_lock(&hugetlb_lock);
+ /*
+ * resize_lock mutex prevents concurrent adjustments to number of
+ * pages in hstate via the proc/sysfs interfaces.
+ */
+ mutex_lock(&h->resize_lock);
+ flush_free_hpage_work(h);
+ spin_lock_irq(&hugetlb_lock);
/*
* Check for a node specific request.
@@ -2717,7 +3514,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
*/
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
if (count > persistent_huge_pages(h)) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
NODEMASK_FREE(node_alloc_noretry);
return -EINVAL;
}
@@ -2729,7 +3527,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with alloc_surplus_huge_page() here and be unable
+ * We might race with alloc_surplus_hugetlb_folio() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -2746,14 +3544,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* page, free_huge_page will handle it by freeing the page
* and reducing the surplus.
*/
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
/* yield cpu to avoid soft lockup */
cond_resched();
ret = alloc_pool_huge_page(h, nodes_allowed,
node_alloc_noretry);
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (!ret)
goto out;
@@ -2772,7 +3570,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * alloc_surplus_huge_page() is checking the global counter,
+ * alloc_surplus_hugetlb_folio() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
@@ -2780,30 +3578,139 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
+
+ /*
+ * Collect pages to be removed on list without dropping lock
+ */
while (min_count < persistent_huge_pages(h)) {
- if (!free_pool_huge_page(h, nodes_allowed, 0))
+ page = remove_pool_huge_page(h, nodes_allowed, 0);
+ if (!page)
break;
- cond_resched_lock(&hugetlb_lock);
+
+ list_add(&page->lru, &page_list);
}
+ /* free the pages after dropping lock */
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ flush_free_hpage_work(h);
+ spin_lock_irq(&hugetlb_lock);
+
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
break;
}
out:
h->max_huge_pages = persistent_huge_pages(h);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
NODEMASK_FREE(node_alloc_noretry);
return 0;
}
+static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
+{
+ int i, nid = folio_nid(folio);
+ struct hstate *target_hstate;
+ struct page *subpage;
+ struct folio *inner_folio;
+ int rc = 0;
+
+ target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
+
+ remove_hugetlb_folio_for_demote(h, folio, false);
+ spin_unlock_irq(&hugetlb_lock);
+
+ rc = hugetlb_vmemmap_restore(h, &folio->page);
+ if (rc) {
+ /* Allocation of vmemmmap failed, we can not demote folio */
+ spin_lock_irq(&hugetlb_lock);
+ folio_ref_unfreeze(folio, 1);
+ add_hugetlb_folio(h, folio, false);
+ return rc;
+ }
+
+ /*
+ * Use destroy_compound_hugetlb_folio_for_demote for all huge page
+ * sizes as it will not ref count folios.
+ */
+ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
+
+ /*
+ * Taking target hstate mutex synchronizes with set_max_huge_pages.
+ * Without the mutex, pages added to target hstate could be marked
+ * as surplus.
+ *
+ * Note that we already hold h->resize_lock. To prevent deadlock,
+ * use the convention of always taking larger size hstate mutex first.
+ */
+ mutex_lock(&target_hstate->resize_lock);
+ for (i = 0; i < pages_per_huge_page(h);
+ i += pages_per_huge_page(target_hstate)) {
+ subpage = folio_page(folio, i);
+ inner_folio = page_folio(subpage);
+ if (hstate_is_gigantic(target_hstate))
+ prep_compound_gigantic_folio_for_demote(inner_folio,
+ target_hstate->order);
+ else
+ prep_compound_page(subpage, target_hstate->order);
+ folio_change_private(inner_folio, NULL);
+ prep_new_hugetlb_folio(target_hstate, inner_folio, nid);
+ free_huge_page(subpage);
+ }
+ mutex_unlock(&target_hstate->resize_lock);
+
+ spin_lock_irq(&hugetlb_lock);
+
+ /*
+ * Not absolutely necessary, but for consistency update max_huge_pages
+ * based on pool changes for the demoted page.
+ */
+ h->max_huge_pages--;
+ target_hstate->max_huge_pages +=
+ pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
+
+ return rc;
+}
+
+static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+ __must_hold(&hugetlb_lock)
+{
+ int nr_nodes, node;
+ struct folio *folio;
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ /* We should never get here if no demote order */
+ if (!h->demote_order) {
+ pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
+ return -EINVAL; /* internal error */
+ }
+
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ list_for_each_entry(folio, &h->hugepage_freelists[node], lru) {
+ if (folio_test_hwpoison(folio))
+ continue;
+ return demote_free_hugetlb_folio(h, folio);
+ }
+ }
+
+ /*
+ * Only way to get here is if all pages on free lists are poisoned.
+ * Return -EBUSY so that caller will not retry.
+ */
+ return -EBUSY;
+}
+
#define HSTATE_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define HSTATE_ATTR_WO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
+
#define HSTATE_ATTR(_name) \
- static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+ static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
static struct kobject *hugepages_kobj;
static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
@@ -2837,7 +3744,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
else
nr_huge_pages = h->nr_huge_pages_node[nid];
- return sprintf(buf, "%lu\n", nr_huge_pages);
+ return sysfs_emit(buf, "%lu\n", nr_huge_pages);
}
static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
@@ -2910,7 +3817,8 @@ HSTATE_ATTR(nr_hugepages);
* huge page alloc/free.
*/
static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr,
+ char *buf)
{
return nr_hugepages_show_common(kobj, attr, buf);
}
@@ -2928,7 +3836,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h = kobj_to_hstate(kobj, NULL);
- return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+ return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -2945,9 +3853,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
if (err)
return err;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return count;
}
@@ -2966,7 +3874,7 @@ static ssize_t free_hugepages_show(struct kobject *kobj,
else
free_huge_pages = h->free_huge_pages_node[nid];
- return sprintf(buf, "%lu\n", free_huge_pages);
+ return sysfs_emit(buf, "%lu\n", free_huge_pages);
}
HSTATE_ATTR_RO(free_hugepages);
@@ -2974,7 +3882,7 @@ static ssize_t resv_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h = kobj_to_hstate(kobj, NULL);
- return sprintf(buf, "%lu\n", h->resv_huge_pages);
+ return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
}
HSTATE_ATTR_RO(resv_hugepages);
@@ -2991,10 +3899,105 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj,
else
surplus_huge_pages = h->surplus_huge_pages_node[nid];
- return sprintf(buf, "%lu\n", surplus_huge_pages);
+ return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
}
HSTATE_ATTR_RO(surplus_hugepages);
+static ssize_t demote_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_demote;
+ unsigned long nr_available;
+ nodemask_t nodes_allowed, *n_mask;
+ struct hstate *h;
+ int err;
+ int nid;
+
+ err = kstrtoul(buf, 10, &nr_demote);
+ if (err)
+ return err;
+ h = kobj_to_hstate(kobj, &nid);
+
+ if (nid != NUMA_NO_NODE) {
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ } else {
+ n_mask = &node_states[N_MEMORY];
+ }
+
+ /* Synchronize with other sysfs operations modifying huge pages */
+ mutex_lock(&h->resize_lock);
+ spin_lock_irq(&hugetlb_lock);
+
+ while (nr_demote) {
+ /*
+ * Check for available pages to demote each time thorough the
+ * loop as demote_pool_huge_page will drop hugetlb_lock.
+ */
+ if (nid != NUMA_NO_NODE)
+ nr_available = h->free_huge_pages_node[nid];
+ else
+ nr_available = h->free_huge_pages;
+ nr_available -= h->resv_huge_pages;
+ if (!nr_available)
+ break;
+
+ err = demote_pool_huge_page(h, n_mask);
+ if (err)
+ break;
+
+ nr_demote--;
+ }
+
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
+
+ if (err)
+ return err;
+ return len;
+}
+HSTATE_ATTR_WO(demote);
+
+static ssize_t demote_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
+
+ return sysfs_emit(buf, "%lukB\n", demote_size);
+}
+
+static ssize_t demote_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hstate *h, *demote_hstate;
+ unsigned long demote_size;
+ unsigned int demote_order;
+
+ demote_size = (unsigned long)memparse(buf, NULL);
+
+ demote_hstate = size_to_hstate(demote_size);
+ if (!demote_hstate)
+ return -EINVAL;
+ demote_order = demote_hstate->order;
+ if (demote_order < HUGETLB_PAGE_ORDER)
+ return -EINVAL;
+
+ /* demote order must be smaller than hstate order */
+ h = kobj_to_hstate(kobj, NULL);
+ if (demote_order >= h->order)
+ return -EINVAL;
+
+ /* resize_lock synchronizes access to demote size and writes */
+ mutex_lock(&h->resize_lock);
+ h->demote_order = demote_order;
+ mutex_unlock(&h->resize_lock);
+
+ return count;
+}
+HSTATE_ATTR(demote_size);
+
static struct attribute *hstate_attrs[] = {
&nr_hugepages_attr.attr,
&nr_overcommit_hugepages_attr.attr,
@@ -3011,6 +4014,16 @@ static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
+static struct attribute *hstate_demote_attrs[] = {
+ &demote_size_attr.attr,
+ &demote_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group hstate_demote_attr_group = {
+ .attrs = hstate_demote_attrs,
+};
+
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
const struct attribute_group *hstate_attr_group)
@@ -3023,30 +4036,29 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
return -ENOMEM;
retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
- if (retval)
+ if (retval) {
kobject_put(hstate_kobjs[hi]);
-
- return retval;
-}
-
-static void __init hugetlb_sysfs_init(void)
-{
- struct hstate *h;
- int err;
-
- hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
- if (!hugepages_kobj)
- return;
-
- for_each_hstate(h) {
- err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
- hstate_kobjs, &hstate_attr_group);
- if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ hstate_kobjs[hi] = NULL;
+ return retval;
+ }
+
+ if (h->demote_order) {
+ retval = sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group);
+ if (retval) {
+ pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+ kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ return retval;
+ }
}
+
+ return 0;
}
#ifdef CONFIG_NUMA
+static bool hugetlb_sysfs_initialized __ro_after_init;
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
@@ -3102,7 +4114,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
* Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
-static void hugetlb_unregister_node(struct node *node)
+void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -3112,10 +4124,15 @@ static void hugetlb_unregister_node(struct node *node)
for_each_hstate(h) {
int idx = hstate_index(h);
- if (nhs->hstate_kobjs[idx]) {
- kobject_put(nhs->hstate_kobjs[idx]);
- nhs->hstate_kobjs[idx] = NULL;
- }
+ struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+ if (!hstate_kobj)
+ continue;
+ if (h->demote_order)
+ sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+ sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+ kobject_put(hstate_kobj);
+ nhs->hstate_kobjs[idx] = NULL;
}
kobject_put(nhs->hugepages_kobj);
@@ -3127,12 +4144,15 @@ static void hugetlb_unregister_node(struct node *node)
* Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
-static void hugetlb_register_node(struct node *node)
+void hugetlb_register_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
+ if (!hugetlb_sysfs_initialized)
+ return;
+
if (nhs->hugepages_kobj)
return; /* already allocated */
@@ -3163,18 +4183,8 @@ static void __init hugetlb_register_all_nodes(void)
{
int nid;
- for_each_node_state(nid, N_MEMORY) {
- struct node *node = node_devices[nid];
- if (node->dev.id == nid)
- hugetlb_register_node(node);
- }
-
- /*
- * Let the node device driver know we're here so it can
- * [un]register hstate attributes on node hotplug.
- */
- register_hugetlbfs_with_node(hugetlb_register_node,
- hugetlb_unregister_node);
+ for_each_online_node(nid)
+ hugetlb_register_node(node_devices[nid]);
}
#else /* !CONFIG_NUMA */
@@ -3190,10 +4200,49 @@ static void hugetlb_register_all_nodes(void) { }
#endif
+#ifdef CONFIG_CMA
+static void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
+ if (err)
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ }
+
+#ifdef CONFIG_NUMA
+ hugetlb_sysfs_initialized = true;
+#endif
+ hugetlb_register_all_nodes();
+}
+
+#ifdef CONFIG_SYSCTL
+static void hugetlb_sysctl_init(void);
+#else
+static inline void hugetlb_sysctl_init(void) { }
+#endif
+
static int __init hugetlb_init(void)
{
int i;
+ BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
+ __NR_HPAGEFLAGS);
+
if (!hugepages_supported()) {
if (hugetlb_max_hstate || default_hstate_max_huge_pages)
pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
@@ -3228,6 +4277,10 @@ static int __init hugetlb_init(void)
}
default_hstate.max_huge_pages =
default_hstate_max_huge_pages;
+
+ for_each_online_node(i)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
}
}
@@ -3237,8 +4290,8 @@ static int __init hugetlb_init(void)
report_hugepages();
hugetlb_sysfs_init();
- hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
+ hugetlb_sysctl_init();
#ifdef CONFIG_SMP
num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
@@ -3273,21 +4326,38 @@ void __init hugetlb_add_hstate(unsigned int order)
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
h = &hstates[hugetlb_max_hstate++];
+ mutex_init(&h->resize_lock);
h->order = order;
- h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
- h->nr_huge_pages = 0;
- h->free_huge_pages = 0;
+ h->mask = ~(huge_page_size(h) - 1);
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
h->next_nid_to_alloc = first_memory_node;
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
- huge_page_size(h)/1024);
+ huge_page_size(h)/SZ_1K);
parsed_hstate = h;
}
+bool __init __weak hugetlb_node_alloc_supported(void)
+{
+ return true;
+}
+
+static void __init hugepages_clear_pages_in_node(void)
+{
+ if (!hugetlb_max_hstate) {
+ default_hstate_max_huge_pages = 0;
+ memset(default_hugepages_in_node, 0,
+ sizeof(default_hugepages_in_node));
+ } else {
+ parsed_hstate->max_huge_pages = 0;
+ memset(parsed_hstate->max_huge_pages_node, 0,
+ sizeof(parsed_hstate->max_huge_pages_node));
+ }
+}
+
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -3299,11 +4369,15 @@ static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
+ int node = NUMA_NO_NODE;
+ int count;
+ unsigned long tmp;
+ char *p = s;
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 0;
+ return 1;
}
/*
@@ -3319,23 +4393,60 @@ static int __init hugepages_setup(char *s)
if (mhp == last_mhp) {
pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
- return 0;
+ return 1;
}
- if (sscanf(s, "%lu", mhp) <= 0)
- *mhp = 0;
+ while (*p) {
+ count = 0;
+ if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+ goto invalid;
+ /* Parameter is node format */
+ if (p[count] == ':') {
+ if (!hugetlb_node_alloc_supported()) {
+ pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
+ return 1;
+ }
+ if (tmp >= MAX_NUMNODES || !node_online(tmp))
+ goto invalid;
+ node = array_index_nospec(tmp, MAX_NUMNODES);
+ p += count + 1;
+ /* Parse hugepages */
+ if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+ goto invalid;
+ if (!hugetlb_max_hstate)
+ default_hugepages_in_node[node] = tmp;
+ else
+ parsed_hstate->max_huge_pages_node[node] = tmp;
+ *mhp += tmp;
+ /* Go to parse next node*/
+ if (p[count] == ',')
+ p += count + 1;
+ else
+ break;
+ } else {
+ if (p != s)
+ goto invalid;
+ *mhp = tmp;
+ break;
+ }
+ }
/*
* Global state is always initialized later in hugetlb_init.
- * But we need to allocate >= MAX_ORDER hstates here early to still
+ * But we need to allocate gigantic hstates here early to still
* use the bootmem allocator.
*/
- if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+ if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
return 1;
+
+invalid:
+ pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
+ hugepages_clear_pages_in_node();
+ return 1;
}
__setup("hugepages=", hugepages_setup);
@@ -3356,7 +4467,7 @@ static int __init hugepagesz_setup(char *s)
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
- return 0;
+ return 1;
}
h = size_to_hstate(size);
@@ -3371,7 +4482,7 @@ static int __init hugepagesz_setup(char *s)
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
- return 0;
+ return 1;
}
/*
@@ -3397,18 +4508,19 @@ __setup("hugepagesz=", hugepagesz_setup);
static int __init default_hugepagesz_setup(char *s)
{
unsigned long size;
+ int i;
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
- return 0;
+ return 1;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
- return 0;
+ return 1;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -3420,11 +4532,14 @@ static int __init default_hugepagesz_setup(char *s)
* The number of default huge pages (for this size) could have been
* specified as the first hugetlb parameter: hugepages=X. If so,
* then default_hstate_max_huge_pages is set. If the default huge
- * page size is gigantic (>= MAX_ORDER), then the pages must be
+ * page size is gigantic (> MAX_ORDER), then the pages must be
* allocated here from bootmem allocator.
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ for_each_online_node(i)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
if (hstate_is_gigantic(&default_hstate))
hugetlb_hstate_alloc_pages(&default_hstate);
default_hstate_max_huge_pages = 0;
@@ -3434,19 +4549,34 @@ static int __init default_hugepagesz_setup(char *s)
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+ struct mempolicy *mpol = get_task_policy(current);
+
+ /*
+ * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+ * (from policy_nodemask) specifically for hugetlb case
+ */
+ if (mpol->mode == MPOL_BIND &&
+ (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+ return &mpol->nodes;
+#endif
+ return NULL;
+}
+
static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
- nodemask_t *mpol_allowed;
+ nodemask_t *mbind_nodemask;
unsigned int *array = h->free_huge_pages_node;
gfp_t gfp_mask = htlb_alloc_mask(h);
- mpol_allowed = policy_nodemask_current(gfp_mask);
-
+ mbind_nodemask = policy_mbind_nodemask(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mpol_allowed ||
- (mpol_allowed && node_isset(node, *mpol_allowed)))
+ if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
nr += array[node];
}
@@ -3493,7 +4623,7 @@ out:
return ret;
}
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+static int hugetlb_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
@@ -3502,7 +4632,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
}
#ifdef CONFIG_NUMA
-int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
+static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(true, table, write,
@@ -3510,7 +4640,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
}
#endif /* CONFIG_NUMA */
-int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+static int hugetlb_overcommit_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
@@ -3531,14 +4661,52 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
goto out;
if (write) {
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
}
out:
return ret;
}
+static struct ctl_table hugetlb_table[] = {
+ {
+ .procname = "nr_hugepages",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = hugetlb_sysctl_handler,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "nr_hugepages_mempolicy",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &hugetlb_mempolicy_sysctl_handler,
+ },
+#endif
+ {
+ .procname = "hugetlb_shm_group",
+ .data = &sysctl_hugetlb_shm_group,
+ .maxlen = sizeof(gid_t),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nr_overcommit_hugepages",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = hugetlb_overcommit_handler,
+ },
+ { }
+};
+
+static void hugetlb_sysctl_init(void)
+{
+ register_sysctl_init("vm", hugetlb_table);
+}
#endif /* CONFIG_SYSCTL */
void hugetlb_report_meminfo(struct seq_file *m)
@@ -3552,7 +4720,7 @@ void hugetlb_report_meminfo(struct seq_file *m)
for_each_hstate(h) {
unsigned long count = h->nr_huge_pages;
- total += (PAGE_SIZE << huge_page_order(h)) * count;
+ total += huge_page_size(h) * count;
if (h == &default_hstate)
seq_printf(m,
@@ -3565,10 +4733,10 @@ void hugetlb_report_meminfo(struct seq_file *m)
h->free_huge_pages,
h->resv_huge_pages,
h->surplus_huge_pages,
- (PAGE_SIZE << huge_page_order(h)) / 1024);
+ huge_page_size(h) / SZ_1K);
}
- seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
+ seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
}
int hugetlb_report_node_meminfo(char *buf, int len, int nid)
@@ -3587,22 +4755,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid)
nid, h->surplus_huge_pages_node[nid]);
}
-void hugetlb_show_meminfo(void)
+void hugetlb_show_meminfo_node(int nid)
{
struct hstate *h;
- int nid;
if (!hugepages_supported())
return;
- for_each_node_state(nid, N_MEMORY)
- for_each_hstate(h)
- pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
- nid,
- h->nr_huge_pages_node[nid],
- h->free_huge_pages_node[nid],
- h->surplus_huge_pages_node[nid],
- 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+ for_each_hstate(h)
+ printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+ nid,
+ h->nr_huge_pages_node[nid],
+ h->free_huge_pages_node[nid],
+ h->surplus_huge_pages_node[nid],
+ huge_page_size(h) / SZ_1K);
}
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
@@ -3626,7 +4792,10 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
{
int ret = -ENOMEM;
- spin_lock(&hugetlb_lock);
+ if (!delta)
+ return 0;
+
+ spin_lock_irq(&hugetlb_lock);
/*
* When cpuset is configured, it breaks the strict hugetlb page
* reservation as the accounting is done on a global variable. Such
@@ -3665,7 +4834,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
return_unused_surplus_pages(h, (unsigned long) -delta);
out:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return ret;
}
@@ -3674,6 +4843,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
/*
+ * HPAGE_RESV_OWNER indicates a private mapping.
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
@@ -3681,18 +4851,42 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
- if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
+ }
+
+ /*
+ * vma_lock structure for sharable mappings is vma specific.
+ * Clear old pointer (if copied via vm_area_dup) and allocate
+ * new structure. Before clearing, make sure vma_lock is not
+ * for this vma.
+ */
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock) {
+ if (vma_lock->vma != vma) {
+ vma->vm_private_data = NULL;
+ hugetlb_vma_lock_alloc(vma);
+ } else
+ pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
+ } else
+ hugetlb_vma_lock_alloc(vma);
+ }
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *resv;
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
+ hugetlb_vma_lock_free(vma);
+
+ resv = vma_resv_map(vma);
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -3717,20 +4911,37 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+
+ /*
+ * PMD sharing is only possible for PUD_SIZE-aligned address ranges
+ * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
+ * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ */
+ if (addr & ~PUD_MASK) {
+ /*
+ * hugetlb_vm_op_split is called right before we attempt to
+ * split the VMA. We will need to unshare PMDs in the old and
+ * new VMAs, so let's unshare before we split.
+ */
+ unsigned long floor = addr & PUD_MASK;
+ unsigned long ceil = floor + PUD_SIZE;
+
+ if (floor >= vma->vm_start && ceil <= vma->vm_end)
+ hugetlb_unshare_pmds(vma, floor, ceil);
+ }
+
return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
- struct hstate *hstate = hstate_vma(vma);
-
- return 1UL << huge_page_shift(hstate);
+ return huge_page_size(hstate_vma(vma));
}
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
+ * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
@@ -3750,7 +4961,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
- .split = hugetlb_vm_op_split,
+ .may_split = hugetlb_vm_op_split,
.pagesize = hugetlb_vm_op_pagesize,
};
@@ -3758,6 +4969,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
if (writable) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
@@ -3767,8 +4979,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
- entry = pte_mkhuge(entry);
- entry = arch_make_huge_pte(entry, vma, page, writable);
+ entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
return entry;
}
@@ -3809,42 +5020,62 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
return false;
}
+static void
+hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
+ struct folio *new_folio, pte_t old)
+{
+ pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
+
+ __folio_mark_uptodate(new_folio);
+ hugepage_add_new_anon_rmap(new_folio, vma, addr);
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
+ newpte = huge_pte_mkuffd_wp(newpte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, newpte);
+ hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
+ folio_set_hugetlb_migratable(new_folio);
+}
+
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
{
- pte_t *src_pte, *dst_pte, entry, dst_entry;
- struct page *ptepage;
+ pte_t *src_pte, *dst_pte, entry;
+ struct folio *pte_folio;
unsigned long addr;
- int cow;
- struct hstate *h = hstate_vma(vma);
+ bool cow = is_cow_mapping(src_vma->vm_flags);
+ struct hstate *h = hstate_vma(src_vma);
unsigned long sz = huge_page_size(h);
- struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long npages = pages_per_huge_page(h);
struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
int ret = 0;
- cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-
if (cow) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
- vma->vm_start,
- vma->vm_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
+ src_vma->vm_start,
+ src_vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
+ mmap_assert_write_locked(src);
+ raw_write_seqcount_begin(&src->write_protect_seq);
} else {
/*
- * For shared mappings i_mmap_rwsem must be held to call
- * huge_pte_alloc, otherwise the returned ptep could go
- * away if part of a shared pmd and another thread calls
- * huge_pmd_unshare.
+ * For shared mappings the vma lock must be held before
+ * calling hugetlb_walk() in the src vma. Otherwise, the
+ * returned ptep could go away if part of a shared pmd and
+ * another thread calls huge_pmd_unshare.
*/
- i_mmap_lock_read(mapping);
+ hugetlb_vma_lock_read(src_vma);
}
- for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ last_addr_mask = hugetlb_mask_last_page(h);
+ for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = huge_pte_offset(src, addr, sz);
- if (!src_pte)
+ src_pte = hugetlb_walk(src_vma, addr, sz);
+ if (!src_pte) {
+ addr |= last_addr_mask;
continue;
- dst_pte = huge_pte_alloc(dst, addr, sz);
+ }
+ dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
@@ -3852,77 +5083,244 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
/*
* If the pagetables are shared don't copy or take references.
- * dst_pte == src_pte is the common case of src/dest sharing.
*
+ * dst_pte == src_pte is the common case of src/dest sharing.
* However, src could have 'unshared' and dst shares with
- * another vma. If dst_pte !none, this implies sharing.
- * Check here before taking page table lock, and once again
- * after taking the lock below.
+ * another vma. So page_count of ptep page is checked instead
+ * to reliably determine whether pte is shared.
*/
- dst_entry = huge_ptep_get(dst_pte);
- if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+ if (page_count(virt_to_page(dst_pte)) > 1) {
+ addr |= last_addr_mask;
continue;
+ }
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- dst_entry = huge_ptep_get(dst_pte);
- if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+again:
+ if (huge_pte_none(entry)) {
/*
- * Skip if src entry none. Also, skip in the
- * unlikely case dst entry !none as this implies
- * sharing with another vma.
+ * Skip if src entry none.
*/
;
- } else if (unlikely(is_hugetlb_entry_migration(entry) ||
- is_hugetlb_entry_hwpoisoned(entry))) {
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
+ if (!userfaultfd_wp(dst_vma))
+ entry = huge_pte_clear_uffd_wp(entry);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else if (unlikely(is_hugetlb_entry_migration(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
+ bool uffd_wp = pte_swp_uffd_wp(entry);
- if (is_write_migration_entry(swp_entry) && cow) {
+ if (!is_readable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
- make_migration_entry_read(&swp_entry);
+ swp_entry = make_readable_migration_entry(
+ swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
- set_huge_swap_pte_at(src, addr, src_pte,
- entry, sz);
+ if (userfaultfd_wp(src_vma) && uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ set_huge_pte_at(src, addr, src_pte, entry);
}
- set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ if (!userfaultfd_wp(dst_vma))
+ entry = huge_pte_clear_uffd_wp(entry);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else if (unlikely(is_pte_marker(entry))) {
+ /* No swap on hugetlb */
+ WARN_ON_ONCE(
+ is_swapin_error_entry(pte_to_swp_entry(entry)));
+ /*
+ * We copy the pte marker only if the dst vma has
+ * uffd-wp enabled.
+ */
+ if (userfaultfd_wp(dst_vma))
+ set_huge_pte_at(dst, addr, dst_pte, entry);
} else {
+ entry = huge_ptep_get(src_pte);
+ pte_folio = page_folio(pte_page(entry));
+ folio_get(pte_folio);
+
+ /*
+ * Failing to duplicate the anon rmap is a rare case
+ * where we see pinned hugetlb pages while they're
+ * prone to COW. We need to do the COW earlier during
+ * fork.
+ *
+ * When pre-allocating the page or copying data, we
+ * need to be without the pgtable locks since we could
+ * sleep during the process.
+ */
+ if (!folio_test_anon(pte_folio)) {
+ page_dup_file_rmap(&pte_folio->page, true);
+ } else if (page_try_dup_anon_rmap(&pte_folio->page,
+ true, src_vma)) {
+ pte_t src_pte_old = entry;
+ struct folio *new_folio;
+
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ /* Do not use reserve as it's private owned */
+ new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
+ if (IS_ERR(new_folio)) {
+ folio_put(pte_folio);
+ ret = PTR_ERR(new_folio);
+ break;
+ }
+ ret = copy_user_large_folio(new_folio,
+ pte_folio,
+ addr, dst_vma);
+ folio_put(pte_folio);
+ if (ret) {
+ folio_put(new_folio);
+ break;
+ }
+
+ /* Install the new hugetlb folio if src pte stable */
+ dst_ptl = huge_pte_lock(h, dst, dst_pte);
+ src_ptl = huge_pte_lockptr(h, src, src_pte);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ entry = huge_ptep_get(src_pte);
+ if (!pte_same(src_pte_old, entry)) {
+ restore_reserve_on_error(h, dst_vma, addr,
+ new_folio);
+ folio_put(new_folio);
+ /* huge_ptep of dst_pte won't change as in child */
+ goto again;
+ }
+ hugetlb_install_folio(dst_vma, dst_pte, addr,
+ new_folio, src_pte_old);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ continue;
+ }
+
if (cow) {
/*
* No need to notify as we are downgrading page
* table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_pte_wrprotect(entry);
}
- entry = huge_ptep_get(src_pte);
- ptepage = pte_page(entry);
- get_page(ptepage);
- page_dup_rmap(ptepage, true);
+
+ if (!userfaultfd_wp(dst_vma))
+ entry = huge_pte_clear_uffd_wp(entry);
+
set_huge_pte_at(dst, addr, dst_pte, entry);
- hugetlb_count_add(pages_per_huge_page(h), dst);
+ hugetlb_count_add(npages, dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
}
- if (cow)
+ if (cow) {
+ raw_write_seqcount_end(&src->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
- else
- i_mmap_unlock_read(mapping);
+ } else {
+ hugetlb_vma_unlock_read(src_vma);
+ }
return ret;
}
-void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct page *ref_page)
+static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *src_ptl, *dst_ptl;
+ pte_t pte;
+
+ dst_ptl = huge_pte_lock(h, mm, dst_pte);
+ src_ptl = huge_pte_lockptr(h, mm, src_pte);
+
+ /*
+ * We don't have to worry about the ordering of src and dst ptlocks
+ * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
+ */
+ if (src_ptl != dst_ptl)
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+ set_huge_pte_at(mm, new_addr, dst_pte, pte);
+
+ if (src_ptl != dst_ptl)
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+}
+
+int move_hugetlb_page_tables(struct vm_area_struct *vma,
+ struct vm_area_struct *new_vma,
+ unsigned long old_addr, unsigned long new_addr,
+ unsigned long len)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long sz = huge_page_size(h);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_end = old_addr + len;
+ unsigned long last_addr_mask;
+ pte_t *src_pte, *dst_pte;
+ struct mmu_notifier_range range;
+ bool shared_pmd = false;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
+ old_end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ /*
+ * In case of shared PMDs, we should cover the maximum possible
+ * range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+
+ mmu_notifier_invalidate_range_start(&range);
+ last_addr_mask = hugetlb_mask_last_page(h);
+ /* Prevent race with file truncation */
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(mapping);
+ for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
+ src_pte = hugetlb_walk(vma, old_addr, sz);
+ if (!src_pte) {
+ old_addr |= last_addr_mask;
+ new_addr |= last_addr_mask;
+ continue;
+ }
+ if (huge_pte_none(huge_ptep_get(src_pte)))
+ continue;
+
+ if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
+ shared_pmd = true;
+ old_addr |= last_addr_mask;
+ new_addr |= last_addr_mask;
+ continue;
+ }
+
+ dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
+ if (!dst_pte)
+ break;
+
+ move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
+ }
+
+ if (shared_pmd)
+ flush_tlb_range(vma, range.start, range.end);
+ else
+ flush_tlb_range(vma, old_end - len, old_end);
+ mmu_notifier_invalidate_range_end(&range);
+ i_mmap_unlock_write(mapping);
+ hugetlb_vma_unlock_write(vma);
+
+ return len + old_addr - old_end;
+}
+
+static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -3932,7 +5330,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
+ bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -3945,26 +5344,21 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
- /*
- * If sharing possible, alert mmu notifiers of worst case.
- */
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
- end);
- adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
- mmu_notifier_invalidate_range_start(&range);
+ last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address, sz);
- if (!ptep)
+ ptep = hugetlb_walk(vma, address, sz);
+ if (!ptep) {
+ address |= last_addr_mask;
continue;
+ }
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, address, ptep)) {
spin_unlock(ptl);
- /*
- * We just unmapped a page of PMDs by clearing a PUD.
- * The caller's TLB flush range should cover this area.
- */
+ tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
+ force_flush = true;
+ address |= last_addr_mask;
continue;
}
@@ -3979,7 +5373,18 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
- huge_pte_clear(mm, address, ptep, sz);
+ /*
+ * If the pte was wr-protected by uffd-wp in any of the
+ * swap forms, meanwhile the caller does not want to
+ * drop the uffd-wp bit in this zap, then replace the
+ * pte with a marker.
+ */
+ if (pte_swp_uffd_wp_any(pte) &&
+ !(zap_flags & ZAP_FLAG_DROP_MARKER))
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ else
+ huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
}
@@ -4007,9 +5412,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
-
+ /* Leave a uffd-wp pte marker if needed */
+ if (huge_pte_uffd_wp(pte) &&
+ !(zap_flags & ZAP_FLAG_DROP_MARKER))
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
hugetlb_count_sub(pages_per_huge_page(h), mm);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
spin_unlock(ptl);
tlb_remove_page_size(tlb, page, huge_page_size(h));
@@ -4019,56 +5428,76 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
+
+ /*
+ * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
+ * could defer the flush until now, since by holding i_mmap_rwsem we
+ * guaranteed that the last refernece would not be dropped. But we must
+ * do the flushing before we return, as otherwise i_mmap_rwsem will be
+ * dropped and the last reference to the shared PMDs page might be
+ * dropped as well.
+ *
+ * In theory we could defer the freeing of the PMD pages as well, but
+ * huge_pmd_unshare() relies on the exact page_count for the PMD page to
+ * detect sharing, so we cannot defer the release of the page either.
+ * Instead, do flush now.
+ */
+ if (force_flush)
+ tlb_flush_mmu_tlbonly(tlb);
}
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+ unsigned long end, struct page *ref_page,
+ zap_flags_t zap_flags)
{
- __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
- /*
- * Clear this flag so that x86's huge_pmd_share page_table_shareable
- * test will fail on a vma being torn down, and not grab a page table
- * on its way out. We're lucky that the flag has such an appropriate
- * name, and can in fact be safely cleared here. We could clear it
- * before the __unmap_hugepage_range above, but all that's necessary
- * is to clear it before releasing the i_mmap_rwsem. This works
- * because in the context this is called, the VMA is about to be
- * destroyed and the i_mmap_rwsem is held.
- */
- vma->vm_flags &= ~VM_MAYSHARE;
+ /* mmu notification performed in caller */
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
+
+ if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
+ /*
+ * Unlock and free the vma lock before releasing i_mmap_rwsem.
+ * When the vma_lock is freed, this makes the vma ineligible
+ * for pmd sharing. And, i_mmap_rwsem is required to set up
+ * pmd sharing. This is important as page tables for this
+ * unmapped range will be asynchrously deleted. If the page
+ * tables are shared, there will be issues when accessed by
+ * someone else.
+ */
+ __hugetlb_vma_unlock_write_free(vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+ unsigned long end, struct page *ref_page,
+ zap_flags_t zap_flags)
{
- struct mm_struct *mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- unsigned long tlb_start = start;
- unsigned long tlb_end = end;
- /*
- * If shared PMDs were possibly used within this vma range, adjust
- * start/end for worst case tlb flushing.
- * Note that we can not be sure if PMDs are shared until we try to
- * unmap pages. However, we want to make sure TLB flushing covers
- * the largest possible range.
- */
- adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
+ start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
- mm = vma->vm_mm;
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
- tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
- __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
- tlb_finish_mmu(&tlb, tlb_start, tlb_end);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
* This is called when the original mapper is failing to COW a MAP_PRIVATE
- * mappping it owns the reserve page for. The intention is to unmap the page
+ * mapping it owns the reserve page for. The intention is to unmap the page
* from other VMAs and let the children be SIGKILLed if they are faulting the
* same region.
*/
@@ -4117,41 +5546,76 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
- address + huge_page_size(h), page);
+ address + huge_page_size(h), page, 0);
}
i_mmap_unlock_write(mapping);
}
/*
- * Hugetlb_cow() should be called with page lock of the original hugepage held.
- * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * hugetlb_wp() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_fault_mutex_table held and pte_page locked so we
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
- struct page *pagecache_page, spinlock_t *ptl)
+static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, unsigned int flags,
+ struct folio *pagecache_folio, spinlock_t *ptl)
{
- pte_t pte;
+ const bool unshare = flags & FAULT_FLAG_UNSHARE;
+ pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(vma);
- struct page *old_page, *new_page;
+ struct folio *old_folio;
+ struct folio *new_folio;
int outside_reserve = 0;
vm_fault_t ret = 0;
unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
- pte = huge_ptep_get(ptep);
- old_page = pte_page(pte);
+ /*
+ * Never handle CoW for uffd-wp protected pages. It should be only
+ * handled when the uffd-wp protection is removed.
+ *
+ * Note that only the CoW optimization path (in hugetlb_no_page())
+ * can trigger this, because hugetlb_fault() will always resolve
+ * uffd-wp bit first.
+ */
+ if (!unshare && huge_pte_uffd_wp(pte))
+ return 0;
-retry_avoidcopy:
- /* If no-one else is actually using this page, avoid the copy
- * and just make the page writable */
- if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
- page_move_anon_rmap(old_page, vma);
+ /*
+ * hugetlb does not support FOLL_FORCE-style write faults that keep the
+ * PTE mapped R/O such as maybe_mkwrite() would do.
+ */
+ if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
+ return VM_FAULT_SIGSEGV;
+
+ /* Let's take out MAP_SHARED mappings first. */
+ if (vma->vm_flags & VM_MAYSHARE) {
set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
+ old_folio = page_folio(pte_page(pte));
+
+ delayacct_wpcopy_start();
+
+retry_avoidcopy:
+ /*
+ * If no-one else is actually using this page, we're the exclusive
+ * owner and can reuse this page.
+ */
+ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
+ if (!PageAnonExclusive(&old_folio->page))
+ page_move_anon_rmap(&old_folio->page, vma);
+ if (likely(!unshare))
+ set_huge_ptep_writable(vma, haddr, ptep);
+
+ delayacct_wpcopy_end();
+ return 0;
+ }
+ VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
+ PageAnonExclusive(&old_folio->page), &old_folio->page);
+
/*
* If the process that created a MAP_PRIVATE mapping is about to
* perform a COW due to a shared page count, attempt to satisfy
@@ -4162,19 +5626,19 @@ retry_avoidcopy:
* of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- old_page != pagecache_page)
+ old_folio != pagecache_folio)
outside_reserve = 1;
- get_page(old_page);
+ folio_get(old_folio);
/*
* Drop page table lock as buddy allocator may be called. It will
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(ptl);
- new_page = alloc_huge_page(vma, haddr, outside_reserve);
+ new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
- if (IS_ERR(new_page)) {
+ if (IS_ERR(new_folio)) {
/*
* If a process owning a MAP_PRIVATE mapping fails to COW,
* it is due to references held by a child and an insufficient
@@ -4183,12 +5647,31 @@ retry_avoidcopy:
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
- put_page(old_page);
- BUG_ON(huge_pte_none(pte));
- unmap_ref_private(mm, vma, old_page, haddr);
- BUG_ON(huge_pte_none(pte));
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx;
+ u32 hash;
+
+ folio_put(old_folio);
+ /*
+ * Drop hugetlb_fault_mutex and vma_lock before
+ * unmapping. unmapping needs to hold vma_lock
+ * in write mode. Dropping vma_lock in read mode
+ * here is OK as COW mappings do not interact with
+ * PMD sharing.
+ *
+ * Reacquire both after unmap operation.
+ */
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ unmap_ref_private(mm, vma, &old_folio->page, haddr);
+
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(vma);
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -4196,10 +5679,11 @@ retry_avoidcopy:
* race occurs while re-acquiring page table
* lock, and our job is done.
*/
+ delayacct_wpcopy_end();
return 0;
}
- ret = vmf_error(PTR_ERR(new_page));
+ ret = vmf_error(PTR_ERR(new_folio));
goto out_release_old;
}
@@ -4212,11 +5696,13 @@ retry_avoidcopy:
goto out_release_all;
}
- copy_user_huge_page(new_page, old_page, address, vma,
- pages_per_huge_page(h));
- __SetPageUptodate(new_page);
+ if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
+ ret = VM_FAULT_HWPOISON_LARGE;
+ goto out_release_all;
+ }
+ __folio_mark_uptodate(new_folio);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
@@ -4225,44 +5711,39 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- ClearPagePrivate(new_page);
+ pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
- /* Break COW */
+ /* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
- set_huge_pte_at(mm, haddr, ptep,
- make_huge_pte(vma, new_page, 1));
- page_remove_rmap(old_page, true);
- hugepage_add_new_anon_rmap(new_page, vma, haddr);
- set_page_huge_active(new_page);
+ page_remove_rmap(&old_folio->page, vma, true);
+ hugepage_add_new_anon_rmap(new_folio, vma, haddr);
+ if (huge_pte_uffd_wp(pte))
+ newpte = huge_pte_mkuffd_wp(newpte);
+ set_huge_pte_at(mm, haddr, ptep, newpte);
+ folio_set_hugetlb_migratable(new_folio);
/* Make the old page be freed below */
- new_page = old_page;
+ new_folio = old_folio;
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- restore_reserve_on_error(h, vma, haddr, new_page);
- put_page(new_page);
+ /*
+ * No restore in case of successful pagetable update (Break COW or
+ * unshare)
+ */
+ if (new_folio != old_folio)
+ restore_reserve_on_error(h, vma, haddr, new_folio);
+ folio_put(new_folio);
out_release_old:
- put_page(old_page);
+ folio_put(old_folio);
spin_lock(ptl); /* Caller expects lock to be held */
- return ret;
-}
-/* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
-{
- struct address_space *mapping;
- pgoff_t idx;
-
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
-
- return find_lock_page(mapping, idx);
+ delayacct_wpcopy_end();
+ return ret;
}
/*
@@ -4272,35 +5753,38 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
- struct address_space *mapping;
- pgoff_t idx;
- struct page *page;
-
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, vma, address);
+ struct folio *folio;
- page = find_get_page(mapping, idx);
- if (page)
- put_page(page);
- return page != NULL;
+ folio = filemap_get_folio(mapping, idx);
+ if (IS_ERR(folio))
+ return false;
+ folio_put(folio);
+ return true;
}
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
pgoff_t idx)
{
struct inode *inode = mapping->host;
struct hstate *h = hstate_inode(inode);
- int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ int err;
- if (err)
+ __folio_set_locked(folio);
+ err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
+
+ if (unlikely(err)) {
+ __folio_clear_locked(folio);
return err;
- ClearPagePrivate(page);
+ }
+ folio_clear_hugetlb_restore_reserve(folio);
/*
- * set page dirty so that it will not be removed from cache/file
+ * mark folio dirty so that it will not be removed from cache/file
* by non-hugetlbfs specific code paths.
*/
- set_page_dirty(page);
+ folio_mark_dirty(folio);
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
@@ -4308,78 +5792,128 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
return 0;
}
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx,
+ unsigned int flags,
+ unsigned long haddr,
+ unsigned long addr,
+ unsigned long reason)
+{
+ u32 hash;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .real_address = addr,
+ .flags = flags,
+
+ /*
+ * Hard to debug if it ends up being
+ * used by a callee that assumes
+ * something about the other
+ * uninitialized fields... same as in
+ * memory.c
+ */
+ };
+
+ /*
+ * vma_lock and hugetlb_fault_mutex must be dropped before handling
+ * userfault. Also mmap_lock could be dropped due to handling
+ * userfault, any vma operation should be careful from here.
+ */
+ hugetlb_vma_unlock_read(vma);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ return handle_userfault(&vmf, reason);
+}
+
+/*
+ * Recheck pte with pgtable lock. Returns true if pte didn't change, or
+ * false if pte changed or is changing.
+ */
+static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
+ pte_t *ptep, pte_t old_pte)
+{
+ spinlock_t *ptl;
+ bool same;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ same = pte_same(huge_ptep_get(ptep), old_pte);
+ spin_unlock(ptl);
+
+ return same;
+}
+
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep, unsigned int flags)
+ unsigned long address, pte_t *ptep,
+ pte_t old_pte, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
- struct page *page;
+ struct folio *folio;
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page = false;
+ bool new_folio, new_pagecache_folio = false;
+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/*
* Currently, we are forced to kill the process in the event the
* original mapper has unmapped pages from the child due to a failed
- * COW. Warn that such a situation has occurred as it may not be obvious
+ * COW/unsharing. Warn that such a situation has occurred as it may not
+ * be obvious.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
current->pid);
- return ret;
+ goto out;
}
/*
- * We can not race with truncation due to holding i_mmap_rwsem.
- * i_size is modified when holding i_mmap_rwsem, so check here
- * once for faults beyond end of file.
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
*/
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
-retry:
- page = find_lock_page(mapping, idx);
- if (!page) {
- /*
- * Check for page in userfault range
- */
+ new_folio = false;
+ folio = filemap_lock_folio(mapping, idx);
+ if (IS_ERR(folio)) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+ /* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
- u32 hash;
- struct vm_fault vmf = {
- .vma = vma,
- .address = haddr,
- .flags = flags,
- /*
- * Hard to debug if it ends up being
- * used by a callee that assumes
- * something about the other
- * uninitialized fields... same as in
- * memory.c
- */
- };
-
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
+ * Since hugetlb_no_page() was examining pte
+ * without pgtable lock, we need to re-test under
+ * lock because the pte may not be stable and could
+ * have changed from under us. Try to detect
+ * either changed or during-changing ptes and retry
+ * properly when needed.
+ *
+ * Note that userfaultfd is actually fine with
+ * false positives (e.g. caused by pte changed),
+ * but not wrong logical events (e.g. caused by
+ * reading a pte during changing). The latter can
+ * confuse the userspace, so the strictness is very
+ * much preferred. E.g., MISSING event should
+ * never happen on the page after UFFDIO_COPY has
+ * correctly installed the page and returned.
*/
- hash = hugetlb_fault_mutex_hash(mapping, idx);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
- i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- goto out;
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MISSING);
}
- page = alloc_huge_page(vma, haddr, 0);
- if (IS_ERR(page)) {
+ folio = alloc_hugetlb_folio(vma, haddr, 0);
+ if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
* sent SIGBUS. The hugetlb fault mutex prevents two
@@ -4392,30 +5926,33 @@ retry:
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
- ptl = huge_pte_lock(h, mm, ptep);
- if (!huge_pte_none(huge_ptep_get(ptep))) {
+ if (hugetlb_pte_stable(h, mm, ptep, old_pte))
+ ret = vmf_error(PTR_ERR(folio));
+ else
ret = 0;
- spin_unlock(ptl);
- goto out;
- }
- spin_unlock(ptl);
- ret = vmf_error(PTR_ERR(page));
goto out;
}
- clear_huge_page(page, address, pages_per_huge_page(h));
- __SetPageUptodate(page);
- new_page = true;
+ clear_huge_page(&folio->page, address, pages_per_huge_page(h));
+ __folio_mark_uptodate(folio);
+ new_folio = true;
if (vma->vm_flags & VM_MAYSHARE) {
- int err = huge_add_to_page_cache(page, mapping, idx);
+ int err = hugetlb_add_to_page_cache(folio, mapping, idx);
if (err) {
- put_page(page);
- if (err == -EEXIST)
- goto retry;
+ /*
+ * err can't be -EEXIST which implies someone
+ * else consumed the reservation since hugetlb
+ * fault mutex is held when add a hugetlb page
+ * to the page cache. So it's safe to call
+ * restore_reserve_on_error() here.
+ */
+ restore_reserve_on_error(h, vma, haddr, folio);
+ folio_put(folio);
goto out;
}
+ new_pagecache_folio = true;
} else {
- lock_page(page);
+ folio_lock(folio);
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
@@ -4428,11 +5965,25 @@ retry:
* don't have hwpoisoned swap entry for errored virtual address.
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
- if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON |
+ if (unlikely(folio_test_hwpoison(folio))) {
+ ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
+
+ /* Check for page in userfault range. */
+ if (userfaultfd_minor(vma)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ /* See comment in userfaultfd_missing() block above */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MINOR);
+ }
}
/*
@@ -4452,44 +6003,54 @@ retry:
ptl = huge_pte_lock(h, mm, ptep);
ret = 0;
- if (!huge_pte_none(huge_ptep_get(ptep)))
+ /* If pte changed from under us, retry */
+ if (!pte_same(huge_ptep_get(ptep), old_pte))
goto backout;
- if (anon_rmap) {
- ClearPagePrivate(page);
- hugepage_add_new_anon_rmap(page, vma, haddr);
- } else
- page_dup_rmap(page, true);
- new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ if (anon_rmap)
+ hugepage_add_new_anon_rmap(folio, vma, haddr);
+ else
+ page_dup_file_rmap(&folio->page, true);
+ new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
+ /*
+ * If this pte was previously wr-protected, keep it wr-protected even
+ * if populated.
+ */
+ if (unlikely(pte_marker_uffd_wp(old_pte)))
+ new_pte = huge_pte_mkuffd_wp(new_pte);
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl);
}
spin_unlock(ptl);
/*
- * Only make newly allocated pages active. Existing pages found
- * in the pagecache could be !page_huge_active() if they have been
- * isolated for migration.
+ * Only set hugetlb_migratable in newly allocated pages. Existing pages
+ * found in the pagecache may not have hugetlb_migratable if they have
+ * been isolated for migration.
*/
- if (new_page)
- set_page_huge_active(page);
+ if (new_folio)
+ folio_set_hugetlb_migratable(folio);
- unlock_page(page);
+ folio_unlock(folio);
out:
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return ret;
backout:
spin_unlock(ptl);
backout_unlocked:
- unlock_page(page);
- restore_reserve_on_error(h, vma, haddr, page);
- put_page(page);
+ if (new_folio && !new_pagecache_folio)
+ restore_reserve_on_error(h, vma, haddr, folio);
+
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
@@ -4508,7 +6069,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
}
#else
/*
- * For uniprocesor systems we always use a single mutex, so just
+ * For uniprocessor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
@@ -4525,63 +6086,46 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vm_fault_t ret;
u32 hash;
pgoff_t idx;
- struct page *page = NULL;
- struct page *pagecache_page = NULL;
+ struct folio *folio = NULL;
+ struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, mm, ptep);
- return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- }
-
- /*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This serves two purposes:
- * 1) It prevents huge_pmd_unshare from being called elsewhere
- * and making the ptep no longer valid.
- * 2) It synchronizes us with i_size modifications during truncation.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something has changed.
- */
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep) {
- i_mmap_unlock_read(mapping);
- return VM_FAULT_OOM;
- }
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
+ mapping = vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
- entry = huge_ptep_get(ptep);
- if (huge_pte_none(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
- goto out_mutex;
+ /*
+ * Acquire vma lock before calling huge_pte_alloc and hold
+ * until finished with ptep. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the ptep no longer valid.
+ */
+ hugetlb_vma_lock_read(vma);
+ ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
+ if (!ptep) {
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ return VM_FAULT_OOM;
}
+ entry = huge_ptep_get(ptep);
+ /* PTE markers should be handled the same way as none pte */
+ if (huge_pte_none_mostly(entry))
+ /*
+ * hugetlb_no_page will drop vma lock and hugetlb fault
+ * mutex internally, which make us return immediately.
+ */
+ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+ entry, flags);
+
ret = 0;
/*
@@ -4591,18 +6135,33 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
- if (!pte_present(entry))
+ if (!pte_present(entry)) {
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ /*
+ * Release the hugetlb fault lock now, but retain
+ * the vma lock, because it is needed to guard the
+ * huge_pte_lockptr() later in
+ * migration_entry_wait_huge(). The vma lock will
+ * be released there.
+ */
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ migration_entry_wait_huge(vma, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ }
/*
- * If we are going to COW the mapping later, we examine the pending
- * reservations for this page now. This will ensure that any
+ * If we are going to COW/unshare the mapping later, we examine the
+ * pending reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
- * spinlock. For private mappings, we also lookup the pagecache
- * page now as it is used to determine if a reservation has been
- * consumed.
+ * spinlock. Also lookup the pagecache page now as it is used to
+ * determine if a reservation has been consumed.
*/
- if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
@@ -4610,57 +6169,78 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- if (!(vma->vm_flags & VM_MAYSHARE))
- pagecache_page = hugetlbfs_pagecache_page(h,
- vma, haddr);
+ pagecache_folio = filemap_lock_folio(mapping, idx);
+ if (IS_ERR(pagecache_folio))
+ pagecache_folio = NULL;
}
ptl = huge_pte_lock(h, mm, ptep);
- /* Check for a racing update before calling hugetlb_cow */
+ /* Check for a racing update before calling hugetlb_wp() */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
goto out_ptl;
+ /* Handle userfault-wp first, before trying to lock more pages */
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
+ (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .real_address = address,
+ .flags = flags,
+ };
+
+ spin_unlock(ptl);
+ if (pagecache_folio) {
+ folio_unlock(pagecache_folio);
+ folio_put(pagecache_folio);
+ }
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ return handle_userfault(&vmf, VM_UFFD_WP);
+ }
+
/*
- * hugetlb_cow() requires page locks of pte_page(entry) and
- * pagecache_page, so here we need take the former one
- * when page != pagecache_page or !pagecache_page.
+ * hugetlb_wp() requires page locks of pte_page(entry) and
+ * pagecache_folio, so here we need take the former one
+ * when folio != pagecache_folio or !pagecache_folio.
*/
- page = pte_page(entry);
- if (page != pagecache_page)
- if (!trylock_page(page)) {
+ folio = page_folio(pte_page(entry));
+ if (folio != pagecache_folio)
+ if (!folio_trylock(folio)) {
need_wait_lock = 1;
goto out_ptl;
}
- get_page(page);
+ folio_get(folio);
- if (flags & FAULT_FLAG_WRITE) {
+ if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep,
- pagecache_page, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags,
+ pagecache_folio, ptl);
goto out_put_page;
+ } else if (likely(flags & FAULT_FLAG_WRITE)) {
+ entry = huge_pte_mkdirty(entry);
}
- entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep);
out_put_page:
- if (page != pagecache_page)
- unlock_page(page);
- put_page(page);
+ if (folio != pagecache_folio)
+ folio_unlock(folio);
+ folio_put(folio);
out_ptl:
spin_unlock(ptl);
- if (pagecache_page) {
- unlock_page(pagecache_page);
- put_page(pagecache_page);
+ if (pagecache_folio) {
+ folio_unlock(pagecache_folio);
+ folio_put(pagecache_folio);
}
out_mutex:
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4669,67 +6249,120 @@ out_mutex:
* here without taking refcount.
*/
if (need_wait_lock)
- wait_on_page_locked(page);
+ folio_wait_locked(folio);
return ret;
}
+#ifdef CONFIG_USERFAULTFD
/*
- * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
- * modifications for huge pages.
+ * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
+ * with modifications for hugetlb pages.
*/
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
-{
- struct address_space *mapping;
- pgoff_t idx;
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct folio **foliop)
+{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
+ bool wp_enabled = (flags & MFILL_ATOMIC_WP);
+ struct hstate *h = hstate_vma(dst_vma);
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
- struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
- int ret;
- struct page *page;
+ int ret = -ENOMEM;
+ struct folio *folio;
+ int writable;
+ bool folio_in_pagecache = false;
- if (!*pagep) {
- ret = -ENOMEM;
- page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page))
+ if (is_continue) {
+ ret = -EFAULT;
+ folio = filemap_lock_folio(mapping, idx);
+ if (IS_ERR(folio))
+ goto out;
+ folio_in_pagecache = true;
+ } else if (!*foliop) {
+ /* If a folio already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
goto out;
+ }
- ret = copy_huge_page_from_user(page,
- (const void __user *) src_addr,
- pages_per_huge_page(h), false);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ if (IS_ERR(folio)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_folio_from_user(folio, (const void __user *) src_addr,
+ false);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
- *pagep = page;
- /* don't free the page */
+ /* Free the allocated folio which may have
+ * consumed a reservation.
+ */
+ restore_reserve_on_error(h, dst_vma, dst_addr, folio);
+ folio_put(folio);
+
+ /* Allocate a temporary folio to hold the copied
+ * contents.
+ */
+ folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
+ if (!folio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ *foliop = folio;
+ /* Set the outparam foliop and return to the caller to
+ * copy the contents outside the lock. Don't free the
+ * folio.
+ */
goto out;
}
} else {
- page = *pagep;
- *pagep = NULL;
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ folio_put(*foliop);
+ ret = -EEXIST;
+ *foliop = NULL;
+ goto out;
+ }
+
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ if (IS_ERR(folio)) {
+ folio_put(*foliop);
+ ret = -ENOMEM;
+ *foliop = NULL;
+ goto out;
+ }
+ ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
+ folio_put(*foliop);
+ *foliop = NULL;
+ if (ret) {
+ folio_put(folio);
+ goto out;
+ }
}
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
- /*
- * If shared, add to page cache
- */
- if (vm_shared) {
+ /* Add shared, newly allocated pages to the page cache. */
+ if (vm_shared && !is_continue) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
@@ -4737,87 +6370,179 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/*
* Serialization between remove_inode_hugepages() and
- * huge_add_to_page_cache() below happens through the
+ * hugetlb_add_to_page_cache() below happens through the
* hugetlb_fault_mutex_table that here must be hold by
* the caller.
*/
- ret = huge_add_to_page_cache(page, mapping, idx);
+ ret = hugetlb_add_to_page_cache(folio, mapping, idx);
if (ret)
goto out_release_nounlock;
+ folio_in_pagecache = true;
}
- ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, dst_mm, dst_pte);
- /*
- * Recheck the i_size after holding PT lock to make sure not
- * to leave any page mapped (as page_mapped()) beyond the end
- * of the i_size (remove_inode_hugepages() is strict about
- * enforcing that). If we bail out here, we'll also leave a
- * page in the radix tree in the vm_shared case beyond the end
- * of the i_size, but remove_inode_hugepages() will take care
- * of it as soon as we drop the hugetlb_fault_mutex_table.
- */
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- ret = -EFAULT;
- if (idx >= size)
+ ret = -EIO;
+ if (folio_test_hwpoison(folio))
goto out_release_unlock;
+ /*
+ * We allow to overwrite a pte marker: consider when both MISSING|WP
+ * registered, we firstly wr-protect a none pte which has no page cache
+ * page backing it, then access the page.
+ */
ret = -EEXIST;
- if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (vm_shared) {
- page_dup_rmap(page, true);
- } else {
- ClearPagePrivate(page);
- hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
- }
+ if (folio_in_pagecache)
+ page_dup_file_rmap(&folio->page, true);
+ else
+ hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr);
- _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = huge_pte_mkdirty(_dst_pte);
+ /*
+ * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
+ * with wp flag set, don't set pte write bit.
+ */
+ if (wp_enabled || (is_continue && !vm_shared))
+ writable = 0;
+ else
+ writable = dst_vma->vm_flags & VM_WRITE;
+
+ _dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
+ /*
+ * Always mark UFFDIO_COPY page dirty; note that this may not be
+ * extremely important for hugetlbfs for now since swapping is not
+ * supported, but we should still be clear in that this page cannot be
+ * thrown away at will, even if write bit not set.
+ */
+ _dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
+ if (wp_enabled)
+ _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
+
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
- (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
- dst_vma->vm_flags & VM_WRITE);
hugetlb_count_add(pages_per_huge_page(h), dst_mm);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
- set_page_huge_active(page);
- if (vm_shared)
- unlock_page(page);
+ if (!is_continue)
+ folio_set_hugetlb_migratable(folio);
+ if (vm_shared || is_continue)
+ folio_unlock(folio);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
- if (vm_shared)
- unlock_page(page);
+ if (vm_shared || is_continue)
+ folio_unlock(folio);
out_release_nounlock:
- put_page(page);
+ if (!folio_in_pagecache)
+ restore_reserve_on_error(h, dst_vma, dst_addr, folio);
+ folio_put(folio);
goto out;
}
+#endif /* CONFIG_USERFAULTFD */
+
+static void record_subpages(struct page *page, struct vm_area_struct *vma,
+ int refs, struct page **pages)
+{
+ int nr;
+
+ for (nr = 0; nr < refs; nr++) {
+ if (likely(pages))
+ pages[nr] = nth_page(page, nr);
+ }
+}
+
+static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
+ unsigned int flags, pte_t *pte,
+ bool *unshare)
+{
+ pte_t pteval = huge_ptep_get(pte);
+
+ *unshare = false;
+ if (is_swap_pte(pteval))
+ return true;
+ if (huge_pte_write(pteval))
+ return false;
+ if (flags & FOLL_WRITE)
+ return true;
+ if (gup_must_unshare(vma, flags, pte_page(pteval))) {
+ *unshare = true;
+ return true;
+ }
+ return false;
+}
+
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & huge_page_mask(h);
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte, entry;
+
+ /*
+ * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+ * follow_hugetlb_page().
+ */
+ if (WARN_ON_ONCE(flags & FOLL_PIN))
+ return NULL;
+
+ hugetlb_vma_lock_read(vma);
+ pte = hugetlb_walk(vma, haddr, huge_page_size(h));
+ if (!pte)
+ goto out_unlock;
+
+ ptl = huge_pte_lock(h, mm, pte);
+ entry = huge_ptep_get(pte);
+ if (pte_present(entry)) {
+ page = pte_page(entry) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ /*
+ * Note that page may be a sub-page, and with vmemmap
+ * optimizations the page struct may be read only.
+ * try_grab_page() will increase the ref count on the
+ * head page, so this will be OK.
+ *
+ * try_grab_page() should always be able to get the page here,
+ * because we hold the ptl lock and have verified pte_present().
+ */
+ if (try_grab_page(page, flags)) {
+ page = NULL;
+ goto out;
+ }
+ }
+out:
+ spin_unlock(ptl);
+out_unlock:
+ hugetlb_vma_unlock_read(vma);
+ return page;
+}
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *locked)
+ struct page **pages, unsigned long *position,
+ unsigned long *nr_pages, long i, unsigned int flags,
+ int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
- int err = -EFAULT;
+ int err = -EFAULT, refs;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
spinlock_t *ptl = NULL;
+ bool unshare = false;
int absent;
struct page *page;
@@ -4830,6 +6555,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
+ hugetlb_vma_lock_read(vma);
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
@@ -4837,8 +6563,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
- huge_page_size(h));
+ pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
+ huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -4854,6 +6580,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
break;
}
@@ -4868,19 +6595,25 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* both cases, and because we can't follow correct pages
* directly from any kind of swap entries.
*/
- if (absent || is_swap_pte(huge_ptep_get(pte)) ||
- ((flags & FOLL_WRITE) &&
- !huge_pte_write(huge_ptep_get(pte)))) {
+ if (absent ||
+ __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
vm_fault_t ret;
unsigned int fault_flags = 0;
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
- if (locked)
+ else if (unshare)
+ fault_flags |= FAULT_FLAG_UNSHARE;
+ if (locked) {
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_KILLABLE;
+ if (flags & FOLL_INTERRUPTIBLE)
+ fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
+ }
if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_RETRY_NOWAIT;
@@ -4919,57 +6652,62 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
/*
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
- if (!pages && !vmas && !pfn_offset &&
+ if (!pages && !pfn_offset &&
(vaddr + huge_page_size(h) < vma->vm_end) &&
(remainder >= pages_per_huge_page(h))) {
vaddr += huge_page_size(h);
remainder -= pages_per_huge_page(h);
i += pages_per_huge_page(h);
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
continue;
}
-same_page:
+ /* vaddr may not be aligned to PAGE_SIZE */
+ refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
+ (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
+
+ if (pages)
+ record_subpages(nth_page(page, pfn_offset),
+ vma, refs,
+ likely(pages) ? pages + i : NULL);
+
if (pages) {
- pages[i] = mem_map_offset(page, pfn_offset);
/*
- * try_grab_page() should always succeed here, because:
- * a) we hold the ptl lock, and b) we've just checked
- * that the huge page is present in the page tables. If
- * the huge page is present, then the tail pages must
- * also be present. The ptl prevents the head page and
- * tail pages from being rearranged in any way. So this
- * page must be available at this point, unless the page
+ * try_grab_folio() should always succeed here,
+ * because: a) we hold the ptl lock, and b) we've just
+ * checked that the huge page is present in the page
+ * tables. If the huge page is present, then the tail
+ * pages must also be present. The ptl prevents the
+ * head page and tail pages from being rearranged in
+ * any way. As this is hugetlb, the pages will never
+ * be p2pdma or not longterm pinable. So this page
+ * must be available at this point, unless the page
* refcount overflowed:
*/
- if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+ if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
+ flags))) {
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
err = -ENOMEM;
break;
}
}
- if (vmas)
- vmas[i] = vma;
+ vaddr += (refs << PAGE_SHIFT);
+ remainder -= refs;
+ i += refs;
- vaddr += PAGE_SIZE;
- ++pfn_offset;
- --remainder;
- ++i;
- if (vaddr < vma->vm_end && remainder &&
- pfn_offset < pages_per_huge_page(h)) {
- /*
- * We use pfn_offset to avoid touching the pageframes
- * of this compound page.
- */
- goto same_page;
- }
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
}
*nr_pages = remainder;
/*
@@ -4982,25 +6720,21 @@ same_page:
return i ? i : err;
}
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
-#endif
-
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
- unsigned long address, unsigned long end, pgprot_t newprot)
+long hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end,
+ pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long start = address;
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- unsigned long pages = 0;
+ long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
/*
* In the case of shared PMDs, the area to flush could be beyond
@@ -5008,54 +6742,98 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* range if PMD sharing is possible.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
- 0, vma, mm, start, end);
+ 0, mm, start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
- for (; address < end; address += huge_page_size(h)) {
+ last_addr_mask = hugetlb_mask_last_page(h);
+ for (; address < end; address += psize) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
- if (!ptep)
- continue;
+ ptep = hugetlb_walk(vma, address, psize);
+ if (!ptep) {
+ if (!uffd_wp) {
+ address |= last_addr_mask;
+ continue;
+ }
+ /*
+ * Userfaultfd wr-protect requires pgtable
+ * pre-allocations to install pte markers.
+ */
+ ptep = huge_pte_alloc(mm, vma, address, psize);
+ if (!ptep) {
+ pages = -ENOMEM;
+ break;
+ }
+ }
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ /*
+ * When uffd-wp is enabled on the vma, unshare
+ * shouldn't happen at all. Warn about it if it
+ * happened due to some reason.
+ */
+ WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
shared_pmd = true;
+ address |= last_addr_mask;
continue;
}
pte = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(is_hugetlb_entry_migration(pte))) {
+ /* Nothing to do. */
+ } else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (is_write_migration_entry(entry)) {
- pte_t newpte;
-
- make_migration_entry_read(&entry);
+ struct page *page = pfn_swap_entry_to_page(entry);
+ pte_t newpte = pte;
+
+ if (is_writable_migration_entry(entry)) {
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(
+ swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
- set_huge_swap_pte_at(mm, address, ptep,
- newpte, huge_page_size(h));
pages++;
}
- spin_unlock(ptl);
- continue;
- }
- if (!huge_pte_none(pte)) {
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+ if (!pte_same(pte, newpte))
+ set_huge_pte_at(mm, address, ptep, newpte);
+ } else if (unlikely(is_pte_marker(pte))) {
+ /* No other markers apply for now. */
+ WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
+ if (uffd_wp_resolve)
+ /* Safe to modify directly (non-present->none). */
+ huge_pte_clear(mm, address, ptep, psize);
+ } else if (!huge_pte_none(pte)) {
pte_t old_pte;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
- pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ pte = huge_pte_modify(old_pte, newprot);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+ if (uffd_wp)
+ pte = huge_pte_mkuffd_wp(pte);
+ else if (uffd_wp_resolve)
+ pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ } else {
+ /* None pte */
+ if (unlikely(uffd_wp))
+ /* Safe to modify directly (none->non-present). */
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
}
spin_unlock(ptl);
}
@@ -5074,20 +6852,22 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
- return pages << h->order;
+ return pages > 0 ? (pages << h->order) : pages;
}
-int hugetlb_reserve_pages(struct inode *inode,
+/* Return true if reservation was successful, false otherwise. */
+bool hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long ret, chg, add = -1;
+ long chg = -1, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -5097,16 +6877,22 @@ int hugetlb_reserve_pages(struct inode *inode,
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
- return -EINVAL;
+ return false;
}
/*
+ * vma specific semaphore used for pmd sharing and fault/truncation
+ * synchronization
+ */
+ hugetlb_vma_lock_alloc(vma);
+
+ /*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
- return 0;
+ return true;
/*
* Shared mappings base their reservation on the number of pages that
@@ -5123,12 +6909,11 @@ int hugetlb_reserve_pages(struct inode *inode,
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to, &regions_needed);
-
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return -ENOMEM;
+ goto out_err;
chg = to - from;
@@ -5136,18 +6921,12 @@ int hugetlb_reserve_pages(struct inode *inode,
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
- if (chg < 0) {
- ret = chg;
+ if (chg < 0)
goto out_err;
- }
- ret = hugetlb_cgroup_charge_cgroup_rsvd(
- hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
-
- if (ret < 0) {
- ret = -ENOMEM;
+ if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
+ chg * pages_per_huge_page(h), &h_cg) < 0)
goto out_err;
- }
if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
@@ -5162,19 +6941,15 @@ int hugetlb_reserve_pages(struct inode *inode,
* reservations already in place (gbl_reserve).
*/
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
- if (gbl_reserve < 0) {
- ret = -ENOSPC;
+ if (gbl_reserve < 0)
goto out_uncharge_cgroup;
- }
/*
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
- ret = hugetlb_acct_memory(h, gbl_reserve);
- if (ret < 0) {
+ if (hugetlb_acct_memory(h, gbl_reserve) < 0)
goto out_put_pages;
- }
/*
* Account for the reservations made. Shared mappings record regions
@@ -5197,12 +6972,16 @@ int hugetlb_reserve_pages(struct inode *inode,
/*
* pages in this range were added to the reserve
* map between region_chg and region_add. This
- * indicates a race with alloc_huge_page. Adjust
+ * indicates a race with alloc_hugetlb_folio. Adjust
* the subpool and reserve counts modified above
* based on the difference.
*/
long rsv_adjust;
+ /*
+ * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
+ * reference to h_cg->css. See comment below for detail.
+ */
hugetlb_cgroup_uncharge_cgroup_rsvd(
hstate_index(h),
(chg - add) * pages_per_huge_page(h), h_cg);
@@ -5210,9 +6989,18 @@ int hugetlb_reserve_pages(struct inode *inode,
rsv_adjust = hugepage_subpool_put_pages(spool,
chg - add);
hugetlb_acct_memory(h, -rsv_adjust);
+ } else if (h_cg) {
+ /*
+ * The file_regions will hold their own reference to
+ * h_cg->css. So we should release the reference held
+ * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
+ * done.
+ */
+ hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
- return 0;
+ return true;
+
out_put_pages:
/* put back original number of pages, chg */
(void)hugepage_subpool_put_pages(spool, chg);
@@ -5220,6 +7008,7 @@ out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
+ hugetlb_vma_lock_free(vma);
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
@@ -5228,7 +7017,7 @@ out_err:
region_abort(resv_map, from, to, regions_needed);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
- return ret;
+ return false;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
@@ -5262,6 +7051,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
/*
* If the subpool has a minimum size, the number of global
* reservations to be released may be adjusted.
+ *
+ * Note that !resv_map implies freed == 0. So (chg - freed)
+ * won't go negative.
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
@@ -5280,32 +7072,43 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
- unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
+ *
+ * Also, vma_lock (vm_private_data) is required for sharing.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
- sbase < svma->vm_start || svma->vm_end < s_end)
+ !range_in_vma(svma, sbase, s_end) ||
+ !svma->vm_private_data)
return 0;
return saddr;
}
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
- unsigned long base = addr & PUD_MASK;
- unsigned long end = base + PUD_SIZE;
+ unsigned long start = addr & PUD_MASK;
+ unsigned long end = start + PUD_SIZE;
+#ifdef CONFIG_USERFAULTFD
+ if (uffd_disable_huge_pmd_share(vma))
+ return false;
+#endif
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
- return true;
- return false;
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return false;
+ if (!vma->vm_private_data) /* vma lock required for sharing */
+ return false;
+ if (!range_in_vma(vma, start, end))
+ return false;
+ return true;
}
/*
@@ -5316,43 +7119,37 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long a_start, a_end;
+ unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
+ v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
- if (!(vma->vm_flags & VM_MAYSHARE))
+ /*
+ * vma needs to span at least one aligned PUD size, and the range
+ * must be at least partially within in.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
+ (*end <= v_start) || (*start >= v_end))
return;
/* Extend the range to be PUD aligned for a worst case scenario */
- a_start = ALIGN_DOWN(*start, PUD_SIZE);
- a_end = ALIGN(*end, PUD_SIZE);
+ if (*start > v_start)
+ *start = ALIGN_DOWN(*start, PUD_SIZE);
- /*
- * Intersect the range with the vma range, since pmd sharing won't be
- * across vma after all
- */
- *start = max(vma->vm_start, a_start);
- *end = min(vma->vm_end, a_end);
+ if (*end < v_end)
+ *end = ALIGN(*end, PUD_SIZE);
}
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode if
- * sharing is possible. For hugetlbfs, this prevents removal of any page
- * table entries associated with the address space. This is important as we
- * are setting up sharing based on existing page table entries (mappings).
- *
- * NOTE: This routine is only called from huge_pte_alloc. Some callers of
- * huge_pte_alloc know that sharing is not possible and do not take
- * i_mmap_rwsem as a performance optimization. This is handled by the
- * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
- * only required for subsequent processing.
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
{
- struct vm_area_struct *vma = find_vma(mm, addr);
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
@@ -5360,20 +7157,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
unsigned long saddr;
pte_t *spte = NULL;
pte_t *pte;
- spinlock_t *ptl;
- if (!vma_shareable(vma, addr))
- return (pte_t *)pmd_alloc(mm, pud, addr);
-
- i_mmap_assert_locked(mapping);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr,
- vma_mmu_pagesize(svma));
+ spte = hugetlb_walk(svma, saddr,
+ vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
@@ -5384,7 +7177,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!spte)
goto out;
- ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
+ spin_lock(&mm->page_table_lock);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
@@ -5392,9 +7185,10 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
} else {
put_page(virt_to_page(spte));
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_unlock_read(mapping);
return pte;
}
@@ -5405,19 +7199,20 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long *addr, pte_t *ptep)
+ unsigned long addr, pte_t *ptep)
{
- pgd_t *pgd = pgd_offset(mm, *addr);
- p4d_t *p4d = p4d_offset(pgd, *addr);
- pud_t *pud = pud_offset(p4d, *addr);
+ pgd_t *pgd = pgd_offset(mm, addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ hugetlb_vma_assert_locked(vma);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -5425,18 +7220,19 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
pud_clear(pud);
put_page(virt_to_page(ptep));
mm_dec_nr_pmds(mm);
- *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
-#define want_pmd_share() (1)
+
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
{
return NULL;
}
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long *addr, pte_t *ptep)
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
@@ -5445,11 +7241,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
}
-#define want_pmd_share() (0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+ return false;
+}
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
@@ -5467,13 +7267,18 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte = (pte_t *)pud;
} else {
BUG_ON(sz != PMD_SIZE);
- if (want_pmd_share() && pud_none(*pud))
- pte = huge_pmd_share(mm, addr, pud);
+ if (want_pmd_share(vma, addr) && pud_none(*pud))
+ pte = huge_pmd_share(mm, vma, addr, pud);
else
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
- BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
+
+ if (pte) {
+ pte_t pteval = ptep_get_lockless(pte);
+
+ BUG_ON(pte_present(pteval) && !pte_huge(pteval));
+ }
return pte;
}
@@ -5515,135 +7320,109 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return (pte_t *)pmd;
}
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-
/*
- * These functions are overwritable if your architecture needs its own
- * behavior.
+ * Return a mask that can be used to update an address to the last huge
+ * page in a page table page mapping size. Used to skip non-present
+ * page table entries when linearly scanning address ranges. Architectures
+ * with unique huge page to page table relationships can define their own
+ * version of this routine.
*/
-struct page * __weak
-follow_huge_addr(struct mm_struct *mm, unsigned long address,
- int write)
+unsigned long hugetlb_mask_last_page(struct hstate *h)
{
- return ERR_PTR(-EINVAL);
-}
+ unsigned long hp_size = huge_page_size(h);
-struct page * __weak
-follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd, int flags, int pdshift)
-{
- WARN(1, "hugepd follow called with no support for hugepage directory format\n");
- return NULL;
+ if (hp_size == PUD_SIZE)
+ return P4D_SIZE - PUD_SIZE;
+ else if (hp_size == PMD_SIZE)
+ return PUD_SIZE - PMD_SIZE;
+ else
+ return 0UL;
}
-struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int flags)
-{
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t pte;
-
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return NULL;
+#else
-retry:
- ptl = pmd_lockptr(mm, pmd);
- spin_lock(ptl);
- /*
- * make sure that the address range covered by this pmd is not
- * unmapped from other threads.
- */
- if (!pmd_huge(*pmd))
- goto out;
- pte = huge_ptep_get((pte_t *)pmd);
- if (pte_present(pte)) {
- page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
- /*
- * try_grab_page() should always succeed here, because: a) we
- * hold the pmd (ptl) lock, and b) we've just checked that the
- * huge pmd (head) page is present in the page tables. The ptl
- * prevents the head page and tail pages from being rearranged
- * in any way. So this page must be available at this point,
- * unless the page refcount overflowed:
- */
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait(mm, (pte_t *)pmd, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
-out:
- spin_unlock(ptl);
- return page;
+/* See description above. Architectures can provide their own version. */
+__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+ if (huge_page_size(h) == PMD_SIZE)
+ return PUD_SIZE - PMD_SIZE;
+#endif
+ return 0UL;
}
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/*
+ * These functions are overwritable if your architecture needs its own
+ * behavior.
+ */
+bool isolate_hugetlb(struct folio *folio, struct list_head *list)
{
- if (flags & (FOLL_GET | FOLL_PIN))
- return NULL;
+ bool ret = true;
- return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ spin_lock_irq(&hugetlb_lock);
+ if (!folio_test_hugetlb(folio) ||
+ !folio_test_hugetlb_migratable(folio) ||
+ !folio_try_get(folio)) {
+ ret = false;
+ goto unlock;
+ }
+ folio_clear_hugetlb_migratable(folio);
+ list_move_tail(&folio->lru, list);
+unlock:
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
}
-struct page * __weak
-follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
+int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
- if (flags & (FOLL_GET | FOLL_PIN))
- return NULL;
+ int ret = 0;
- return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
+ *hugetlb = false;
+ spin_lock_irq(&hugetlb_lock);
+ if (folio_test_hugetlb(folio)) {
+ *hugetlb = true;
+ if (folio_test_hugetlb_freed(folio))
+ ret = 0;
+ else if (folio_test_hugetlb_migratable(folio) || unpoison)
+ ret = folio_try_get(folio);
+ else
+ ret = -EBUSY;
+ }
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
}
-bool isolate_huge_page(struct page *page, struct list_head *list)
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
- bool ret = true;
+ int ret;
- VM_BUG_ON_PAGE(!PageHead(page), page);
- spin_lock(&hugetlb_lock);
- if (!page_huge_active(page) || !get_page_unless_zero(page)) {
- ret = false;
- goto unlock;
- }
- clear_page_huge_active(page);
- list_move_tail(&page->lru, list);
-unlock:
- spin_unlock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
+ ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
+ spin_unlock_irq(&hugetlb_lock);
return ret;
}
-void putback_active_hugepage(struct page *page)
+void folio_putback_active_hugetlb(struct folio *folio)
{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- spin_lock(&hugetlb_lock);
- set_page_huge_active(page);
- list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
- put_page(page);
+ spin_lock_irq(&hugetlb_lock);
+ folio_set_hugetlb_migratable(folio);
+ list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
+ spin_unlock_irq(&hugetlb_lock);
+ folio_put(folio);
}
-void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
{
- struct hstate *h = page_hstate(oldpage);
+ struct hstate *h = folio_hstate(old_folio);
- hugetlb_cgroup_migrate(oldpage, newpage);
- set_page_owner_migrate_reason(newpage, reason);
+ hugetlb_cgroup_migrate(old_folio, new_folio);
+ set_page_owner_migrate_reason(&new_folio->page, reason);
/*
- * transfer temporary state of the new huge page. This is
+ * transfer temporary state of the new hugetlb folio. This is
* reverse to other transitions because the newpage is going to
* be final while the old one will be freed so it takes over
* the temporary status.
@@ -5652,28 +7431,122 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
* here as well otherwise the global surplus count will not match
* the per-node's.
*/
- if (PageHugeTemporary(newpage)) {
- int old_nid = page_to_nid(oldpage);
- int new_nid = page_to_nid(newpage);
+ if (folio_test_hugetlb_temporary(new_folio)) {
+ int old_nid = folio_nid(old_folio);
+ int new_nid = folio_nid(new_folio);
+
+ folio_set_hugetlb_temporary(old_folio);
+ folio_clear_hugetlb_temporary(new_folio);
- SetPageHugeTemporary(oldpage);
- ClearPageHugeTemporary(newpage);
- spin_lock(&hugetlb_lock);
+ /*
+ * There is no need to transfer the per-node surplus state
+ * when we do not cross the node.
+ */
+ if (new_nid == old_nid)
+ return;
+ spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages_node[old_nid]) {
h->surplus_huge_pages_node[old_nid]--;
h->surplus_huge_pages_node[new_nid]++;
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
}
}
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
+ unsigned long address;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ if (start >= end)
+ return;
+
+ flush_cache_range(vma, start, end);
+ /*
+ * No need to call adjust_range_if_pmd_sharing_possible(), because
+ * we have already done the PUD_SIZE alignment.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ start, end);
+ mmu_notifier_invalidate_range_start(&range);
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ for (address = start; address < end; address += PUD_SIZE) {
+ ptep = hugetlb_walk(vma, address, sz);
+ if (!ptep)
+ continue;
+ ptl = huge_pte_lock(h, mm, ptep);
+ huge_pmd_unshare(mm, vma, address, ptep);
+ spin_unlock(ptl);
+ }
+ flush_hugetlb_tlb_range(vma, start, end);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ /*
+ * No need to call mmu_notifier_invalidate_range(), see
+ * Documentation/mm/mmu_notifier.rst.
+ */
+ mmu_notifier_invalidate_range_end(&range);
+}
+
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+ hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+}
+
#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;
static int __init cmdline_parse_hugetlb_cma(char *p)
{
- hugetlb_cma_size = memparse(p, &p);
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ hugetlb_cma_size_in_node[nid] = tmp;
+ hugetlb_cma_size += tmp;
+
+ /*
+ * Skip the separator if have one, otherwise
+ * break the parsing.
+ */
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else {
+ hugetlb_cma_size = memparse(p, &p);
+ break;
+ }
+ }
+
return 0;
}
@@ -5682,6 +7555,7 @@ early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
void __init hugetlb_cma_reserve(int order)
{
unsigned long size, reserved, per_node;
+ bool node_specific_cma_alloc = false;
int nid;
cma_reserve_called = true;
@@ -5689,30 +7563,72 @@ void __init hugetlb_cma_reserve(int order)
if (!hugetlb_cma_size)
return;
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ if (!node_online(nid)) {
+ pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ continue;
+ }
+
+ if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
+ nid, (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ } else {
+ node_specific_cma_alloc = true;
+ }
+ }
+
+ /* Validate the CMA size again in case some invalid nodes specified. */
+ if (!hugetlb_cma_size)
+ return;
+
if (hugetlb_cma_size < (PAGE_SIZE << order)) {
pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
(PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size = 0;
return;
}
- /*
- * If 3 GB area is requested on a machine with 4 numa nodes,
- * let's allocate 1 GB on first three nodes and ignore the last one.
- */
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
- pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
- hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ if (!node_specific_cma_alloc) {
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ }
reserved = 0;
- for_each_node_state(nid, N_ONLINE) {
+ for_each_online_node(nid) {
int res;
char name[CMA_MAX_NAME];
- size = min(per_node, hugetlb_cma_size - reserved);
+ if (node_specific_cma_alloc) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ size = hugetlb_cma_size_in_node[nid];
+ } else {
+ size = min(per_node, hugetlb_cma_size - reserved);
+ }
+
size = round_up(size, PAGE_SIZE << order);
snprintf(name, sizeof(name), "hugetlb%d", nid);
- res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+ /*
+ * Note that 'order per bit' is based on smallest size that
+ * may be returned to CMA allocator in the case of
+ * huge page demotion.
+ */
+ res = cma_declare_contiguous_nid(0, size, 0,
+ PAGE_SIZE << HUGETLB_PAGE_ORDER,
0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
@@ -5728,9 +7644,16 @@ void __init hugetlb_cma_reserve(int order)
if (reserved >= hugetlb_cma_size)
break;
}
+
+ if (!reserved)
+ /*
+ * hugetlb_cma_size is used to determine if allocations from
+ * cma are possible. Set to zero if no cma regions are set up.
+ */
+ hugetlb_cma_size = 0;
}
-void __init hugetlb_cma_check(void)
+static void __init hugetlb_cma_check(void)
{
if (!hugetlb_cma_size || cma_reserve_called)
return;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 1f87aec9ab5c..dedd2edb076e 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -27,9 +27,6 @@
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-#define hugetlb_cgroup_from_counter(counter, idx) \
- container_of(counter, struct hugetlb_cgroup, hugepage[idx])
-
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static inline struct page_counter *
@@ -78,15 +75,12 @@ parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
{
- int idx;
+ struct hstate *h;
- for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ for_each_hstate(h) {
if (page_counter_read(
- hugetlb_cgroup_counter_from_cgroup(h_cg, idx)) ||
- page_counter_read(hugetlb_cgroup_counter_from_cgroup_rsvd(
- h_cg, idx))) {
+ hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
return true;
- }
}
return false;
}
@@ -116,7 +110,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
rsvd_parent);
limit = round_down(PAGE_COUNTER_MAX,
- 1 << huge_page_order(&hstates[idx]));
+ pages_per_huge_page(&hstates[idx]));
ret = page_counter_set_max(
hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
@@ -129,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
}
}
+static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
+{
+ int node;
+
+ for_each_node(node)
+ kfree(h_cgroup->nodeinfo[node]);
+ kfree(h_cgroup);
+}
+
static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
struct hugetlb_cgroup *h_cgroup;
+ int node;
+
+ h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
+ GFP_KERNEL);
- h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
if (!h_cgroup)
return ERR_PTR(-ENOMEM);
if (!parent_h_cgroup)
root_h_cgroup = h_cgroup;
+ /*
+ * TODO: this routine can waste much memory for nodes which will
+ * never be onlined. It's better to use memory hotplug callback
+ * function.
+ */
+ for_each_node(node) {
+ /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
+ int node_to_alloc =
+ node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
+ h_cgroup->nodeinfo[node] =
+ kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
+ GFP_KERNEL, node_to_alloc);
+ if (!h_cgroup->nodeinfo[node])
+ goto fail_alloc_nodeinfo;
+ }
+
hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
return &h_cgroup->css;
+
+fail_alloc_nodeinfo:
+ hugetlb_cgroup_free(h_cgroup);
+ return ERR_PTR(-ENOMEM);
}
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct hugetlb_cgroup *h_cgroup;
-
- h_cgroup = hugetlb_cgroup_from_css(css);
- kfree(h_cgroup);
+ hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
}
/*
@@ -168,8 +191,9 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
struct page_counter *counter;
struct hugetlb_cgroup *page_hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
+ struct folio *folio = page_folio(page);
- page_hcg = hugetlb_cgroup_from_page(page);
+ page_hcg = hugetlb_cgroup_from_folio(folio);
/*
* We can have pages in active list without any cgroup
* ie, hugepage with less than 3 pages. We can safely
@@ -188,7 +212,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
/* Take the pages off the local counter */
page_counter_cancel(counter, nr_pages);
- set_hugetlb_cgroup(page, parent);
+ set_hugetlb_cgroup(folio, parent);
out:
return;
}
@@ -202,16 +226,14 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
struct page *page;
- int idx = 0;
do {
for_each_hstate(h) {
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
- hugetlb_cgroup_move_parent(idx, h_cg, page);
+ hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
- spin_unlock(&hugetlb_lock);
- idx++;
+ spin_unlock_irq(&hugetlb_lock);
}
cond_resched();
} while (hugetlb_cgroup_have_usage(h_cg));
@@ -288,44 +310,54 @@ int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
/* Should be called with hugetlb_lock held */
static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page, bool rsvd)
+ struct folio *folio, bool rsvd)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
- __set_hugetlb_cgroup(page, h_cg, rsvd);
- return;
+ __set_hugetlb_cgroup(folio, h_cg, rsvd);
+ if (!rsvd) {
+ unsigned long usage =
+ h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
+ /*
+ * This write is not atomic due to fetching usage and writing
+ * to it, but that's fine because we call this with
+ * hugetlb_lock held anyway.
+ */
+ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
+ usage + nr_pages);
+ }
}
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
- __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false);
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
}
void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
- __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true);
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
}
/*
* Should be called with hugetlb_lock held
*/
-static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page, bool rsvd)
+static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
+ struct folio *folio, bool rsvd)
{
struct hugetlb_cgroup *h_cg;
if (hugetlb_cgroup_disabled())
return;
lockdep_assert_held(&hugetlb_lock);
- h_cg = __hugetlb_cgroup_from_page(page, rsvd);
+ h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
if (unlikely(!h_cg))
return;
- __set_hugetlb_cgroup(page, NULL, rsvd);
+ __set_hugetlb_cgroup(folio, NULL, rsvd);
page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
rsvd),
@@ -333,20 +365,29 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
if (rsvd)
css_put(&h_cg->css);
-
- return;
+ else {
+ unsigned long usage =
+ h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
+ /*
+ * This write is not atomic due to fetching usage and writing
+ * to it, but that's fine because we call this with
+ * hugetlb_lock held anyway.
+ */
+ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
+ usage - nr_pages);
+ }
}
-void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page)
+void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
+ struct folio *folio)
{
- __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false);
+ __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
}
-void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages,
- struct page *page)
+void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
+ struct folio *folio)
{
- __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true);
+ __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
}
static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
@@ -393,16 +434,22 @@ void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
struct file_region *rg,
- unsigned long nr_pages)
+ unsigned long nr_pages,
+ bool region_del)
{
if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
return;
- if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 &&
+ if (rg->reservation_counter && resv->pages_per_hpage &&
!resv->reservation_counter) {
page_counter_uncharge(rg->reservation_counter,
nr_pages * resv->pages_per_hpage);
- css_put(rg->css);
+ /*
+ * Only do css_put(rg->css) when we delete the entire region
+ * because one file_region must hold exactly one css reference.
+ */
+ if (region_del)
+ css_put(rg->css);
}
}
@@ -417,6 +464,59 @@ enum {
RES_RSVD_FAILCNT,
};
+static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
+{
+ int nid;
+ struct cftype *cft = seq_cft(seq);
+ int idx = MEMFILE_IDX(cft->private);
+ bool legacy = MEMFILE_ATTR(cft->private);
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
+ struct cgroup_subsys_state *css;
+ unsigned long usage;
+
+ if (legacy) {
+ /* Add up usage across all nodes for the non-hierarchical total. */
+ usage = 0;
+ for_each_node_state(nid, N_MEMORY)
+ usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
+ seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
+
+ /* Simply print the per-node usage for the non-hierarchical total. */
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(seq, " N%d=%lu", nid,
+ READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
+ PAGE_SIZE);
+ seq_putc(seq, '\n');
+ }
+
+ /*
+ * The hierarchical total is pretty much the value recorded by the
+ * counter, so use that.
+ */
+ seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
+ page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
+
+ /*
+ * For each node, transverse the css tree to obtain the hierarchical
+ * node usage.
+ */
+ for_each_node_state(nid, N_MEMORY) {
+ usage = 0;
+ rcu_read_lock();
+ css_for_each_descendant_pre(css, &h_cg->css) {
+ usage += READ_ONCE(hugetlb_cgroup_from_css(css)
+ ->nodeinfo[nid]
+ ->usage[idx]);
+ }
+ rcu_read_unlock();
+ seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
+ }
+
+ seq_putc(seq, '\n');
+
+ return 0;
+}
+
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -462,7 +562,7 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
counter = &h_cg->hugepage[idx];
limit = round_down(PAGE_COUNTER_MAX,
- 1 << huge_page_order(&hstates[idx]));
+ pages_per_huge_page(&hstates[idx]));
switch (MEMFILE_ATTR(cft->private)) {
case RES_RSVD_USAGE:
@@ -509,7 +609,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return ret;
idx = MEMFILE_IDX(of_cft(of)->private);
- nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
+ nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_RSVD_LIMIT:
@@ -573,12 +673,12 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
static char *mem_fmt(char *buf, int size, unsigned long hsize)
{
- if (hsize >= (1UL << 30))
- snprintf(buf, size, "%luGB", hsize >> 30);
- else if (hsize >= (1UL << 20))
- snprintf(buf, size, "%luMB", hsize >> 20);
+ if (hsize >= SZ_1G)
+ snprintf(buf, size, "%luGB", hsize / SZ_1G);
+ else if (hsize >= SZ_1M)
+ snprintf(buf, size, "%luMB", hsize / SZ_1M);
else
- snprintf(buf, size, "%luKB", hsize >> 10);
+ snprintf(buf, size, "%luKB", hsize / SZ_1K);
return buf;
}
@@ -667,8 +767,15 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
events_local_file[idx]);
cft->flags = CFTYPE_NOT_ON_ROOT;
- /* NULL terminate the last cft */
+ /* Add the numa stat file */
cft = &h->cgroup_files_dfl[6];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
+ cft->private = MEMFILE_PRIVATE(idx, 0);
+ cft->seq_show = hugetlb_cgroup_read_numa_stat;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files_dfl[7];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
@@ -738,8 +845,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
- /* NULL terminate the last cft */
+ /* Add the numa stat file */
cft = &h->cgroup_files_legacy[8];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
+ cft->private = MEMFILE_PRIVATE(idx, 1);
+ cft->seq_show = hugetlb_cgroup_read_numa_stat;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files_legacy[9];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
@@ -771,27 +884,26 @@ void __init hugetlb_cgroup_file_init(void)
* hugetlb_lock will make sure a parallel cgroup rmdir won't happen
* when we migrate hugepages
*/
-void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
{
struct hugetlb_cgroup *h_cg;
struct hugetlb_cgroup *h_cg_rsvd;
- struct hstate *h = page_hstate(oldhpage);
+ struct hstate *h = folio_hstate(old_folio);
if (hugetlb_cgroup_disabled())
return;
- VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
- spin_lock(&hugetlb_lock);
- h_cg = hugetlb_cgroup_from_page(oldhpage);
- h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
- set_hugetlb_cgroup(oldhpage, NULL);
- set_hugetlb_cgroup_rsvd(oldhpage, NULL);
+ spin_lock_irq(&hugetlb_lock);
+ h_cg = hugetlb_cgroup_from_folio(old_folio);
+ h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
+ set_hugetlb_cgroup(old_folio, NULL);
+ set_hugetlb_cgroup_rsvd(old_folio, NULL);
/* move the h_cg details to new cgroup */
- set_hugetlb_cgroup(newhpage, h_cg);
- set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
- list_move(&newhpage->lru, &h->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+ set_hugetlb_cgroup(new_folio, h_cg);
+ set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
+ list_move(&new_folio->lru, &h->hugepage_activelist);
+ spin_unlock_irq(&hugetlb_lock);
return;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
new file mode 100644
index 000000000000..4b9734777f69
--- /dev/null
+++ b/mm/hugetlb_vmemmap.c
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HugeTLB Vmemmap Optimization (HVO)
+ *
+ * Copyright (c) 2020, ByteDance. All rights reserved.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ * See Documentation/mm/vmemmap_dedup.rst
+ */
+#define pr_fmt(fmt) "HugeTLB: " fmt
+
+#include <linux/pgtable.h>
+#include <linux/moduleparam.h>
+#include <linux/bootmem_info.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include "hugetlb_vmemmap.h"
+
+/**
+ * struct vmemmap_remap_walk - walk vmemmap page table
+ *
+ * @remap_pte: called for each lowest-level entry (PTE).
+ * @nr_walked: the number of walked pte.
+ * @reuse_page: the page which is reused for the tail vmemmap pages.
+ * @reuse_addr: the virtual address of the @reuse_page page.
+ * @vmemmap_pages: the list head of the vmemmap pages that can be freed
+ * or is mapped from.
+ */
+struct vmemmap_remap_walk {
+ void (*remap_pte)(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk);
+ unsigned long nr_walked;
+ struct page *reuse_page;
+ unsigned long reuse_addr;
+ struct list_head *vmemmap_pages;
+};
+
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+ pmd_t __pmd;
+ int i;
+ unsigned long addr = start;
+ struct page *head;
+ pte_t *pgtable;
+
+ spin_lock(&init_mm.page_table_lock);
+ head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
+ spin_unlock(&init_mm.page_table_lock);
+
+ if (!head)
+ return 0;
+
+ pgtable = pte_alloc_one_kernel(&init_mm);
+ if (!pgtable)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, &__pmd, pgtable);
+
+ for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ pgprot_t pgprot = PAGE_KERNEL;
+
+ entry = mk_pte(head + i, pgprot);
+ pte = pte_offset_kernel(&__pmd, addr);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
+
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pmd_leaf(*pmd))) {
+ /*
+ * Higher order allocations from buddy allocator must be able to
+ * be treated as indepdenent small pages (as they can be freed
+ * individually).
+ */
+ if (!PageReserved(head))
+ split_page(head, get_order(PMD_SIZE));
+
+ /* Make pte visible before pmd. See comment in pmd_install(). */
+ smp_wmb();
+ pmd_populate_kernel(&init_mm, pmd, pgtable);
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
+ } else {
+ pte_free_kernel(&init_mm, pgtable);
+ }
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
+
+static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+
+ /*
+ * The reuse_page is found 'first' in table walk before we start
+ * remapping (which is calling @walk->remap_pte).
+ */
+ if (!walk->reuse_page) {
+ walk->reuse_page = pte_page(ptep_get(pte));
+ /*
+ * Because the reuse address is part of the range that we are
+ * walking, skip the reuse address range.
+ */
+ addr += PAGE_SIZE;
+ pte++;
+ walk->nr_walked++;
+ }
+
+ for (; addr != end; addr += PAGE_SIZE, pte++) {
+ walk->remap_pte(pte, addr, walk);
+ walk->nr_walked++;
+ }
+}
+
+static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ int ret;
+
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+ if (ret)
+ return ret;
+
+ next = pmd_addr_end(addr, end);
+ vmemmap_pte_range(pmd, addr, next, walk);
+ } while (pmd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ int ret;
+
+ next = pud_addr_end(addr, end);
+ ret = vmemmap_pmd_range(pud, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pud++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ int ret;
+
+ next = p4d_addr_end(addr, end);
+ ret = vmemmap_pud_range(p4d, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (p4d++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_remap_range(unsigned long start, unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ unsigned long addr = start;
+ unsigned long next;
+ pgd_t *pgd;
+
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
+
+ pgd = pgd_offset_k(addr);
+ do {
+ int ret;
+
+ next = pgd_addr_end(addr, end);
+ ret = vmemmap_p4d_range(pgd, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pgd++, addr = next, addr != end);
+
+ flush_tlb_kernel_range(start, end);
+
+ return 0;
+}
+
+/*
+ * Free a vmemmap page. A vmemmap page can be allocated from the memblock
+ * allocator or buddy allocator. If the PG_reserved flag is set, it means
+ * that it allocated from the memblock allocator, just free it via the
+ * free_bootmem_page(). Otherwise, use __free_page().
+ */
+static inline void free_vmemmap_page(struct page *page)
+{
+ if (PageReserved(page))
+ free_bootmem_page(page);
+ else
+ __free_page(page);
+}
+
+/* Free a list of the vmemmap pages */
+static void free_vmemmap_page_list(struct list_head *list)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru)
+ free_vmemmap_page(page);
+}
+
+static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ /*
+ * Remap the tail pages as read-only to catch illegal write operation
+ * to the tail pages.
+ */
+ pgprot_t pgprot = PAGE_KERNEL_RO;
+ struct page *page = pte_page(ptep_get(pte));
+ pte_t entry;
+
+ /* Remapping the head page requires r/w */
+ if (unlikely(addr == walk->reuse_addr)) {
+ pgprot = PAGE_KERNEL;
+ list_del(&walk->reuse_page->lru);
+
+ /*
+ * Makes sure that preceding stores to the page contents from
+ * vmemmap_remap_free() become visible before the set_pte_at()
+ * write.
+ */
+ smp_wmb();
+ }
+
+ entry = mk_pte(walk->reuse_page, pgprot);
+ list_add_tail(&page->lru, walk->vmemmap_pages);
+ set_pte_at(&init_mm, addr, pte, entry);
+}
+
+/*
+ * How many struct page structs need to be reset. When we reuse the head
+ * struct page, the special metadata (e.g. page->flags or page->mapping)
+ * cannot copy to the tail struct page structs. The invalid value will be
+ * checked in the free_tail_page_prepare(). In order to avoid the message
+ * of "corrupted mapping in tail page". We need to reset at least 3 (one
+ * head struct page struct and two tail struct page structs) struct page
+ * structs.
+ */
+#define NR_RESET_STRUCT_PAGE 3
+
+static inline void reset_struct_pages(struct page *start)
+{
+ struct page *from = start + NR_RESET_STRUCT_PAGE;
+
+ BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
+ memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
+}
+
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ pgprot_t pgprot = PAGE_KERNEL;
+ struct page *page;
+ void *to;
+
+ BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
+
+ page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&page->lru);
+ to = page_to_virt(page);
+ copy_page(to, (void *)walk->reuse_addr);
+ reset_struct_pages(to);
+
+ /*
+ * Makes sure that preceding stores to the page contents become visible
+ * before the set_pte_at() write.
+ */
+ smp_wmb();
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+/**
+ * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
+ * to the page which @reuse is mapped to, then free vmemmap
+ * which the range are mapped to.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_free(unsigned long start, unsigned long end,
+ unsigned long reuse)
+{
+ int ret;
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_remap_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+ int nid = page_to_nid((struct page *)start);
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
+ __GFP_NOWARN;
+
+ /*
+ * Allocate a new head vmemmap page to avoid breaking a contiguous
+ * block of struct page memory when freeing it back to page allocator
+ * in free_vmemmap_page_list(). This will allow the likely contiguous
+ * struct page backing memory to be kept contiguous and allowing for
+ * more allocations of hugepages. Fallback to the currently
+ * mapped head page in case should it fail to allocate.
+ */
+ walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
+ if (walk.reuse_page) {
+ copy_page(page_to_virt(walk.reuse_page),
+ (void *)walk.reuse_addr);
+ list_add(&walk.reuse_page->lru, &vmemmap_pages);
+ }
+
+ /*
+ * In order to make remapping routine most efficient for the huge pages,
+ * the routine of vmemmap page table walking has the following rules
+ * (see more details from the vmemmap_pte_range()):
+ *
+ * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
+ * should be continuous.
+ * - The @reuse address is part of the range [@reuse, @end) that we are
+ * walking which is passed to vmemmap_remap_range().
+ * - The @reuse address is the first in the complete range.
+ *
+ * So we need to make sure that @start and @reuse meet the above rules.
+ */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ mmap_read_lock(&init_mm);
+ ret = vmemmap_remap_range(reuse, end, &walk);
+ if (ret && walk.nr_walked) {
+ end = reuse + walk.nr_walked * PAGE_SIZE;
+ /*
+ * vmemmap_pages contains pages from the previous
+ * vmemmap_remap_range call which failed. These
+ * are pages which were removed from the vmemmap.
+ * They will be restored in the following call.
+ */
+ walk = (struct vmemmap_remap_walk) {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ vmemmap_remap_range(reuse, end, &walk);
+ }
+ mmap_read_unlock(&init_mm);
+
+ free_vmemmap_page_list(&vmemmap_pages);
+
+ return ret;
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+ struct list_head *list)
+{
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE;
+ unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+ int nid = page_to_nid((struct page *)start);
+ struct page *page, *next;
+
+ while (nr_pages--) {
+ page = alloc_pages_node(nid, gfp_mask, 0);
+ if (!page)
+ goto out;
+ list_add_tail(&page->lru, list);
+ }
+
+ return 0;
+out:
+ list_for_each_entry_safe(page, next, list, lru)
+ __free_page(page);
+ return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ * to the page which is from the @vmemmap_pages
+ * respectively.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+ unsigned long reuse)
+{
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ /* See the comment in the vmemmap_remap_free(). */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
+ return -ENOMEM;
+
+ mmap_read_lock(&init_mm);
+ vmemmap_remap_range(reuse, end, &walk);
+ mmap_read_unlock(&init_mm);
+
+ return 0;
+}
+
+DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
+EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
+
+static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
+core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
+
+/**
+ * hugetlb_vmemmap_restore - restore previously optimized (by
+ * hugetlb_vmemmap_optimize()) vmemmap pages which
+ * will be reallocated and remapped.
+ * @h: struct hstate.
+ * @head: the head page whose vmemmap pages will be restored.
+ *
+ * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
+ * negative error code otherwise.
+ */
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
+{
+ int ret;
+ unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+ unsigned long vmemmap_reuse;
+
+ if (!HPageVmemmapOptimized(head))
+ return 0;
+
+ vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
+ vmemmap_reuse = vmemmap_start;
+ vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+ /*
+ * The pages which the vmemmap virtual address range [@vmemmap_start,
+ * @vmemmap_end) are mapped to are freed to the buddy allocator, and
+ * the range is mapped to the page which @vmemmap_reuse is mapped to.
+ * When a HugeTLB page is freed to the buddy allocator, previously
+ * discarded vmemmap pages must be allocated and remapping.
+ */
+ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse);
+ if (!ret) {
+ ClearHPageVmemmapOptimized(head);
+ static_branch_dec(&hugetlb_optimize_vmemmap_key);
+ }
+
+ return ret;
+}
+
+/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
+static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
+{
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return false;
+
+ if (!hugetlb_vmemmap_optimizable(h))
+ return false;
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+ pmd_t *pmdp, pmd;
+ struct page *vmemmap_page;
+ unsigned long vaddr = (unsigned long)head;
+
+ /*
+ * Only the vmemmap page's vmemmap page can be self-hosted.
+ * Walking the page tables to find the backing page of the
+ * vmemmap page.
+ */
+ pmdp = pmd_off_k(vaddr);
+ /*
+ * The READ_ONCE() is used to stabilize *pmdp in a register or
+ * on the stack so that it will stop changing under the code.
+ * The only concurrent operation where it can be changed is
+ * split_vmemmap_huge_pmd() (*pmdp will be stable after this
+ * operation).
+ */
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_leaf(pmd))
+ vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
+ else
+ vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
+ /*
+ * Due to HugeTLB alignment requirements and the vmemmap pages
+ * being at the start of the hotplugged memory region in
+ * memory_hotplug.memmap_on_memory case. Checking any vmemmap
+ * page's vmemmap page if it is marked as VmemmapSelfHosted is
+ * sufficient.
+ *
+ * [ hotplugged memory ]
+ * [ section ][...][ section ]
+ * [ vmemmap ][ usable memory ]
+ * ^ | | |
+ * +---+ | |
+ * ^ | |
+ * +-------+ |
+ * ^ |
+ * +-------------------------------------------+
+ */
+ if (PageVmemmapSelfHosted(vmemmap_page))
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
+ * @h: struct hstate.
+ * @head: the head page whose vmemmap pages will be optimized.
+ *
+ * This function only tries to optimize @head's vmemmap pages and does not
+ * guarantee that the optimization will succeed after it returns. The caller
+ * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
+ * have been optimized.
+ */
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
+{
+ unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+ unsigned long vmemmap_reuse;
+
+ if (!vmemmap_should_optimize(h, head))
+ return;
+
+ static_branch_inc(&hugetlb_optimize_vmemmap_key);
+
+ vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
+ vmemmap_reuse = vmemmap_start;
+ vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+ /*
+ * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
+ * to the page which @vmemmap_reuse is mapped to, then free the pages
+ * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
+ */
+ if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
+ static_branch_dec(&hugetlb_optimize_vmemmap_key);
+ else
+ SetHPageVmemmapOptimized(head);
+}
+
+static struct ctl_table hugetlb_vmemmap_sysctls[] = {
+ {
+ .procname = "hugetlb_optimize_vmemmap",
+ .data = &vmemmap_optimize_enabled,
+ .maxlen = sizeof(vmemmap_optimize_enabled),
+ .mode = 0644,
+ .proc_handler = proc_dobool,
+ },
+ { }
+};
+
+static int __init hugetlb_vmemmap_init(void)
+{
+ const struct hstate *h;
+
+ /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
+ BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
+
+ for_each_hstate(h) {
+ if (hugetlb_vmemmap_optimizable(h)) {
+ register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
+ break;
+ }
+ }
+ return 0;
+}
+late_initcall(hugetlb_vmemmap_init);
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
new file mode 100644
index 000000000000..25bd0e002431
--- /dev/null
+++ b/mm/hugetlb_vmemmap.h
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HugeTLB Vmemmap Optimization (HVO)
+ *
+ * Copyright (c) 2020, ByteDance. All rights reserved.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ */
+#ifndef _LINUX_HUGETLB_VMEMMAP_H
+#define _LINUX_HUGETLB_VMEMMAP_H
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
+
+/*
+ * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
+ * Documentation/vm/vmemmap_dedup.rst.
+ */
+#define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE
+
+static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
+{
+ return pages_per_huge_page(h) * sizeof(struct page);
+}
+
+/*
+ * Return how many vmemmap size associated with a HugeTLB page that can be
+ * optimized and can be freed to the buddy allocator.
+ */
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
+{
+ int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+ if (!is_power_of_2(sizeof(struct page)))
+ return 0;
+ return size > 0 ? size : 0;
+}
+#else
+static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
+{
+ return 0;
+}
+
+static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
+{
+}
+
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
+{
+ return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
+
+static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
+{
+ return hugetlb_vmemmap_optimizable_size(h) != 0;
+}
+#endif /* _LINUX_HUGETLB_VMEMMAP_H */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e488876b168a..d0548e382b6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -26,37 +26,30 @@ static int hwpoison_inject(void *data, u64 val)
p = pfn_to_page(pfn);
hpage = compound_head(p);
- /*
- * This implies unable to support free buddy pages.
- */
- if (!get_hwpoison_page(p))
- return 0;
if (!hwpoison_filter_enable)
goto inject;
- shake_page(hpage, 0);
+ shake_page(hpage);
/*
- * This implies unable to support non-LRU pages.
+ * This implies unable to support non-LRU pages except free page.
*/
- if (!PageLRU(hpage) && !PageHuge(p))
- goto put_out;
+ if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p))
+ return 0;
/*
- * do a racy check with elevated page count, to make sure PG_hwpoison
- * will only be set for the targeted owner (or on a free page).
+ * do a racy check to make sure PG_hwpoison will only be set for
+ * the targeted owner (or on a free page).
* memory_failure() will redo the check reliably inside page lock.
*/
err = hwpoison_filter(hpage);
if (err)
- goto put_out;
+ return 0;
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
- return memory_failure(pfn, MF_COUNT_INCREASED);
-put_out:
- put_hwpoison_page(p);
- return 0;
+ err = memory_failure(pfn, MF_SW_SIMULATED);
+ return (err == -EOPNOTSUPP) ? 0 : err;
}
static int hwpoison_unpoison(void *data, u64 val)
@@ -70,12 +63,13 @@ static int hwpoison_unpoison(void *data, u64 val)
DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
-static void pfn_inject_exit(void)
+static void __exit pfn_inject_exit(void)
{
+ hwpoison_filter_enable = 0;
debugfs_remove_recursive(hwpoison_dir);
}
-static int pfn_inject_init(void)
+static int __init pfn_inject_init(void)
{
hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 3a613c85f9ed..efa97b57acfd 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm_types.h>
-#include <linux/rbtree.h>
+#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/list.h>
@@ -10,6 +10,7 @@
#include <linux/atomic.h>
#include <linux/user_namespace.h>
+#include <linux/iommu.h>
#include <asm/mmu.h>
#ifndef INIT_MM_CONTEXT
@@ -27,15 +28,31 @@
* and size this cpu_bitmask to NR_CPUS.
*/
struct mm_struct init_mm = {
- .mm_rb = RB_ROOT,
+ .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
+ .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq),
MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+ .mm_lock_seq = 0,
+#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
+#ifdef CONFIG_IOMMU_SVA
+ .pasid = IOMMU_PASID_INVALID,
+#endif
INIT_MM_CONTEXT(init_mm)
};
+
+void setup_initial_init_mm(void *start_code, void *end_code,
+ void *end_data, void *brk)
+{
+ init_mm.start_code = (unsigned long)start_code;
+ init_mm.end_code = (unsigned long)end_code;
+ init_mm.end_data = (unsigned long)end_data;
+ init_mm.brk = (unsigned long)brk;
+}
diff --git a/mm/internal.h b/mm/internal.h
index a801a4d51f26..8ed127c1c808 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -10,8 +10,11 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/rmap.h>
#include <linux/tracepoint-defs.h>
+struct folio_batch;
+
/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
@@ -21,7 +24,7 @@
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
- __GFP_ATOMIC)
+ __GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
@@ -32,61 +35,126 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+/*
+ * Different from WARN_ON_ONCE(), no warning will be issued
+ * when we specify __GFP_NOWARN.
+ */
+#define WARN_ON_ONCE_GFP(cond, gfp) ({ \
+ static bool __section(".data.once") __warned; \
+ int __ret_warn_once = !!(cond); \
+ \
+ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
+ __warned = true; \
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn_once); \
+})
+
void page_writeback_init(void);
-vm_fault_t do_swap_page(struct vm_fault *vmf);
+/*
+ * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
+ * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit
+ * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently
+ * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
+ */
+#define COMPOUND_MAPPED 0x800000
+#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1)
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long floor, unsigned long ceiling);
+/*
+ * How many individual pages have an elevated _mapcount. Excludes
+ * the folio's entire_mapcount.
+ */
+static inline int folio_nr_pages_mapped(struct folio *folio)
+{
+ return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
+}
+
+static inline void *folio_raw_mapping(struct folio *folio)
+{
+ unsigned long mapping = (unsigned long)folio->mapping;
-static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
+ return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
+}
+
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+ int nr_throttled);
+static inline void acct_reclaim_writeback(struct folio *folio)
{
- return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+ pg_data_t *pgdat = folio_pgdat(folio);
+ int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
+
+ if (nr_throttled)
+ __acct_reclaim_writeback(pgdat, folio, nr_throttled);
}
+static inline void wake_throttle_isolated(pg_data_t *pgdat)
+{
+ wait_queue_head_t *wqh;
+
+ wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+
+vm_fault_t do_swap_page(struct vm_fault *vmf);
+void folio_rotate_reclaimable(struct folio *folio);
+bool __folio_end_writeback(struct folio *folio);
+void deactivate_file_folio(struct folio *folio);
+void folio_activate(struct folio *folio);
+
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *start_vma, unsigned long floor,
+ unsigned long ceiling, bool mm_wr_locked);
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+
+struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
-void force_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read);
-void __do_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size);
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-static inline void ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
+void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
+ unsigned int order);
+void force_page_cache_ra(struct readahead_control *, unsigned long nr);
+static inline void force_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read)
{
- __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
+ DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
+ force_page_cache_ra(&ractl, nr_to_read);
}
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
+unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
+void filemap_free_folio(struct address_space *mapping, struct folio *folio);
+int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
+bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
+ loff_t end);
+long invalidate_inode_page(struct page *page);
+unsigned long mapping_try_invalidate(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_failed);
/**
- * page_evictable - test whether a page is evictable
- * @page: the page to test
- *
- * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.
+ * folio_evictable - Test whether a folio is evictable.
+ * @folio: The folio to test.
*
- * Reasons page might not be evictable:
- * (1) page's mapping marked unevictable
- * (2) page is part of an mlocked VMA
+ * Test whether @folio is evictable -- i.e., should be placed on
+ * active/inactive lists vs unevictable list.
*
+ * Reasons folio might not be evictable:
+ * 1. folio's mapping marked unevictable
+ * 2. One of the pages in the folio is part of an mlocked VMA
*/
-static inline bool page_evictable(struct page *page)
+static inline bool folio_evictable(struct folio *folio)
{
bool ret;
/* Prevent address_space of inode and swap cache from being freed */
rcu_read_lock();
- ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+ ret = !mapping_unevictable(folio_mapping(folio)) &&
+ !folio_test_mlocked(folio);
rcu_read_unlock();
return ret;
}
@@ -113,17 +181,33 @@ extern unsigned long highest_memmap_pfn;
/*
* in mm/vmscan.c:
*/
-extern int isolate_lru_page(struct page *page);
-extern void putback_lru_page(struct page *page);
+bool isolate_lru_page(struct page *page);
+bool folio_isolate_lru(struct folio *folio);
+void putback_lru_page(struct page *page);
+void folio_putback_lru(struct folio *folio);
+extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
/*
* in mm/rmap.c:
*/
-extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/page_alloc.c
*/
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+extern char * const zone_names[MAX_NR_ZONES];
+
+/* perform sanity checks on struct pages being allocated or freed */
+DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
+
+extern int min_free_kbytes;
+
+void setup_per_zone_wmarks(void);
+void calculate_min_free_kbytes(void);
+int __meminit init_per_zone_wmark_min(void);
+void page_alloc_sysctl_init(void);
/*
* Structure for holding the mostly immutable allocation parameters passed
@@ -131,10 +215,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
* family of functions.
*
* nodemask, migratetype and highest_zoneidx are initialized only once in
- * __alloc_pages_nodemask() and then never change.
+ * __alloc_pages() and then never change.
*
* zonelist, preferred_zone and highest_zoneidx are set first in
- * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * __alloc_pages() for the fast path, and might be later changed
* in __alloc_pages_slowpath(). All other functions pass the whole structure
* by a const pointer.
*/
@@ -159,6 +243,67 @@ struct alloc_context {
};
/*
+ * This function returns the order of a free page in the buddy system. In
+ * general, page_zone(page)->lock must be held by the caller to prevent the
+ * page from being allocated in parallel and returning garbage as the order.
+ * If a caller does not hold page_zone(page)->lock, it must guarantee that the
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use buddy_order_unsafe() below.
+ */
+static inline unsigned int buddy_order(struct page *page)
+{
+ /* PageBuddy() must be checked by the caller */
+ return page_private(page);
+}
+
+/*
+ * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
+ * PageBuddy() should be checked first by the caller to minimize race window,
+ * and invalid values must be handled gracefully.
+ *
+ * READ_ONCE is used so that if the caller assigns the result into a local
+ * variable and e.g. tests it for valid range before using, the compiler cannot
+ * decide to remove the variable and inline the page_private(page) multiple
+ * times, potentially observing different values in the tests and the actual
+ * use of the result.
+ */
+#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
+
+/*
+ * This function checks whether a page is free && is the buddy
+ * we can coalesce a page and its buddy if
+ * (a) the buddy is not in a hole (check before calling!) &&
+ * (b) the buddy is in the buddy system &&
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
+ *
+ * For recording whether a page is in the buddy system, we set PageBuddy.
+ * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
+ *
+ * For recording page's order, we use page_private(page).
+ */
+static inline bool page_is_buddy(struct page *page, struct page *buddy,
+ unsigned int order)
+{
+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
+ return false;
+
+ if (buddy_order(buddy) != order)
+ return false;
+
+ /*
+ * zone check is done late to avoid uselessly calculating
+ * zone/node ids for pages that could never merge.
+ */
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return false;
+
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
+ return true;
+}
+
+/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
*
@@ -181,6 +326,35 @@ __find_buddy_pfn(unsigned long page_pfn, unsigned int order)
return page_pfn ^ (1 << order);
}
+/*
+ * Find the buddy of @page and validate it.
+ * @page: The input page
+ * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
+ * function is used in the performance-critical __free_one_page().
+ * @order: The order of the page
+ * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
+ * page_to_pfn().
+ *
+ * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
+ * not the same as @page. The validation is necessary before use it.
+ *
+ * Return: the found buddy page or NULL if not found.
+ */
+static inline struct page *find_buddy_page_pfn(struct page *page,
+ unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
+{
+ unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
+ struct page *buddy;
+
+ buddy = page + (__buddy_pfn - pfn);
+ if (buddy_pfn)
+ *buddy_pfn = __buddy_pfn;
+
+ if (page_is_buddy(page, buddy, order))
+ return buddy;
+ return NULL;
+}
+
extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone);
@@ -193,19 +367,79 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}
+void set_zone_contiguous(struct zone *zone);
+
+static inline void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
+
+/*
+ * This will have no effect, other than possibly generating a warning, if the
+ * caller passes in a non-large folio.
+ */
+static inline void folio_set_order(struct folio *folio, unsigned int order)
+{
+ if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
+ return;
+
+ folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+ folio->_folio_nr_pages = 1U << order;
+#endif
+}
+
+static inline void prep_compound_head(struct page *page, unsigned int order)
+{
+ struct folio *folio = (struct folio *)page;
+
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
+ folio_set_order(folio, order);
+ atomic_set(&folio->_entire_mapcount, -1);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
+}
+
+static inline void prep_compound_tail(struct page *head, int tail_idx)
+{
+ struct page *p = head + tail_idx;
+
+ p->mapping = TAIL_MAPPING;
+ set_compound_head(p, head);
+ set_page_private(p, 0);
+}
+
extern void prep_compound_page(struct page *page, unsigned int order);
+
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
-extern void zone_pcp_update(struct zone *zone);
+extern void free_unref_page(struct page *page, unsigned int order);
+extern void free_unref_page_list(struct list_head *list);
+
extern void zone_pcp_reset(struct zone *zone);
+extern void zone_pcp_disable(struct zone *zone);
+extern void zone_pcp_enable(struct zone *zone);
+extern void zone_pcp_init(struct zone *zone);
+
+extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr,
+ int nid, bool exact_nid);
+
+void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
+ unsigned long, enum meminit_context, struct vmem_altmap *, int);
+
+
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -225,7 +459,13 @@ struct compact_control {
unsigned int nr_freepages; /* Number of isolated free pages */
unsigned int nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long migrate_pfn; /* isolate_migratepages search base */
+ /*
+ * Acts as an in/out parameter to page isolation for migration.
+ * isolate_migratepages uses it as a search base.
+ * isolate_migratepages_block will update the value to the next pfn
+ * after the last isolated one.
+ */
+ unsigned long migrate_pfn;
unsigned long fast_start_pfn; /* a pfn to start linear scan from */
struct zone *zone;
unsigned long total_migrate_scanned;
@@ -244,8 +484,12 @@ struct compact_control {
bool direct_compaction; /* False from kcompactd or /proc/... */
bool proactive_compaction; /* kcompactd proactive compaction */
bool whole_zone; /* Whole zone should/has been scanned */
- bool contended; /* Signal lock or sched contention */
- bool rescan; /* Rescanning the same pageblock */
+ bool contended; /* Signal lock contention */
+ bool finish_pageblock; /* Scan the remainder of a pageblock. Used
+ * when there are potentially transient
+ * isolation or migration failures to
+ * ensure forward progress.
+ */
bool alloc_contig; /* alloc_contig_range allocation */
};
@@ -261,44 +505,24 @@ struct capture_control {
unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn);
-unsigned long
+int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal);
-#endif
+int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end);
-/*
- * This function returns the order of a free page in the buddy system. In
- * general, page_zone(page)->lock must be held by the caller to prevent the
- * page from being allocated in parallel and returning garbage as the order.
- * If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel. Alternatively, it must
- * handle invalid values gracefully, and use page_order_unsafe() below.
- */
-static inline unsigned int page_order(struct page *page)
-{
- /* PageBuddy() must be checked by the caller */
- return page_private(page);
-}
+/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
+void init_cma_reserved_pageblock(struct page *page);
-/*
- * Like page_order(), but for callers who cannot afford to hold the zone lock.
- * PageBuddy() should be checked first by the caller to minimize race window,
- * and invalid values must be handled gracefully.
- *
- * READ_ONCE is used so that if the caller assigns the result into a local
- * variable and e.g. tests it for valid range before using, the compiler cannot
- * decide to remove the variable and inline the page_private(page) multiple
- * times, potentially observing different values in the tests and the actual
- * use of the result.
- */
-#define page_order_unsafe(page) READ_ONCE(page_private(page))
+#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal);
-static inline bool is_cow_mapping(vm_flags_t flags)
+static inline bool free_area_empty(struct free_area *area, int migratetype)
{
- return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ return list_empty(&area->free_list[migratetype]);
}
/*
@@ -314,7 +538,7 @@ static inline bool is_exec_mapping(vm_flags_t flags)
}
/*
- * Stack area - atomatically grows in one direction
+ * Stack area - automatically grows in one direction
*
* VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
* do_mmap() forbids all other combinations.
@@ -333,78 +557,120 @@ static inline bool is_data_mapping(vm_flags_t flags)
}
/* mm/util.c */
-void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev);
-void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
+struct anon_vma *folio_anon_vma(struct folio *folio);
#ifdef CONFIG_MMU
+void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking);
-extern void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
-static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
-{
- munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
-}
-
+ unsigned long start, unsigned long end, int *locked);
+extern long faultin_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ bool write, int *locked);
+extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+ unsigned long bytes);
/*
- * must be called with vma's mmap_lock held for read or write, and page locked.
- */
-extern void mlock_vma_page(struct page *page);
-extern unsigned int munlock_vma_page(struct page *page);
-
-/*
- * Clear the page's PageMlocked(). This can be useful in a situation where
- * we want to unconditionally remove a page from the pagecache -- e.g.,
- * on truncation or freeing.
+ * mlock_vma_folio() and munlock_vma_folio():
+ * should be called with vma's mmap_lock held for read or write,
+ * under page table lock for the pte/pmd being added or removed.
*
- * It is legal to call this function for any page, mlocked or not.
- * If called for a page that is still mapped by mlocked vmas, all we do
- * is revert to lazy LRU behaviour -- semantics are not broken.
- */
-extern void clear_page_mlock(struct page *page);
-
-/*
- * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
- * (because that does not go through the full procedure of migration ptes):
- * to migrate the Mlocked page flag; update statistics.
+ * mlock is usually called at the end of page_add_*_rmap(), munlock at
+ * the end of page_remove_rmap(); but new anon folios are managed by
+ * folio_add_lru_vma() calling mlock_new_folio().
+ *
+ * @compound is used to include pmd mappings of THPs, but filter out
+ * pte mappings of THPs, which cannot be consistently counted: a pte
+ * mapping of the THP head cannot be distinguished by the page alone.
*/
-static inline void mlock_migrate_page(struct page *newpage, struct page *page)
+void mlock_folio(struct folio *folio);
+static inline void mlock_vma_folio(struct folio *folio,
+ struct vm_area_struct *vma, bool compound)
{
- if (TestClearPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
+ /*
+ * The VM_SPECIAL check here serves two purposes.
+ * 1) VM_IO check prevents migration from double-counting during mlock.
+ * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
+ * is never left set on a VM_SPECIAL vma, there is an interval while
+ * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
+ * still be set while VM_SPECIAL bits are added: so ignore it then.
+ */
+ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
+ (compound || !folio_test_large(folio)))
+ mlock_folio(folio);
+}
- /* Holding pmd lock, no change in irq context: __mod is safe */
- __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- SetPageMlocked(newpage);
- __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
- }
+void munlock_folio(struct folio *folio);
+static inline void munlock_vma_folio(struct folio *folio,
+ struct vm_area_struct *vma, bool compound)
+{
+ if (unlikely(vma->vm_flags & VM_LOCKED) &&
+ (compound || !folio_test_large(folio)))
+ munlock_folio(folio);
}
+void mlock_new_folio(struct folio *folio);
+bool need_mlock_drain(int cpu);
+void mlock_drain_local(void);
+void mlock_drain_remote(int cpu);
+
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
- * At what user virtual address is page expected in @vma?
+ * Return the start of user virtual address at the specific offset within
+ * a vma.
*/
static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ unsigned long address;
+
+ if (pgoff >= vma->vm_pgoff) {
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address >= vma->vm_end)
+ address = -EFAULT;
+ } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
+ /* Test above avoids possibility of wrap to 0 on 32-bit */
+ address = vma->vm_start;
+ } else {
+ address = -EFAULT;
+ }
+ return address;
}
+/*
+ * Return the start of user virtual address of a page within a vma.
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
+}
- return max(start, vma->vm_start);
+/*
+ * Then at what user virtual address will none of the range be found in vma?
+ * Assumes that vma_address() already returned a good starting address.
+ */
+static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
+{
+ struct vm_area_struct *vma = pvmw->vma;
+ pgoff_t pgoff;
+ unsigned long address;
+
+ /* Common case, plus ->pgoff is invalid for KSM */
+ if (pvmw->nr_pages == 1)
+ return pvmw->address + PAGE_SIZE;
+
+ pgoff = pvmw->pgoff + pvmw->nr_pages;
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address > vma->vm_end)
+ address = vma->vm_end;
+ return address;
}
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
@@ -427,43 +693,24 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
}
return fpin;
}
-
#else /* !CONFIG_MMU */
-static inline void clear_page_mlock(struct page *page) { }
-static inline void mlock_vma_page(struct page *page) { }
-static inline void mlock_migrate_page(struct page *new, struct page *old) { }
-
-#endif /* !CONFIG_MMU */
-
-/*
- * Return the mem_map entry representing the 'offset' subpage within
- * the maximally aligned gigantic page 'base'. Handle any discontiguity
- * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
- */
-static inline struct page *mem_map_offset(struct page *base, int offset)
-{
- if (unlikely(offset >= MAX_ORDER_NR_PAGES))
- return nth_page(base, offset);
- return base + offset;
-}
-
-/*
- * Iterator over all subpages within the maximally aligned gigantic
- * page 'base'. Handle any discontiguity in the mem_map.
- */
-static inline struct page *mem_map_next(struct page *iter,
- struct page *base, int offset)
+static inline void unmap_mapping_folio(struct folio *folio) { }
+static inline void mlock_new_folio(struct folio *folio) { }
+static inline bool need_mlock_drain(int cpu) { return false; }
+static inline void mlock_drain_local(void) { }
+static inline void mlock_drain_remote(int cpu) { }
+static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
- if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
- unsigned long pfn = page_to_pfn(base) + offset;
- if (!pfn_valid(pfn))
- return NULL;
- return pfn_to_page(pfn);
- }
- return iter + 1;
}
+#endif /* !CONFIG_MMU */
/* Memory initialisation debug and verification */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+DECLARE_STATIC_KEY_TRUE(deferred_pages);
+
+bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
@@ -502,17 +749,6 @@ static inline void mminit_verify_zonelist(void)
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */
-/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
-#if defined(CONFIG_SPARSEMEM)
-extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
- unsigned long *end_pfn);
-#else
-static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
- unsigned long *end_pfn)
-{
-}
-#endif /* CONFIG_SPARSEMEM */
-
#define NODE_RECLAIM_NOSCAN -2
#define NODE_RECLAIM_FULL -1
#define NODE_RECLAIM_SOME 0
@@ -520,14 +756,22 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#ifdef CONFIG_NUMA
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{
return NODE_RECLAIM_NOSCAN;
}
+static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+ return NUMA_NO_NODE;
+}
#endif
+/*
+ * mm/memory-failure.c
+ */
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
@@ -542,8 +786,9 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
+unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
- struct list_head *page_list);
+ struct list_head *folio_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
#define ALLOC_WMARK_LOW WMARK_LOW
@@ -564,8 +809,13 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_OOM ALLOC_NO_WATERMARKS
#endif
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access
+ * to 25% of the min watermark or
+ * 62.5% if __GFP_HIGH is set.
+ */
+#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50%
+ * of the min watermark.
+ */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
@@ -573,8 +823,12 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#else
#define ALLOC_NOFRAGMENT 0x0
#endif
+#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
+/* Flags that allow allocations below the min watermark. */
+#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
+
enum ttu_flags;
struct tlbflush_unmap_batch;
@@ -602,6 +856,7 @@ static inline void flush_tlb_batched_pending(struct mm_struct *mm)
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags pagetype_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
@@ -623,4 +878,245 @@ struct migration_target_control {
gfp_t gfp_mask;
};
+/*
+ * mm/filemap.c
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size);
+
+/*
+ * mm/vmalloc.c
+ */
+#ifdef CONFIG_MMU
+void __init vmalloc_init(void);
+int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift);
+#else
+static inline void vmalloc_init(void)
+{
+}
+
+static inline
+int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ return -EINVAL;
+}
+#endif
+
+int __must_check __vmap_pages_range_noflush(unsigned long addr,
+ unsigned long end, pgprot_t prot,
+ struct page **pages, unsigned int page_shift);
+
+void vunmap_range_noflush(unsigned long start, unsigned long end);
+
+void __vunmap_range_noflush(unsigned long start, unsigned long end);
+
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid, int *flags);
+
+void free_zone_device_page(struct page *page);
+int migrate_device_coherent_page(struct page *page);
+
+/*
+ * mm/gup.c
+ */
+struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
+int __must_check try_grab_page(struct page *page, unsigned int flags);
+
+/*
+ * mm/huge_memory.c
+ */
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd,
+ unsigned int flags);
+
+enum {
+ /* mark page accessed */
+ FOLL_TOUCH = 1 << 16,
+ /* a retry, previous pass started an IO */
+ FOLL_TRIED = 1 << 17,
+ /* we are working on non-current tsk/mm */
+ FOLL_REMOTE = 1 << 18,
+ /* pages must be released via unpin_user_page */
+ FOLL_PIN = 1 << 19,
+ /* gup_fast: prevent fall-back to slow gup */
+ FOLL_FAST_ONLY = 1 << 20,
+ /* allow unlocking the mmap lock */
+ FOLL_UNLOCKABLE = 1 << 21,
+};
+
+/*
+ * Indicates for which pages that are write-protected in the page table,
+ * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
+ * GUP pin will remain consistent with the pages mapped into the page tables
+ * of the MM.
+ *
+ * Temporary unmapping of PageAnonExclusive() pages or clearing of
+ * PageAnonExclusive() has to protect against concurrent GUP:
+ * * Ordinary GUP: Using the PT lock
+ * * GUP-fast and fork(): mm->write_protect_seq
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ * page_try_share_anon_rmap()
+ *
+ * Must be called with the (sub)page that's actually referenced via the
+ * page table entry, which might not necessarily be the head page for a
+ * PTE-mapped THP.
+ *
+ * If the vma is NULL, we're coming from the GUP-fast path and might have
+ * to fallback to the slow path just to lookup the vma.
+ */
+static inline bool gup_must_unshare(struct vm_area_struct *vma,
+ unsigned int flags, struct page *page)
+{
+ /*
+ * FOLL_WRITE is implicitly handled correctly as the page table entry
+ * has to be writable -- and if it references (part of) an anonymous
+ * folio, that part is required to be marked exclusive.
+ */
+ if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
+ return false;
+ /*
+ * Note: PageAnon(page) is stable until the page is actually getting
+ * freed.
+ */
+ if (!PageAnon(page)) {
+ /*
+ * We only care about R/O long-term pining: R/O short-term
+ * pinning does not have the semantics to observe successive
+ * changes through the process page tables.
+ */
+ if (!(flags & FOLL_LONGTERM))
+ return false;
+
+ /* We really need the vma ... */
+ if (!vma)
+ return true;
+
+ /*
+ * ... because we only care about writable private ("COW")
+ * mappings where we have to break COW early.
+ */
+ return is_cow_mapping(vma->vm_flags);
+ }
+
+ /* Paired with a memory barrier in page_try_share_anon_rmap(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_rmb();
+
+ /*
+ * During GUP-fast we might not get called on the head page for a
+ * hugetlb page that is mapped using cont-PTE, because GUP-fast does
+ * not work with the abstracted hugetlb PTEs that always point at the
+ * head page. For hugetlb, PageAnonExclusive only applies on the head
+ * page (as it cannot be partially COW-shared), so lookup the head page.
+ */
+ if (unlikely(!PageHead(page) && PageHuge(page)))
+ page = compound_head(page);
+
+ /*
+ * Note that PageKsm() pages cannot be exclusive, and consequently,
+ * cannot get pinned.
+ */
+ return !PageAnonExclusive(page);
+}
+
+extern bool mirrored_kernelcore;
+
+static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
+{
+ /*
+ * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
+ * enablements, because when without soft-dirty being compiled in,
+ * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
+ * will be constantly true.
+ */
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return false;
+
+ /*
+ * Soft-dirty is kind of special: its tracking is enabled when the
+ * vma flags not set.
+ */
+ return !(vma->vm_flags & VM_SOFTDIRTY);
+}
+
+/*
+ * VMA Iterator functions shared between nommu and mmap
+ */
+static inline int vma_iter_prealloc(struct vma_iterator *vmi)
+{
+ return mas_preallocate(&vmi->mas, GFP_KERNEL);
+}
+
+static inline void vma_iter_clear(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end)
+{
+ mas_set_range(&vmi->mas, start, end - 1);
+ mas_store_prealloc(&vmi->mas, NULL);
+}
+
+static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
+{
+ return mas_walk(&vmi->mas);
+}
+
+/* Store a VMA with preallocated memory */
+static inline void vma_iter_store(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
+{
+
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ vmi->mas.index > vma->vm_start)) {
+ pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
+ vmi->mas.index, vma->vm_start, vma->vm_start,
+ vma->vm_end, vmi->mas.index, vmi->mas.last);
+ }
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ vmi->mas.last < vma->vm_start)) {
+ pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
+ vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
+ vmi->mas.index, vmi->mas.last);
+ }
+#endif
+
+ if (vmi->mas.node != MAS_START &&
+ ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
+ vma_iter_invalidate(vmi);
+
+ vmi->mas.index = vma->vm_start;
+ vmi->mas.last = vma->vm_end - 1;
+ mas_store_prealloc(&vmi->mas, vma);
+}
+
+static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
+ struct vm_area_struct *vma, gfp_t gfp)
+{
+ if (vmi->mas.node != MAS_START &&
+ ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
+ vma_iter_invalidate(vmi);
+
+ vmi->mas.index = vma->vm_start;
+ vmi->mas.last = vma->vm_end - 1;
+ mas_store_gfp(&vmi->mas, vma, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * VMA lock generalization
+ */
+struct vma_prepare {
+ struct vm_area_struct *vma;
+ struct vm_area_struct *adj_next;
+ struct file *file;
+ struct address_space *mapping;
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *insert;
+ struct vm_area_struct *remove;
+ struct vm_area_struct *remove2;
+};
#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 11c75fb07584..32e390c42c53 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -22,7 +22,7 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
unsigned long, shared.rb_subtree_last,
- vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+ vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree)
/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
new file mode 100644
index 000000000000..01b362799930
--- /dev/null
+++ b/mm/io-mapping.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/io-mapping.h>
+
+/**
+ * io_mapping_map_user - remap an I/O mapping to userspace
+ * @iomap: the source io_mapping
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ */
+int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, unsigned long size)
+{
+ vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+
+ if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
+ return -EINVAL;
+
+ /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+ return remap_pfn_range_notrack(vma, addr, pfn, size,
+ __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+ (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
+}
+EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fa1ab41d152..8652426282cc 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -8,271 +8,38 @@
*/
#include <linux/vmalloc.h>
#include <linux/mm.h>
-#include <linux/sched.h>
#include <linux/io.h>
#include <linux/export.h>
-#include <asm/cacheflush.h>
-#include "pgalloc-track.h"
-
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static int __read_mostly ioremap_p4d_capable;
-static int __read_mostly ioremap_pud_capable;
-static int __read_mostly ioremap_pmd_capable;
-static int __read_mostly ioremap_huge_disabled;
-
-static int __init set_nohugeiomap(char *str)
-{
- ioremap_huge_disabled = 1;
- return 0;
-}
-early_param("nohugeiomap", set_nohugeiomap);
-
-void __init ioremap_huge_init(void)
-{
- if (!ioremap_huge_disabled) {
- if (arch_ioremap_p4d_supported())
- ioremap_p4d_capable = 1;
- if (arch_ioremap_pud_supported())
- ioremap_pud_capable = 1;
- if (arch_ioremap_pmd_supported())
- ioremap_pmd_capable = 1;
- }
-}
-
-static inline int ioremap_p4d_enabled(void)
-{
- return ioremap_p4d_capable;
-}
-
-static inline int ioremap_pud_enabled(void)
-{
- return ioremap_pud_capable;
-}
-
-static inline int ioremap_pmd_enabled(void)
-{
- return ioremap_pmd_capable;
-}
-
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static inline int ioremap_p4d_enabled(void) { return 0; }
-static inline int ioremap_pud_enabled(void) { return 0; }
-static inline int ioremap_pmd_enabled(void) { return 0; }
-#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pte_t *pte;
- u64 pfn;
-
- pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel_track(pmd, addr, mask);
- if (!pte)
- return -ENOMEM;
- do {
- BUG_ON(!pte_none(*pte));
- set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
- pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- *mask |= PGTBL_PTE_MODIFIED;
- return 0;
-}
-
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pmd_enabled())
- return 0;
-
- if ((end - addr) != PMD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PMD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PMD_SIZE))
- return 0;
-
- if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
- return 0;
-
- return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
- if (!pmd)
- return -ENOMEM;
- do {
- next = pmd_addr_end(addr, end);
-
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_PMD_MODIFIED;
- continue;
- }
-
- if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pud_enabled())
- return 0;
-
- if ((end - addr) != PUD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PUD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PUD_SIZE))
- return 0;
-
- if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
- return 0;
-
- return pud_set_huge(pud, phys_addr, prot);
-}
-
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_alloc_track(&init_mm, p4d, addr, mask);
- if (!pud)
- return -ENOMEM;
- do {
- next = pud_addr_end(addr, end);
-
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_PUD_MODIFIED;
- continue;
- }
-
- if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_p4d_enabled())
- return 0;
-
- if ((end - addr) != P4D_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, P4D_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, P4D_SIZE))
- return 0;
-
- if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
- return 0;
-
- return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- p4d_t *p4d;
- unsigned long next;
-
- p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
- if (!p4d)
- return -ENOMEM;
- do {
- next = p4d_addr_end(addr, end);
-
- if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_P4D_MODIFIED;
- continue;
- }
-
- if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-int ioremap_page_range(unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
- pgd_t *pgd;
- unsigned long start;
- unsigned long next;
- int err;
- pgtbl_mod_mask mask = 0;
-
- might_sleep();
- BUG_ON(addr >= end);
-
- start = addr;
- pgd = pgd_offset_k(addr);
- do {
- next = pgd_addr_end(addr, end);
- err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
- &mask);
- if (err)
- break;
- } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
- flush_cache_vmap(start, end);
-
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(start, end);
-
- return err;
-}
-
-#ifdef CONFIG_GENERIC_IOREMAP
-void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
unsigned long offset, vaddr;
phys_addr_t last_addr;
struct vm_struct *area;
/* Disallow wrap-around or zero size */
- last_addr = addr + size - 1;
- if (!size || last_addr < addr)
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr)
return NULL;
/* Page-align mappings */
- offset = addr & (~PAGE_MASK);
- addr -= offset;
+ offset = phys_addr & (~PAGE_MASK);
+ phys_addr -= offset;
size = PAGE_ALIGN(size + offset);
+ if (!ioremap_allowed(phys_addr, size, prot))
+ return NULL;
+
area = get_vm_area_caller(size, VM_IOREMAP,
__builtin_return_address(0));
if (!area)
return NULL;
vaddr = (unsigned long)area->addr;
+ area->phys_addr = phys_addr;
- if (ioremap_page_range(vaddr, vaddr + size, addr, __pgprot(prot))) {
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr,
+ __pgprot(prot))) {
free_vm_area(area);
return NULL;
}
@@ -283,7 +50,12 @@ EXPORT_SYMBOL(ioremap_prot);
void iounmap(volatile void __iomem *addr)
{
- vunmap((void *)((unsigned long)addr & PAGE_MASK));
+ void *vaddr = (void *)((unsigned long)addr & PAGE_MASK);
+
+ if (!iounmap_allowed(vaddr))
+ return;
+
+ if (is_vmalloc_addr(vaddr))
+ vunmap(vaddr);
}
EXPORT_SYMBOL(iounmap);
-#endif /* CONFIG_GENERIC_IOREMAP */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 370d970e5ab5..7634dd2a6128 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -6,12 +6,15 @@ KCOV_INSTRUMENT := n
# Disable ftrace to avoid recursion.
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_generic.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_hw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_sw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_hw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_sw_tags.o = $(CC_FLAGS_FTRACE)
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
@@ -22,13 +25,32 @@ CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-obj-$(CONFIG_KASAN) := common.o init.o report.o
-obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
-obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
+# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
+# we need to treat them normally (as builtins), otherwise the compiler won't
+# recognize them as instrumentable. If it doesn't instrument them, we need to
+# pass -fno-builtin, so the compiler doesn't inline them.
+CFLAGS_KASAN_TEST += -fno-builtin
+endif
+
+CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST)
+CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST)
+
+obj-y := common.o report.o
+obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
+obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o
+obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o
+
+obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o
+obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 950fd372a07e..256930da578a 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -1,24 +1,18 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains common generic and tag-based KASAN code.
+ * This file contains common KASAN code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
-#include <linux/kmemleak.h>
#include <linux/linkage.h>
#include <linux/memblock.h>
#include <linux/memory.h>
@@ -31,140 +25,60 @@
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <linux/vmalloc.h>
#include <linux/bug.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
#include "kasan.h"
#include "../slab.h"
-depot_stack_handle_t kasan_save_stack(gfp_t flags)
+struct slab *kasan_addr_to_slab(const void *addr)
+{
+ if (virt_addr_valid(addr))
+ return virt_to_slab(addr);
+ return NULL;
+}
+
+depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
- nr_entries = filter_irq_stacks(entries, nr_entries);
- return stack_depot_save(entries, nr_entries, flags);
+ return __stack_depot_save(entries, nr_entries, flags, can_alloc);
}
void kasan_set_track(struct kasan_track *track, gfp_t flags)
{
track->pid = current->pid;
- track->stack = kasan_save_stack(flags);
+ track->stack = kasan_save_stack(flags, true);
}
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
void kasan_enable_current(void)
{
current->kasan_depth++;
}
+EXPORT_SYMBOL(kasan_enable_current);
void kasan_disable_current(void)
{
current->kasan_depth--;
}
+EXPORT_SYMBOL(kasan_disable_current);
-bool __kasan_check_read(const volatile void *p, unsigned int size)
-{
- return check_memory_region((unsigned long)p, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__kasan_check_read);
-
-bool __kasan_check_write(const volatile void *p, unsigned int size)
-{
- return check_memory_region((unsigned long)p, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__kasan_check_write);
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-#undef memset
-void *memset(void *addr, int c, size_t len)
+void __kasan_unpoison_range(const void *address, size_t size)
{
- if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
- return NULL;
-
- return __memset(addr, c, len);
-}
-
-#ifdef __HAVE_ARCH_MEMMOVE
-#undef memmove
-void *memmove(void *dest, const void *src, size_t len)
-{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
- return NULL;
-
- return __memmove(dest, src, len);
-}
-#endif
-
-#undef memcpy
-void *memcpy(void *dest, const void *src, size_t len)
-{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
- return NULL;
-
- return __memcpy(dest, src, len);
-}
-
-/*
- * Poisons the shadow memory for 'size' bytes starting from 'addr'.
- * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
- */
-void kasan_poison_shadow(const void *address, size_t size, u8 value)
-{
- void *shadow_start, *shadow_end;
-
- /*
- * Perform shadow offset calculation based on untagged address, as
- * some of the callers (e.g. kasan_poison_object_data) pass tagged
- * addresses to this function.
- */
- address = reset_tag(address);
-
- shadow_start = kasan_mem_to_shadow(address);
- shadow_end = kasan_mem_to_shadow(address + size);
-
- __memset(shadow_start, value, shadow_end - shadow_start);
-}
-
-void kasan_unpoison_shadow(const void *address, size_t size)
-{
- u8 tag = get_tag(address);
-
- /*
- * Perform shadow offset calculation based on untagged address, as
- * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
- * addresses to this function.
- */
- address = reset_tag(address);
-
- kasan_poison_shadow(address, size, tag);
-
- if (size & KASAN_SHADOW_MASK) {
- u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
-
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- *shadow = tag;
- else
- *shadow = size & KASAN_SHADOW_MASK;
- }
-}
-
-static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
-{
- void *base = task_stack_page(task);
- size_t size = sp - base;
-
- kasan_unpoison_shadow(base, size);
+ kasan_unpoison(address, size, false);
}
+#ifdef CONFIG_KASAN_STACK
/* Unpoison the entire stack for a task. */
void kasan_unpoison_task_stack(struct task_struct *task)
{
- __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+ void *base = task_stack_page(task);
+
+ kasan_unpoison(base, THREAD_SIZE, false);
}
/* Unpoison the stack for the current task beyond a watermark sp value. */
@@ -177,132 +91,57 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
*/
void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
- kasan_unpoison_shadow(base, watermark - base);
+ kasan_unpoison(base, watermark - base, false);
}
+#endif /* CONFIG_KASAN_STACK */
-void kasan_alloc_pages(struct page *page, unsigned int order)
+bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
unsigned long i;
if (unlikely(PageHighMem(page)))
- return;
+ return false;
+
+ if (!kasan_sample_page_alloc(order))
+ return false;
- tag = random_tag();
+ tag = kasan_random_tag();
+ kasan_unpoison(set_tag(page_address(page), tag),
+ PAGE_SIZE << order, init);
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
-}
-void kasan_free_pages(struct page *page, unsigned int order)
-{
- if (likely(!PageHighMem(page)))
- kasan_poison_shadow(page_address(page),
- PAGE_SIZE << order,
- KASAN_FREE_PAGE);
+ return true;
}
-/*
- * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
- * For larger allocations larger redzones are used.
- */
-static inline unsigned int optimal_redzone(unsigned int object_size)
+void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
{
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- return 0;
-
- return
- object_size <= 64 - 16 ? 16 :
- object_size <= 128 - 32 ? 32 :
- object_size <= 512 - 64 ? 64 :
- object_size <= 4096 - 128 ? 128 :
- object_size <= (1 << 14) - 256 ? 256 :
- object_size <= (1 << 15) - 512 ? 512 :
- object_size <= (1 << 16) - 1024 ? 1024 : 2048;
-}
-
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags)
-{
- unsigned int orig_size = *size;
- unsigned int redzone_size;
- int redzone_adjust;
-
- /* Add alloc meta. */
- cache->kasan_info.alloc_meta_offset = *size;
- *size += sizeof(struct kasan_alloc_meta);
-
- /* Add free meta. */
- if (IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
- cache->object_size < sizeof(struct kasan_free_meta))) {
- cache->kasan_info.free_meta_offset = *size;
- *size += sizeof(struct kasan_free_meta);
- }
-
- redzone_size = optimal_redzone(cache->object_size);
- redzone_adjust = redzone_size - (*size - cache->object_size);
- if (redzone_adjust > 0)
- *size += redzone_adjust;
-
- *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
- max(*size, cache->object_size + redzone_size));
-
- /*
- * If the metadata doesn't fit, don't enable KASAN at all.
- */
- if (*size <= cache->kasan_info.alloc_meta_offset ||
- *size <= cache->kasan_info.free_meta_offset) {
- cache->kasan_info.alloc_meta_offset = 0;
- cache->kasan_info.free_meta_offset = 0;
- *size = orig_size;
- return;
- }
-
- *flags |= SLAB_KASAN;
-}
-
-size_t kasan_metadata_size(struct kmem_cache *cache)
-{
- return (cache->kasan_info.alloc_meta_offset ?
- sizeof(struct kasan_alloc_meta) : 0) +
- (cache->kasan_info.free_meta_offset ?
- sizeof(struct kasan_free_meta) : 0);
-}
-
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
- const void *object)
-{
- return (void *)object + cache->kasan_info.alloc_meta_offset;
-}
-
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
- const void *object)
-{
- BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
- return (void *)object + cache->kasan_info.free_meta_offset;
+ if (likely(!PageHighMem(page)))
+ kasan_poison(page_address(page), PAGE_SIZE << order,
+ KASAN_PAGE_FREE, init);
}
-void kasan_poison_slab(struct page *page)
+void __kasan_poison_slab(struct slab *slab)
{
+ struct page *page = slab_page(slab);
unsigned long i;
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
- kasan_poison_shadow(page_address(page), page_size(page),
- KASAN_KMALLOC_REDZONE);
+ kasan_poison(page_address(page), page_size(page),
+ KASAN_SLAB_REDZONE, false);
}
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_unpoison_shadow(object, cache->object_size);
+ kasan_unpoison(object, cache->object_size, false);
}
-void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_poison_shadow(object,
- round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
- KASAN_KMALLOC_REDZONE);
+ kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
+ KASAN_SLAB_REDZONE, false);
}
/*
@@ -319,86 +158,62 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
* based on objects indexes, so that objects that are next to each other
* get different tags.
*/
-static u8 assign_tag(struct kmem_cache *cache, const void *object,
- bool init, bool keep_tag)
+static inline u8 assign_tag(struct kmem_cache *cache,
+ const void *object, bool init)
{
- /*
- * 1. When an object is kmalloc()'ed, two hooks are called:
- * kasan_slab_alloc() and kasan_kmalloc(). We assign the
- * tag only in the first one.
- * 2. We reuse the same tag for krealloc'ed objects.
- */
- if (keep_tag)
- return get_tag(object);
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ return 0xff;
/*
* If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
* set, assign a tag when the object is being allocated (init == false).
*/
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
- return init ? KASAN_TAG_KERNEL : random_tag();
+ return init ? KASAN_TAG_KERNEL : kasan_random_tag();
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
/* For SLAB assign tags based on the object index in the freelist. */
- return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
+ return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object);
#else
/*
* For SLUB assign a random tag during slab creation, otherwise reuse
* the already assigned tag.
*/
- return init ? random_tag() : get_tag(object);
+ return init ? kasan_random_tag() : get_tag(object);
#endif
}
-void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
const void *object)
{
- struct kasan_alloc_meta *alloc_info;
+ /* Initialize per-object metadata if it is present. */
+ if (kasan_requires_meta())
+ kasan_init_object_meta(cache, object);
- if (!(cache->flags & SLAB_KASAN))
- return (void *)object;
-
- alloc_info = get_alloc_info(cache, object);
- __memset(alloc_info, 0, sizeof(*alloc_info));
-
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- object = set_tag(object,
- assign_tag(cache, object, true, false));
+ /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
+ object = set_tag(object, assign_tag(cache, object, true));
return (void *)object;
}
-static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
-{
- if (IS_ENABLED(CONFIG_KASAN_GENERIC))
- return shadow_byte < 0 ||
- shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
-
- /* else CONFIG_KASAN_SW_TAGS: */
- if ((u8)shadow_byte == KASAN_TAG_INVALID)
- return true;
- if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte))
- return true;
-
- return false;
-}
-
-static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
- unsigned long ip, bool quarantine)
+static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool quarantine, bool init)
{
- s8 shadow_byte;
- u8 tag;
void *tagged_object;
- unsigned long rounded_up_size;
- tag = get_tag(object);
+ if (!kasan_arch_is_ready())
+ return false;
+
tagged_object = object;
- object = reset_tag(object);
+ object = kasan_reset_tag(object);
- if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
+ if (is_kfence_address(object))
+ return false;
+
+ if (unlikely(nearest_obj(cache, virt_to_slab(object), object) !=
object)) {
- kasan_report_invalid_free(tagged_object, ip);
+ kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE);
return true;
}
@@ -406,526 +221,232 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
return false;
- shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
- if (shadow_invalid(tag, shadow_byte)) {
- kasan_report_invalid_free(tagged_object, ip);
+ if (!kasan_byte_accessible(tagged_object)) {
+ kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE);
return true;
}
- rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
- kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+ kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
+ KASAN_SLAB_FREE, init);
- if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
- unlikely(!(cache->flags & SLAB_KASAN)))
+ if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
return false;
- kasan_set_free_info(cache, object, tag);
-
- quarantine_put(get_free_info(cache, object), cache);
+ if (kasan_stack_collection_enabled())
+ kasan_save_free_info(cache, tagged_object);
- return IS_ENABLED(CONFIG_KASAN_GENERIC);
+ return kasan_quarantine_put(cache, object);
}
-bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool init)
{
- return __kasan_slab_free(cache, object, ip, true);
+ return ____kasan_slab_free(cache, object, ip, true, init);
}
-static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags, bool keep_tag)
+static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
{
- unsigned long redzone_start;
- unsigned long redzone_end;
- u8 tag = 0xff;
-
- if (gfpflags_allow_blocking(flags))
- quarantine_reduce();
-
- if (unlikely(object == NULL))
- return NULL;
-
- redzone_start = round_up((unsigned long)(object + size),
- KASAN_SHADOW_SCALE_SIZE);
- redzone_end = round_up((unsigned long)object + cache->object_size,
- KASAN_SHADOW_SCALE_SIZE);
+ if (!kasan_arch_is_ready())
+ return false;
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- tag = assign_tag(cache, object, false, keep_tag);
+ if (ptr != page_address(virt_to_head_page(ptr))) {
+ kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE);
+ return true;
+ }
- /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
- kasan_unpoison_shadow(set_tag(object, tag), size);
- kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
+ if (!kasan_byte_accessible(ptr)) {
+ kasan_report_invalid_free(ptr, ip, KASAN_REPORT_DOUBLE_FREE);
+ return true;
+ }
- if (cache->flags & SLAB_KASAN)
- kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+ /*
+ * The object will be poisoned by kasan_poison_pages() or
+ * kasan_slab_free_mempool().
+ */
- return set_tag(object, tag);
+ return false;
}
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
- gfp_t flags)
+void __kasan_kfree_large(void *ptr, unsigned long ip)
{
- return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+ ____kasan_kfree_large(ptr, ip);
}
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags)
+void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
{
- return __kasan_kmalloc(cache, object, size, flags, true);
-}
-EXPORT_SYMBOL(kasan_kmalloc);
+ struct folio *folio;
-void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
- gfp_t flags)
-{
- struct page *page;
- unsigned long redzone_start;
- unsigned long redzone_end;
-
- if (gfpflags_allow_blocking(flags))
- quarantine_reduce();
+ folio = virt_to_folio(ptr);
- if (unlikely(ptr == NULL))
- return NULL;
-
- page = virt_to_page(ptr);
- redzone_start = round_up((unsigned long)(ptr + size),
- KASAN_SHADOW_SCALE_SIZE);
- redzone_end = (unsigned long)ptr + page_size(page);
-
- kasan_unpoison_shadow(ptr, size);
- kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
- KASAN_PAGE_REDZONE);
+ /*
+ * Even though this function is only called for kmem_cache_alloc and
+ * kmalloc backed mempool allocations, those allocations can still be
+ * !PageSlab() when the size provided to kmalloc is larger than
+ * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
+ */
+ if (unlikely(!folio_test_slab(folio))) {
+ if (____kasan_kfree_large(ptr, ip))
+ return;
+ kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false);
+ } else {
+ struct slab *slab = folio_slab(folio);
- return (void *)ptr;
+ ____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
+ }
}
-void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+ void *object, gfp_t flags, bool init)
{
- struct page *page;
+ u8 tag;
+ void *tagged_object;
- if (unlikely(object == ZERO_SIZE_PTR))
- return (void *)object;
+ if (gfpflags_allow_blocking(flags))
+ kasan_quarantine_reduce();
- page = virt_to_head_page(object);
+ if (unlikely(object == NULL))
+ return NULL;
- if (unlikely(!PageSlab(page)))
- return kasan_kmalloc_large(object, size, flags);
- else
- return __kasan_kmalloc(page->slab_cache, object, size,
- flags, true);
-}
+ if (is_kfence_address(object))
+ return (void *)object;
-void kasan_poison_kfree(void *ptr, unsigned long ip)
-{
- struct page *page;
+ /*
+ * Generate and assign random tag for tag-based modes.
+ * Tag is ignored in set_tag() for the generic mode.
+ */
+ tag = assign_tag(cache, object, false);
+ tagged_object = set_tag(object, tag);
- page = virt_to_head_page(ptr);
+ /*
+ * Unpoison the whole object.
+ * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
+ */
+ kasan_unpoison(tagged_object, cache->object_size, init);
- if (unlikely(!PageSlab(page))) {
- if (ptr != page_address(page)) {
- kasan_report_invalid_free(ptr, ip);
- return;
- }
- kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
- } else {
- __kasan_slab_free(page->slab_cache, ptr, ip, false);
- }
-}
+ /* Save alloc info (if possible) for non-kmalloc() allocations. */
+ if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache))
+ kasan_save_alloc_info(cache, tagged_object, flags);
-void kasan_kfree_large(void *ptr, unsigned long ip)
-{
- if (ptr != page_address(virt_to_head_page(ptr)))
- kasan_report_invalid_free(ptr, ip);
- /* The object will be poisoned by page_alloc. */
+ return tagged_object;
}
-#ifndef CONFIG_KASAN_VMALLOC
-int kasan_module_alloc(void *addr, size_t size)
+static inline void *____kasan_kmalloc(struct kmem_cache *cache,
+ const void *object, size_t size, gfp_t flags)
{
- void *ret;
- size_t scaled_size;
- size_t shadow_size;
- unsigned long shadow_start;
-
- shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
- scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
- shadow_size = round_up(scaled_size, PAGE_SIZE);
-
- if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
- return -EINVAL;
-
- ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
- shadow_start + shadow_size,
- GFP_KERNEL,
- PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
- __builtin_return_address(0));
-
- if (ret) {
- __memset(ret, KASAN_SHADOW_INIT, shadow_size);
- find_vm_area(addr)->flags |= VM_KASAN;
- kmemleak_ignore(ret);
- return 0;
- }
-
- return -ENOMEM;
-}
+ unsigned long redzone_start;
+ unsigned long redzone_end;
-void kasan_free_shadow(const struct vm_struct *vm)
-{
- if (vm->flags & VM_KASAN)
- vfree(kasan_mem_to_shadow(vm->addr));
-}
-#endif
+ if (gfpflags_allow_blocking(flags))
+ kasan_quarantine_reduce();
-#ifdef CONFIG_MEMORY_HOTPLUG
-static bool shadow_mapped(unsigned long addr)
-{
- pgd_t *pgd = pgd_offset_k(addr);
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ if (unlikely(object == NULL))
+ return NULL;
- if (pgd_none(*pgd))
- return false;
- p4d = p4d_offset(pgd, addr);
- if (p4d_none(*p4d))
- return false;
- pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
- return false;
+ if (is_kfence_address(kasan_reset_tag(object)))
+ return (void *)object;
/*
- * We can't use pud_large() or pud_huge(), the first one is
- * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
- * pud_bad(), if pud is bad then it's bad because it's huge.
+ * The object has already been unpoisoned by kasan_slab_alloc() for
+ * kmalloc() or by kasan_krealloc() for krealloc().
*/
- if (pud_bad(*pud))
- return true;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
- return false;
- if (pmd_bad(*pmd))
- return true;
- pte = pte_offset_kernel(pmd, addr);
- return !pte_none(*pte);
-}
-
-static int __meminit kasan_mem_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct memory_notify *mem_data = data;
- unsigned long nr_shadow_pages, start_kaddr, shadow_start;
- unsigned long shadow_end, shadow_size;
-
- nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
- start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
- shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
- shadow_size = nr_shadow_pages << PAGE_SHIFT;
- shadow_end = shadow_start + shadow_size;
-
- if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
- WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
- return NOTIFY_BAD;
-
- switch (action) {
- case MEM_GOING_ONLINE: {
- void *ret;
-
- /*
- * If shadow is mapped already than it must have been mapped
- * during the boot. This could happen if we onlining previously
- * offlined memory.
- */
- if (shadow_mapped(shadow_start))
- return NOTIFY_OK;
-
- ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
- shadow_end, GFP_KERNEL,
- PAGE_KERNEL, VM_NO_GUARD,
- pfn_to_nid(mem_data->start_pfn),
- __builtin_return_address(0));
- if (!ret)
- return NOTIFY_BAD;
-
- kmemleak_ignore(ret);
- return NOTIFY_OK;
- }
- case MEM_CANCEL_ONLINE:
- case MEM_OFFLINE: {
- struct vm_struct *vm;
-
- /*
- * shadow_start was either mapped during boot by kasan_init()
- * or during memory online by __vmalloc_node_range().
- * In the latter case we can use vfree() to free shadow.
- * Non-NULL result of the find_vm_area() will tell us if
- * that was the second case.
- *
- * Currently it's not possible to free shadow mapped
- * during boot by kasan_init(). It's because the code
- * to do that hasn't been written yet. So we'll just
- * leak the memory.
- */
- vm = find_vm_area((void *)shadow_start);
- if (vm)
- vfree((void *)shadow_start);
- }
- }
+ /*
+ * The redzone has byte-level precision for the generic mode.
+ * Partially poison the last object granule to cover the unaligned
+ * part of the redzone.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ kasan_poison_last_granule((void *)object, size);
- return NOTIFY_OK;
-}
+ /* Poison the aligned part of the redzone. */
+ redzone_start = round_up((unsigned long)(object + size),
+ KASAN_GRANULE_SIZE);
+ redzone_end = round_up((unsigned long)(object + cache->object_size),
+ KASAN_GRANULE_SIZE);
+ kasan_poison((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_SLAB_REDZONE, false);
-static int __init kasan_memhotplug_init(void)
-{
- hotplug_memory_notifier(kasan_mem_notifier, 0);
+ /*
+ * Save alloc info (if possible) for kmalloc() allocations.
+ * This also rewrites the alloc info when called from kasan_krealloc().
+ */
+ if (kasan_stack_collection_enabled() && is_kmalloc_cache(cache))
+ kasan_save_alloc_info(cache, (void *)object, flags);
- return 0;
+ /* Keep the tag that was set by kasan_slab_alloc(). */
+ return (void *)object;
}
-core_initcall(kasan_memhotplug_init);
-#endif
-
-#ifdef CONFIG_KASAN_VMALLOC
-static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
{
- unsigned long page;
- pte_t pte;
-
- if (likely(!pte_none(*ptep)))
- return 0;
-
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
-
- spin_lock(&init_mm.page_table_lock);
- if (likely(pte_none(*ptep))) {
- set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
- }
- spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
- return 0;
+ return ____kasan_kmalloc(cache, object, size, flags);
}
+EXPORT_SYMBOL(__kasan_kmalloc);
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
+ gfp_t flags)
{
- unsigned long shadow_start, shadow_end;
- int ret;
-
- if (!is_vmalloc_or_module_addr((void *)addr))
- return 0;
-
- shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
- shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
- shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
- shadow_end = ALIGN(shadow_end, PAGE_SIZE);
+ unsigned long redzone_start;
+ unsigned long redzone_end;
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
- if (ret)
- return ret;
+ if (gfpflags_allow_blocking(flags))
+ kasan_quarantine_reduce();
- flush_cache_vmap(shadow_start, shadow_end);
+ if (unlikely(ptr == NULL))
+ return NULL;
/*
- * We need to be careful about inter-cpu effects here. Consider:
- *
- * CPU#0 CPU#1
- * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
- * p[99] = 1;
- *
- * With compiler instrumentation, that ends up looking like this:
- *
- * CPU#0 CPU#1
- * // vmalloc() allocates memory
- * // let a = area->addr
- * // we reach kasan_populate_vmalloc
- * // and call kasan_unpoison_shadow:
- * STORE shadow(a), unpoison_val
- * ...
- * STORE shadow(a+99), unpoison_val x = LOAD p
- * // rest of vmalloc process <data dependency>
- * STORE p, a LOAD shadow(x+99)
- *
- * If there is no barrier between the end of unpoisioning the shadow
- * and the store of the result to p, the stores could be committed
- * in a different order by CPU#0, and CPU#1 could erroneously observe
- * poison in the shadow.
- *
- * We need some sort of barrier between the stores.
- *
- * In the vmalloc() case, this is provided by a smp_wmb() in
- * clear_vm_uninitialized_flag(). In the per-cpu allocator and in
- * get_vm_area() and friends, the caller gets shadow allocated but
- * doesn't have any pages mapped into the virtual address space that
- * has been reserved. Mapping those pages in will involve taking and
- * releasing a page-table lock, which will provide the barrier.
+ * The object has already been unpoisoned by kasan_unpoison_pages() for
+ * alloc_pages() or by kasan_krealloc() for krealloc().
*/
- return 0;
-}
-
-/*
- * Poison the shadow for a vmalloc region. Called as part of the
- * freeing process at the time the region is freed.
- */
-void kasan_poison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
- kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
-}
+ /*
+ * The redzone has byte-level precision for the generic mode.
+ * Partially poison the last object granule to cover the unaligned
+ * part of the redzone.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ kasan_poison_last_granule(ptr, size);
-void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
+ /* Poison the aligned part of the redzone. */
+ redzone_start = round_up((unsigned long)(ptr + size),
+ KASAN_GRANULE_SIZE);
+ redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
+ kasan_poison((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_PAGE_REDZONE, false);
- kasan_unpoison_shadow(start, size);
+ return (void *)ptr;
}
-static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
- unsigned long page;
+ struct slab *slab;
- page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return (void *)object;
- spin_lock(&init_mm.page_table_lock);
+ /*
+ * Unpoison the object's data.
+ * Part of it might already have been unpoisoned, but it's unknown
+ * how big that part is.
+ */
+ kasan_unpoison(object, size, false);
- if (likely(!pte_none(*ptep))) {
- pte_clear(&init_mm, addr, ptep);
- free_page(page);
- }
- spin_unlock(&init_mm.page_table_lock);
+ slab = virt_to_slab(object);
- return 0;
+ /* Piggy-back on kmalloc() instrumentation to poison the redzone. */
+ if (unlikely(!slab))
+ return __kasan_kmalloc_large(object, size, flags);
+ else
+ return ____kasan_kmalloc(slab->slab_cache, object, size, flags);
}
-/*
- * Release the backing for the vmalloc region [start, end), which
- * lies within the free region [free_region_start, free_region_end).
- *
- * This can be run lazily, long after the region was freed. It runs
- * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
- * infrastructure.
- *
- * How does this work?
- * -------------------
- *
- * We have a region that is page aligned, labelled as A.
- * That might not map onto the shadow in a way that is page-aligned:
- *
- * start end
- * v v
- * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
- * -------- -------- -------- -------- --------
- * | | | | |
- * | | | /-------/ |
- * \-------\|/------/ |/---------------/
- * ||| ||
- * |??AAAAAA|AAAAAAAA|AA??????| < shadow
- * (1) (2) (3)
- *
- * First we align the start upwards and the end downwards, so that the
- * shadow of the region aligns with shadow page boundaries. In the
- * example, this gives us the shadow page (2). This is the shadow entirely
- * covered by this allocation.
- *
- * Then we have the tricky bits. We want to know if we can free the
- * partially covered shadow pages - (1) and (3) in the example. For this,
- * we are given the start and end of the free region that contains this
- * allocation. Extending our previous example, we could have:
- *
- * free_region_start free_region_end
- * | start end |
- * v v v v
- * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
- * -------- -------- -------- -------- --------
- * | | | | |
- * | | | /-------/ |
- * \-------\|/------/ |/---------------/
- * ||| ||
- * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
- * (1) (2) (3)
- *
- * Once again, we align the start of the free region up, and the end of
- * the free region down so that the shadow is page aligned. So we can free
- * page (1) - we know no allocation currently uses anything in that page,
- * because all of it is in the vmalloc free region. But we cannot free
- * page (3), because we can't be sure that the rest of it is unused.
- *
- * We only consider pages that contain part of the original region for
- * freeing: we don't try to free other pages from the free region or we'd
- * end up trying to free huge chunks of virtual address space.
- *
- * Concurrency
- * -----------
- *
- * How do we know that we're not freeing a page that is simultaneously
- * being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
- *
- * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
- * at the same time. While we run under free_vmap_area_lock, the population
- * code does not.
- *
- * free_vmap_area_lock instead operates to ensure that the larger range
- * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
- * the per-cpu region-finding algorithm both run under free_vmap_area_lock,
- * no space identified as free will become used while we are running. This
- * means that so long as we are careful with alignment and only free shadow
- * pages entirely covered by the free region, we will not run in to any
- * trouble - any simultaneous allocations will be for disjoint regions.
- */
-void kasan_release_vmalloc(unsigned long start, unsigned long end,
- unsigned long free_region_start,
- unsigned long free_region_end)
+bool __kasan_check_byte(const void *address, unsigned long ip)
{
- void *shadow_start, *shadow_end;
- unsigned long region_start, region_end;
- unsigned long size;
-
- region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
- region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- free_region_start = ALIGN(free_region_start,
- PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- if (start != region_start &&
- free_region_start < region_start)
- region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
-
- free_region_end = ALIGN_DOWN(free_region_end,
- PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- if (end != region_end &&
- free_region_end > region_end)
- region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
-
- shadow_start = kasan_mem_to_shadow((void *)region_start);
- shadow_end = kasan_mem_to_shadow((void *)region_end);
-
- if (shadow_end > shadow_start) {
- size = shadow_end - shadow_start;
- apply_to_existing_page_range(&init_mm,
- (unsigned long)shadow_start,
- size, kasan_depopulate_vmalloc_pte,
- NULL);
- flush_tlb_kernel_range((unsigned long)shadow_start,
- (unsigned long)shadow_end);
+ if (!kasan_byte_accessible(address)) {
+ kasan_report(address, 1, false, ip);
+ return false;
}
+ return true;
}
-#endif
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 248264b9cb76..4d837ab83f08 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -7,20 +7,14 @@
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
+#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/linkage.h>
#include <linux/memblock.h>
@@ -46,39 +40,39 @@
* depending on memory access size X.
*/
-static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+static __always_inline bool memory_is_poisoned_1(const void *addr)
{
- s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow(addr);
if (unlikely(shadow_value)) {
- s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+ s8 last_accessible_byte = (unsigned long)addr & KASAN_GRANULE_MASK;
return unlikely(last_accessible_byte >= shadow_value);
}
return false;
}
-static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+static __always_inline bool memory_is_poisoned_2_4_8(const void *addr,
unsigned long size)
{
- u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow(addr);
/*
* Access crosses 8(shadow size)-byte boundary. Such access maps
* into 2 shadow bytes, so we need to check them both.
*/
- if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+ if (unlikely((((unsigned long)addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
return memory_is_poisoned_1(addr + size - 1);
}
-static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+static __always_inline bool memory_is_poisoned_16(const void *addr)
{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow(addr);
/* Unaligned 16-bytes access maps into 3 shadow bytes. */
- if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+ if (unlikely(!IS_ALIGNED((unsigned long)addr, KASAN_GRANULE_SIZE)))
return *shadow_addr || memory_is_poisoned_1(addr + 15);
return *shadow_addr;
@@ -126,26 +120,26 @@ static __always_inline unsigned long memory_is_nonzero(const void *start,
return bytes_is_nonzero(start, (end - start) % 8);
}
-static __always_inline bool memory_is_poisoned_n(unsigned long addr,
- size_t size)
+static __always_inline bool memory_is_poisoned_n(const void *addr, size_t size)
{
unsigned long ret;
- ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
- kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+ ret = memory_is_nonzero(kasan_mem_to_shadow(addr),
+ kasan_mem_to_shadow(addr + size - 1) + 1);
if (unlikely(ret)) {
- unsigned long last_byte = addr + size - 1;
- s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+ const void *last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow(last_byte);
+ s8 last_accessible_byte = (unsigned long)last_byte & KASAN_GRANULE_MASK;
if (unlikely(ret != (unsigned long)last_shadow ||
- ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ last_accessible_byte >= *last_shadow))
return true;
}
return false;
}
-static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+static __always_inline bool memory_is_poisoned(const void *addr, size_t size)
{
if (__builtin_constant_p(size)) {
switch (size) {
@@ -165,20 +159,21 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
-static __always_inline bool check_memory_region_inline(unsigned long addr,
+static __always_inline bool check_region_inline(const void *addr,
size_t size, bool write,
unsigned long ret_ip)
{
+ if (!kasan_arch_is_ready())
+ return true;
+
if (unlikely(size == 0))
return true;
if (unlikely(addr + size < addr))
return !kasan_report(addr, size, write, ret_ip);
- if (unlikely((void *)addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ if (unlikely(!addr_has_metadata(addr)))
return !kasan_report(addr, size, write, ret_ip);
- }
if (likely(!memory_is_poisoned(addr, size)))
return true;
@@ -186,64 +181,77 @@ static __always_inline bool check_memory_region_inline(unsigned long addr,
return !kasan_report(addr, size, write, ret_ip);
}
-bool check_memory_region(unsigned long addr, size_t size, bool write,
- unsigned long ret_ip)
+bool kasan_check_range(const void *addr, size_t size, bool write,
+ unsigned long ret_ip)
{
- return check_memory_region_inline(addr, size, write, ret_ip);
+ return check_region_inline(addr, size, write, ret_ip);
+}
+
+bool kasan_byte_accessible(const void *addr)
+{
+ s8 shadow_byte;
+
+ if (!kasan_arch_is_ready())
+ return true;
+
+ shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
+
+ return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE;
}
void kasan_cache_shrink(struct kmem_cache *cache)
{
- quarantine_remove_cache(cache);
+ kasan_quarantine_remove_cache(cache);
}
void kasan_cache_shutdown(struct kmem_cache *cache)
{
if (!__kmem_cache_empty(cache))
- quarantine_remove_cache(cache);
+ kasan_quarantine_remove_cache(cache);
}
static void register_global(struct kasan_global *global)
{
- size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+ size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE);
- kasan_unpoison_shadow(global->beg, global->size);
+ kasan_unpoison(global->beg, global->size, false);
- kasan_poison_shadow(global->beg + aligned_size,
- global->size_with_redzone - aligned_size,
- KASAN_GLOBAL_REDZONE);
+ kasan_poison(global->beg + aligned_size,
+ global->size_with_redzone - aligned_size,
+ KASAN_GLOBAL_REDZONE, false);
}
-void __asan_register_globals(struct kasan_global *globals, size_t size)
+void __asan_register_globals(void *ptr, ssize_t size)
{
int i;
+ struct kasan_global *globals = ptr;
for (i = 0; i < size; i++)
register_global(&globals[i]);
}
EXPORT_SYMBOL(__asan_register_globals);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+void __asan_unregister_globals(void *ptr, ssize_t size)
{
}
EXPORT_SYMBOL(__asan_unregister_globals);
#define DEFINE_ASAN_LOAD_STORE(size) \
- void __asan_load##size(unsigned long addr) \
+ void __asan_load##size(void *addr) \
{ \
- check_memory_region_inline(addr, size, false, _RET_IP_);\
+ check_region_inline(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_load##size); \
__alias(__asan_load##size) \
- void __asan_load##size##_noabort(unsigned long); \
+ void __asan_load##size##_noabort(void *); \
EXPORT_SYMBOL(__asan_load##size##_noabort); \
- void __asan_store##size(unsigned long addr) \
+ void __asan_store##size(void *addr) \
{ \
- check_memory_region_inline(addr, size, true, _RET_IP_); \
+ check_region_inline(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_store##size); \
__alias(__asan_store##size) \
- void __asan_store##size##_noabort(unsigned long); \
+ void __asan_store##size##_noabort(void *); \
EXPORT_SYMBOL(__asan_store##size##_noabort)
DEFINE_ASAN_LOAD_STORE(1);
@@ -252,24 +260,24 @@ DEFINE_ASAN_LOAD_STORE(4);
DEFINE_ASAN_LOAD_STORE(8);
DEFINE_ASAN_LOAD_STORE(16);
-void __asan_loadN(unsigned long addr, size_t size)
+void __asan_loadN(void *addr, ssize_t size)
{
- check_memory_region(addr, size, false, _RET_IP_);
+ kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_loadN);
__alias(__asan_loadN)
-void __asan_loadN_noabort(unsigned long, size_t);
+void __asan_loadN_noabort(void *, ssize_t);
EXPORT_SYMBOL(__asan_loadN_noabort);
-void __asan_storeN(unsigned long addr, size_t size)
+void __asan_storeN(void *addr, ssize_t size)
{
- check_memory_region(addr, size, true, _RET_IP_);
+ kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_storeN);
__alias(__asan_storeN)
-void __asan_storeN_noabort(unsigned long, size_t);
+void __asan_storeN_noabort(void *, ssize_t);
EXPORT_SYMBOL(__asan_storeN_noabort);
/* to shut up compiler complaints */
@@ -277,42 +285,41 @@ void __asan_handle_no_return(void) {}
EXPORT_SYMBOL(__asan_handle_no_return);
/* Emitted by compiler to poison alloca()ed objects. */
-void __asan_alloca_poison(unsigned long addr, size_t size)
+void __asan_alloca_poison(void *addr, ssize_t size)
{
- size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE);
size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
rounded_up_size;
- size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t rounded_down_size = round_down(size, KASAN_GRANULE_SIZE);
const void *left_redzone = (const void *)(addr -
KASAN_ALLOCA_REDZONE_SIZE);
const void *right_redzone = (const void *)(addr + rounded_up_size);
- WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+ WARN_ON(!IS_ALIGNED((unsigned long)addr, KASAN_ALLOCA_REDZONE_SIZE));
- kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
- kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_LEFT);
- kasan_poison_shadow(right_redzone,
- padding_size + KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_RIGHT);
+ kasan_unpoison((const void *)(addr + rounded_down_size),
+ size - rounded_down_size, false);
+ kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_LEFT, false);
+ kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_RIGHT, false);
}
EXPORT_SYMBOL(__asan_alloca_poison);
/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom)
{
- if (unlikely(!stack_top || stack_top > stack_bottom))
+ if (unlikely(!stack_top || stack_top > (void *)stack_bottom))
return;
- kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+ kasan_unpoison(stack_top, (void *)stack_bottom - stack_top, false);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
/* Emitted by the compiler to [un]poison local variables. */
#define DEFINE_ASAN_SET_SHADOW(byte) \
- void __asan_set_shadow_##byte(const void *addr, size_t size) \
+ void __asan_set_shadow_##byte(const void *addr, ssize_t size) \
{ \
__memset((void *)addr, 0x##byte, size); \
} \
@@ -325,45 +332,194 @@ DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
-void kasan_record_aux_stack(void *addr)
+/* Only allow cache merging when no per-object metadata is present. */
+slab_flags_t kasan_never_merge(void)
{
- struct page *page = kasan_addr_to_page(addr);
- struct kmem_cache *cache;
- struct kasan_alloc_meta *alloc_info;
- void *object;
+ if (!kasan_requires_meta())
+ return 0;
+ return SLAB_KASAN;
+}
- if (!(page && PageSlab(page)))
- return;
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static inline unsigned int optimal_redzone(unsigned int object_size)
+{
+ return
+ object_size <= 64 - 16 ? 16 :
+ object_size <= 128 - 32 ? 32 :
+ object_size <= 512 - 64 ? 64 :
+ object_size <= 4096 - 128 ? 128 :
+ object_size <= (1 << 14) - 256 ? 256 :
+ object_size <= (1 << 15) - 512 ? 512 :
+ object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags)
+{
+ unsigned int ok_size;
+ unsigned int optimal_size;
- cache = page->slab_cache;
- object = nearest_obj(cache, page, addr);
- alloc_info = get_alloc_info(cache, object);
+ if (!kasan_requires_meta())
+ return;
/*
- * record the last two call_rcu() call stacks.
+ * SLAB_KASAN is used to mark caches that are sanitized by KASAN
+ * and that thus have per-object metadata.
+ * Currently this flag is used in two places:
+ * 1. In slab_ksize() to account for per-object metadata when
+ * calculating the size of the accessible memory within the object.
+ * 2. In slab_common.c via kasan_never_merge() to prevent merging of
+ * caches with per-object metadata.
*/
- alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
- alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
-}
+ *flags |= SLAB_KASAN;
-void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_free_meta *free_meta;
+ ok_size = *size;
- free_meta = get_free_info(cache, object);
- kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+ /* Add alloc meta into redzone. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
/*
- * the object was freed and has free track set
+ * If alloc meta doesn't fit, don't add it.
+ * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal
+ * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for
+ * larger sizes.
*/
- *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
+ if (*size > KMALLOC_MAX_SIZE) {
+ cache->kasan_info.alloc_meta_offset = 0;
+ *size = ok_size;
+ /* Continue, since free meta might still fit. */
+ }
+
+ /*
+ * Add free meta into redzone when it's not possible to store
+ * it in the object. This is the case when:
+ * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
+ * be touched after it was freed, or
+ * 2. Object has a constructor, which means it's expected to
+ * retain its content until the next allocation, or
+ * 3. Object is too small.
+ * Otherwise cache->kasan_info.free_meta_offset = 0 is implied.
+ */
+ if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta)) {
+ ok_size = *size;
+
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+
+ /* If free meta doesn't fit, don't add it. */
+ if (*size > KMALLOC_MAX_SIZE) {
+ cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
+ *size = ok_size;
+ }
+ }
+
+ /* Calculate size with optimal redzone. */
+ optimal_size = cache->object_size + optimal_redzone(cache->object_size);
+ /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */
+ if (optimal_size > KMALLOC_MAX_SIZE)
+ optimal_size = KMALLOC_MAX_SIZE;
+ /* Use optimal size if the size with added metas is not large enough. */
+ if (*size < optimal_size)
+ *size = optimal_size;
+}
+
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object)
+{
+ if (!cache->kasan_info.alloc_meta_offset)
+ return NULL;
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
}
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object)
{
- if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META)
return NULL;
- return &get_free_info(cache, object)->free_track;
+ return (void *)object + cache->kasan_info.free_meta_offset;
+}
+
+void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ __memset(alloc_meta, 0, sizeof(*alloc_meta));
+}
+
+size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
+{
+ struct kasan_cache *info = &cache->kasan_info;
+
+ if (!kasan_requires_meta())
+ return 0;
+
+ if (in_object)
+ return (info->free_meta_offset ?
+ 0 : sizeof(struct kasan_free_meta));
+ else
+ return (info->alloc_meta_offset ?
+ sizeof(struct kasan_alloc_meta) : 0) +
+ ((info->free_meta_offset &&
+ info->free_meta_offset != KASAN_NO_FREE_META) ?
+ sizeof(struct kasan_free_meta) : 0);
+}
+
+static void __kasan_record_aux_stack(void *addr, bool can_alloc)
+{
+ struct slab *slab = kasan_addr_to_slab(addr);
+ struct kmem_cache *cache;
+ struct kasan_alloc_meta *alloc_meta;
+ void *object;
+
+ if (is_kfence_address(addr) || !slab)
+ return;
+
+ cache = slab->slab_cache;
+ object = nearest_obj(cache, slab, addr);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return;
+
+ alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
+ alloc_meta->aux_stack[0] = kasan_save_stack(0, can_alloc);
+}
+
+void kasan_record_aux_stack(void *addr)
+{
+ return __kasan_record_aux_stack(addr, true);
+}
+
+void kasan_record_aux_stack_noalloc(void *addr)
+{
+ return __kasan_record_aux_stack(addr, false);
+}
+
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ kasan_set_track(&alloc_meta->alloc_track, flags);
+}
+
+void kasan_save_free_info(struct kmem_cache *cache, void *object)
+{
+ struct kasan_free_meta *free_meta;
+
+ free_meta = kasan_get_free_meta(cache, object);
+ if (!free_meta)
+ return;
+
+ kasan_set_track(&free_meta->free_track, 0);
+ /* The object was freed and has free track set. */
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK;
}
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
deleted file mode 100644
index a38c7a9e192a..000000000000
--- a/mm/kasan/generic_report.c
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This file contains generic KASAN specific error reporting code.
- *
- * Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
- *
- * Some code borrowed from https://github.com/xairy/kasan-prototype by
- * Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/bitops.h>
-#include <linux/ftrace.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/printk.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/stackdepot.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/kasan.h>
-#include <linux/module.h>
-
-#include <asm/sections.h>
-
-#include "kasan.h"
-#include "../slab.h"
-
-void *find_first_bad_addr(void *addr, size_t size)
-{
- void *p = addr;
-
- while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
- p += KASAN_SHADOW_SCALE_SIZE;
- return p;
-}
-
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
-{
- const char *bug_type = "unknown-crash";
- u8 *shadow_addr;
-
- shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
-
- /*
- * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
- * at the next shadow byte to determine the type of the bad access.
- */
- if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
- shadow_addr++;
-
- switch (*shadow_addr) {
- case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
- /*
- * In theory it's still possible to see these shadow values
- * due to a data race in the kernel code.
- */
- bug_type = "out-of-bounds";
- break;
- case KASAN_PAGE_REDZONE:
- case KASAN_KMALLOC_REDZONE:
- bug_type = "slab-out-of-bounds";
- break;
- case KASAN_GLOBAL_REDZONE:
- bug_type = "global-out-of-bounds";
- break;
- case KASAN_STACK_LEFT:
- case KASAN_STACK_MID:
- case KASAN_STACK_RIGHT:
- case KASAN_STACK_PARTIAL:
- bug_type = "stack-out-of-bounds";
- break;
- case KASAN_FREE_PAGE:
- case KASAN_KMALLOC_FREE:
- case KASAN_KMALLOC_FREETRACK:
- bug_type = "use-after-free";
- break;
- case KASAN_ALLOCA_LEFT:
- case KASAN_ALLOCA_RIGHT:
- bug_type = "alloca-out-of-bounds";
- break;
- case KASAN_VMALLOC_INVALID:
- bug_type = "vmalloc-out-of-bounds";
- break;
- }
-
- return bug_type;
-}
-
-static const char *get_wild_bug_type(struct kasan_access_info *info)
-{
- const char *bug_type = "unknown-crash";
-
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
-
- return bug_type;
-}
-
-const char *get_bug_type(struct kasan_access_info *info)
-{
- /*
- * If access_size is a negative number, then it has reason to be
- * defined as out-of-bounds bug type.
- *
- * Casting negative numbers to size_t would indeed turn up as
- * a large size_t and its value will be larger than ULONG_MAX/2,
- * so that this can qualify as out-of-bounds.
- */
- if (info->access_addr + info->access_size < info->access_addr)
- return "out-of-bounds";
-
- if (addr_has_shadow(info->access_addr))
- return get_shadow_bug_type(info);
- return get_wild_bug_type(info);
-}
-
-#define DEFINE_ASAN_REPORT_LOAD(size) \
-void __asan_report_load##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, false, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_load##size##_noabort)
-
-#define DEFINE_ASAN_REPORT_STORE(size) \
-void __asan_report_store##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, true, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_store##size##_noabort)
-
-DEFINE_ASAN_REPORT_LOAD(1);
-DEFINE_ASAN_REPORT_LOAD(2);
-DEFINE_ASAN_REPORT_LOAD(4);
-DEFINE_ASAN_REPORT_LOAD(8);
-DEFINE_ASAN_REPORT_LOAD(16);
-DEFINE_ASAN_REPORT_STORE(1);
-DEFINE_ASAN_REPORT_STORE(2);
-DEFINE_ASAN_REPORT_STORE(4);
-DEFINE_ASAN_REPORT_STORE(8);
-DEFINE_ASAN_REPORT_STORE(16);
-
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_load_n_noabort);
-
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
new file mode 100644
index 000000000000..06141bbc1e51
--- /dev/null
+++ b/mm/kasan/hw_tags.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core hardware tag-based KASAN code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#define pr_fmt(fmt) "kasan: " fmt
+
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/static_key.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+enum kasan_arg {
+ KASAN_ARG_DEFAULT,
+ KASAN_ARG_OFF,
+ KASAN_ARG_ON,
+};
+
+enum kasan_arg_mode {
+ KASAN_ARG_MODE_DEFAULT,
+ KASAN_ARG_MODE_SYNC,
+ KASAN_ARG_MODE_ASYNC,
+ KASAN_ARG_MODE_ASYMM,
+};
+
+enum kasan_arg_vmalloc {
+ KASAN_ARG_VMALLOC_DEFAULT,
+ KASAN_ARG_VMALLOC_OFF,
+ KASAN_ARG_VMALLOC_ON,
+};
+
+static enum kasan_arg kasan_arg __ro_after_init;
+static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
+static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
+
+/*
+ * Whether KASAN is enabled at all.
+ * The value remains false until KASAN is initialized by kasan_init_hw_tags().
+ */
+DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
+EXPORT_SYMBOL(kasan_flag_enabled);
+
+/*
+ * Whether the selected mode is synchronous, asynchronous, or asymmetric.
+ * Defaults to KASAN_MODE_SYNC.
+ */
+enum kasan_mode kasan_mode __ro_after_init;
+EXPORT_SYMBOL_GPL(kasan_mode);
+
+/* Whether to enable vmalloc tagging. */
+DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+
+#define PAGE_ALLOC_SAMPLE_DEFAULT 1
+#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3
+
+/*
+ * Sampling interval of page_alloc allocation (un)poisoning.
+ * Defaults to no sampling.
+ */
+unsigned long kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+
+/*
+ * Minimum order of page_alloc allocations to be affected by sampling.
+ * The default value is chosen to match both
+ * PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER.
+ */
+unsigned int kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+
+DEFINE_PER_CPU(long, kasan_page_alloc_skip);
+
+/* kasan=off/on */
+static int __init early_kasan_flag(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg = KASAN_ARG_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg = KASAN_ARG_ON;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan", early_kasan_flag);
+
+/* kasan.mode=sync/async/asymm */
+static int __init early_kasan_mode(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "sync"))
+ kasan_arg_mode = KASAN_ARG_MODE_SYNC;
+ else if (!strcmp(arg, "async"))
+ kasan_arg_mode = KASAN_ARG_MODE_ASYNC;
+ else if (!strcmp(arg, "asymm"))
+ kasan_arg_mode = KASAN_ARG_MODE_ASYMM;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.mode", early_kasan_mode);
+
+/* kasan.vmalloc=off/on */
+static int __init early_kasan_flag_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
+
+static inline const char *kasan_mode_info(void)
+{
+ if (kasan_mode == KASAN_MODE_ASYNC)
+ return "async";
+ else if (kasan_mode == KASAN_MODE_ASYMM)
+ return "asymm";
+ else
+ return "sync";
+}
+
+/* kasan.page_alloc.sample=<sampling interval> */
+static int __init early_kasan_flag_page_alloc_sample(char *arg)
+{
+ int rv;
+
+ if (!arg)
+ return -EINVAL;
+
+ rv = kstrtoul(arg, 0, &kasan_page_alloc_sample);
+ if (rv)
+ return rv;
+
+ if (!kasan_page_alloc_sample || kasan_page_alloc_sample > LONG_MAX) {
+ kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample);
+
+/* kasan.page_alloc.sample.order=<minimum page order> */
+static int __init early_kasan_flag_page_alloc_sample_order(char *arg)
+{
+ int rv;
+
+ if (!arg)
+ return -EINVAL;
+
+ rv = kstrtouint(arg, 0, &kasan_page_alloc_sample_order);
+ if (rv)
+ return rv;
+
+ if (kasan_page_alloc_sample_order > INT_MAX) {
+ kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order);
+
+/*
+ * kasan_init_hw_tags_cpu() is called for each CPU.
+ * Not marked as __init as a CPU can be hot-plugged after boot.
+ */
+void kasan_init_hw_tags_cpu(void)
+{
+ /*
+ * There's no need to check that the hardware is MTE-capable here,
+ * as this function is only called for MTE-capable hardware.
+ */
+
+ /*
+ * If KASAN is disabled via command line, don't initialize it.
+ * When this function is called, kasan_flag_enabled is not yet
+ * set by kasan_init_hw_tags(). Thus, check kasan_arg instead.
+ */
+ if (kasan_arg == KASAN_ARG_OFF)
+ return;
+
+ /*
+ * Enable async or asymm modes only when explicitly requested
+ * through the command line.
+ */
+ kasan_enable_hw_tags();
+}
+
+/* kasan_init_hw_tags() is called once on boot CPU. */
+void __init kasan_init_hw_tags(void)
+{
+ /* If hardware doesn't support MTE, don't initialize KASAN. */
+ if (!system_supports_mte())
+ return;
+
+ /* If KASAN is disabled via command line, don't initialize it. */
+ if (kasan_arg == KASAN_ARG_OFF)
+ return;
+
+ switch (kasan_arg_mode) {
+ case KASAN_ARG_MODE_DEFAULT:
+ /* Default is specified by kasan_mode definition. */
+ break;
+ case KASAN_ARG_MODE_SYNC:
+ kasan_mode = KASAN_MODE_SYNC;
+ break;
+ case KASAN_ARG_MODE_ASYNC:
+ kasan_mode = KASAN_MODE_ASYNC;
+ break;
+ case KASAN_ARG_MODE_ASYMM:
+ kasan_mode = KASAN_MODE_ASYMM;
+ break;
+ }
+
+ switch (kasan_arg_vmalloc) {
+ case KASAN_ARG_VMALLOC_DEFAULT:
+ /* Default is specified by kasan_flag_vmalloc definition. */
+ break;
+ case KASAN_ARG_VMALLOC_OFF:
+ static_branch_disable(&kasan_flag_vmalloc);
+ break;
+ case KASAN_ARG_VMALLOC_ON:
+ static_branch_enable(&kasan_flag_vmalloc);
+ break;
+ }
+
+ kasan_init_tags();
+
+ /* KASAN is now initialized, enable it. */
+ static_branch_enable(&kasan_flag_enabled);
+
+ pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
+ kasan_mode_info(),
+ kasan_vmalloc_enabled() ? "on" : "off",
+ kasan_stack_collection_enabled() ? "on" : "off");
+}
+
+#ifdef CONFIG_KASAN_VMALLOC
+
+static void unpoison_vmalloc_pages(const void *addr, u8 tag)
+{
+ struct vm_struct *area;
+ int i;
+
+ /*
+ * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations
+ * (see the comment in __kasan_unpoison_vmalloc), all of the pages
+ * should belong to a single area.
+ */
+ area = find_vm_area((void *)addr);
+ if (WARN_ON(!area))
+ return;
+
+ for (i = 0; i < area->nr_pages; i++) {
+ struct page *page = area->pages[i];
+
+ page_kasan_tag_set(page, tag);
+ }
+}
+
+static void init_vmalloc_pages(const void *start, unsigned long size)
+{
+ const void *addr;
+
+ for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+ struct page *page = vmalloc_to_page(addr);
+
+ clear_highpage_kasan_tagged(page);
+ }
+}
+
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags)
+{
+ u8 tag;
+ unsigned long redzone_start, redzone_size;
+
+ if (!kasan_vmalloc_enabled()) {
+ if (flags & KASAN_VMALLOC_INIT)
+ init_vmalloc_pages(start, size);
+ return (void *)start;
+ }
+
+ /*
+ * Don't tag non-VM_ALLOC mappings, as:
+ *
+ * 1. Unlike the software KASAN modes, hardware tag-based KASAN only
+ * supports tagging physical memory. Therefore, it can only tag a
+ * single mapping of normal physical pages.
+ * 2. Hardware tag-based KASAN can only tag memory mapped with special
+ * mapping protection bits, see arch_vmap_pgprot_tagged().
+ * As non-VM_ALLOC mappings can be mapped outside of vmalloc code,
+ * providing these bits would require tracking all non-VM_ALLOC
+ * mappers.
+ *
+ * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags
+ * the first virtual mapping, which is created by vmalloc().
+ * Tagging the page_alloc memory backing that vmalloc() allocation is
+ * skipped, see ___GFP_SKIP_KASAN.
+ *
+ * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
+ */
+ if (!(flags & KASAN_VMALLOC_VM_ALLOC)) {
+ WARN_ON(flags & KASAN_VMALLOC_INIT);
+ return (void *)start;
+ }
+
+ /*
+ * Don't tag executable memory.
+ * The kernel doesn't tolerate having the PC register tagged.
+ */
+ if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) {
+ WARN_ON(flags & KASAN_VMALLOC_INIT);
+ return (void *)start;
+ }
+
+ tag = kasan_random_tag();
+ start = set_tag(start, tag);
+
+ /* Unpoison and initialize memory up to size. */
+ kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT);
+
+ /*
+ * Explicitly poison and initialize the in-page vmalloc() redzone.
+ * Unlike software KASAN modes, hardware tag-based KASAN doesn't
+ * unpoison memory when populating shadow for vmalloc() space.
+ */
+ redzone_start = round_up((unsigned long)start + size,
+ KASAN_GRANULE_SIZE);
+ redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start;
+ kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID,
+ flags & KASAN_VMALLOC_INIT);
+
+ /*
+ * Set per-page tag flags to allow accessing physical memory for the
+ * vmalloc() mapping through page_address(vmalloc_to_page()).
+ */
+ unpoison_vmalloc_pages(start, tag);
+
+ return (void *)start;
+}
+
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ /*
+ * No tagging here.
+ * The physical pages backing the vmalloc() allocation are poisoned
+ * through the usual page_alloc paths.
+ */
+}
+
+#endif
+
+void kasan_enable_hw_tags(void)
+{
+ if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
+ hw_enable_tag_checks_async();
+ else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
+ hw_enable_tag_checks_asymm();
+ else
+ hw_enable_tag_checks_sync();
+}
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+EXPORT_SYMBOL_GPL(kasan_enable_hw_tags);
+
+void kasan_force_async_fault(void)
+{
+ hw_force_async_tag_fault();
+}
+EXPORT_SYMBOL_GPL(kasan_force_async_fault);
+
+#endif
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index fe6be0be1f76..dcfec277e839 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -1,14 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains some kasan initialization code.
+ * This file contains KASAN shadow initialization code.
*
* Copyright (c) 2015 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/memblock.h>
@@ -46,7 +41,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss;
static inline bool kasan_pud_table(p4d_t p4d)
{
return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
@@ -58,7 +53,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss;
static inline bool kasan_pmd_table(pud_t pud)
{
return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
@@ -69,7 +64,8 @@ static inline bool kasan_pmd_table(pud_t pud)
return false;
}
#endif
-pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
+pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]
+ __page_aligned_bss;
static inline bool kasan_pte_table(pmd_t pmd)
{
@@ -224,8 +220,8 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
/**
* kasan_populate_early_shadow - populate shadow memory region with
* kasan_early_shadow_page
- * @shadow_start - start of the memory range to populate
- * @shadow_end - end of the memory range to populate
+ * @shadow_start: start of the memory range to populate
+ * @shadow_end: end of the memory range to populate
*/
int __ref kasan_populate_early_shadow(const void *shadow_start,
const void *shadow_end)
@@ -290,7 +286,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd)
for (i = 0; i < PTRS_PER_PTE; i++) {
pte = pte_start + i;
- if (!pte_none(*pte))
+ if (!pte_none(ptep_get(pte)))
return;
}
@@ -347,16 +343,19 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
unsigned long end)
{
unsigned long next;
+ pte_t ptent;
for (; addr < end; addr = next, pte++) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
if (next > end)
next = end;
- if (!pte_present(*pte))
+ ptent = ptep_get(pte);
+
+ if (!pte_present(ptent))
continue;
- if (WARN_ON(!kasan_early_shadow_page_entry(*pte)))
+ if (WARN_ON(!kasan_early_shadow_page_entry(ptent)))
continue;
pte_clear(&init_mm, addr, pte);
}
@@ -377,9 +376,10 @@ static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr,
if (kasan_pte_table(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
- IS_ALIGNED(next, PMD_SIZE))
+ IS_ALIGNED(next, PMD_SIZE)) {
pmd_clear(pmd);
- continue;
+ continue;
+ }
}
pte = pte_offset_kernel(pmd, addr);
kasan_remove_pte_table(pte, addr, next);
@@ -402,9 +402,10 @@ static void kasan_remove_pud_table(pud_t *pud, unsigned long addr,
if (kasan_pmd_table(*pud)) {
if (IS_ALIGNED(addr, PUD_SIZE) &&
- IS_ALIGNED(next, PUD_SIZE))
+ IS_ALIGNED(next, PUD_SIZE)) {
pud_clear(pud);
- continue;
+ continue;
+ }
}
pmd = pmd_offset(pud, addr);
pmd_base = pmd_offset(pud, 0);
@@ -428,9 +429,10 @@ static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr,
if (kasan_pud_table(*p4d)) {
if (IS_ALIGNED(addr, P4D_SIZE) &&
- IS_ALIGNED(next, P4D_SIZE))
+ IS_ALIGNED(next, P4D_SIZE)) {
p4d_clear(p4d);
- continue;
+ continue;
+ }
}
pud = pud_offset(p4d, addr);
kasan_remove_pud_table(pud, addr, next);
@@ -446,9 +448,8 @@ void kasan_remove_zero_shadow(void *start, unsigned long size)
addr = (unsigned long)kasan_mem_to_shadow(start);
end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT);
- if (WARN_ON((unsigned long)start %
- (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
- WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) ||
+ WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE))
return;
for (; addr < end; addr = next) {
@@ -462,9 +463,10 @@ void kasan_remove_zero_shadow(void *start, unsigned long size)
if (kasan_p4d_table(*pgd)) {
if (IS_ALIGNED(addr, PGDIR_SIZE) &&
- IS_ALIGNED(next, PGDIR_SIZE))
+ IS_ALIGNED(next, PGDIR_SIZE)) {
pgd_clear(pgd);
- continue;
+ continue;
+ }
}
p4d = p4d_offset(pgd, addr);
@@ -481,14 +483,12 @@ int kasan_add_zero_shadow(void *start, unsigned long size)
shadow_start = kasan_mem_to_shadow(start);
shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT);
- if (WARN_ON((unsigned long)start %
- (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
- WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) ||
+ WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE))
return -EINVAL;
ret = kasan_populate_early_shadow(shadow_start, shadow_end);
if (ret)
- kasan_remove_zero_shadow(shadow_start,
- size >> KASAN_SHADOW_SCALE_SHIFT);
+ kasan_remove_zero_shadow(start, size);
return ret;
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ac499456740f..2e973b36fe07 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,83 +2,234 @@
#ifndef __MM_KASAN_KASAN_H
#define __MM_KASAN_KASAN_H
+#include <linux/atomic.h>
#include <linux/kasan.h>
+#include <linux/kasan-tags.h>
+#include <linux/kfence.h>
#include <linux/stackdepot.h>
-#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
-#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
-#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
-#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
-#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
+#include <linux/static_key.h>
+
+DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
+
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return static_branch_unlikely(&kasan_flag_stacktrace);
+}
+
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return true;
+}
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_HW_TAGS
+
+#include "../slab.h"
+
+DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+
+enum kasan_mode {
+ KASAN_MODE_SYNC,
+ KASAN_MODE_ASYNC,
+ KASAN_MODE_ASYMM,
+};
+
+extern enum kasan_mode kasan_mode __ro_after_init;
+
+extern unsigned long kasan_page_alloc_sample;
+extern unsigned int kasan_page_alloc_sample_order;
+DECLARE_PER_CPU(long, kasan_page_alloc_skip);
+
+static inline bool kasan_vmalloc_enabled(void)
+{
+ return static_branch_likely(&kasan_flag_vmalloc);
+}
+
+static inline bool kasan_async_fault_possible(void)
+{
+ return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM;
+}
+
+static inline bool kasan_sync_fault_possible(void)
+{
+ return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
+}
+
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+ /* Fast-path for when sampling is disabled. */
+ if (kasan_page_alloc_sample == 1)
+ return true;
+
+ if (order < kasan_page_alloc_sample_order)
+ return true;
+
+ if (this_cpu_dec_return(kasan_page_alloc_skip) < 0) {
+ this_cpu_write(kasan_page_alloc_skip,
+ kasan_page_alloc_sample - 1);
+ return true;
+ }
+
+ return false;
+}
+
+#else /* CONFIG_KASAN_HW_TAGS */
+
+static inline bool kasan_async_fault_possible(void)
+{
+ return false;
+}
+
+static inline bool kasan_sync_fault_possible(void)
+{
+ return true;
+}
+
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+ return true;
+}
+
+#endif /* CONFIG_KASAN_HW_TAGS */
#ifdef CONFIG_KASAN_GENERIC
-#define KASAN_FREE_PAGE 0xFF /* page was freed */
-#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
-#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
-#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
-#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
+
+/* Generic KASAN uses per-object metadata to store stack traces. */
+static inline bool kasan_requires_meta(void)
+{
+ /*
+ * Technically, Generic KASAN always collects stack traces right now.
+ * However, let's use kasan_stack_collection_enabled() in case the
+ * kasan.stacktrace command-line argument is changed to affect
+ * Generic KASAN.
+ */
+ return kasan_stack_collection_enabled();
+}
+
+#else /* CONFIG_KASAN_GENERIC */
+
+/* Tag-based KASAN modes do not use per-object metadata. */
+static inline bool kasan_requires_meta(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_KASAN_GENERIC */
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
#else
-#define KASAN_FREE_PAGE KASAN_TAG_INVALID
-#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
-#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
-#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
-#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
+#include <asm/mte-kasan.h>
+#define KASAN_GRANULE_SIZE MTE_GRANULE_SIZE
#endif
-#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
+#define KASAN_GRANULE_MASK (KASAN_GRANULE_SIZE - 1)
-/*
- * Stack redzone shadow values
- * (Those are compiler's ABI, don't change them)
- */
-#define KASAN_STACK_LEFT 0xF1
-#define KASAN_STACK_MID 0xF2
-#define KASAN_STACK_RIGHT 0xF3
-#define KASAN_STACK_PARTIAL 0xF4
+#define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT)
-/*
- * alloca redzone shadow values
- */
+#ifdef CONFIG_KASAN_GENERIC
+#define KASAN_PAGE_FREE 0xFF /* freed page */
+#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocation */
+#define KASAN_SLAB_REDZONE 0xFC /* redzone for slab object */
+#define KASAN_SLAB_FREE 0xFB /* freed slab object */
+#define KASAN_VMALLOC_INVALID 0xF8 /* inaccessible space in vmap area */
+#else
+#define KASAN_PAGE_FREE KASAN_TAG_INVALID
+#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
+#define KASAN_SLAB_REDZONE KASAN_TAG_INVALID
+#define KASAN_SLAB_FREE KASAN_TAG_INVALID
+#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only used for SW_TAGS */
+#endif
+
+#ifdef CONFIG_KASAN_GENERIC
+
+#define KASAN_SLAB_FREETRACK 0xFA /* freed slab object with free track */
+#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
+
+/* Stack redzone shadow values. Compiler ABI, do not change. */
+#define KASAN_STACK_LEFT 0xF1
+#define KASAN_STACK_MID 0xF2
+#define KASAN_STACK_RIGHT 0xF3
+#define KASAN_STACK_PARTIAL 0xF4
+
+/* alloca redzone shadow values. */
#define KASAN_ALLOCA_LEFT 0xCA
#define KASAN_ALLOCA_RIGHT 0xCB
+/* alloca redzone size. Compiler ABI, do not change. */
#define KASAN_ALLOCA_REDZONE_SIZE 32
-/*
- * Stack frame marker (compiler ABI).
- */
+/* Stack frame marker. Compiler ABI, do not change. */
#define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3
-/* Don't break randconfig/all*config builds */
+/* Dummy value to avoid breaking randconfig/all*config builds. */
#ifndef KASAN_ABI_VERSION
#define KASAN_ABI_VERSION 1
#endif
-struct kasan_access_info {
+#endif /* CONFIG_KASAN_GENERIC */
+
+/* Metadata layout customization. */
+#define META_BYTES_PER_BLOCK 1
+#define META_BLOCKS_PER_ROW 16
+#define META_BYTES_PER_ROW (META_BLOCKS_PER_ROW * META_BYTES_PER_BLOCK)
+#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE)
+#define META_ROWS_AROUND_ADDR 2
+
+#define KASAN_STACK_DEPTH 64
+
+struct kasan_track {
+ u32 pid;
+ depot_stack_handle_t stack;
+};
+
+enum kasan_report_type {
+ KASAN_REPORT_ACCESS,
+ KASAN_REPORT_INVALID_FREE,
+ KASAN_REPORT_DOUBLE_FREE,
+};
+
+struct kasan_report_info {
+ /* Filled in by kasan_report_*(). */
+ enum kasan_report_type type;
const void *access_addr;
- const void *first_bad_addr;
size_t access_size;
bool is_write;
unsigned long ip;
+
+ /* Filled in by the common reporting code. */
+ const void *first_bad_addr;
+ struct kmem_cache *cache;
+ void *object;
+ size_t alloc_size;
+
+ /* Filled in by the mode-specific reporting code. */
+ const char *bug_type;
+ struct kasan_track alloc_track;
+ struct kasan_track free_track;
};
-/* The layout of struct dictated by compiler */
+/* Do not change the struct layout: compiler ABI. */
struct kasan_source_location {
const char *filename;
int line_no;
int column_no;
};
-/* The layout of struct dictated by compiler */
+/* Do not change the struct layout: compiler ABI. */
struct kasan_global {
const void *beg; /* Address of the beginning of the global variable. */
size_t size; /* Size of the global variable. */
- size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */
+ size_t size_with_redzone; /* Size of the variable + size of the redzone. 32 bytes aligned. */
const void *name;
const void *module_name; /* Name of the module where the global variable is declared. */
- unsigned long has_dynamic_init; /* This needed for C++ */
+ unsigned long has_dynamic_init; /* This is needed for C++. */
#if KASAN_ABI_VERSION >= 4
struct kasan_source_location *location;
#endif
@@ -87,57 +238,58 @@ struct kasan_global {
#endif
};
-/**
- * Structures to keep alloc and free tracks *
- */
-
-#define KASAN_STACK_DEPTH 64
-
-struct kasan_track {
- u32 pid;
- depot_stack_handle_t stack;
-};
+/* Structures for keeping alloc and free meta. */
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
-#define KASAN_NR_FREE_STACKS 5
-#else
-#define KASAN_NR_FREE_STACKS 1
-#endif
+#ifdef CONFIG_KASAN_GENERIC
struct kasan_alloc_meta {
struct kasan_track alloc_track;
-#ifdef CONFIG_KASAN_GENERIC
- /*
- * call_rcu() call stack is stored into struct kasan_alloc_meta.
- * The free stack is stored into struct kasan_free_meta.
- */
+ /* Free track is stored in kasan_free_meta. */
depot_stack_handle_t aux_stack[2];
-#else
- struct kasan_track free_track[KASAN_NR_FREE_STACKS];
-#endif
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
- u8 free_track_idx;
-#endif
};
struct qlist_node {
struct qlist_node *next;
};
+
+/*
+ * Free meta is stored either in the object itself or in the redzone after the
+ * object. In the former case, free meta offset is 0. In the latter case, the
+ * offset is between 0 and INT_MAX. INT_MAX marks that free meta is not present.
+ */
+#define KASAN_NO_FREE_META INT_MAX
+
+/*
+ * Free meta is only used by Generic mode while the object is in quarantine.
+ * After that, slab allocator stores the freelist pointer in the object.
+ */
struct kasan_free_meta {
- /* This field is used while the object is in the quarantine.
- * Otherwise it might be used for the allocator freelist.
- */
struct qlist_node quarantine_link;
-#ifdef CONFIG_KASAN_GENERIC
struct kasan_track free_track;
-#endif
};
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
- const void *object);
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
- const void *object);
+#endif /* CONFIG_KASAN_GENERIC */
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+
+struct kasan_stack_ring_entry {
+ void *ptr;
+ size_t size;
+ u32 pid;
+ depot_stack_handle_t stack;
+ bool is_free;
+};
+
+struct kasan_stack_ring {
+ rwlock_t lock;
+ size_t size;
+ atomic64_t pos;
+ struct kasan_stack_ring_entry *entries;
+};
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
@@ -145,66 +297,87 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool addr_has_shadow(const void *addr)
+static __always_inline bool addr_has_metadata(const void *addr)
{
- return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+ return (kasan_reset_tag(addr) >=
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
}
-void kasan_poison_shadow(const void *address, size_t size, u8 value);
-
/**
- * check_memory_region - Check memory region, and report if invalid access.
+ * kasan_check_range - Check memory region, and report if invalid access.
* @addr: the accessed address
* @size: the accessed size
* @write: true if access is a write access
* @ret_ip: return address
* @return: true if access was valid, false if invalid
*/
-bool check_memory_region(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(const void *addr, size_t size, bool write,
unsigned long ret_ip);
-void *find_first_bad_addr(void *addr, size_t size);
-const char *get_bug_type(struct kasan_access_info *info);
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-bool kasan_report(unsigned long addr, size_t size,
- bool is_write, unsigned long ip);
-void kasan_report_invalid_free(void *object, unsigned long ip);
+static __always_inline bool addr_has_metadata(const void *addr)
+{
+ return (is_vmalloc_addr(addr) || virt_addr_valid(addr));
+}
-struct page *kasan_addr_to_page(const void *addr);
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-depot_stack_handle_t kasan_save_stack(gfp_t flags);
-void kasan_set_track(struct kasan_track *track, gfp_t flags);
-void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag);
+const void *kasan_find_first_bad_addr(const void *addr, size_t size);
+size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache);
+void kasan_complete_mode_report_info(struct kasan_report_info *info);
+void kasan_metadata_fetch_row(char *buffer, void *row);
-#if defined(CONFIG_KASAN_GENERIC) && \
- (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
-void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
-void quarantine_reduce(void);
-void quarantine_remove_cache(struct kmem_cache *cache);
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+void kasan_print_tags(u8 addr_tag, const void *addr);
#else
-static inline void quarantine_put(struct kasan_free_meta *info,
- struct kmem_cache *cache) { }
-static inline void quarantine_reduce(void) { }
-static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+static inline void kasan_print_tags(u8 addr_tag, const void *addr) { }
#endif
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_STACK)
+void kasan_print_address_stack_frame(const void *addr);
+#else
+static inline void kasan_print_address_stack_frame(const void *addr) { }
+#endif
-void print_tags(u8 addr_tag, const void *addr);
+#ifdef CONFIG_KASAN_GENERIC
+void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object);
+#else
+static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { }
+#endif
-u8 random_tag(void);
+bool kasan_report(const void *addr, size_t size,
+ bool is_write, unsigned long ip);
+void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type);
-#else
+struct slab *kasan_addr_to_slab(const void *addr);
-static inline void print_tags(u8 addr_tag, const void *addr) { }
+#ifdef CONFIG_KASAN_GENERIC
+void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size);
+void kasan_init_object_meta(struct kmem_cache *cache, const void *object);
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object);
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object);
+#else
+static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { }
+static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { }
+#endif
-static inline u8 random_tag(void)
-{
- return 0;
-}
+depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
+void kasan_set_track(struct kasan_track *track, gfp_t flags);
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags);
+void kasan_save_free_info(struct kmem_cache *cache, void *object);
+#if defined(CONFIG_KASAN_GENERIC) && \
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
+bool kasan_quarantine_put(struct kmem_cache *cache, void *object);
+void kasan_quarantine_reduce(void);
+void kasan_quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline bool kasan_quarantine_put(struct kmem_cache *cache, void *object) { return false; }
+static inline void kasan_quarantine_reduce(void) { }
+static inline void kasan_quarantine_remove_cache(struct kmem_cache *cache) { }
#endif
#ifndef arch_kasan_set_tag
@@ -213,87 +386,255 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
return addr;
}
#endif
-#ifndef arch_kasan_reset_tag
-#define arch_kasan_reset_tag(addr) ((void *)(addr))
-#endif
#ifndef arch_kasan_get_tag
#define arch_kasan_get_tag(addr) 0
#endif
#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
-#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr))
#define get_tag(addr) arch_kasan_get_tag(addr)
+#ifdef CONFIG_KASAN_HW_TAGS
+
+#define hw_enable_tag_checks_sync() arch_enable_tag_checks_sync()
+#define hw_enable_tag_checks_async() arch_enable_tag_checks_async()
+#define hw_enable_tag_checks_asymm() arch_enable_tag_checks_asymm()
+#define hw_suppress_tag_checks_start() arch_suppress_tag_checks_start()
+#define hw_suppress_tag_checks_stop() arch_suppress_tag_checks_stop()
+#define hw_force_async_tag_fault() arch_force_async_tag_fault()
+#define hw_get_random_tag() arch_get_random_tag()
+#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
+#define hw_set_mem_tag_range(addr, size, tag, init) \
+ arch_set_mem_tag_range((addr), (size), (tag), (init))
+
+void kasan_enable_hw_tags(void);
+
+#else /* CONFIG_KASAN_HW_TAGS */
+
+static inline void kasan_enable_hw_tags(void) { }
+
+#endif /* CONFIG_KASAN_HW_TAGS */
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+void __init kasan_init_tags(void);
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
+#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+void kasan_force_async_fault(void);
+
+#else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */
+
+static inline void kasan_force_async_fault(void) { }
+
+#endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */
+
+#ifdef CONFIG_KASAN_SW_TAGS
+u8 kasan_random_tag(void);
+#elif defined(CONFIG_KASAN_HW_TAGS)
+static inline u8 kasan_random_tag(void) { return hw_get_random_tag(); }
+#else
+static inline u8 kasan_random_tag(void) { return 0; }
+#endif
+
+#ifdef CONFIG_KASAN_HW_TAGS
+
+static inline void kasan_poison(const void *addr, size_t size, u8 value, bool init)
+{
+ addr = kasan_reset_tag(addr);
+
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(addr))
+ return;
+
+ if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
+ return;
+ if (WARN_ON(size & KASAN_GRANULE_MASK))
+ return;
+
+ hw_set_mem_tag_range((void *)addr, size, value, init);
+}
+
+static inline void kasan_unpoison(const void *addr, size_t size, bool init)
+{
+ u8 tag = get_tag(addr);
+
+ addr = kasan_reset_tag(addr);
+
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(addr))
+ return;
+
+ if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
+ return;
+ size = round_up(size, KASAN_GRANULE_SIZE);
+
+ hw_set_mem_tag_range((void *)addr, size, tag, init);
+}
+
+static inline bool kasan_byte_accessible(const void *addr)
+{
+ u8 ptr_tag = get_tag(addr);
+ u8 mem_tag = hw_get_mem_tag((void *)addr);
+
+ return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag;
+}
+
+#else /* CONFIG_KASAN_HW_TAGS */
+
+/**
+ * kasan_poison - mark the memory range as inaccessible
+ * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size - range size, must be aligned to KASAN_GRANULE_SIZE
+ * @value - value that's written to metadata for the range
+ * @init - whether to initialize the memory range (only for hardware tag-based)
+ *
+ * The size gets aligned to KASAN_GRANULE_SIZE before marking the range.
+ */
+void kasan_poison(const void *addr, size_t size, u8 value, bool init);
+
+/**
+ * kasan_unpoison - mark the memory range as accessible
+ * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size - range size, can be unaligned
+ * @init - whether to initialize the memory range (only for hardware tag-based)
+ *
+ * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before
+ * marking the range.
+ * For the generic mode, the last granule of the memory range gets partially
+ * unpoisoned based on the @size.
+ */
+void kasan_unpoison(const void *addr, size_t size, bool init);
+
+bool kasan_byte_accessible(const void *addr);
+
+#endif /* CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_GENERIC
+
+/**
+ * kasan_poison_last_granule - mark the last granule of the memory range as
+ * inaccessible
+ * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size - range size
+ *
+ * This function is only available for the generic mode, as it's the only mode
+ * that has partially poisoned memory granules.
+ */
+void kasan_poison_last_granule(const void *address, size_t size);
+
+#else /* CONFIG_KASAN_GENERIC */
+
+static inline void kasan_poison_last_granule(const void *address, size_t size) { }
+
+#endif /* CONFIG_KASAN_GENERIC */
+
+#ifndef kasan_arch_is_ready
+static inline bool kasan_arch_is_ready(void) { return true; }
+#elif !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
+#error kasan_arch_is_ready only works in KASAN generic outline mode!
+#endif
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+void kasan_kunit_test_suite_start(void);
+void kasan_kunit_test_suite_end(void);
+
+#else /* CONFIG_KASAN_KUNIT_TEST */
+
+static inline void kasan_kunit_test_suite_start(void) { }
+static inline void kasan_kunit_test_suite_end(void) { }
+
+#endif /* CONFIG_KASAN_KUNIT_TEST */
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+
+bool kasan_save_enable_multi_shot(void);
+void kasan_restore_multi_shot(bool enabled);
+
+#endif
+
/*
* Exported functions for interfaces called from assembly or from generated
- * code. Declarations here to avoid warning about missing declarations.
+ * code. Declared here to avoid warnings about missing declarations.
*/
+
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
-void __asan_register_globals(struct kasan_global *globals, size_t size);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size);
+void __asan_register_globals(void *globals, ssize_t size);
+void __asan_unregister_globals(void *globals, ssize_t size);
void __asan_handle_no_return(void);
-void __asan_alloca_poison(unsigned long addr, size_t size);
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
-
-void __asan_load1(unsigned long addr);
-void __asan_store1(unsigned long addr);
-void __asan_load2(unsigned long addr);
-void __asan_store2(unsigned long addr);
-void __asan_load4(unsigned long addr);
-void __asan_store4(unsigned long addr);
-void __asan_load8(unsigned long addr);
-void __asan_store8(unsigned long addr);
-void __asan_load16(unsigned long addr);
-void __asan_store16(unsigned long addr);
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
-
-void __asan_load1_noabort(unsigned long addr);
-void __asan_store1_noabort(unsigned long addr);
-void __asan_load2_noabort(unsigned long addr);
-void __asan_store2_noabort(unsigned long addr);
-void __asan_load4_noabort(unsigned long addr);
-void __asan_store4_noabort(unsigned long addr);
-void __asan_load8_noabort(unsigned long addr);
-void __asan_store8_noabort(unsigned long addr);
-void __asan_load16_noabort(unsigned long addr);
-void __asan_store16_noabort(unsigned long addr);
-void __asan_loadN_noabort(unsigned long addr, size_t size);
-void __asan_storeN_noabort(unsigned long addr, size_t size);
-
-void __asan_report_load1_noabort(unsigned long addr);
-void __asan_report_store1_noabort(unsigned long addr);
-void __asan_report_load2_noabort(unsigned long addr);
-void __asan_report_store2_noabort(unsigned long addr);
-void __asan_report_load4_noabort(unsigned long addr);
-void __asan_report_store4_noabort(unsigned long addr);
-void __asan_report_load8_noabort(unsigned long addr);
-void __asan_report_store8_noabort(unsigned long addr);
-void __asan_report_load16_noabort(unsigned long addr);
-void __asan_report_store16_noabort(unsigned long addr);
-void __asan_report_load_n_noabort(unsigned long addr, size_t size);
-void __asan_report_store_n_noabort(unsigned long addr, size_t size);
-
-void __asan_set_shadow_00(const void *addr, size_t size);
-void __asan_set_shadow_f1(const void *addr, size_t size);
-void __asan_set_shadow_f2(const void *addr, size_t size);
-void __asan_set_shadow_f3(const void *addr, size_t size);
-void __asan_set_shadow_f5(const void *addr, size_t size);
-void __asan_set_shadow_f8(const void *addr, size_t size);
-
-void __hwasan_load1_noabort(unsigned long addr);
-void __hwasan_store1_noabort(unsigned long addr);
-void __hwasan_load2_noabort(unsigned long addr);
-void __hwasan_store2_noabort(unsigned long addr);
-void __hwasan_load4_noabort(unsigned long addr);
-void __hwasan_store4_noabort(unsigned long addr);
-void __hwasan_load8_noabort(unsigned long addr);
-void __hwasan_store8_noabort(unsigned long addr);
-void __hwasan_load16_noabort(unsigned long addr);
-void __hwasan_store16_noabort(unsigned long addr);
-void __hwasan_loadN_noabort(unsigned long addr, size_t size);
-void __hwasan_storeN_noabort(unsigned long addr, size_t size);
-
-void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
-
-#endif
+void __asan_alloca_poison(void *, ssize_t size);
+void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom);
+
+void __asan_load1(void *);
+void __asan_store1(void *);
+void __asan_load2(void *);
+void __asan_store2(void *);
+void __asan_load4(void *);
+void __asan_store4(void *);
+void __asan_load8(void *);
+void __asan_store8(void *);
+void __asan_load16(void *);
+void __asan_store16(void *);
+void __asan_loadN(void *, ssize_t size);
+void __asan_storeN(void *, ssize_t size);
+
+void __asan_load1_noabort(void *);
+void __asan_store1_noabort(void *);
+void __asan_load2_noabort(void *);
+void __asan_store2_noabort(void *);
+void __asan_load4_noabort(void *);
+void __asan_store4_noabort(void *);
+void __asan_load8_noabort(void *);
+void __asan_store8_noabort(void *);
+void __asan_load16_noabort(void *);
+void __asan_store16_noabort(void *);
+void __asan_loadN_noabort(void *, ssize_t size);
+void __asan_storeN_noabort(void *, ssize_t size);
+
+void __asan_report_load1_noabort(void *);
+void __asan_report_store1_noabort(void *);
+void __asan_report_load2_noabort(void *);
+void __asan_report_store2_noabort(void *);
+void __asan_report_load4_noabort(void *);
+void __asan_report_store4_noabort(void *);
+void __asan_report_load8_noabort(void *);
+void __asan_report_store8_noabort(void *);
+void __asan_report_load16_noabort(void *);
+void __asan_report_store16_noabort(void *);
+void __asan_report_load_n_noabort(void *, ssize_t size);
+void __asan_report_store_n_noabort(void *, ssize_t size);
+
+void __asan_set_shadow_00(const void *addr, ssize_t size);
+void __asan_set_shadow_f1(const void *addr, ssize_t size);
+void __asan_set_shadow_f2(const void *addr, ssize_t size);
+void __asan_set_shadow_f3(const void *addr, ssize_t size);
+void __asan_set_shadow_f5(const void *addr, ssize_t size);
+void __asan_set_shadow_f8(const void *addr, ssize_t size);
+
+void *__asan_memset(void *addr, int c, ssize_t len);
+void *__asan_memmove(void *dest, const void *src, ssize_t len);
+void *__asan_memcpy(void *dest, const void *src, ssize_t len);
+
+void __hwasan_load1_noabort(void *);
+void __hwasan_store1_noabort(void *);
+void __hwasan_load2_noabort(void *);
+void __hwasan_store2_noabort(void *);
+void __hwasan_load4_noabort(void *);
+void __hwasan_store4_noabort(void *);
+void __hwasan_load8_noabort(void *);
+void __hwasan_store8_noabort(void *);
+void __hwasan_load16_noabort(void *);
+void __hwasan_store16_noabort(void *);
+void __hwasan_loadN_noabort(void *, ssize_t size);
+void __hwasan_storeN_noabort(void *, ssize_t size);
+
+void __hwasan_tag_memory(void *, u8 tag, ssize_t size);
+
+void *__hwasan_memset(void *addr, int c, ssize_t len);
+void *__hwasan_memmove(void *dest, const void *src, ssize_t len);
+void *__hwasan_memcpy(void *dest, const void *src, ssize_t len);
+
+void kasan_tag_mismatch(void *addr, unsigned long access_info,
+ unsigned long ret_ip);
+
+#endif /* __MM_KASAN_KASAN_H */
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
new file mode 100644
index 000000000000..b61cc6a42541
--- /dev/null
+++ b/mm/kasan/kasan_test.c
@@ -0,0 +1,1581 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ */
+
+#define pr_fmt(fmt) "kasan_test: " fmt
+
+#include <kunit/test.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/set_memory.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <trace/events/printk.h>
+
+#include <asm/page.h>
+
+#include "kasan.h"
+
+#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
+
+static bool multishot;
+
+/* Fields set based on lines observed in the console. */
+static struct {
+ bool report_found;
+ bool async_fault;
+} test_status;
+
+/*
+ * Some tests use these global variables to store return values from function
+ * calls that could otherwise be eliminated by the compiler as dead code.
+ */
+void *kasan_ptr_result;
+int kasan_int_result;
+
+/* Probe for console output: obtains test_status lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ if (strnstr(buf, "BUG: KASAN: ", len))
+ WRITE_ONCE(test_status.report_found, true);
+ else if (strnstr(buf, "Asynchronous fault: ", len))
+ WRITE_ONCE(test_status.async_fault, true);
+}
+
+static int kasan_suite_init(struct kunit_suite *suite)
+{
+ if (!kasan_enabled()) {
+ pr_err("Can't run KASAN tests with KASAN disabled");
+ return -1;
+ }
+
+ /* Stop failing KUnit tests on KASAN reports. */
+ kasan_kunit_test_suite_start();
+
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
+ */
+ multishot = kasan_save_enable_multi_shot();
+
+ register_trace_console(probe_console, NULL);
+ return 0;
+}
+
+static void kasan_suite_exit(struct kunit_suite *suite)
+{
+ kasan_kunit_test_suite_end();
+ kasan_restore_multi_shot(multishot);
+ unregister_trace_console(probe_console, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+static void kasan_test_exit(struct kunit *test)
+{
+ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found));
+}
+
+/**
+ * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a
+ * KASAN report; causes a test failure otherwise. This relies on a KUnit
+ * resource named "kasan_status". Do not use this name for KUnit resources
+ * outside of KASAN tests.
+ *
+ * For hardware tag-based KASAN, when a synchronous tag fault happens, tag
+ * checking is auto-disabled. When this happens, this test handler reenables
+ * tag checking. As tag checking can be only disabled or enabled per CPU,
+ * this handler disables migration (preemption).
+ *
+ * Since the compiler doesn't see that the expression can change the test_status
+ * fields, it can reorder or optimize away the accesses to those fields.
+ * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the
+ * expression to prevent that.
+ *
+ * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept
+ * as false. This allows detecting KASAN reports that happen outside of the
+ * checks by asserting !test_status.report_found at the start of
+ * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit.
+ */
+#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
+ kasan_sync_fault_possible()) \
+ migrate_disable(); \
+ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \
+ barrier(); \
+ expression; \
+ barrier(); \
+ if (kasan_async_fault_possible()) \
+ kasan_force_async_fault(); \
+ if (!READ_ONCE(test_status.report_found)) { \
+ KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \
+ "expected in \"" #expression \
+ "\", but none occurred"); \
+ } \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
+ kasan_sync_fault_possible()) { \
+ if (READ_ONCE(test_status.report_found) && \
+ !READ_ONCE(test_status.async_fault)) \
+ kasan_enable_hw_tags(); \
+ migrate_enable(); \
+ } \
+ WRITE_ONCE(test_status.report_found, false); \
+ WRITE_ONCE(test_status.async_fault, false); \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
+ if (!IS_ENABLED(config)) \
+ kunit_skip((test), "Test requires " #config "=y"); \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CONFIG_OFF(test, config) do { \
+ if (IS_ENABLED(config)) \
+ kunit_skip((test), "Test requires " #config "=n"); \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test) do { \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) \
+ break; /* No compiler instrumentation. */ \
+ if (IS_ENABLED(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX)) \
+ break; /* Should always be instrumented! */ \
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) \
+ kunit_skip((test), "Test requires checked mem*()"); \
+} while (0)
+
+static void kmalloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE - 5;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ /*
+ * An unaligned access past the requested kmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x');
+
+ /*
+ * An aligned access into the first out-of-bounds granule that falls
+ * within the aligned kmalloc object.
+ */
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y');
+
+ /* Out-of-bounds access past the aligned kmalloc object. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] =
+ ptr[size + KASAN_GRANULE_SIZE + 5]);
+
+ kfree(ptr);
+}
+
+static void kmalloc_oob_left(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 15;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1));
+ kfree(ptr);
+}
+
+static void kmalloc_node_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 4096;
+
+ ptr = kmalloc_node(size, GFP_KERNEL, 0);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ kfree(ptr);
+}
+
+/*
+ * These kmalloc_pagealloc_* tests try allocating a memory chunk that doesn't
+ * fit into a slab cache and therefore is allocated via the page allocator
+ * fallback. Since this kind of fallback is only implemented for SLUB, these
+ * tests are limited to that allocator.
+ */
+static void kmalloc_pagealloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0);
+
+ kfree(ptr);
+}
+
+static void kmalloc_pagealloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ kfree(ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+}
+
+static void kmalloc_pagealloc_invalid_free(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1));
+}
+
+static void pagealloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ size_t order = 4;
+ size_t size = (1UL << (PAGE_SHIFT + order));
+
+ /*
+ * With generic KASAN page allocations have no redzones, thus
+ * out-of-bounds detection is not guaranteed.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=210503.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ free_pages((unsigned long)ptr, order);
+}
+
+static void pagealloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ size_t order = 4;
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ free_pages((unsigned long)ptr, order);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+}
+
+static void kmalloc_large_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE - 256;
+
+ /*
+ * Allocate a chunk that is large enough, but still fits into a slab
+ * and does not trigger the page allocator fallback in SLUB.
+ */
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
+ kfree(ptr);
+}
+
+static void krealloc_more_oob_helper(struct kunit *test,
+ size_t size1, size_t size2)
+{
+ char *ptr1, *ptr2;
+ size_t middle;
+
+ KUNIT_ASSERT_LT(test, size1, size2);
+ middle = size1 + (size2 - size1) / 2;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
+ /* All offsets up to size2 must be accessible. */
+ ptr2[size1 - 1] = 'x';
+ ptr2[size1] = 'x';
+ ptr2[middle] = 'x';
+ ptr2[size2 - 1] = 'x';
+
+ /* Generic mode is precise, so unaligned size2 must be inaccessible. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x');
+
+ /* For all modes first aligned offset after size2 must be inaccessible. */
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x');
+
+ kfree(ptr2);
+}
+
+static void krealloc_less_oob_helper(struct kunit *test,
+ size_t size1, size_t size2)
+{
+ char *ptr1, *ptr2;
+ size_t middle;
+
+ KUNIT_ASSERT_LT(test, size2, size1);
+ middle = size2 + (size1 - size2) / 2;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
+ /* Must be accessible for all modes. */
+ ptr2[size2 - 1] = 'x';
+
+ /* Generic mode is precise, so unaligned size2 must be inaccessible. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x');
+
+ /* For all modes first aligned offset after size2 must be inaccessible. */
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x');
+
+ /*
+ * For all modes all size2, middle, and size1 should land in separate
+ * granules and thus the latter two offsets should be inaccessible.
+ */
+ KUNIT_EXPECT_LE(test, round_up(size2, KASAN_GRANULE_SIZE),
+ round_down(middle, KASAN_GRANULE_SIZE));
+ KUNIT_EXPECT_LE(test, round_up(middle, KASAN_GRANULE_SIZE),
+ round_down(size1, KASAN_GRANULE_SIZE));
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[middle] = 'x');
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1 - 1] = 'x');
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1] = 'x');
+
+ kfree(ptr2);
+}
+
+static void krealloc_more_oob(struct kunit *test)
+{
+ krealloc_more_oob_helper(test, 201, 235);
+}
+
+static void krealloc_less_oob(struct kunit *test)
+{
+ krealloc_less_oob_helper(test, 235, 201);
+}
+
+static void krealloc_pagealloc_more_oob(struct kunit *test)
+{
+ /* page_alloc fallback in only implemented for SLUB. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ krealloc_more_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 201,
+ KMALLOC_MAX_CACHE_SIZE + 235);
+}
+
+static void krealloc_pagealloc_less_oob(struct kunit *test)
+{
+ /* page_alloc fallback in only implemented for SLUB. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ krealloc_less_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 235,
+ KMALLOC_MAX_CACHE_SIZE + 201);
+}
+
+/*
+ * Check that krealloc() detects a use-after-free, returns NULL,
+ * and doesn't unpoison the freed object.
+ */
+static void krealloc_uaf(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ int size1 = 201;
+ int size2 = 235;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+ kfree(ptr1);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL));
+ KUNIT_ASSERT_NULL(test, ptr2);
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1);
+}
+
+static void kmalloc_oob_16(struct kunit *test)
+{
+ struct {
+ u64 words[2];
+ } *ptr1, *ptr2;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ /* This test is specifically crafted for the generic mode. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ OPTIMIZER_HIDE_VAR(ptr1);
+ OPTIMIZER_HIDE_VAR(ptr2);
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
+ kfree(ptr1);
+ kfree(ptr2);
+}
+
+static void kmalloc_uaf_16(struct kunit *test)
+{
+ struct {
+ u64 words[2];
+ } *ptr1, *ptr2;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ kfree(ptr2);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
+ kfree(ptr1);
+}
+
+/*
+ * Note: in the memset tests below, the written range touches both valid and
+ * invalid memory. This makes sure that the instrumentation does not only check
+ * the starting address but the whole range.
+ */
+
+static void kmalloc_oob_memset_2(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_4(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_8(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_16(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_in_memset(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memset(ptr, 0, size + KASAN_GRANULE_SIZE));
+ kfree(ptr);
+}
+
+static void kmalloc_memmove_negative_size(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 64;
+ size_t invalid_size = -2;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ /*
+ * Hardware tag-based mode doesn't check memmove for negative size.
+ * As a result, this test introduces a side-effect memory corruption,
+ * which can result in a crash.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ memset((char *)ptr, 0, 64);
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(invalid_size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ kfree(ptr);
+}
+
+static void kmalloc_memmove_invalid_size(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 64;
+ size_t invalid_size = size;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ memset((char *)ptr, 0, 64);
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(invalid_size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ kfree(ptr);
+}
+
+static void kmalloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 10;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]);
+}
+
+static void kmalloc_uaf_memset(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 33;
+
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
+ /*
+ * Only generic KASAN uses quarantine, which is required to avoid a
+ * kernel memory corruption this test causes.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size));
+}
+
+static void kmalloc_uaf2(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ size_t size = 43;
+ int counter = 0;
+
+again:
+ ptr1 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ kfree(ptr1);
+
+ ptr2 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /*
+ * For tag-based KASAN ptr1 and ptr2 tags might happen to be the same.
+ * Allow up to 16 attempts at generating different tags.
+ */
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && ptr1 == ptr2 && counter++ < 16) {
+ kfree(ptr2);
+ goto again;
+ }
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]);
+ KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
+
+ kfree(ptr2);
+}
+
+/*
+ * Check that KASAN detects use-after-free when another object was allocated in
+ * the same slot. Relevant for the tag-based modes, which do not use quarantine.
+ */
+static void kmalloc_uaf3(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ size_t size = 100;
+
+ /* This test is specifically crafted for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr1 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+ kfree(ptr1);
+
+ ptr2 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ kfree(ptr2);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
+}
+
+static void kfree_via_page(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 8;
+ struct page *page;
+ unsigned long offset;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ page = virt_to_page(ptr);
+ offset = offset_in_page(ptr);
+ kfree(page_address(page) + offset);
+}
+
+static void kfree_via_phys(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 8;
+ phys_addr_t phys;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ phys = virt_to_phys(ptr);
+ kfree(phys_to_virt(phys));
+}
+
+static void kmem_cache_oob(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]);
+
+ kmem_cache_free(cache, p);
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_accounted(struct kunit *test)
+{
+ int i;
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, SLAB_ACCOUNT, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ /*
+ * Several allocations with a delay to allow for lazy per memcg kmem
+ * cache creation.
+ */
+ for (i = 0; i < 5; i++) {
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p)
+ goto free_cache;
+
+ kmem_cache_free(cache, p);
+ msleep(100);
+ }
+
+free_cache:
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_bulk(struct kunit *test)
+{
+ struct kmem_cache *cache;
+ size_t size = 200;
+ char *p[10];
+ bool ret;
+ int i;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p);
+ if (!ret) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(p); i++)
+ p[i][0] = p[i][size - 1] = 42;
+
+ kmem_cache_free_bulk(cache, ARRAY_SIZE(p), (void **)&p);
+ kmem_cache_destroy(cache);
+}
+
+static char global_array[10];
+
+static void kasan_global_oob_right(struct kunit *test)
+{
+ /*
+ * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS
+ * from failing here and panicking the kernel, access the array via a
+ * volatile pointer, which will prevent the compiler from being able to
+ * determine the array bounds.
+ *
+ * This access uses a volatile pointer to char (char *volatile) rather
+ * than the more conventional pointer to volatile char (volatile char *)
+ * because we want to prevent the compiler from making inferences about
+ * the pointer itself (i.e. its array bounds), not the data that it
+ * refers to.
+ */
+ char *volatile array = global_array;
+ char *p = &array[ARRAY_SIZE(global_array) + 3];
+
+ /* Only generic mode instruments globals. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_global_oob_left(struct kunit *test)
+{
+ char *volatile array = global_array;
+ char *p = array - 3;
+
+ /*
+ * GCC is known to fail this test, skip it.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=215051.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_CC_IS_CLANG);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+/* Check that ksize() does NOT unpoison whole object. */
+static void ksize_unpoisons_memory(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE - 5;
+ size_t real_size;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ real_size = ksize(ptr);
+ KUNIT_EXPECT_GT(test, real_size, size);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+
+ /* These accesses shouldn't trigger a KASAN report. */
+ ptr[0] = 'x';
+ ptr[size - 1] = 'x';
+
+ /* These must trigger a KASAN report. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]);
+
+ kfree(ptr);
+}
+
+/*
+ * Check that a use-after-free is detected by ksize() and via normal accesses
+ * after it.
+ */
+static void ksize_uaf(struct kunit *test)
+{
+ char *ptr;
+ int size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ kfree(ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
+}
+
+static void kasan_stack_oob(struct kunit *test)
+{
+ char stack_array[10];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = stack_array;
+ char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF];
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_alloca_oob_left(struct kunit *test)
+{
+ volatile int i = 10;
+ char alloca_array[i];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = alloca_array;
+ char *p = array - 1;
+
+ /* Only generic mode instruments dynamic allocas. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_alloca_oob_right(struct kunit *test)
+{
+ volatile int i = 10;
+ char alloca_array[i];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = alloca_array;
+ char *p = array + i;
+
+ /* Only generic mode instruments dynamic allocas. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kmem_cache_double_free(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ kmem_cache_free(cache, p);
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p));
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_invalid_free(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ /* Trigger invalid free, the object doesn't get freed. */
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1));
+
+ /*
+ * Properly free the object to prevent the "Objects remaining in
+ * test_cache on __kmem_cache_shutdown" BUG failure.
+ */
+ kmem_cache_free(cache, p);
+
+ kmem_cache_destroy(cache);
+}
+
+static void empty_cache_ctor(void *object) { }
+
+static void kmem_cache_double_destroy(struct kunit *test)
+{
+ struct kmem_cache *cache;
+
+ /* Provide a constructor to prevent cache merging. */
+ cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+ kmem_cache_destroy(cache);
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
+}
+
+static void kasan_memchr(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ if (OOB_TAG_OFF)
+ size = round_up(size, OOB_TAG_OFF);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_ptr_result = memchr(ptr, '1', size + 1));
+
+ kfree(ptr);
+}
+
+static void kasan_memcmp(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+ int arr[9];
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ if (OOB_TAG_OFF)
+ size = round_up(size, OOB_TAG_OFF);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ memset(arr, 0, sizeof(arr));
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_int_result = memcmp(ptr, arr, size+1));
+ kfree(ptr);
+}
+
+static void kasan_strings(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+
+ /*
+ * Try to cause only 1 invalid access (less spam in dmesg).
+ * For that we need ptr to point to zeroed byte.
+ * Skip metadata that could be stored in freed object so ptr
+ * will likely point to zeroed byte.
+ */
+ ptr += 16;
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1'));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1'));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2"));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1));
+}
+
+static void kasan_bitops_modify(struct kunit *test, int nr, void *addr)
+{
+ KUNIT_EXPECT_KASAN_FAIL(test, set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(nr, addr));
+}
+
+static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr)
+{
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr));
+
+#if defined(clear_bit_unlock_is_negative_byte)
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
+ clear_bit_unlock_is_negative_byte(nr, addr));
+#endif
+}
+
+static void kasan_bitops_generic(struct kunit *test)
+{
+ long *bits;
+
+ /* This test is specifically crafted for the generic mode. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ /*
+ * Allocate 1 more byte, which causes kzalloc to round up to 16 bytes;
+ * this way we do not actually corrupt other memory.
+ */
+ bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits);
+
+ /*
+ * Below calls try to access bit within allocated memory; however, the
+ * below accesses are still out-of-bounds, since bitops are defined to
+ * operate on the whole long the bit is in.
+ */
+ kasan_bitops_modify(test, BITS_PER_LONG, bits);
+
+ /*
+ * Below calls try to access bit beyond allocated memory.
+ */
+ kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, bits);
+
+ kfree(bits);
+}
+
+static void kasan_bitops_tags(struct kunit *test)
+{
+ long *bits;
+
+ /* This test is specifically crafted for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ /* kmalloc-64 cache will be used and the last 16 bytes will be the redzone. */
+ bits = kzalloc(48, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits);
+
+ /* Do the accesses past the 48 allocated bytes, but within the redone. */
+ kasan_bitops_modify(test, BITS_PER_LONG, (void *)bits + 48);
+ kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, (void *)bits + 48);
+
+ kfree(bits);
+}
+
+static void kmalloc_double_kzfree(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 16;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree_sensitive(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
+}
+
+/*
+ * The two tests below check that Generic KASAN prints auxiliary stack traces
+ * for RCU callbacks and workqueues. The reports need to be inspected manually.
+ *
+ * These tests are still enabled for other KASAN modes to make sure that all
+ * modes report bad accesses in tested scenarios.
+ */
+
+static struct kasan_rcu_info {
+ int i;
+ struct rcu_head rcu;
+} *global_rcu_ptr;
+
+static void rcu_uaf_reclaim(struct rcu_head *rp)
+{
+ struct kasan_rcu_info *fp =
+ container_of(rp, struct kasan_rcu_info, rcu);
+
+ kfree(fp);
+ ((volatile struct kasan_rcu_info *)fp)->i;
+}
+
+static void rcu_uaf(struct kunit *test)
+{
+ struct kasan_rcu_info *ptr;
+
+ ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ global_rcu_ptr = rcu_dereference_protected(
+ (struct kasan_rcu_info __rcu *)ptr, NULL);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
+ rcu_barrier());
+}
+
+static void workqueue_uaf_work(struct work_struct *work)
+{
+ kfree(work);
+}
+
+static void workqueue_uaf(struct kunit *test)
+{
+ struct workqueue_struct *workqueue;
+ struct work_struct *work;
+
+ workqueue = create_workqueue("kasan_workqueue_test");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, workqueue);
+
+ work = kmalloc(sizeof(struct work_struct), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, work);
+
+ INIT_WORK(work, workqueue_uaf_work);
+ queue_work(workqueue, work);
+ destroy_workqueue(workqueue);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ((volatile struct work_struct *)work)->data);
+}
+
+static void vmalloc_helpers_tags(struct kunit *test)
+{
+ void *ptr;
+
+ /* This test is intended for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ ptr = vmalloc(PAGE_SIZE);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ /* Check that the returned pointer is tagged. */
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure exported vmalloc helpers handle tagged pointers. */
+ KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr));
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr));
+
+#if !IS_MODULE(CONFIG_KASAN_KUNIT_TEST)
+ {
+ int rv;
+
+ /* Make sure vmalloc'ed memory permissions can be changed. */
+ rv = set_memory_ro((unsigned long)ptr, 1);
+ KUNIT_ASSERT_GE(test, rv, 0);
+ rv = set_memory_rw((unsigned long)ptr, 1);
+ KUNIT_ASSERT_GE(test, rv, 0);
+ }
+#endif
+
+ vfree(ptr);
+}
+
+static void vmalloc_oob(struct kunit *test)
+{
+ char *v_ptr, *p_ptr;
+ struct page *page;
+ size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ v_ptr = vmalloc(size);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ OPTIMIZER_HIDE_VAR(v_ptr);
+
+ /*
+ * We have to be careful not to hit the guard page in vmalloc tests.
+ * The MMU will catch that and crash us.
+ */
+
+ /* Make sure in-bounds accesses are valid. */
+ v_ptr[0] = 0;
+ v_ptr[size - 1] = 0;
+
+ /*
+ * An unaligned access past the requested vmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
+
+ /* An aligned access into the first out-of-bounds granule. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]);
+
+ /* Check that in-bounds accesses to the physical page are valid. */
+ page = vmalloc_to_page(v_ptr);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+ p_ptr = page_address(page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+ p_ptr[0] = 0;
+
+ vfree(v_ptr);
+
+ /*
+ * We can't check for use-after-unmap bugs in this nor in the following
+ * vmalloc tests, as the page might be fully unmapped and accessing it
+ * will crash the kernel.
+ */
+}
+
+static void vmap_tags(struct kunit *test)
+{
+ char *p_ptr, *v_ptr;
+ struct page *p_page, *v_page;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons vmap mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ p_page = alloc_pages(GFP_KERNEL, 1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page);
+ p_ptr = page_address(p_page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+ v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ /*
+ * We can't check for out-of-bounds bugs in this nor in the following
+ * vmalloc tests, as allocations have page granularity and accessing
+ * the guard page will crash the kernel.
+ */
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses through both pointers work. */
+ *p_ptr = 0;
+ *v_ptr = 0;
+
+ /* Make sure vmalloc_to_page() correctly recovers the page pointer. */
+ v_page = vmalloc_to_page(v_ptr);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page);
+ KUNIT_EXPECT_PTR_EQ(test, p_page, v_page);
+
+ vunmap(v_ptr);
+ free_pages((unsigned long)p_ptr, 1);
+}
+
+static void vm_map_ram_tags(struct kunit *test)
+{
+ char *p_ptr, *v_ptr;
+ struct page *page;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons vm_map_ram mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ page = alloc_pages(GFP_KERNEL, 1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+ p_ptr = page_address(page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+ v_ptr = vm_map_ram(&page, 1, -1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses through both pointers work. */
+ *p_ptr = 0;
+ *v_ptr = 0;
+
+ vm_unmap_ram(v_ptr, 1);
+ free_pages((unsigned long)p_ptr, 1);
+}
+
+static void vmalloc_percpu(struct kunit *test)
+{
+ char __percpu *ptr;
+ int cpu;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons percpu mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+
+ for_each_possible_cpu(cpu) {
+ char *c_ptr = per_cpu_ptr(ptr, cpu);
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses don't crash the kernel. */
+ *c_ptr = 0;
+ }
+
+ free_percpu(ptr);
+}
+
+/*
+ * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN,
+ * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based
+ * modes.
+ */
+static void match_all_not_assigned(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ int i, size, order;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ for (i = 0; i < 256; i++) {
+ size = get_random_u32_inclusive(1, 1024);
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ kfree(ptr);
+ }
+
+ for (i = 0; i < 256; i++) {
+ order = get_random_u32_inclusive(1, 4);
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ free_pages((unsigned long)ptr, order);
+ }
+
+ if (!IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ return;
+
+ for (i = 0; i < 256; i++) {
+ size = get_random_u32_inclusive(1, 1024);
+ ptr = vmalloc(size);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ vfree(ptr);
+ }
+}
+
+/* Check that 0xff works as a match-all pointer tag for tag-based modes. */
+static void match_all_ptr_tag(struct kunit *test)
+{
+ char *ptr;
+ u8 tag;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(128, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ /* Backup the assigned tag. */
+ tag = get_tag(ptr);
+ KUNIT_EXPECT_NE(test, tag, (u8)KASAN_TAG_KERNEL);
+
+ /* Reset the tag to 0xff.*/
+ ptr = set_tag(ptr, KASAN_TAG_KERNEL);
+
+ /* This access shouldn't trigger a KASAN report. */
+ *ptr = 0;
+
+ /* Recover the pointer tag and free. */
+ ptr = set_tag(ptr, tag);
+ kfree(ptr);
+}
+
+/* Check that there are no match-all memory tags for tag-based modes. */
+static void match_all_mem_tag(struct kunit *test)
+{
+ char *ptr;
+ int tag;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(128, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_NE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* For each possible tag value not matching the pointer tag. */
+ for (tag = KASAN_TAG_MIN; tag <= KASAN_TAG_KERNEL; tag++) {
+ if (tag == get_tag(ptr))
+ continue;
+
+ /* Mark the first memory granule with the chosen memory tag. */
+ kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag, false);
+
+ /* This access must cause a KASAN report. */
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = 0);
+ }
+
+ /* Recover the memory tag and free. */
+ kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr), false);
+ kfree(ptr);
+}
+
+static struct kunit_case kasan_kunit_test_cases[] = {
+ KUNIT_CASE(kmalloc_oob_right),
+ KUNIT_CASE(kmalloc_oob_left),
+ KUNIT_CASE(kmalloc_node_oob_right),
+ KUNIT_CASE(kmalloc_pagealloc_oob_right),
+ KUNIT_CASE(kmalloc_pagealloc_uaf),
+ KUNIT_CASE(kmalloc_pagealloc_invalid_free),
+ KUNIT_CASE(pagealloc_oob_right),
+ KUNIT_CASE(pagealloc_uaf),
+ KUNIT_CASE(kmalloc_large_oob_right),
+ KUNIT_CASE(krealloc_more_oob),
+ KUNIT_CASE(krealloc_less_oob),
+ KUNIT_CASE(krealloc_pagealloc_more_oob),
+ KUNIT_CASE(krealloc_pagealloc_less_oob),
+ KUNIT_CASE(krealloc_uaf),
+ KUNIT_CASE(kmalloc_oob_16),
+ KUNIT_CASE(kmalloc_uaf_16),
+ KUNIT_CASE(kmalloc_oob_in_memset),
+ KUNIT_CASE(kmalloc_oob_memset_2),
+ KUNIT_CASE(kmalloc_oob_memset_4),
+ KUNIT_CASE(kmalloc_oob_memset_8),
+ KUNIT_CASE(kmalloc_oob_memset_16),
+ KUNIT_CASE(kmalloc_memmove_negative_size),
+ KUNIT_CASE(kmalloc_memmove_invalid_size),
+ KUNIT_CASE(kmalloc_uaf),
+ KUNIT_CASE(kmalloc_uaf_memset),
+ KUNIT_CASE(kmalloc_uaf2),
+ KUNIT_CASE(kmalloc_uaf3),
+ KUNIT_CASE(kfree_via_page),
+ KUNIT_CASE(kfree_via_phys),
+ KUNIT_CASE(kmem_cache_oob),
+ KUNIT_CASE(kmem_cache_accounted),
+ KUNIT_CASE(kmem_cache_bulk),
+ KUNIT_CASE(kasan_global_oob_right),
+ KUNIT_CASE(kasan_global_oob_left),
+ KUNIT_CASE(kasan_stack_oob),
+ KUNIT_CASE(kasan_alloca_oob_left),
+ KUNIT_CASE(kasan_alloca_oob_right),
+ KUNIT_CASE(ksize_unpoisons_memory),
+ KUNIT_CASE(ksize_uaf),
+ KUNIT_CASE(kmem_cache_double_free),
+ KUNIT_CASE(kmem_cache_invalid_free),
+ KUNIT_CASE(kmem_cache_double_destroy),
+ KUNIT_CASE(kasan_memchr),
+ KUNIT_CASE(kasan_memcmp),
+ KUNIT_CASE(kasan_strings),
+ KUNIT_CASE(kasan_bitops_generic),
+ KUNIT_CASE(kasan_bitops_tags),
+ KUNIT_CASE(kmalloc_double_kzfree),
+ KUNIT_CASE(rcu_uaf),
+ KUNIT_CASE(workqueue_uaf),
+ KUNIT_CASE(vmalloc_helpers_tags),
+ KUNIT_CASE(vmalloc_oob),
+ KUNIT_CASE(vmap_tags),
+ KUNIT_CASE(vm_map_ram_tags),
+ KUNIT_CASE(vmalloc_percpu),
+ KUNIT_CASE(match_all_not_assigned),
+ KUNIT_CASE(match_all_ptr_tag),
+ KUNIT_CASE(match_all_mem_tag),
+ {}
+};
+
+static struct kunit_suite kasan_kunit_test_suite = {
+ .name = "kasan",
+ .test_cases = kasan_kunit_test_cases,
+ .exit = kasan_test_exit,
+ .suite_init = kasan_suite_init,
+ .suite_exit = kasan_suite_exit,
+};
+
+kunit_test_suite(kasan_kunit_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
new file mode 100644
index 000000000000..7be7bed456ef
--- /dev/null
+++ b/mm/kasan/kasan_test_module.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ */
+
+#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
+
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "kasan.h"
+
+static noinline void __init copy_user_test(void)
+{
+ char *kmem;
+ char __user *usermem;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+ int __maybe_unused unused;
+
+ kmem = kmalloc(size, GFP_KERNEL);
+ if (!kmem)
+ return;
+
+ usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ if (IS_ERR(usermem)) {
+ pr_err("Failed to allocate user memory\n");
+ kfree(kmem);
+ return;
+ }
+
+ OPTIMIZER_HIDE_VAR(size);
+
+ pr_info("out-of-bounds in copy_from_user()\n");
+ unused = copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in copy_to_user()\n");
+ unused = copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user()\n");
+ unused = __copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user()\n");
+ unused = __copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in strncpy_from_user()\n");
+ unused = strncpy_from_user(kmem, usermem, size + 1);
+
+ vm_munmap((unsigned long)usermem, PAGE_SIZE);
+ kfree(kmem);
+}
+
+static int __init test_kasan_module_init(void)
+{
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
+ */
+ bool multishot = kasan_save_enable_multi_shot();
+
+ copy_user_test();
+
+ kasan_restore_multi_shot(multishot);
+ return -EAGAIN;
+}
+
+module_init(test_kasan_module_init);
+MODULE_LICENSE("GPL");
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 4c5375810449..152dca73f398 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -6,16 +6,6 @@
* Copyright (C) 2016 Google, Inc.
*
* Based on code by Dmitry Chernenkov.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
*/
#include <linux/gfp.h>
@@ -29,6 +19,7 @@
#include <linux/srcu.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/cpuhotplug.h>
#include "../slab.h"
#include "kasan.h"
@@ -36,13 +27,14 @@
/* Data structure and operations for quarantine queues. */
/*
- * Each queue is a signle-linked list, which also stores the total size of
+ * Each queue is a single-linked list, which also stores the total size of
* objects inside of it.
*/
struct qlist_head {
struct qlist_node *head;
struct qlist_node *tail;
size_t bytes;
+ bool offline;
};
#define QLIST_INIT { NULL, NULL, 0 }
@@ -107,6 +99,15 @@ static unsigned long quarantine_size;
static DEFINE_RAW_SPINLOCK(quarantine_lock);
DEFINE_STATIC_SRCU(remove_cache_srcu);
+struct cpu_shrink_qlist {
+ raw_spinlock_t lock;
+ struct qlist_head qlist;
+};
+
+static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock),
+};
+
/* Maximum size of the global queue. */
static unsigned long quarantine_max_size;
@@ -125,7 +126,7 @@ static unsigned long quarantine_batch_size;
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
- return virt_to_head_page(qlink)->slab_cache;
+ return virt_to_slab(qlink)->slab_cache;
}
static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
@@ -140,12 +141,28 @@ static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
{
void *object = qlink_to_object(qlink, cache);
+ struct kasan_free_meta *meta = kasan_get_free_meta(cache, object);
unsigned long flags;
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
- *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
+ /*
+ * If init_on_free is enabled and KASAN's free metadata is stored in
+ * the object, zero the metadata. Otherwise, the object's memory will
+ * not be properly zeroed, as KASAN saves the metadata after the slab
+ * allocator zeroes the object.
+ */
+ if (slab_want_init_on_free(cache) &&
+ cache->kasan_info.free_meta_offset == 0)
+ memzero_explicit(meta, sizeof(*meta));
+
+ /*
+ * As the object now gets freed from the quarantine, assume that its
+ * free track is no longer valid.
+ */
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE;
+
___cache_free(cache, object, _THIS_IP_);
if (IS_ENABLED(CONFIG_SLAB))
@@ -171,24 +188,36 @@ static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
qlist_init(q);
}
-void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+bool kasan_quarantine_put(struct kmem_cache *cache, void *object)
{
unsigned long flags;
struct qlist_head *q;
struct qlist_head temp = QLIST_INIT;
+ struct kasan_free_meta *meta = kasan_get_free_meta(cache, object);
+
+ /*
+ * If there's no metadata for this object, don't put it into
+ * quarantine.
+ */
+ if (!meta)
+ return false;
/*
* Note: irq must be disabled until after we move the batch to the
- * global quarantine. Otherwise quarantine_remove_cache() can miss
- * some objects belonging to the cache if they are in our local temp
- * list. quarantine_remove_cache() executes on_each_cpu() at the
- * beginning which ensures that it either sees the objects in per-cpu
- * lists or in the global quarantine.
+ * global quarantine. Otherwise kasan_quarantine_remove_cache() can
+ * miss some objects belonging to the cache if they are in our local
+ * temp list. kasan_quarantine_remove_cache() executes on_each_cpu()
+ * at the beginning which ensures that it either sees the objects in
+ * per-cpu lists or in the global quarantine.
*/
local_irq_save(flags);
q = this_cpu_ptr(&cpu_quarantine);
- qlist_put(q, &info->quarantine_link, cache->size);
+ if (q->offline) {
+ local_irq_restore(flags);
+ return false;
+ }
+ qlist_put(q, &meta->quarantine_link, cache->size);
if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
@@ -209,9 +238,11 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
}
local_irq_restore(flags);
+
+ return true;
}
-void quarantine_reduce(void)
+void kasan_quarantine_reduce(void)
{
size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
@@ -223,7 +254,7 @@ void quarantine_reduce(void)
return;
/*
- * srcu critical section ensures that quarantine_remove_cache()
+ * srcu critical section ensures that kasan_quarantine_remove_cache()
* will not miss objects belonging to the cache while they are in our
* local to_free list. srcu is chosen because (1) it gives us private
* grace period domain that does not interfere with anything else,
@@ -286,32 +317,58 @@ static void qlist_move_cache(struct qlist_head *from,
}
}
-static void per_cpu_remove_cache(void *arg)
+static void __per_cpu_remove_cache(struct qlist_head *q, void *arg)
{
struct kmem_cache *cache = arg;
- struct qlist_head to_free = QLIST_INIT;
+ unsigned long flags;
+ struct cpu_shrink_qlist *sq;
+
+ sq = this_cpu_ptr(&shrink_qlist);
+ raw_spin_lock_irqsave(&sq->lock, flags);
+ qlist_move_cache(q, &sq->qlist, cache);
+ raw_spin_unlock_irqrestore(&sq->lock, flags);
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
struct qlist_head *q;
q = this_cpu_ptr(&cpu_quarantine);
- qlist_move_cache(q, &to_free, cache);
- qlist_free_all(&to_free, cache);
+ /*
+ * Ensure the ordering between the writing to q->offline and
+ * per_cpu_remove_cache. Prevent cpu_quarantine from being corrupted
+ * by interrupt.
+ */
+ if (READ_ONCE(q->offline))
+ return;
+ __per_cpu_remove_cache(q, arg);
}
/* Free all quarantined objects belonging to cache. */
-void quarantine_remove_cache(struct kmem_cache *cache)
+void kasan_quarantine_remove_cache(struct kmem_cache *cache)
{
unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
+ int cpu;
+ struct cpu_shrink_qlist *sq;
/*
* Must be careful to not miss any objects that are being moved from
- * per-cpu list to the global quarantine in quarantine_put(),
- * nor objects being freed in quarantine_reduce(). on_each_cpu()
+ * per-cpu list to the global quarantine in kasan_quarantine_put(),
+ * nor objects being freed in kasan_quarantine_reduce(). on_each_cpu()
* achieves the first goal, while synchronize_srcu() achieves the
* second.
*/
on_each_cpu(per_cpu_remove_cache, cache, 1);
+ for_each_online_cpu(cpu) {
+ sq = per_cpu_ptr(&shrink_qlist, cpu);
+ raw_spin_lock_irqsave(&sq->lock, flags);
+ qlist_move_cache(&sq->qlist, &to_free, cache);
+ raw_spin_unlock_irqrestore(&sq->lock, flags);
+ }
+ qlist_free_all(&to_free, cache);
+
raw_spin_lock_irqsave(&quarantine_lock, flags);
for (i = 0; i < QUARANTINE_BATCHES; i++) {
if (qlist_empty(&global_quarantine[i]))
@@ -328,3 +385,36 @@ void quarantine_remove_cache(struct kmem_cache *cache)
synchronize_srcu(&remove_cache_srcu);
}
+
+static int kasan_cpu_online(unsigned int cpu)
+{
+ this_cpu_ptr(&cpu_quarantine)->offline = false;
+ return 0;
+}
+
+static int kasan_cpu_offline(unsigned int cpu)
+{
+ struct qlist_head *q;
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ /* Ensure the ordering between the writing to q->offline and
+ * qlist_free_all. Otherwise, cpu_quarantine may be corrupted
+ * by interrupt.
+ */
+ WRITE_ONCE(q->offline, true);
+ barrier();
+ qlist_free_all(q, NULL);
+ return 0;
+}
+
+static int __init kasan_cpu_quarantine_init(void)
+{
+ int ret = 0;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/kasan:online",
+ kasan_cpu_online, kasan_cpu_offline);
+ if (ret < 0)
+ pr_err("kasan cpu quarantine register failed [%d]\n", ret);
+ return ret;
+}
+late_initcall(kasan_cpu_quarantine_init);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 00a53f1355ae..ca4b6ff080a6 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -1,23 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains common generic and tag-based KASAN error reporting code.
+ * This file contains common KASAN error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
+#include <kunit/test.h>
#include <linux/bitops.h>
#include <linux/ftrace.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/lockdep.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/sched.h>
@@ -30,25 +27,112 @@
#include <linux/module.h>
#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
+#include <trace/events/error_report.h>
#include <asm/sections.h>
-#include <kunit/test.h>
-
#include "kasan.h"
#include "../slab.h"
-/* Shadow layout customization. */
-#define SHADOW_BYTES_PER_BLOCK 1
-#define SHADOW_BLOCKS_PER_ROW 16
-#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
-#define SHADOW_ROWS_AROUND_ADDR 2
-
static unsigned long kasan_flags;
#define KASAN_BIT_REPORTED 0
#define KASAN_BIT_MULTI_SHOT 1
+enum kasan_arg_fault {
+ KASAN_ARG_FAULT_DEFAULT,
+ KASAN_ARG_FAULT_REPORT,
+ KASAN_ARG_FAULT_PANIC,
+ KASAN_ARG_FAULT_PANIC_ON_WRITE,
+};
+
+static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT;
+
+/* kasan.fault=report/panic */
+static int __init early_kasan_fault(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "report"))
+ kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
+ else if (!strcmp(arg, "panic"))
+ kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
+ else if (!strcmp(arg, "panic_on_write"))
+ kasan_arg_fault = KASAN_ARG_FAULT_PANIC_ON_WRITE;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.fault", early_kasan_fault);
+
+static int __init kasan_set_multi_shot(char *str)
+{
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+/*
+ * This function is used to check whether KASAN reports are suppressed for
+ * software KASAN modes via kasan_disable/enable_current() critical sections.
+ *
+ * This is done to avoid:
+ * 1. False-positive reports when accessing slab metadata,
+ * 2. Deadlocking when poisoned memory is accessed by the reporting code.
+ *
+ * Hardware Tag-Based KASAN instead relies on:
+ * For #1: Resetting tags via kasan_reset_tag().
+ * For #2: Suppression of tag checks via CPU, see report_suppress_start/end().
+ */
+static bool report_suppressed_sw(void)
+{
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ if (current->kasan_depth)
+ return true;
+#endif
+ return false;
+}
+
+static void report_suppress_start(void)
+{
+#ifdef CONFIG_KASAN_HW_TAGS
+ /*
+ * Disable preemption for the duration of printing a KASAN report, as
+ * hw_suppress_tag_checks_start() disables checks on the current CPU.
+ */
+ preempt_disable();
+ hw_suppress_tag_checks_start();
+#else
+ kasan_disable_current();
+#endif
+}
+
+static void report_suppress_stop(void)
+{
+#ifdef CONFIG_KASAN_HW_TAGS
+ hw_suppress_tag_checks_stop();
+ preempt_enable();
+#else
+ kasan_enable_current();
+#endif
+}
+
+/*
+ * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot
+ * is enabled. Note that KASAN tests effectively enable kasan_multi_shot
+ * for their duration.
+ */
+static bool report_enabled(void)
+{
+ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ return true;
+ return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+
bool kasan_save_enable_multi_shot(void)
{
return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
@@ -62,147 +146,203 @@ void kasan_restore_multi_shot(bool enabled)
}
EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-static int __init kasan_set_multi_shot(char *str)
+#endif
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+/*
+ * Whether the KASAN KUnit test suite is currently being executed.
+ * Updated in kasan_test.c.
+ */
+static bool kasan_kunit_executing;
+
+void kasan_kunit_test_suite_start(void)
{
- set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
- return 1;
+ WRITE_ONCE(kasan_kunit_executing, true);
}
-__setup("kasan_multi_shot", kasan_set_multi_shot);
+EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start);
+
+void kasan_kunit_test_suite_end(void)
+{
+ WRITE_ONCE(kasan_kunit_executing, false);
+}
+EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end);
-static void print_error_description(struct kasan_access_info *info)
+static bool kasan_kunit_test_suite_executing(void)
{
- pr_err("BUG: KASAN: %s in %pS\n",
- get_bug_type(info), (void *)info->ip);
- pr_err("%s of size %zu at addr %px by task %s/%d\n",
- info->is_write ? "Write" : "Read", info->access_size,
- info->access_addr, current->comm, task_pid_nr(current));
+ return READ_ONCE(kasan_kunit_executing);
+}
+
+#else /* CONFIG_KASAN_KUNIT_TEST */
+
+static inline bool kasan_kunit_test_suite_executing(void) { return false; }
+
+#endif /* CONFIG_KASAN_KUNIT_TEST */
+
+#if IS_ENABLED(CONFIG_KUNIT)
+
+static void fail_non_kasan_kunit_test(void)
+{
+ struct kunit *test;
+
+ if (kasan_kunit_test_suite_executing())
+ return;
+
+ test = current->kunit_test;
+ if (test)
+ kunit_set_failure(test);
}
+#else /* CONFIG_KUNIT */
+
+static inline void fail_non_kasan_kunit_test(void) { }
+
+#endif /* CONFIG_KUNIT */
+
static DEFINE_SPINLOCK(report_lock);
-static void start_report(unsigned long *flags)
+static void start_report(unsigned long *flags, bool sync)
{
- /*
- * Make sure we don't end up in loop.
- */
- kasan_disable_current();
+ fail_non_kasan_kunit_test();
+ /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
+ disable_trace_on_warning();
+ /* Do not allow LOCKDEP mangling KASAN reports. */
+ lockdep_off();
+ /* Make sure we don't end up in loop. */
+ report_suppress_start();
spin_lock_irqsave(&report_lock, *flags);
pr_err("==================================================================\n");
}
-static void end_report(unsigned long *flags)
+static void end_report(unsigned long *flags, const void *addr, bool is_write)
{
+ if (addr)
+ trace_error_report_end(ERROR_DETECTOR_KASAN,
+ (unsigned long)addr);
pr_err("==================================================================\n");
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
- panic("panic_on_warn set ...\n");
+ if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ check_panic_on_warn("KASAN");
+ switch (kasan_arg_fault) {
+ case KASAN_ARG_FAULT_DEFAULT:
+ case KASAN_ARG_FAULT_REPORT:
+ break;
+ case KASAN_ARG_FAULT_PANIC:
+ panic("kasan.fault=panic set ...\n");
+ break;
+ case KASAN_ARG_FAULT_PANIC_ON_WRITE:
+ if (is_write)
+ panic("kasan.fault=panic_on_write set ...\n");
+ break;
}
- kasan_enable_current();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ lockdep_on();
+ report_suppress_stop();
}
-static void print_stack(depot_stack_handle_t stack)
+static void print_error_description(struct kasan_report_info *info)
{
- unsigned long *entries;
- unsigned int nr_entries;
+ pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip);
- nr_entries = stack_depot_fetch(stack, &entries);
- stack_trace_print(entries, nr_entries, 0);
+ if (info->type != KASAN_REPORT_ACCESS) {
+ pr_err("Free of addr %px by task %s/%d\n",
+ info->access_addr, current->comm, task_pid_nr(current));
+ return;
+ }
+
+ if (info->access_size)
+ pr_err("%s of size %zu at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
+ else
+ pr_err("%s at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_addr, current->comm, task_pid_nr(current));
}
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
- if (track->stack) {
- print_stack(track->stack);
- } else {
+ if (track->stack)
+ stack_depot_print(track->stack);
+ else
pr_err("(stack is not available)\n");
- }
}
-struct page *kasan_addr_to_page(const void *addr)
+static inline struct page *addr_to_page(const void *addr)
{
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory))
+ if (virt_addr_valid(addr))
return virt_to_head_page(addr);
return NULL;
}
-static void describe_object_addr(struct kmem_cache *cache, void *object,
- const void *addr)
+static void describe_object_addr(const void *addr, struct kasan_report_info *info)
{
unsigned long access_addr = (unsigned long)addr;
- unsigned long object_addr = (unsigned long)object;
- const char *rel_type;
+ unsigned long object_addr = (unsigned long)info->object;
+ const char *rel_type, *region_state = "";
int rel_bytes;
pr_err("The buggy address belongs to the object at %px\n"
" which belongs to the cache %s of size %d\n",
- object, cache->name, cache->object_size);
-
- if (!addr)
- return;
+ info->object, info->cache->name, info->cache->object_size);
if (access_addr < object_addr) {
rel_type = "to the left";
rel_bytes = object_addr - access_addr;
- } else if (access_addr >= object_addr + cache->object_size) {
+ } else if (access_addr >= object_addr + info->alloc_size) {
rel_type = "to the right";
- rel_bytes = access_addr - (object_addr + cache->object_size);
+ rel_bytes = access_addr - (object_addr + info->alloc_size);
} else {
rel_type = "inside";
rel_bytes = access_addr - object_addr;
}
+ /*
+ * Tag-Based modes use the stack ring to infer the bug type, but the
+ * memory region state description is generated based on the metadata.
+ * Thus, defining the region state as below can contradict the metadata.
+ * Fixing this requires further improvements, so only infer the state
+ * for the Generic mode.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) {
+ if (strcmp(info->bug_type, "slab-out-of-bounds") == 0)
+ region_state = "allocated ";
+ else if (strcmp(info->bug_type, "slab-use-after-free") == 0)
+ region_state = "freed ";
+ }
+
pr_err("The buggy address is located %d bytes %s of\n"
- " %d-byte region [%px, %px)\n",
- rel_bytes, rel_type, cache->object_size, (void *)object_addr,
- (void *)(object_addr + cache->object_size));
+ " %s%zu-byte region [%px, %px)\n",
+ rel_bytes, rel_type, region_state, info->alloc_size,
+ (void *)object_addr, (void *)(object_addr + info->alloc_size));
}
-static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr, u8 tag)
+static void describe_object_stacks(struct kasan_report_info *info)
{
- struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
-
- if (cache->flags & SLAB_KASAN) {
- struct kasan_track *free_track;
-
- print_track(&alloc_info->alloc_track, "Allocated");
+ if (info->alloc_track.stack) {
+ print_track(&info->alloc_track, "Allocated");
pr_err("\n");
- free_track = kasan_get_free_track(cache, object, tag);
- if (free_track) {
- print_track(free_track, "Freed");
- pr_err("\n");
- }
+ }
-#ifdef CONFIG_KASAN_GENERIC
- if (alloc_info->aux_stack[0]) {
- pr_err("Last call_rcu():\n");
- print_stack(alloc_info->aux_stack[0]);
- pr_err("\n");
- }
- if (alloc_info->aux_stack[1]) {
- pr_err("Second to last call_rcu():\n");
- print_stack(alloc_info->aux_stack[1]);
- pr_err("\n");
- }
-#endif
+ if (info->free_track.stack) {
+ print_track(&info->free_track, "Freed");
+ pr_err("\n");
}
- describe_object_addr(cache, object, addr);
+ kasan_print_aux_stacks(info->cache, info->object);
+}
+
+static void describe_object(const void *addr, struct kasan_report_info *info)
+{
+ if (kasan_stack_collection_enabled())
+ describe_object_stacks(info);
+ describe_object_addr(addr, info);
}
static inline bool kernel_or_module_addr(const void *addr)
{
- if (addr >= (void *)_stext && addr < (void *)_end)
+ if (is_kernel((unsigned long)addr))
return true;
if (is_module_address((unsigned long)addr))
return true;
@@ -216,357 +356,270 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-static bool __must_check tokenize_frame_descr(const char **frame_descr,
- char *token, size_t max_tok_len,
- unsigned long *value)
+static void print_address_description(void *addr, u8 tag,
+ struct kasan_report_info *info)
{
- const char *sep = strchr(*frame_descr, ' ');
-
- if (sep == NULL)
- sep = *frame_descr + strlen(*frame_descr);
-
- if (token != NULL) {
- const size_t tok_len = sep - *frame_descr;
-
- if (tok_len + 1 > max_tok_len) {
- pr_err("KASAN internal error: frame description too long: %s\n",
- *frame_descr);
- return false;
- }
-
- /* Copy token (+ 1 byte for '\0'). */
- strlcpy(token, *frame_descr, tok_len + 1);
- }
-
- /* Advance frame_descr past separator. */
- *frame_descr = sep + 1;
-
- if (value != NULL && kstrtoul(token, 10, value)) {
- pr_err("KASAN internal error: not a valid number: %s\n", token);
- return false;
- }
-
- return true;
-}
-
-static void print_decoded_frame_descr(const char *frame_descr)
-{
- /*
- * We need to parse the following string:
- * "n alloc_1 alloc_2 ... alloc_n"
- * where alloc_i looks like
- * "offset size len name"
- * or "offset size len name:line".
- */
-
- char token[64];
- unsigned long num_objects;
-
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &num_objects))
- return;
+ struct page *page = addr_to_page(addr);
+ dump_stack_lvl(KERN_ERR);
pr_err("\n");
- pr_err("this frame has %lu %s:\n", num_objects,
- num_objects == 1 ? "object" : "objects");
-
- while (num_objects--) {
- unsigned long offset;
- unsigned long size;
-
- /* access offset */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &offset))
- return;
- /* access size */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &size))
- return;
- /* name length (unused) */
- if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
- return;
- /* object name */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- NULL))
- return;
-
- /* Strip line number; without filename it's not very helpful. */
- strreplace(token, ':', '\0');
-
- /* Finally, print object information. */
- pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
- }
-}
-
-static bool __must_check get_address_stack_frame_info(const void *addr,
- unsigned long *offset,
- const char **frame_descr,
- const void **frame_pc)
-{
- unsigned long aligned_addr;
- unsigned long mem_ptr;
- const u8 *shadow_bottom;
- const u8 *shadow_ptr;
- const unsigned long *frame;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
-
- /*
- * NOTE: We currently only support printing frame information for
- * accesses to the task's own stack.
- */
- if (!object_is_on_stack(addr))
- return false;
- aligned_addr = round_down((unsigned long)addr, sizeof(long));
- mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE);
- shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
- shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
-
- while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
- shadow_ptr--;
- mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
+ if (info->cache && info->object) {
+ describe_object(addr, info);
+ pr_err("\n");
}
- while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
- shadow_ptr--;
- mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
+ if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+ pr_err("The buggy address belongs to the variable:\n");
+ pr_err(" %pS\n", addr);
+ pr_err("\n");
}
- if (shadow_ptr < shadow_bottom)
- return false;
-
- frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE);
- if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
- pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
- frame[0]);
- return false;
+ if (object_is_on_stack(addr)) {
+ /*
+ * Currently, KASAN supports printing frame information only
+ * for accesses to the task's own stack.
+ */
+ kasan_print_address_stack_frame(addr);
+ pr_err("\n");
}
- *offset = (unsigned long)addr - (unsigned long)frame;
- *frame_descr = (const char *)frame[1];
- *frame_pc = (void *)frame[2];
-
- return true;
-}
-
-static void print_address_stack_frame(const void *addr)
-{
- unsigned long offset;
- const char *frame_descr;
- const void *frame_pc;
-
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- return;
-
- if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
- &frame_pc))
- return;
-
- /*
- * get_address_stack_frame_info only returns true if the given addr is
- * on the current task's stack.
- */
- pr_err("\n");
- pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
- addr, current->comm, task_pid_nr(current), offset);
- pr_err(" %pS\n", frame_pc);
+ if (is_vmalloc_addr(addr)) {
+ struct vm_struct *va = find_vm_area(addr);
- if (!frame_descr)
- return;
-
- print_decoded_frame_descr(frame_descr);
-}
-
-static void print_address_description(void *addr, u8 tag)
-{
- struct page *page = kasan_addr_to_page(addr);
-
- dump_stack();
- pr_err("\n");
-
- if (page && PageSlab(page)) {
- struct kmem_cache *cache = page->slab_cache;
- void *object = nearest_obj(cache, page, addr);
-
- describe_object(cache, object, addr, tag);
- }
+ if (va) {
+ pr_err("The buggy address belongs to the virtual mapping at\n"
+ " [%px, %px) created by:\n"
+ " %pS\n",
+ va->addr, va->addr + va->size, va->caller);
+ pr_err("\n");
- if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
- pr_err("The buggy address belongs to the variable:\n");
- pr_err(" %pS\n", addr);
+ page = vmalloc_to_page(addr);
+ }
}
if (page) {
- pr_err("The buggy address belongs to the page:\n");
+ pr_err("The buggy address belongs to the physical page:\n");
dump_page(page, "kasan: bad access detected");
+ pr_err("\n");
}
-
- print_address_stack_frame(addr);
}
-static bool row_is_guilty(const void *row, const void *guilty)
+static bool meta_row_is_guilty(const void *row, const void *addr)
{
- return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+ return (row <= addr) && (addr < row + META_MEM_BYTES_PER_ROW);
}
-static int shadow_pointer_offset(const void *row, const void *shadow)
+static int meta_pointer_offset(const void *row, const void *addr)
{
- /* The length of ">ff00ff00ff00ff00: " is
- * 3 + (BITS_PER_LONG/8)*2 chars.
+ /*
+ * Memory state around the buggy address:
+ * ff00ff00ff00ff00: 00 00 00 05 fe fe fe fe fe fe fe fe fe fe fe fe
+ * ...
+ *
+ * The length of ">ff00ff00ff00ff00: " is
+ * 3 + (BITS_PER_LONG / 8) * 2 chars.
+ * The length of each granule metadata is 2 bytes
+ * plus 1 byte for space.
*/
- return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
- (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+ return 3 + (BITS_PER_LONG / 8) * 2 +
+ (addr - row) / KASAN_GRANULE_SIZE * 3 + 1;
}
-static void print_shadow_for_address(const void *addr)
+static void print_memory_metadata(const void *addr)
{
int i;
- const void *shadow = kasan_mem_to_shadow(addr);
- const void *shadow_row;
+ void *row;
- shadow_row = (void *)round_down((unsigned long)shadow,
- SHADOW_BYTES_PER_ROW)
- - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+ row = (void *)round_down((unsigned long)addr, META_MEM_BYTES_PER_ROW)
+ - META_ROWS_AROUND_ADDR * META_MEM_BYTES_PER_ROW;
pr_err("Memory state around the buggy address:\n");
- for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
- const void *kaddr = kasan_shadow_to_mem(shadow_row);
- char buffer[4 + (BITS_PER_LONG/8)*2];
- char shadow_buf[SHADOW_BYTES_PER_ROW];
+ for (i = -META_ROWS_AROUND_ADDR; i <= META_ROWS_AROUND_ADDR; i++) {
+ char buffer[4 + (BITS_PER_LONG / 8) * 2];
+ char metadata[META_BYTES_PER_ROW];
snprintf(buffer, sizeof(buffer),
- (i == 0) ? ">%px: " : " %px: ", kaddr);
+ (i == 0) ? ">%px: " : " %px: ", row);
+
/*
* We should not pass a shadow pointer to generic
* function, because generic functions may try to
* access kasan mapping for the passed address.
*/
- memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
+ kasan_metadata_fetch_row(&metadata[0], row);
+
print_hex_dump(KERN_ERR, buffer,
- DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
- shadow_buf, SHADOW_BYTES_PER_ROW, 0);
+ DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1,
+ metadata, META_BYTES_PER_ROW, 0);
- if (row_is_guilty(shadow_row, shadow))
- pr_err("%*c\n",
- shadow_pointer_offset(shadow_row, shadow),
- '^');
+ if (meta_row_is_guilty(row, addr))
+ pr_err("%*c\n", meta_pointer_offset(row, addr), '^');
- shadow_row += SHADOW_BYTES_PER_ROW;
+ row += META_MEM_BYTES_PER_ROW;
}
}
-static bool report_enabled(void)
+static void print_report(struct kasan_report_info *info)
{
- if (current->kasan_depth)
- return false;
- if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
- return true;
- return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+ void *addr = kasan_reset_tag((void *)info->access_addr);
+ u8 tag = get_tag((void *)info->access_addr);
+
+ print_error_description(info);
+ if (addr_has_metadata(addr))
+ kasan_print_tags(tag, info->first_bad_addr);
+ pr_err("\n");
+
+ if (addr_has_metadata(addr)) {
+ print_address_description(addr, tag, info);
+ print_memory_metadata(info->first_bad_addr);
+ } else {
+ dump_stack_lvl(KERN_ERR);
+ }
}
-#if IS_ENABLED(CONFIG_KUNIT)
-static void kasan_update_kunit_status(struct kunit *cur_test)
+static void complete_report_info(struct kasan_report_info *info)
{
- struct kunit_resource *resource;
- struct kunit_kasan_expectation *kasan_data;
+ void *addr = kasan_reset_tag((void *)info->access_addr);
+ struct slab *slab;
- resource = kunit_find_named_resource(cur_test, "kasan_data");
-
- if (!resource) {
- kunit_set_failure(cur_test);
- return;
+ if (info->type == KASAN_REPORT_ACCESS)
+ info->first_bad_addr = kasan_find_first_bad_addr(
+ (void *)info->access_addr, info->access_size);
+ else
+ info->first_bad_addr = addr;
+
+ slab = kasan_addr_to_slab(addr);
+ if (slab) {
+ info->cache = slab->slab_cache;
+ info->object = nearest_obj(info->cache, slab, addr);
+
+ /* Try to determine allocation size based on the metadata. */
+ info->alloc_size = kasan_get_alloc_size(info->object, info->cache);
+ /* Fallback to the object size if failed. */
+ if (!info->alloc_size)
+ info->alloc_size = info->cache->object_size;
+ } else
+ info->cache = info->object = NULL;
+
+ switch (info->type) {
+ case KASAN_REPORT_INVALID_FREE:
+ info->bug_type = "invalid-free";
+ break;
+ case KASAN_REPORT_DOUBLE_FREE:
+ info->bug_type = "double-free";
+ break;
+ default:
+ /* bug_type filled in by kasan_complete_mode_report_info. */
+ break;
}
- kasan_data = (struct kunit_kasan_expectation *)resource->data;
- kasan_data->report_found = true;
- kunit_put_resource(resource);
+ /* Fill in mode-specific report info fields. */
+ kasan_complete_mode_report_info(info);
}
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
-void kasan_report_invalid_free(void *object, unsigned long ip)
+void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type)
{
unsigned long flags;
- u8 tag = get_tag(object);
+ struct kasan_report_info info;
- object = reset_tag(object);
+ /*
+ * Do not check report_suppressed_sw(), as an invalid-free cannot be
+ * caused by accessing poisoned memory and thus should not be suppressed
+ * by kasan_disable/enable_current() critical sections.
+ *
+ * Note that for Hardware Tag-Based KASAN, kasan_report_invalid_free()
+ * is triggered by explicit tag checks and not by the ones performed by
+ * the CPU. Thus, reporting invalid-free is not suppressed as well.
+ */
+ if (unlikely(!report_enabled()))
+ return;
-#if IS_ENABLED(CONFIG_KUNIT)
- if (current->kunit_test)
- kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
+ start_report(&flags, true);
- start_report(&flags);
- pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- print_tags(tag, object);
- pr_err("\n");
- print_address_description(object, tag);
- pr_err("\n");
- print_shadow_for_address(object);
- end_report(&flags);
-}
+ memset(&info, 0, sizeof(info));
+ info.type = type;
+ info.access_addr = ptr;
+ info.access_size = 0;
+ info.is_write = false;
+ info.ip = ip;
-static void __kasan_report(unsigned long addr, size_t size, bool is_write,
- unsigned long ip)
-{
- struct kasan_access_info info;
- void *tagged_addr;
- void *untagged_addr;
- unsigned long flags;
+ complete_report_info(&info);
-#if IS_ENABLED(CONFIG_KUNIT)
- if (current->kunit_test)
- kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
+ print_report(&info);
- disable_trace_on_warning();
+ /*
+ * Invalid free is considered a "write" since the allocator's metadata
+ * updates involves writes.
+ */
+ end_report(&flags, ptr, true);
+}
- tagged_addr = (void *)addr;
- untagged_addr = reset_tag(tagged_addr);
+/*
+ * kasan_report() is the only reporting function that uses
+ * user_access_save/restore(): kasan_report_invalid_free() cannot be called
+ * from a UACCESS region, and kasan_report_async() is not used on x86.
+ */
+bool kasan_report(const void *addr, size_t size, bool is_write,
+ unsigned long ip)
+{
+ bool ret = true;
+ unsigned long ua_flags = user_access_save();
+ unsigned long irq_flags;
+ struct kasan_report_info info;
+
+ if (unlikely(report_suppressed_sw()) || unlikely(!report_enabled())) {
+ ret = false;
+ goto out;
+ }
- info.access_addr = tagged_addr;
- if (addr_has_shadow(untagged_addr))
- info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
- else
- info.first_bad_addr = untagged_addr;
+ start_report(&irq_flags, true);
+
+ memset(&info, 0, sizeof(info));
+ info.type = KASAN_REPORT_ACCESS;
+ info.access_addr = addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
- start_report(&flags);
+ complete_report_info(&info);
- print_error_description(&info);
- if (addr_has_shadow(untagged_addr))
- print_tags(get_tag(tagged_addr), info.first_bad_addr);
- pr_err("\n");
+ print_report(&info);
- if (addr_has_shadow(untagged_addr)) {
- print_address_description(untagged_addr, get_tag(tagged_addr));
- pr_err("\n");
- print_shadow_for_address(info.first_bad_addr);
- } else {
- dump_stack();
- }
+ end_report(&irq_flags, (void *)addr, is_write);
+
+out:
+ user_access_restore(ua_flags);
- end_report(&flags);
+ return ret;
}
-bool kasan_report(unsigned long addr, size_t size, bool is_write,
- unsigned long ip)
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_report_async(void)
{
- unsigned long flags = user_access_save();
- bool ret = false;
-
- if (likely(report_enabled())) {
- __kasan_report(addr, size, is_write, ip);
- ret = true;
- }
+ unsigned long flags;
- user_access_restore(flags);
+ /*
+ * Do not check report_suppressed_sw(), as
+ * kasan_disable/enable_current() critical sections do not affect
+ * Hardware Tag-Based KASAN.
+ */
+ if (unlikely(!report_enabled()))
+ return;
- return ret;
+ start_report(&flags, false);
+ pr_err("BUG: KASAN: invalid-access\n");
+ pr_err("Asynchronous fault: no details available\n");
+ pr_err("\n");
+ dump_stack_lvl(KERN_ERR);
+ /*
+ * Conservatively set is_write=true, because no details are available.
+ * In this mode, kasan.fault=panic_on_write is like kasan.fault=panic.
+ */
+ end_report(&flags, NULL, true);
}
+#endif /* CONFIG_KASAN_HW_TAGS */
#ifdef CONFIG_KASAN_INLINE
/*
@@ -604,6 +657,6 @@ void kasan_non_canonical_hook(unsigned long addr)
else
bug_type = "maybe wild-memory-access";
pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type,
- orig_addr, orig_addr + KASAN_SHADOW_MASK);
+ orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1);
}
#endif
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
new file mode 100644
index 000000000000..51a1e8a8877f
--- /dev/null
+++ b/mm/kasan/report_generic.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains generic KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
+{
+ const void *p = addr;
+
+ if (!addr_has_metadata(p))
+ return p;
+
+ while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
+ p += KASAN_GRANULE_SIZE;
+
+ return p;
+}
+
+size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache)
+{
+ size_t size = 0;
+ u8 *shadow;
+
+ /*
+ * Skip the addr_has_metadata check, as this function only operates on
+ * slab memory, which must have metadata.
+ */
+
+ /*
+ * The loop below returns 0 for freed objects, for which KASAN cannot
+ * calculate the allocation size based on the metadata.
+ */
+ shadow = (u8 *)kasan_mem_to_shadow(object);
+ while (size < cache->object_size) {
+ if (*shadow == 0)
+ size += KASAN_GRANULE_SIZE;
+ else if (*shadow >= 1 && *shadow <= KASAN_GRANULE_SIZE - 1)
+ return size + *shadow;
+ else
+ return size;
+ shadow++;
+ }
+
+ return cache->object_size;
+}
+
+static const char *get_shadow_bug_type(struct kasan_report_info *info)
+{
+ const char *bug_type = "unknown-crash";
+ u8 *shadow_addr;
+
+ shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+ /*
+ * If shadow byte value is in [0, KASAN_GRANULE_SIZE) we can look
+ * at the next shadow byte to determine the type of the bad access.
+ */
+ if (*shadow_addr > 0 && *shadow_addr <= KASAN_GRANULE_SIZE - 1)
+ shadow_addr++;
+
+ switch (*shadow_addr) {
+ case 0 ... KASAN_GRANULE_SIZE - 1:
+ /*
+ * In theory it's still possible to see these shadow values
+ * due to a data race in the kernel code.
+ */
+ bug_type = "out-of-bounds";
+ break;
+ case KASAN_PAGE_REDZONE:
+ case KASAN_SLAB_REDZONE:
+ bug_type = "slab-out-of-bounds";
+ break;
+ case KASAN_GLOBAL_REDZONE:
+ bug_type = "global-out-of-bounds";
+ break;
+ case KASAN_STACK_LEFT:
+ case KASAN_STACK_MID:
+ case KASAN_STACK_RIGHT:
+ case KASAN_STACK_PARTIAL:
+ bug_type = "stack-out-of-bounds";
+ break;
+ case KASAN_PAGE_FREE:
+ bug_type = "use-after-free";
+ break;
+ case KASAN_SLAB_FREE:
+ case KASAN_SLAB_FREETRACK:
+ bug_type = "slab-use-after-free";
+ break;
+ case KASAN_ALLOCA_LEFT:
+ case KASAN_ALLOCA_RIGHT:
+ bug_type = "alloca-out-of-bounds";
+ break;
+ case KASAN_VMALLOC_INVALID:
+ bug_type = "vmalloc-out-of-bounds";
+ break;
+ }
+
+ return bug_type;
+}
+
+static const char *get_wild_bug_type(struct kasan_report_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+static const char *get_bug_type(struct kasan_report_info *info)
+{
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
+ if (addr_has_metadata(info->access_addr))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+void kasan_complete_mode_report_info(struct kasan_report_info *info)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ struct kasan_free_meta *free_meta;
+
+ if (!info->bug_type)
+ info->bug_type = get_bug_type(info);
+
+ if (!info->cache || !info->object)
+ return;
+
+ alloc_meta = kasan_get_alloc_meta(info->cache, info->object);
+ if (alloc_meta)
+ memcpy(&info->alloc_track, &alloc_meta->alloc_track,
+ sizeof(info->alloc_track));
+
+ if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) {
+ /* Free meta must be present with KASAN_SLAB_FREETRACK. */
+ free_meta = kasan_get_free_meta(info->cache, info->object);
+ memcpy(&info->free_track, &free_meta->free_track,
+ sizeof(info->free_track));
+ }
+}
+
+void kasan_metadata_fetch_row(char *buffer, void *row)
+{
+ memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
+}
+
+void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return;
+
+ if (alloc_meta->aux_stack[0]) {
+ pr_err("Last potentially related work creation:\n");
+ stack_depot_print(alloc_meta->aux_stack[0]);
+ pr_err("\n");
+ }
+ if (alloc_meta->aux_stack[1]) {
+ pr_err("Second to last potentially related work creation:\n");
+ stack_depot_print(alloc_meta->aux_stack[1]);
+ pr_err("\n");
+ }
+}
+
+#ifdef CONFIG_KASAN_STACK
+static bool __must_check tokenize_frame_descr(const char **frame_descr,
+ char *token, size_t max_tok_len,
+ unsigned long *value)
+{
+ const char *sep = strchr(*frame_descr, ' ');
+
+ if (sep == NULL)
+ sep = *frame_descr + strlen(*frame_descr);
+
+ if (token != NULL) {
+ const size_t tok_len = sep - *frame_descr;
+
+ if (tok_len + 1 > max_tok_len) {
+ pr_err("KASAN internal error: frame description too long: %s\n",
+ *frame_descr);
+ return false;
+ }
+
+ /* Copy token (+ 1 byte for '\0'). */
+ strscpy(token, *frame_descr, tok_len + 1);
+ }
+
+ /* Advance frame_descr past separator. */
+ *frame_descr = sep + 1;
+
+ if (value != NULL && kstrtoul(token, 10, value)) {
+ pr_err("KASAN internal error: not a valid number: %s\n", token);
+ return false;
+ }
+
+ return true;
+}
+
+static void print_decoded_frame_descr(const char *frame_descr)
+{
+ /*
+ * We need to parse the following string:
+ * "n alloc_1 alloc_2 ... alloc_n"
+ * where alloc_i looks like
+ * "offset size len name"
+ * or "offset size len name:line".
+ */
+
+ char token[64];
+ unsigned long num_objects;
+
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &num_objects))
+ return;
+
+ pr_err("\n");
+ pr_err("This frame has %lu %s:\n", num_objects,
+ num_objects == 1 ? "object" : "objects");
+
+ while (num_objects--) {
+ unsigned long offset;
+ unsigned long size;
+
+ /* access offset */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &offset))
+ return;
+ /* access size */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &size))
+ return;
+ /* name length (unused) */
+ if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
+ return;
+ /* object name */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ NULL))
+ return;
+
+ /* Strip line number; without filename it's not very helpful. */
+ strreplace(token, ':', '\0');
+
+ /* Finally, print object information. */
+ pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
+ }
+}
+
+/* Returns true only if the address is on the current task's stack. */
+static bool __must_check get_address_stack_frame_info(const void *addr,
+ unsigned long *offset,
+ const char **frame_descr,
+ const void **frame_pc)
+{
+ unsigned long aligned_addr;
+ unsigned long mem_ptr;
+ const u8 *shadow_bottom;
+ const u8 *shadow_ptr;
+ const unsigned long *frame;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
+
+ aligned_addr = round_down((unsigned long)addr, sizeof(long));
+ mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE);
+ shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
+ shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_GRANULE_SIZE;
+ }
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_GRANULE_SIZE;
+ }
+
+ if (shadow_ptr < shadow_bottom)
+ return false;
+
+ frame = (const unsigned long *)(mem_ptr + KASAN_GRANULE_SIZE);
+ if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
+ pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
+ frame[0]);
+ return false;
+ }
+
+ *offset = (unsigned long)addr - (unsigned long)frame;
+ *frame_descr = (const char *)frame[1];
+ *frame_pc = (void *)frame[2];
+
+ return true;
+}
+
+void kasan_print_address_stack_frame(const void *addr)
+{
+ unsigned long offset;
+ const char *frame_descr;
+ const void *frame_pc;
+
+ if (WARN_ON(!object_is_on_stack(addr)))
+ return;
+
+ pr_err("The buggy address belongs to stack of task %s/%d\n",
+ current->comm, task_pid_nr(current));
+
+ if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
+ &frame_pc))
+ return;
+
+ pr_err(" and is located at offset %lu in frame:\n", offset);
+ pr_err(" %pS\n", frame_pc);
+
+ if (!frame_descr)
+ return;
+
+ print_decoded_frame_descr(frame_descr);
+}
+#endif /* CONFIG_KASAN_STACK */
+
+#define DEFINE_ASAN_REPORT_LOAD(size) \
+void __asan_report_load##size##_noabort(void *addr) \
+{ \
+ kasan_report(addr, size, false, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size) \
+void __asan_report_store##size##_noabort(void *addr) \
+{ \
+ kasan_report(addr, size, true, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(void *addr, ssize_t size)
+{
+ kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(void *addr, ssize_t size)
+{
+ kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
new file mode 100644
index 000000000000..065e1b2fc484
--- /dev/null
+++ b/mm/kasan/report_hw_tags.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains hardware tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
+{
+ /*
+ * Hardware Tag-Based KASAN only calls this function for normal memory
+ * accesses, and thus addr points precisely to the first bad address
+ * with an invalid (and present) memory tag. Therefore:
+ * 1. Return the address as is without walking memory tags.
+ * 2. Skip the addr_has_metadata check.
+ */
+ return kasan_reset_tag(addr);
+}
+
+size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache)
+{
+ size_t size = 0;
+ int i = 0;
+ u8 memory_tag;
+
+ /*
+ * Skip the addr_has_metadata check, as this function only operates on
+ * slab memory, which must have metadata.
+ */
+
+ /*
+ * The loop below returns 0 for freed objects, for which KASAN cannot
+ * calculate the allocation size based on the metadata.
+ */
+ while (size < cache->object_size) {
+ memory_tag = hw_get_mem_tag(object + i * KASAN_GRANULE_SIZE);
+ if (memory_tag != KASAN_TAG_INVALID)
+ size += KASAN_GRANULE_SIZE;
+ else
+ return size;
+ i++;
+ }
+
+ return cache->object_size;
+}
+
+void kasan_metadata_fetch_row(char *buffer, void *row)
+{
+ int i;
+
+ for (i = 0; i < META_BYTES_PER_ROW; i++)
+ buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE);
+}
+
+void kasan_print_tags(u8 addr_tag, const void *addr)
+{
+ u8 memory_tag = hw_get_mem_tag((void *)addr);
+
+ pr_err("Pointer tag: [%02x], memory tag: [%02x]\n",
+ addr_tag, memory_tag);
+}
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
new file mode 100644
index 000000000000..689e94f9fe3c
--- /dev/null
+++ b/mm/kasan/report_sw_tags.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains software tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
+{
+ u8 tag = get_tag(addr);
+ void *p = kasan_reset_tag(addr);
+ void *end = p + size;
+
+ if (!addr_has_metadata(p))
+ return p;
+
+ while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
+ p += KASAN_GRANULE_SIZE;
+
+ return p;
+}
+
+size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache)
+{
+ size_t size = 0;
+ u8 *shadow;
+
+ /*
+ * Skip the addr_has_metadata check, as this function only operates on
+ * slab memory, which must have metadata.
+ */
+
+ /*
+ * The loop below returns 0 for freed objects, for which KASAN cannot
+ * calculate the allocation size based on the metadata.
+ */
+ shadow = (u8 *)kasan_mem_to_shadow(object);
+ while (size < cache->object_size) {
+ if (*shadow != KASAN_TAG_INVALID)
+ size += KASAN_GRANULE_SIZE;
+ else
+ return size;
+ shadow++;
+ }
+
+ return cache->object_size;
+}
+
+void kasan_metadata_fetch_row(char *buffer, void *row)
+{
+ memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
+}
+
+void kasan_print_tags(u8 addr_tag, const void *addr)
+{
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
+
+ pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
+}
+
+#ifdef CONFIG_KASAN_STACK
+void kasan_print_address_stack_frame(const void *addr)
+{
+ if (WARN_ON(!object_is_on_stack(addr)))
+ return;
+
+ pr_err("The buggy address belongs to stack of task %s/%d\n",
+ current->comm, task_pid_nr(current));
+}
+#endif
diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c
new file mode 100644
index 000000000000..8b8bfdb3cfdb
--- /dev/null
+++ b/mm/kasan/report_tags.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Copyright (c) 2020 Google, Inc.
+ */
+
+#include <linux/atomic.h>
+
+#include "kasan.h"
+
+extern struct kasan_stack_ring stack_ring;
+
+static const char *get_common_bug_type(struct kasan_report_info *info)
+{
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
+ return "invalid-access";
+}
+
+void kasan_complete_mode_report_info(struct kasan_report_info *info)
+{
+ unsigned long flags;
+ u64 pos;
+ struct kasan_stack_ring_entry *entry;
+ void *ptr;
+ u32 pid;
+ depot_stack_handle_t stack;
+ bool is_free;
+ bool alloc_found = false, free_found = false;
+
+ if ((!info->cache || !info->object) && !info->bug_type) {
+ info->bug_type = get_common_bug_type(info);
+ return;
+ }
+
+ write_lock_irqsave(&stack_ring.lock, flags);
+
+ pos = atomic64_read(&stack_ring.pos);
+
+ /*
+ * The loop below tries to find stack ring entries relevant to the
+ * buggy object. This is a best-effort process.
+ *
+ * First, another object with the same tag can be allocated in place of
+ * the buggy object. Also, since the number of entries is limited, the
+ * entries relevant to the buggy object can be overwritten.
+ */
+
+ for (u64 i = pos - 1; i != pos - 1 - stack_ring.size; i--) {
+ if (alloc_found && free_found)
+ break;
+
+ entry = &stack_ring.entries[i % stack_ring.size];
+
+ /* Paired with smp_store_release() in save_stack_info(). */
+ ptr = (void *)smp_load_acquire(&entry->ptr);
+
+ if (kasan_reset_tag(ptr) != info->object ||
+ get_tag(ptr) != get_tag(info->access_addr))
+ continue;
+
+ pid = READ_ONCE(entry->pid);
+ stack = READ_ONCE(entry->stack);
+ is_free = READ_ONCE(entry->is_free);
+
+ if (is_free) {
+ /*
+ * Second free of the same object.
+ * Give up on trying to find the alloc entry.
+ */
+ if (free_found)
+ break;
+
+ info->free_track.pid = pid;
+ info->free_track.stack = stack;
+ free_found = true;
+
+ /*
+ * If a free entry is found first, the bug is likely
+ * a use-after-free.
+ */
+ if (!info->bug_type)
+ info->bug_type = "slab-use-after-free";
+ } else {
+ /* Second alloc of the same object. Give up. */
+ if (alloc_found)
+ break;
+
+ info->alloc_track.pid = pid;
+ info->alloc_track.stack = stack;
+ alloc_found = true;
+
+ /*
+ * If an alloc entry is found first, the bug is likely
+ * an out-of-bounds.
+ */
+ if (!info->bug_type)
+ info->bug_type = "slab-out-of-bounds";
+ }
+ }
+
+ write_unlock_irqrestore(&stack_ring.lock, flags);
+
+ /* Assign the common bug type if no entries were found. */
+ if (!info->bug_type)
+ info->bug_type = get_common_bug_type(info);
+}
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
new file mode 100644
index 000000000000..dd772f9d0f08
--- /dev/null
+++ b/mm/kasan/shadow.c
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains KASAN runtime code that manages shadow memory for
+ * generic and software tag-based KASAN modes.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ */
+
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kfence.h>
+#include <linux/kmemleak.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include "kasan.h"
+
+bool __kasan_check_read(const volatile void *p, unsigned int size)
+{
+ return kasan_check_range((void *)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_read);
+
+bool __kasan_check_write(const volatile void *p, unsigned int size)
+{
+ return kasan_check_range((void *)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_write);
+
+#if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY)
+/*
+ * CONFIG_GENERIC_ENTRY relies on compiler emitted mem*() calls to not be
+ * instrumented. KASAN enabled toolchains should emit __asan_mem*() functions
+ * for the sites they want to instrument.
+ *
+ * If we have a compiler that can instrument meminstrinsics, never override
+ * these, so that non-instrumented files can safely consider them as builtins.
+ */
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+ if (!kasan_check_range(addr, len, true, _RET_IP_))
+ return NULL;
+
+ return __memset(addr, c, len);
+}
+
+#ifdef __HAVE_ARCH_MEMMOVE
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memmove(dest, src, len);
+}
+#endif
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memcpy(dest, src, len);
+}
+#endif
+
+void *__asan_memset(void *addr, int c, ssize_t len)
+{
+ if (!kasan_check_range(addr, len, true, _RET_IP_))
+ return NULL;
+
+ return __memset(addr, c, len);
+}
+EXPORT_SYMBOL(__asan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__asan_memmove(void *dest, const void *src, ssize_t len)
+{
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memmove(dest, src, len);
+}
+EXPORT_SYMBOL(__asan_memmove);
+#endif
+
+void *__asan_memcpy(void *dest, const void *src, ssize_t len)
+{
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memcpy(dest, src, len);
+}
+EXPORT_SYMBOL(__asan_memcpy);
+
+#ifdef CONFIG_KASAN_SW_TAGS
+void *__hwasan_memset(void *addr, int c, ssize_t len) __alias(__asan_memset);
+EXPORT_SYMBOL(__hwasan_memset);
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__hwasan_memmove(void *dest, const void *src, ssize_t len) __alias(__asan_memmove);
+EXPORT_SYMBOL(__hwasan_memmove);
+#endif
+void *__hwasan_memcpy(void *dest, const void *src, ssize_t len) __alias(__asan_memcpy);
+EXPORT_SYMBOL(__hwasan_memcpy);
+#endif
+
+void kasan_poison(const void *addr, size_t size, u8 value, bool init)
+{
+ void *shadow_start, *shadow_end;
+
+ if (!kasan_arch_is_ready())
+ return;
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_poison_object_data) pass tagged
+ * addresses to this function.
+ */
+ addr = kasan_reset_tag(addr);
+
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(addr))
+ return;
+
+ if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
+ return;
+ if (WARN_ON(size & KASAN_GRANULE_MASK))
+ return;
+
+ shadow_start = kasan_mem_to_shadow(addr);
+ shadow_end = kasan_mem_to_shadow(addr + size);
+
+ __memset(shadow_start, value, shadow_end - shadow_start);
+}
+EXPORT_SYMBOL(kasan_poison);
+
+#ifdef CONFIG_KASAN_GENERIC
+void kasan_poison_last_granule(const void *addr, size_t size)
+{
+ if (!kasan_arch_is_ready())
+ return;
+
+ if (size & KASAN_GRANULE_MASK) {
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size);
+ *shadow = size & KASAN_GRANULE_MASK;
+ }
+}
+#endif
+
+void kasan_unpoison(const void *addr, size_t size, bool init)
+{
+ u8 tag = get_tag(addr);
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
+ * addresses to this function.
+ */
+ addr = kasan_reset_tag(addr);
+
+ /*
+ * Skip KFENCE memory if called explicitly outside of sl*b. Also note
+ * that calls to ksize(), where size is not a multiple of machine-word
+ * size, would otherwise poison the invalid portion of the word.
+ */
+ if (is_kfence_address(addr))
+ return;
+
+ if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
+ return;
+
+ /* Unpoison all granules that cover the object. */
+ kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag, false);
+
+ /* Partially poison the last granule for the generic mode. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ kasan_poison_last_granule(addr, size);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one is
+ * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
+ * pud_bad(), if pud is bad then it's bad because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(ptep_get(pte));
+}
+
+static int __meminit kasan_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct memory_notify *mem_data = data;
+ unsigned long nr_shadow_pages, start_kaddr, shadow_start;
+ unsigned long shadow_end, shadow_size;
+
+ nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
+ start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
+ shadow_size = nr_shadow_pages << PAGE_SHIFT;
+ shadow_end = shadow_start + shadow_size;
+
+ if (WARN_ON(mem_data->nr_pages % KASAN_GRANULE_SIZE) ||
+ WARN_ON(start_kaddr % KASAN_MEMORY_PER_SHADOW_PAGE))
+ return NOTIFY_BAD;
+
+ switch (action) {
+ case MEM_GOING_ONLINE: {
+ void *ret;
+
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
+ ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
+ shadow_end, GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD,
+ pfn_to_nid(mem_data->start_pfn),
+ __builtin_return_address(0));
+ if (!ret)
+ return NOTIFY_BAD;
+
+ kmemleak_ignore(ret);
+ return NOTIFY_OK;
+ }
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * shadow_start was either mapped during boot by kasan_init()
+ * or during memory online by __vmalloc_node_range().
+ * In the latter case we can use vfree() to free shadow.
+ * Non-NULL result of the find_vm_area() will tell us if
+ * that was the second case.
+ *
+ * Currently it's not possible to free shadow mapped
+ * during boot by kasan_init(). It's because the code
+ * to do that hasn't been written yet. So we'll just
+ * leak the memory.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+ hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI);
+
+ return 0;
+}
+
+core_initcall(kasan_memhotplug_init);
+#endif
+
+#ifdef CONFIG_KASAN_VMALLOC
+
+void __init __weak kasan_populate_early_vm_area_shadow(void *start,
+ unsigned long size)
+{
+}
+
+static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+ pte_t pte;
+
+ if (likely(!pte_none(ptep_get(ptep))))
+ return 0;
+
+ page = __get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pte_none(ptep_get(ptep)))) {
+ set_pte_at(&init_mm, addr, ptep, pte);
+ page = 0;
+ }
+ spin_unlock(&init_mm.page_table_lock);
+ if (page)
+ free_page(page);
+ return 0;
+}
+
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+{
+ unsigned long shadow_start, shadow_end;
+ int ret;
+
+ if (!kasan_arch_is_ready())
+ return 0;
+
+ if (!is_vmalloc_or_module_addr((void *)addr))
+ return 0;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
+ shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
+
+ /*
+ * User Mode Linux maps enough shadow memory for all of virtual memory
+ * at boot, so doesn't need to allocate more on vmalloc, just clear it.
+ *
+ * The remaining CONFIG_UML checks in this file exist for the same
+ * reason.
+ */
+ if (IS_ENABLED(CONFIG_UML)) {
+ __memset((void *)shadow_start, KASAN_VMALLOC_INVALID, shadow_end - shadow_start);
+ return 0;
+ }
+
+ shadow_start = PAGE_ALIGN_DOWN(shadow_start);
+ shadow_end = PAGE_ALIGN(shadow_end);
+
+ ret = apply_to_page_range(&init_mm, shadow_start,
+ shadow_end - shadow_start,
+ kasan_populate_vmalloc_pte, NULL);
+ if (ret)
+ return ret;
+
+ flush_cache_vmap(shadow_start, shadow_end);
+
+ /*
+ * We need to be careful about inter-cpu effects here. Consider:
+ *
+ * CPU#0 CPU#1
+ * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
+ * p[99] = 1;
+ *
+ * With compiler instrumentation, that ends up looking like this:
+ *
+ * CPU#0 CPU#1
+ * // vmalloc() allocates memory
+ * // let a = area->addr
+ * // we reach kasan_populate_vmalloc
+ * // and call kasan_unpoison:
+ * STORE shadow(a), unpoison_val
+ * ...
+ * STORE shadow(a+99), unpoison_val x = LOAD p
+ * // rest of vmalloc process <data dependency>
+ * STORE p, a LOAD shadow(x+99)
+ *
+ * If there is no barrier between the end of unpoisoning the shadow
+ * and the store of the result to p, the stores could be committed
+ * in a different order by CPU#0, and CPU#1 could erroneously observe
+ * poison in the shadow.
+ *
+ * We need some sort of barrier between the stores.
+ *
+ * In the vmalloc() case, this is provided by a smp_wmb() in
+ * clear_vm_uninitialized_flag(). In the per-cpu allocator and in
+ * get_vm_area() and friends, the caller gets shadow allocated but
+ * doesn't have any pages mapped into the virtual address space that
+ * has been reserved. Mapping those pages in will involve taking and
+ * releasing a page-table lock, which will provide the barrier.
+ */
+
+ return 0;
+}
+
+static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+
+ page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
+
+ spin_lock(&init_mm.page_table_lock);
+
+ if (likely(!pte_none(ptep_get(ptep)))) {
+ pte_clear(&init_mm, addr, ptep);
+ free_page(page);
+ }
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
+
+/*
+ * Release the backing for the vmalloc region [start, end), which
+ * lies within the free region [free_region_start, free_region_end).
+ *
+ * This can be run lazily, long after the region was freed. It runs
+ * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
+ * infrastructure.
+ *
+ * How does this work?
+ * -------------------
+ *
+ * We have a region that is page aligned, labeled as A.
+ * That might not map onto the shadow in a way that is page-aligned:
+ *
+ * start end
+ * v v
+ * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |??AAAAAA|AAAAAAAA|AA??????| < shadow
+ * (1) (2) (3)
+ *
+ * First we align the start upwards and the end downwards, so that the
+ * shadow of the region aligns with shadow page boundaries. In the
+ * example, this gives us the shadow page (2). This is the shadow entirely
+ * covered by this allocation.
+ *
+ * Then we have the tricky bits. We want to know if we can free the
+ * partially covered shadow pages - (1) and (3) in the example. For this,
+ * we are given the start and end of the free region that contains this
+ * allocation. Extending our previous example, we could have:
+ *
+ * free_region_start free_region_end
+ * | start end |
+ * v v v v
+ * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
+ * (1) (2) (3)
+ *
+ * Once again, we align the start of the free region up, and the end of
+ * the free region down so that the shadow is page aligned. So we can free
+ * page (1) - we know no allocation currently uses anything in that page,
+ * because all of it is in the vmalloc free region. But we cannot free
+ * page (3), because we can't be sure that the rest of it is unused.
+ *
+ * We only consider pages that contain part of the original region for
+ * freeing: we don't try to free other pages from the free region or we'd
+ * end up trying to free huge chunks of virtual address space.
+ *
+ * Concurrency
+ * -----------
+ *
+ * How do we know that we're not freeing a page that is simultaneously
+ * being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
+ *
+ * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
+ * at the same time. While we run under free_vmap_area_lock, the population
+ * code does not.
+ *
+ * free_vmap_area_lock instead operates to ensure that the larger range
+ * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
+ * the per-cpu region-finding algorithm both run under free_vmap_area_lock,
+ * no space identified as free will become used while we are running. This
+ * means that so long as we are careful with alignment and only free shadow
+ * pages entirely covered by the free region, we will not run in to any
+ * trouble - any simultaneous allocations will be for disjoint regions.
+ */
+void kasan_release_vmalloc(unsigned long start, unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end)
+{
+ void *shadow_start, *shadow_end;
+ unsigned long region_start, region_end;
+ unsigned long size;
+
+ if (!kasan_arch_is_ready())
+ return;
+
+ region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
+ region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ free_region_start = ALIGN(free_region_start, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ if (start != region_start &&
+ free_region_start < region_start)
+ region_start -= KASAN_MEMORY_PER_SHADOW_PAGE;
+
+ free_region_end = ALIGN_DOWN(free_region_end, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ if (end != region_end &&
+ free_region_end > region_end)
+ region_end += KASAN_MEMORY_PER_SHADOW_PAGE;
+
+ shadow_start = kasan_mem_to_shadow((void *)region_start);
+ shadow_end = kasan_mem_to_shadow((void *)region_end);
+
+ if (shadow_end > shadow_start) {
+ size = shadow_end - shadow_start;
+ if (IS_ENABLED(CONFIG_UML)) {
+ __memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
+ return;
+ }
+ apply_to_existing_page_range(&init_mm,
+ (unsigned long)shadow_start,
+ size, kasan_depopulate_vmalloc_pte,
+ NULL);
+ flush_tlb_kernel_range((unsigned long)shadow_start,
+ (unsigned long)shadow_end);
+ }
+}
+
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags)
+{
+ /*
+ * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC
+ * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored.
+ * Software KASAN modes can't optimize zeroing memory by combining it
+ * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
+ */
+
+ if (!kasan_arch_is_ready())
+ return (void *)start;
+
+ if (!is_vmalloc_or_module_addr(start))
+ return (void *)start;
+
+ /*
+ * Don't tag executable memory with the tag-based mode.
+ * The kernel doesn't tolerate having the PC register tagged.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) &&
+ !(flags & KASAN_VMALLOC_PROT_NORMAL))
+ return (void *)start;
+
+ start = set_tag(start, kasan_random_tag());
+ kasan_unpoison(start, size, false);
+ return (void *)start;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ if (!kasan_arch_is_ready())
+ return;
+
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ size = round_up(size, KASAN_GRANULE_SIZE);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
+}
+
+#else /* CONFIG_KASAN_VMALLOC */
+
+int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask)
+{
+ void *ret;
+ size_t scaled_size;
+ size_t shadow_size;
+ unsigned long shadow_start;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+ scaled_size = (size + KASAN_GRANULE_SIZE - 1) >>
+ KASAN_SHADOW_SCALE_SHIFT;
+ shadow_size = round_up(scaled_size, PAGE_SIZE);
+
+ if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+ return -EINVAL;
+
+ if (IS_ENABLED(CONFIG_UML)) {
+ __memset((void *)shadow_start, KASAN_SHADOW_INIT, shadow_size);
+ return 0;
+ }
+
+ ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+ shadow_start + shadow_size,
+ GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+ __builtin_return_address(0));
+
+ if (ret) {
+ struct vm_struct *vm = find_vm_area(addr);
+ __memset(ret, KASAN_SHADOW_INIT, shadow_size);
+ vm->flags |= VM_KASAN;
+ kmemleak_ignore(ret);
+
+ if (vm->flags & VM_DEFER_KMEMLEAK)
+ kmemleak_vmalloc(vm, size, gfp_mask);
+
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+void kasan_free_module_shadow(const struct vm_struct *vm)
+{
+ if (IS_ENABLED(CONFIG_UML))
+ return;
+
+ if (vm->flags & VM_KASAN)
+ vfree(kasan_mem_to_shadow(vm->addr));
+}
+
+#endif
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
new file mode 100644
index 000000000000..220b5d4c6876
--- /dev/null
+++ b/mm/kasan/sw_tags.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core software tag-based KASAN code.
+ *
+ * Copyright (c) 2018 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#define pr_fmt(fmt) "kasan: " fmt
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+static DEFINE_PER_CPU(u32, prng_state);
+
+void __init kasan_init_sw_tags(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(prng_state, cpu) = (u32)get_cycles();
+
+ kasan_init_tags();
+
+ pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n",
+ kasan_stack_collection_enabled() ? "on" : "off");
+}
+
+/*
+ * If a preemption happens between this_cpu_read and this_cpu_write, the only
+ * side effect is that we'll give a few allocated in different contexts objects
+ * the same tag. Since tag-based KASAN is meant to be used a probabilistic
+ * bug-detection debug feature, this doesn't have significant negative impact.
+ *
+ * Ideally the tags use strong randomness to prevent any attempts to predict
+ * them during explicit exploit attempts. But strong randomness is expensive,
+ * and we did an intentional trade-off to use a PRNG. This non-atomic RMW
+ * sequence has in fact positive effect, since interrupts that randomly skew
+ * PRNG at unpredictable points do only good.
+ */
+u8 kasan_random_tag(void)
+{
+ u32 state = this_cpu_read(prng_state);
+
+ state = 1664525 * state + 1013904223;
+ this_cpu_write(prng_state, state);
+
+ return (u8)(state % (KASAN_TAG_MAX + 1));
+}
+
+bool kasan_check_range(const void *addr, size_t size, bool write,
+ unsigned long ret_ip)
+{
+ u8 tag;
+ u8 *shadow_first, *shadow_last, *shadow;
+ void *untagged_addr;
+
+ if (unlikely(size == 0))
+ return true;
+
+ if (unlikely(addr + size < addr))
+ return !kasan_report(addr, size, write, ret_ip);
+
+ tag = get_tag((const void *)addr);
+
+ /*
+ * Ignore accesses for pointers tagged with 0xff (native kernel
+ * pointer tag) to suppress false positives caused by kmap.
+ *
+ * Some kernel code was written to account for archs that don't keep
+ * high memory mapped all the time, but rather map and unmap particular
+ * pages when needed. Instead of storing a pointer to the kernel memory,
+ * this code saves the address of the page structure and offset within
+ * that page for later use. Those pages are then mapped and unmapped
+ * with kmap/kunmap when necessary and virt_to_page is used to get the
+ * virtual address of the page. For arm64 (that keeps the high memory
+ * mapped all the time), kmap is turned into a page_address call.
+
+ * The issue is that with use of the page_address + virt_to_page
+ * sequence the top byte value of the original pointer gets lost (gets
+ * set to KASAN_TAG_KERNEL (0xFF)).
+ */
+ if (tag == KASAN_TAG_KERNEL)
+ return true;
+
+ untagged_addr = kasan_reset_tag((const void *)addr);
+ if (unlikely(!addr_has_metadata(untagged_addr)))
+ return !kasan_report(addr, size, write, ret_ip);
+ shadow_first = kasan_mem_to_shadow(untagged_addr);
+ shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
+ for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
+ if (*shadow != tag) {
+ return !kasan_report(addr, size, write, ret_ip);
+ }
+ }
+
+ return true;
+}
+
+bool kasan_byte_accessible(const void *addr)
+{
+ u8 tag = get_tag(addr);
+ void *untagged_addr = kasan_reset_tag(addr);
+ u8 shadow_byte;
+
+ if (!addr_has_metadata(untagged_addr))
+ return false;
+
+ shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr));
+ return tag == KASAN_TAG_KERNEL || tag == shadow_byte;
+}
+
+#define DEFINE_HWASAN_LOAD_STORE(size) \
+ void __hwasan_load##size##_noabort(void *addr) \
+ { \
+ kasan_check_range(addr, size, false, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
+ void __hwasan_store##size##_noabort(void *addr) \
+ { \
+ kasan_check_range(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__hwasan_store##size##_noabort)
+
+DEFINE_HWASAN_LOAD_STORE(1);
+DEFINE_HWASAN_LOAD_STORE(2);
+DEFINE_HWASAN_LOAD_STORE(4);
+DEFINE_HWASAN_LOAD_STORE(8);
+DEFINE_HWASAN_LOAD_STORE(16);
+
+void __hwasan_loadN_noabort(void *addr, ssize_t size)
+{
+ kasan_check_range(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_loadN_noabort);
+
+void __hwasan_storeN_noabort(void *addr, ssize_t size)
+{
+ kasan_check_range(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_storeN_noabort);
+
+void __hwasan_tag_memory(void *addr, u8 tag, ssize_t size)
+{
+ kasan_poison(addr, size, tag, false);
+}
+EXPORT_SYMBOL(__hwasan_tag_memory);
+
+void kasan_tag_mismatch(void *addr, unsigned long access_info,
+ unsigned long ret_ip)
+{
+ kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
+ ret_ip);
+}
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index e02a36a51f42..7dcfe341d48e 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -1,200 +1,144 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains core tag-based KASAN code.
+ * This file contains common tag-based KASAN code.
*
* Copyright (c) 2018 Google, Inc.
- * Author: Andrey Konovalov <andreyknvl@google.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
+ * Copyright (c) 2020 Google, Inc.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/export.h>
-#include <linux/interrupt.h>
+#include <linux/atomic.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
-#include <linux/kmemleak.h>
-#include <linux/linkage.h>
#include <linux/memblock.h>
#include <linux/memory.h>
#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/slab.h>
-#include <linux/stacktrace.h>
+#include <linux/static_key.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <linux/vmalloc.h>
-#include <linux/bug.h>
#include "kasan.h"
#include "../slab.h"
-static DEFINE_PER_CPU(u32, prng_state);
+#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10)
-void kasan_init_tags(void)
-{
- int cpu;
+enum kasan_arg_stacktrace {
+ KASAN_ARG_STACKTRACE_DEFAULT,
+ KASAN_ARG_STACKTRACE_OFF,
+ KASAN_ARG_STACKTRACE_ON,
+};
- for_each_possible_cpu(cpu)
- per_cpu(prng_state, cpu) = (u32)get_cycles();
-}
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
-/*
- * If a preemption happens between this_cpu_read and this_cpu_write, the only
- * side effect is that we'll give a few allocated in different contexts objects
- * the same tag. Since tag-based KASAN is meant to be used a probabilistic
- * bug-detection debug feature, this doesn't have significant negative impact.
- *
- * Ideally the tags use strong randomness to prevent any attempts to predict
- * them during explicit exploit attempts. But strong randomness is expensive,
- * and we did an intentional trade-off to use a PRNG. This non-atomic RMW
- * sequence has in fact positive effect, since interrupts that randomly skew
- * PRNG at unpredictable points do only good.
- */
-u8 random_tag(void)
+/* Whether to collect alloc/free stack traces. */
+DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
+
+/* Non-zero, as initial pointer values are 0. */
+#define STACK_RING_BUSY_PTR ((void *)1)
+
+struct kasan_stack_ring stack_ring = {
+ .lock = __RW_LOCK_UNLOCKED(stack_ring.lock)
+};
+
+/* kasan.stacktrace=off/on */
+static int __init early_kasan_flag_stacktrace(char *arg)
{
- u32 state = this_cpu_read(prng_state);
+ if (!arg)
+ return -EINVAL;
- state = 1664525 * state + 1013904223;
- this_cpu_write(prng_state, state);
+ if (!strcmp(arg, "off"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON;
+ else
+ return -EINVAL;
- return (u8)(state % (KASAN_TAG_MAX + 1));
+ return 0;
}
+early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
-void *kasan_reset_tag(const void *addr)
+/* kasan.stack_ring_size=<number of entries> */
+static int __init early_kasan_flag_stack_ring_size(char *arg)
{
- return reset_tag(addr);
+ if (!arg)
+ return -EINVAL;
+
+ return kstrtoul(arg, 0, &stack_ring.size);
}
+early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size);
-bool check_memory_region(unsigned long addr, size_t size, bool write,
- unsigned long ret_ip)
+void __init kasan_init_tags(void)
{
- u8 tag;
- u8 *shadow_first, *shadow_last, *shadow;
- void *untagged_addr;
+ switch (kasan_arg_stacktrace) {
+ case KASAN_ARG_STACKTRACE_DEFAULT:
+ /* Default is specified by kasan_flag_stacktrace definition. */
+ break;
+ case KASAN_ARG_STACKTRACE_OFF:
+ static_branch_disable(&kasan_flag_stacktrace);
+ break;
+ case KASAN_ARG_STACKTRACE_ON:
+ static_branch_enable(&kasan_flag_stacktrace);
+ break;
+ }
- if (unlikely(size == 0))
- return true;
+ if (kasan_stack_collection_enabled()) {
+ if (!stack_ring.size)
+ stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT;
+ stack_ring.entries = memblock_alloc(
+ sizeof(stack_ring.entries[0]) * stack_ring.size,
+ SMP_CACHE_BYTES);
+ if (WARN_ON(!stack_ring.entries))
+ static_branch_disable(&kasan_flag_stacktrace);
+ }
+}
- if (unlikely(addr + size < addr))
- return !kasan_report(addr, size, write, ret_ip);
+static void save_stack_info(struct kmem_cache *cache, void *object,
+ gfp_t gfp_flags, bool is_free)
+{
+ unsigned long flags;
+ depot_stack_handle_t stack;
+ u64 pos;
+ struct kasan_stack_ring_entry *entry;
+ void *old_ptr;
- tag = get_tag((const void *)addr);
+ stack = kasan_save_stack(gfp_flags, true);
/*
- * Ignore accesses for pointers tagged with 0xff (native kernel
- * pointer tag) to suppress false positives caused by kmap.
- *
- * Some kernel code was written to account for archs that don't keep
- * high memory mapped all the time, but rather map and unmap particular
- * pages when needed. Instead of storing a pointer to the kernel memory,
- * this code saves the address of the page structure and offset within
- * that page for later use. Those pages are then mapped and unmapped
- * with kmap/kunmap when necessary and virt_to_page is used to get the
- * virtual address of the page. For arm64 (that keeps the high memory
- * mapped all the time), kmap is turned into a page_address call.
-
- * The issue is that with use of the page_address + virt_to_page
- * sequence the top byte value of the original pointer gets lost (gets
- * set to KASAN_TAG_KERNEL (0xFF)).
+ * Prevent save_stack_info() from modifying stack ring
+ * when kasan_complete_mode_report_info() is walking it.
*/
- if (tag == KASAN_TAG_KERNEL)
- return true;
+ read_lock_irqsave(&stack_ring.lock, flags);
- untagged_addr = reset_tag((const void *)addr);
- if (unlikely(untagged_addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- return !kasan_report(addr, size, write, ret_ip);
- }
- shadow_first = kasan_mem_to_shadow(untagged_addr);
- shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
- for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
- if (*shadow != tag) {
- return !kasan_report(addr, size, write, ret_ip);
- }
- }
+next:
+ pos = atomic64_fetch_add(1, &stack_ring.pos);
+ entry = &stack_ring.entries[pos % stack_ring.size];
- return true;
-}
+ /* Detect stack ring entry slots that are being written to. */
+ old_ptr = READ_ONCE(entry->ptr);
+ if (old_ptr == STACK_RING_BUSY_PTR)
+ goto next; /* Busy slot. */
+ if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR))
+ goto next; /* Busy slot. */
-#define DEFINE_HWASAN_LOAD_STORE(size) \
- void __hwasan_load##size##_noabort(unsigned long addr) \
- { \
- check_memory_region(addr, size, false, _RET_IP_); \
- } \
- EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
- void __hwasan_store##size##_noabort(unsigned long addr) \
- { \
- check_memory_region(addr, size, true, _RET_IP_); \
- } \
- EXPORT_SYMBOL(__hwasan_store##size##_noabort)
-
-DEFINE_HWASAN_LOAD_STORE(1);
-DEFINE_HWASAN_LOAD_STORE(2);
-DEFINE_HWASAN_LOAD_STORE(4);
-DEFINE_HWASAN_LOAD_STORE(8);
-DEFINE_HWASAN_LOAD_STORE(16);
-
-void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
-{
- check_memory_region(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__hwasan_loadN_noabort);
+ WRITE_ONCE(entry->size, cache->object_size);
+ WRITE_ONCE(entry->pid, current->pid);
+ WRITE_ONCE(entry->stack, stack);
+ WRITE_ONCE(entry->is_free, is_free);
-void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
-{
- check_memory_region(addr, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__hwasan_storeN_noabort);
+ /*
+ * Paired with smp_load_acquire() in kasan_complete_mode_report_info().
+ */
+ smp_store_release(&entry->ptr, (s64)object);
-void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
-{
- kasan_poison_shadow((void *)addr, size, tag);
+ read_unlock_irqrestore(&stack_ring.lock, flags);
}
-EXPORT_SYMBOL(__hwasan_tag_memory);
-void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
{
- struct kasan_alloc_meta *alloc_meta;
- u8 idx = 0;
-
- alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- idx = alloc_meta->free_track_idx;
- alloc_meta->free_pointer_tag[idx] = tag;
- alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
-#endif
-
- kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+ save_stack_info(cache, object, flags, false);
}
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
+void kasan_save_free_info(struct kmem_cache *cache, void *object)
{
- struct kasan_alloc_meta *alloc_meta;
- int i = 0;
-
- alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- break;
- }
- if (i == KASAN_NR_FREE_STACKS)
- i = alloc_meta->free_track_idx;
-#endif
-
- return &alloc_meta->free_track[i];
+ save_stack_info(cache, object, 0, true);
}
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
deleted file mode 100644
index bee43717d6f0..000000000000
--- a/mm/kasan/tags_report.c
+++ /dev/null
@@ -1,93 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This file contains tag-based KASAN specific error reporting code.
- *
- * Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
- *
- * Some code borrowed from https://github.com/xairy/kasan-prototype by
- * Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/bitops.h>
-#include <linux/ftrace.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/printk.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/stackdepot.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/kasan.h>
-#include <linux/module.h>
-
-#include <asm/sections.h>
-
-#include "kasan.h"
-#include "../slab.h"
-
-const char *get_bug_type(struct kasan_access_info *info)
-{
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- struct kasan_alloc_meta *alloc_meta;
- struct kmem_cache *cache;
- struct page *page;
- const void *addr;
- void *object;
- u8 tag;
- int i;
-
- tag = get_tag(info->access_addr);
- addr = reset_tag(info->access_addr);
- page = kasan_addr_to_page(addr);
- if (page && PageSlab(page)) {
- cache = page->slab_cache;
- object = nearest_obj(cache, page, (void *)addr);
- alloc_meta = get_alloc_info(cache, object);
-
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
- if (alloc_meta->free_pointer_tag[i] == tag)
- return "use-after-free";
- return "out-of-bounds";
- }
-
-#endif
- /*
- * If access_size is a negative number, then it has reason to be
- * defined as out-of-bounds bug type.
- *
- * Casting negative numbers to size_t would indeed turn up as
- * a large size_t and its value will be larger than ULONG_MAX/2,
- * so that this can qualify as out-of-bounds.
- */
- if (info->access_addr + info->access_size < info->access_addr)
- return "out-of-bounds";
-
- return "invalid-access";
-}
-
-void *find_first_bad_addr(void *addr, size_t size)
-{
- u8 tag = get_tag(addr);
- void *p = reset_tag(addr);
- void *end = p + size;
-
- while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
- p += KASAN_SHADOW_SCALE_SIZE;
- return p;
-}
-
-void print_tags(u8 addr_tag, const void *addr)
-{
- u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
-
- pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
-}
diff --git a/mm/kfence/.kunitconfig b/mm/kfence/.kunitconfig
new file mode 100644
index 000000000000..f3d65e939bfa
--- /dev/null
+++ b/mm/kfence/.kunitconfig
@@ -0,0 +1,6 @@
+CONFIG_KUNIT=y
+CONFIG_KFENCE=y
+CONFIG_KFENCE_KUNIT_TEST=y
+
+# Additional dependencies.
+CONFIG_FTRACE=y
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
new file mode 100644
index 000000000000..2de2a58d11a1
--- /dev/null
+++ b/mm/kfence/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y := core.o report.o
+
+CFLAGS_kfence_test.o := -fno-omit-frame-pointer -fno-optimize-sibling-calls
+obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
new file mode 100644
index 000000000000..dad3c0eb70a0
--- /dev/null
+++ b/mm/kfence/core.c
@@ -0,0 +1,1182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE guarded object allocator and fault handling.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kfence: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/hash.h>
+#include <linux/irq_work.h>
+#include <linux/jhash.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kfence.h>
+#include <linux/kmemleak.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/log2.h>
+#include <linux/memblock.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/clock.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* Disables KFENCE on the first warning assuming an irrecoverable error. */
+#define KFENCE_WARN_ON(cond) \
+ ({ \
+ const bool __cond = WARN_ON(cond); \
+ if (unlikely(__cond)) { \
+ WRITE_ONCE(kfence_enabled, false); \
+ disabled_by_warn = true; \
+ } \
+ __cond; \
+ })
+
+/* === Data ================================================================= */
+
+static bool kfence_enabled __read_mostly;
+static bool disabled_by_warn __read_mostly;
+
+unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kfence."
+
+static int kfence_enable_late(void);
+static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
+{
+ unsigned long num;
+ int ret = kstrtoul(val, 0, &num);
+
+ if (ret < 0)
+ return ret;
+
+ /* Using 0 to indicate KFENCE is disabled. */
+ if (!num && READ_ONCE(kfence_enabled)) {
+ pr_info("disabled\n");
+ WRITE_ONCE(kfence_enabled, false);
+ }
+
+ *((unsigned long *)kp->arg) = num;
+
+ if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
+ return disabled_by_warn ? -EINVAL : kfence_enable_late();
+ return 0;
+}
+
+static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
+{
+ if (!READ_ONCE(kfence_enabled))
+ return sprintf(buffer, "0\n");
+
+ return param_get_ulong(buffer, kp);
+}
+
+static const struct kernel_param_ops sample_interval_param_ops = {
+ .set = param_set_sample_interval,
+ .get = param_get_sample_interval,
+};
+module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+
+/* Pool usage% threshold when currently covered allocations are skipped. */
+static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
+module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+
+/* If true, use a deferrable timer. */
+static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
+module_param_named(deferrable, kfence_deferrable, bool, 0444);
+
+/* If true, check all canary bytes on panic. */
+static bool kfence_check_on_panic __read_mostly;
+module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444);
+
+/* The pool of pages used for guard pages and objects. */
+char *__kfence_pool __read_mostly;
+EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
+
+/*
+ * Per-object metadata, with one-to-one mapping of object metadata to
+ * backing pages (in __kfence_pool).
+ */
+static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
+struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* Freelist with available objects. */
+static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
+static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
+
+/*
+ * The static key to set up a KFENCE allocation; or if static keys are not used
+ * to gate allocations, to avoid a load and compare if KFENCE is disabled.
+ */
+DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
+
+/* Gates the allocation, ensuring only one succeeds in a given period. */
+atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+
+/*
+ * A Counting Bloom filter of allocation coverage: limits currently covered
+ * allocations of the same source filling up the pool.
+ *
+ * Assuming a range of 15%-85% unique allocations in the pool at any point in
+ * time, the below parameters provide a probablity of 0.02-0.33 for false
+ * positive hits respectively:
+ *
+ * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
+ */
+#define ALLOC_COVERED_HNUM 2
+#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
+#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
+static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
+
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
+
+/*
+ * Randomness for stack hashes, making the same collisions across reboots and
+ * different machines less likely.
+ */
+static u32 stack_hash_seed __ro_after_init;
+
+/* Statistics counters for debugfs. */
+enum kfence_counter_id {
+ KFENCE_COUNTER_ALLOCATED,
+ KFENCE_COUNTER_ALLOCS,
+ KFENCE_COUNTER_FREES,
+ KFENCE_COUNTER_ZOMBIES,
+ KFENCE_COUNTER_BUGS,
+ KFENCE_COUNTER_SKIP_INCOMPAT,
+ KFENCE_COUNTER_SKIP_CAPACITY,
+ KFENCE_COUNTER_SKIP_COVERED,
+ KFENCE_COUNTER_COUNT,
+};
+static atomic_long_t counters[KFENCE_COUNTER_COUNT];
+static const char *const counter_names[] = {
+ [KFENCE_COUNTER_ALLOCATED] = "currently allocated",
+ [KFENCE_COUNTER_ALLOCS] = "total allocations",
+ [KFENCE_COUNTER_FREES] = "total frees",
+ [KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
+ [KFENCE_COUNTER_BUGS] = "total bugs",
+ [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
+ [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
+ [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
+};
+static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
+
+/* === Internals ============================================================ */
+
+static inline bool should_skip_covered(void)
+{
+ unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
+
+ return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
+}
+
+static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
+{
+ num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
+ num_entries = filter_irq_stacks(stack_entries, num_entries);
+ return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
+}
+
+/*
+ * Adds (or subtracts) count @val for allocation stack trace hash
+ * @alloc_stack_hash from Counting Bloom filter.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+}
+
+/*
+ * Returns true if the allocation stack trace hash @alloc_stack_hash is
+ * currently contained (non-zero count) in Counting Bloom filter.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
+ return false;
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+
+ return true;
+}
+
+static bool kfence_protect(unsigned long addr)
+{
+ return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
+}
+
+static bool kfence_unprotect(unsigned long addr)
+{
+ return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
+}
+
+static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
+{
+ unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
+ unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
+
+ /* The checks do not affect performance; only called from slow-paths. */
+
+ /* Only call with a pointer into kfence_metadata. */
+ if (KFENCE_WARN_ON(meta < kfence_metadata ||
+ meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
+ return 0;
+
+ /*
+ * This metadata object only ever maps to 1 page; verify that the stored
+ * address is in the expected range.
+ */
+ if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
+ return 0;
+
+ return pageaddr;
+}
+
+/*
+ * Update the object's metadata state, including updating the alloc/free stacks
+ * depending on the state transition.
+ */
+static noinline void
+metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
+ unsigned long *stack_entries, size_t num_stack_entries)
+{
+ struct kfence_track *track =
+ next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
+
+ lockdep_assert_held(&meta->lock);
+
+ if (stack_entries) {
+ memcpy(track->stack_entries, stack_entries,
+ num_stack_entries * sizeof(stack_entries[0]));
+ } else {
+ /*
+ * Skip over 1 (this) functions; noinline ensures we do not
+ * accidentally skip over the caller by never inlining.
+ */
+ num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+ }
+ track->num_stack_entries = num_stack_entries;
+ track->pid = task_pid_nr(current);
+ track->cpu = raw_smp_processor_id();
+ track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
+
+ /*
+ * Pairs with READ_ONCE() in
+ * kfence_shutdown_cache(),
+ * kfence_handle_page_fault().
+ */
+ WRITE_ONCE(meta->state, next);
+}
+
+/* Check canary byte at @addr. */
+static inline bool check_canary_byte(u8 *addr)
+{
+ struct kfence_metadata *meta;
+ unsigned long flags;
+
+ if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr)))
+ return true;
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+
+ meta = addr_to_metadata((unsigned long)addr);
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ return false;
+}
+
+static inline void set_canary(const struct kfence_metadata *meta)
+{
+ const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
+ unsigned long addr = pageaddr;
+
+ /*
+ * The canary may be written to part of the object memory, but it does
+ * not affect it. The user should initialize the object before using it.
+ */
+ for (; addr < meta->addr; addr += sizeof(u64))
+ *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
+
+ addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64));
+ for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64))
+ *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
+}
+
+static inline void check_canary(const struct kfence_metadata *meta)
+{
+ const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
+ unsigned long addr = pageaddr;
+
+ /*
+ * We'll iterate over each canary byte per-side until a corrupted byte
+ * is found. However, we'll still iterate over the canary bytes to the
+ * right of the object even if there was an error in the canary bytes to
+ * the left of the object. Specifically, if check_canary_byte()
+ * generates an error, showing both sides might give more clues as to
+ * what the error is about when displaying which bytes were corrupted.
+ */
+
+ /* Apply to left of object. */
+ for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) {
+ if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64))
+ break;
+ }
+
+ /*
+ * If the canary is corrupted in a certain 64 bytes, or the canary
+ * memory cannot be completely covered by multiple consecutive 64 bytes,
+ * it needs to be checked one by one.
+ */
+ for (; addr < meta->addr; addr++) {
+ if (unlikely(!check_canary_byte((u8 *)addr)))
+ break;
+ }
+
+ /* Apply to right of object. */
+ for (addr = meta->addr + meta->size; addr % sizeof(u64) != 0; addr++) {
+ if (unlikely(!check_canary_byte((u8 *)addr)))
+ return;
+ }
+ for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) {
+ if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) {
+
+ for (; addr - pageaddr < PAGE_SIZE; addr++) {
+ if (!check_canary_byte((u8 *)addr))
+ return;
+ }
+ }
+ }
+}
+
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
+ unsigned long *stack_entries, size_t num_stack_entries,
+ u32 alloc_stack_hash)
+{
+ struct kfence_metadata *meta = NULL;
+ unsigned long flags;
+ struct slab *slab;
+ void *addr;
+ const bool random_right_allocate = get_random_u32_below(2);
+ const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
+ !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS);
+
+ /* Try to obtain a free object. */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ if (!list_empty(&kfence_freelist)) {
+ meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
+ list_del_init(&meta->list);
+ }
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+ if (!meta) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
+ return NULL;
+ }
+
+ if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
+ /*
+ * This is extremely unlikely -- we are reporting on a
+ * use-after-free, which locked meta->lock, and the reporting
+ * code via printk calls kmalloc() which ends up in
+ * kfence_alloc() and tries to grab the same object that we're
+ * reporting on. While it has never been observed, lockdep does
+ * report that there is a possibility of deadlock. Fix it by
+ * using trylock and bailing out gracefully.
+ */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ /* Put the object back on the freelist. */
+ list_add_tail(&meta->list, &kfence_freelist);
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+ return NULL;
+ }
+
+ meta->addr = metadata_to_pageaddr(meta);
+ /* Unprotect if we're reusing this page. */
+ if (meta->state == KFENCE_OBJECT_FREED)
+ kfence_unprotect(meta->addr);
+
+ /*
+ * Note: for allocations made before RNG initialization, will always
+ * return zero. We still benefit from enabling KFENCE as early as
+ * possible, even when the RNG is not yet available, as this will allow
+ * KFENCE to detect bugs due to earlier allocations. The only downside
+ * is that the out-of-bounds accesses detected are deterministic for
+ * such allocations.
+ */
+ if (random_right_allocate) {
+ /* Allocate on the "right" side, re-calculate address. */
+ meta->addr += PAGE_SIZE - size;
+ meta->addr = ALIGN_DOWN(meta->addr, cache->align);
+ }
+
+ addr = (void *)meta->addr;
+
+ /* Update remaining metadata. */
+ metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
+ /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
+ WRITE_ONCE(meta->cache, cache);
+ meta->size = size;
+ meta->alloc_stack_hash = alloc_stack_hash;
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ alloc_covered_add(alloc_stack_hash, 1);
+
+ /* Set required slab fields. */
+ slab = virt_to_slab((void *)meta->addr);
+ slab->slab_cache = cache;
+#if defined(CONFIG_SLUB)
+ slab->objects = 1;
+#elif defined(CONFIG_SLAB)
+ slab->s_mem = addr;
+#endif
+
+ /* Memory initialization. */
+ set_canary(meta);
+
+ /*
+ * We check slab_want_init_on_alloc() ourselves, rather than letting
+ * SL*B do the initialization, as otherwise we might overwrite KFENCE's
+ * redzone.
+ */
+ if (unlikely(slab_want_init_on_alloc(gfp, cache)))
+ memzero_explicit(addr, size);
+ if (cache->ctor)
+ cache->ctor(addr);
+
+ if (random_fault)
+ kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
+ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
+
+ return addr;
+}
+
+static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
+{
+ struct kcsan_scoped_access assert_page_exclusive;
+ unsigned long flags;
+ bool init;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+
+ if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
+ /* Invalid or double-free, bail out. */
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+ kfence_report_error((unsigned long)addr, false, NULL, meta,
+ KFENCE_ERROR_INVALID_FREE);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ return;
+ }
+
+ /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
+ kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
+ KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
+ &assert_page_exclusive);
+
+ if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
+ kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
+
+ /* Restore page protection if there was an OOB access. */
+ if (meta->unprotected_page) {
+ memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
+ kfence_protect(meta->unprotected_page);
+ meta->unprotected_page = 0;
+ }
+
+ /* Mark the object as freed. */
+ metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
+ init = slab_want_init_on_free(meta->cache);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ alloc_covered_add(meta->alloc_stack_hash, -1);
+
+ /* Check canary bytes for memory corruption. */
+ check_canary(meta);
+
+ /*
+ * Clear memory if init-on-free is set. While we protect the page, the
+ * data is still there, and after a use-after-free is detected, we
+ * unprotect the page, so the data is still accessible.
+ */
+ if (!zombie && unlikely(init))
+ memzero_explicit(addr, meta->size);
+
+ /* Protect to detect use-after-frees. */
+ kfence_protect((unsigned long)addr);
+
+ kcsan_end_scoped_access(&assert_page_exclusive);
+ if (!zombie) {
+ /* Add it to the tail of the freelist for reuse. */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ KFENCE_WARN_ON(!list_empty(&meta->list));
+ list_add_tail(&meta->list, &kfence_freelist);
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+ atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
+ atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
+ } else {
+ /* See kfence_shutdown_cache(). */
+ atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
+ }
+}
+
+static void rcu_guarded_free(struct rcu_head *h)
+{
+ struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
+
+ kfence_guarded_free((void *)meta->addr, meta, false);
+}
+
+/*
+ * Initialization of the KFENCE pool after its allocation.
+ * Returns 0 on success; otherwise returns the address up to
+ * which partial initialization succeeded.
+ */
+static unsigned long kfence_init_pool(void)
+{
+ unsigned long addr = (unsigned long)__kfence_pool;
+ struct page *pages;
+ int i;
+
+ if (!arch_kfence_init_pool())
+ return addr;
+
+ pages = virt_to_page(__kfence_pool);
+
+ /*
+ * Set up object pages: they must have PG_slab set, to avoid freeing
+ * these as real pages.
+ *
+ * We also want to avoid inserting kfence_free() in the kfree()
+ * fast-path in SLUB, and therefore need to ensure kfree() correctly
+ * enters __slab_free() slow-path.
+ */
+ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+ struct slab *slab = page_slab(nth_page(pages, i));
+
+ if (!i || (i % 2))
+ continue;
+
+ __folio_set_slab(slab_folio(slab));
+#ifdef CONFIG_MEMCG
+ slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
+ MEMCG_DATA_OBJCGS;
+#endif
+ }
+
+ /*
+ * Protect the first 2 pages. The first page is mostly unnecessary, and
+ * merely serves as an extended guard page. However, adding one
+ * additional page in the beginning gives us an even number of pages,
+ * which simplifies the mapping of address to metadata index.
+ */
+ for (i = 0; i < 2; i++) {
+ if (unlikely(!kfence_protect(addr)))
+ return addr;
+
+ addr += PAGE_SIZE;
+ }
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ struct kfence_metadata *meta = &kfence_metadata[i];
+
+ /* Initialize metadata. */
+ INIT_LIST_HEAD(&meta->list);
+ raw_spin_lock_init(&meta->lock);
+ meta->state = KFENCE_OBJECT_UNUSED;
+ meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
+ list_add_tail(&meta->list, &kfence_freelist);
+
+ /* Protect the right redzone. */
+ if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
+ goto reset_slab;
+
+ addr += 2 * PAGE_SIZE;
+ }
+
+ return 0;
+
+reset_slab:
+ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+ struct slab *slab = page_slab(nth_page(pages, i));
+
+ if (!i || (i % 2))
+ continue;
+#ifdef CONFIG_MEMCG
+ slab->memcg_data = 0;
+#endif
+ __folio_clear_slab(slab_folio(slab));
+ }
+
+ return addr;
+}
+
+static bool __init kfence_init_pool_early(void)
+{
+ unsigned long addr;
+
+ if (!__kfence_pool)
+ return false;
+
+ addr = kfence_init_pool();
+
+ if (!addr) {
+ /*
+ * The pool is live and will never be deallocated from this point on.
+ * Ignore the pool object from the kmemleak phys object tree, as it would
+ * otherwise overlap with allocations returned by kfence_alloc(), which
+ * are registered with kmemleak through the slab post-alloc hook.
+ */
+ kmemleak_ignore_phys(__pa(__kfence_pool));
+ return true;
+ }
+
+ /*
+ * Only release unprotected pages, and do not try to go back and change
+ * page attributes due to risk of failing to do so as well. If changing
+ * page attributes for some pages fails, it is very likely that it also
+ * fails for the first page, and therefore expect addr==__kfence_pool in
+ * most failure cases.
+ */
+ memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
+ __kfence_pool = NULL;
+ return false;
+}
+
+static bool kfence_init_pool_late(void)
+{
+ unsigned long addr, free_size;
+
+ addr = kfence_init_pool();
+
+ if (!addr)
+ return true;
+
+ /* Same as above. */
+ free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
+#ifdef CONFIG_CONTIG_ALLOC
+ free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
+#else
+ free_pages_exact((void *)addr, free_size);
+#endif
+ __kfence_pool = NULL;
+ return false;
+}
+
+/* === DebugFS Interface ==================================================== */
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
+ for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
+ seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(stats);
+
+/*
+ * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
+ * start_object() and next_object() return the object index + 1, because NULL is used
+ * to stop iteration.
+ */
+static void *start_object(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+ return (void *)((long)*pos + 1);
+ return NULL;
+}
+
+static void stop_object(struct seq_file *seq, void *v)
+{
+}
+
+static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+ return (void *)((long)*pos + 1);
+ return NULL;
+}
+
+static int show_object(struct seq_file *seq, void *v)
+{
+ struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ kfence_print_object(seq, meta);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ seq_puts(seq, "---------------------------------\n");
+
+ return 0;
+}
+
+static const struct seq_operations objects_sops = {
+ .start = start_object,
+ .next = next_object,
+ .stop = stop_object,
+ .show = show_object,
+};
+DEFINE_SEQ_ATTRIBUTE(objects);
+
+static int kfence_debugfs_init(void)
+{
+ struct dentry *kfence_dir;
+
+ if (!READ_ONCE(kfence_enabled))
+ return 0;
+
+ kfence_dir = debugfs_create_dir("kfence", NULL);
+ debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
+ debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
+ return 0;
+}
+
+late_initcall(kfence_debugfs_init);
+
+/* === Panic Notifier ====================================================== */
+
+static void kfence_check_all_canary(void)
+{
+ int i;
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ struct kfence_metadata *meta = &kfence_metadata[i];
+
+ if (meta->state == KFENCE_OBJECT_ALLOCATED)
+ check_canary(meta);
+ }
+}
+
+static int kfence_check_canary_callback(struct notifier_block *nb,
+ unsigned long reason, void *arg)
+{
+ kfence_check_all_canary();
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kfence_check_canary_notifier = {
+ .notifier_call = kfence_check_canary_callback,
+};
+
+/* === Allocation Gate Timer ================================================ */
+
+static struct delayed_work kfence_timer;
+
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* Wait queue to wake up allocation-gate timer task. */
+static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
+
+static void wake_up_kfence_timer(struct irq_work *work)
+{
+ wake_up(&allocation_wait);
+}
+static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
+#endif
+
+/*
+ * Set up delayed work, which will enable and disable the static key. We need to
+ * use a work queue (rather than a simple timer), since enabling and disabling a
+ * static key cannot be done from an interrupt.
+ *
+ * Note: Toggling a static branch currently causes IPIs, and here we'll end up
+ * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
+ * more aggressive sampling intervals), we could get away with a variant that
+ * avoids IPIs, at the cost of not immediately capturing allocations if the
+ * instructions remain cached.
+ */
+static void toggle_allocation_gate(struct work_struct *work)
+{
+ if (!READ_ONCE(kfence_enabled))
+ return;
+
+ atomic_set(&kfence_allocation_gate, 0);
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+ /* Enable static key, and await allocation to happen. */
+ static_branch_enable(&kfence_allocation_key);
+
+ wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
+
+ /* Disable static key and reset timer. */
+ static_branch_disable(&kfence_allocation_key);
+#endif
+ queue_delayed_work(system_unbound_wq, &kfence_timer,
+ msecs_to_jiffies(kfence_sample_interval));
+}
+
+/* === Public interface ===================================================== */
+
+void __init kfence_alloc_pool(void)
+{
+ if (!kfence_sample_interval)
+ return;
+
+ /* if the pool has already been initialized by arch, skip the below. */
+ if (__kfence_pool)
+ return;
+
+ __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+
+ if (!__kfence_pool)
+ pr_err("failed to allocate pool\n");
+}
+
+static void kfence_init_enable(void)
+{
+ if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
+ static_branch_enable(&kfence_allocation_key);
+
+ if (kfence_deferrable)
+ INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate);
+ else
+ INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate);
+
+ if (kfence_check_on_panic)
+ atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier);
+
+ WRITE_ONCE(kfence_enabled, true);
+ queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
+
+ pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
+ CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
+ (void *)(__kfence_pool + KFENCE_POOL_SIZE));
+}
+
+void __init kfence_init(void)
+{
+ stack_hash_seed = get_random_u32();
+
+ /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
+ if (!kfence_sample_interval)
+ return;
+
+ if (!kfence_init_pool_early()) {
+ pr_err("%s failed\n", __func__);
+ return;
+ }
+
+ kfence_init_enable();
+}
+
+static int kfence_init_late(void)
+{
+ const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE;
+#ifdef CONFIG_CONTIG_ALLOC
+ struct page *pages;
+
+ pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL);
+ if (!pages)
+ return -ENOMEM;
+ __kfence_pool = page_to_virt(pages);
+#else
+ if (nr_pages > MAX_ORDER_NR_PAGES) {
+ pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
+ return -EINVAL;
+ }
+ __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
+ if (!__kfence_pool)
+ return -ENOMEM;
+#endif
+
+ if (!kfence_init_pool_late()) {
+ pr_err("%s failed\n", __func__);
+ return -EBUSY;
+ }
+
+ kfence_init_enable();
+ kfence_debugfs_init();
+
+ return 0;
+}
+
+static int kfence_enable_late(void)
+{
+ if (!__kfence_pool)
+ return kfence_init_late();
+
+ WRITE_ONCE(kfence_enabled, true);
+ queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
+ pr_info("re-enabled\n");
+ return 0;
+}
+
+void kfence_shutdown_cache(struct kmem_cache *s)
+{
+ unsigned long flags;
+ struct kfence_metadata *meta;
+ int i;
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ bool in_use;
+
+ meta = &kfence_metadata[i];
+
+ /*
+ * If we observe some inconsistent cache and state pair where we
+ * should have returned false here, cache destruction is racing
+ * with either kmem_cache_alloc() or kmem_cache_free(). Taking
+ * the lock will not help, as different critical section
+ * serialization will have the same outcome.
+ */
+ if (READ_ONCE(meta->cache) != s ||
+ READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
+ continue;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ if (in_use) {
+ /*
+ * This cache still has allocations, and we should not
+ * release them back into the freelist so they can still
+ * safely be used and retain the kernel's default
+ * behaviour of keeping the allocations alive (leak the
+ * cache); however, they effectively become "zombie
+ * allocations" as the KFENCE objects are the only ones
+ * still in use and the owning cache is being destroyed.
+ *
+ * We mark them freed, so that any subsequent use shows
+ * more useful error messages that will include stack
+ * traces of the user of the object, the original
+ * allocation, and caller to shutdown_cache().
+ */
+ kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
+ }
+ }
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ meta = &kfence_metadata[i];
+
+ /* See above. */
+ if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
+ continue;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
+ meta->cache = NULL;
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ }
+}
+
+void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
+{
+ unsigned long stack_entries[KFENCE_STACK_DEPTH];
+ size_t num_stack_entries;
+ u32 alloc_stack_hash;
+
+ /*
+ * Perform size check before switching kfence_allocation_gate, so that
+ * we don't disable KFENCE without making an allocation.
+ */
+ if (size > PAGE_SIZE) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
+ return NULL;
+ }
+
+ /*
+ * Skip allocations from non-default zones, including DMA. We cannot
+ * guarantee that pages in the KFENCE pool will have the requested
+ * properties (e.g. reside in DMAable memory).
+ */
+ if ((flags & GFP_ZONEMASK) ||
+ (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
+ return NULL;
+ }
+
+ /*
+ * Skip allocations for this slab, if KFENCE has been disabled for
+ * this slab.
+ */
+ if (s->flags & SLAB_SKIP_KFENCE)
+ return NULL;
+
+ if (atomic_inc_return(&kfence_allocation_gate) > 1)
+ return NULL;
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+ /*
+ * waitqueue_active() is fully ordered after the update of
+ * kfence_allocation_gate per atomic_inc_return().
+ */
+ if (waitqueue_active(&allocation_wait)) {
+ /*
+ * Calling wake_up() here may deadlock when allocations happen
+ * from within timer code. Use an irq_work to defer it.
+ */
+ irq_work_queue(&wake_up_kfence_timer_work);
+ }
+#endif
+
+ if (!READ_ONCE(kfence_enabled))
+ return NULL;
+
+ num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
+
+ /*
+ * Do expensive check for coverage of allocation in slow-path after
+ * allocation_gate has already become non-zero, even though it might
+ * mean not making any allocation within a given sample interval.
+ *
+ * This ensures reasonable allocation coverage when the pool is almost
+ * full, including avoiding long-lived allocations of the same source
+ * filling up the pool (e.g. pagecache allocations).
+ */
+ alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
+ if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+ return NULL;
+ }
+
+ return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
+ alloc_stack_hash);
+}
+
+size_t kfence_ksize(const void *addr)
+{
+ const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+ /*
+ * Read locklessly -- if there is a race with __kfence_alloc(), this is
+ * either a use-after-free or invalid access.
+ */
+ return meta ? meta->size : 0;
+}
+
+void *kfence_object_start(const void *addr)
+{
+ const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+ /*
+ * Read locklessly -- if there is a race with __kfence_alloc(), this is
+ * either a use-after-free or invalid access.
+ */
+ return meta ? (void *)meta->addr : NULL;
+}
+
+void __kfence_free(void *addr)
+{
+ struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+#ifdef CONFIG_MEMCG
+ KFENCE_WARN_ON(meta->objcg);
+#endif
+ /*
+ * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
+ * the object, as the object page may be recycled for other-typed
+ * objects once it has been freed. meta->cache may be NULL if the cache
+ * was destroyed.
+ */
+ if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
+ call_rcu(&meta->rcu_head, rcu_guarded_free);
+ else
+ kfence_guarded_free(addr, meta, false);
+}
+
+bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
+{
+ const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
+ struct kfence_metadata *to_report = NULL;
+ enum kfence_error_type error_type;
+ unsigned long flags;
+
+ if (!is_kfence_address((void *)addr))
+ return false;
+
+ if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
+ return kfence_unprotect(addr); /* ... unprotect and proceed. */
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+
+ if (page_index % 2) {
+ /* This is a redzone, report a buffer overflow. */
+ struct kfence_metadata *meta;
+ int distance = 0;
+
+ meta = addr_to_metadata(addr - PAGE_SIZE);
+ if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+ to_report = meta;
+ /* Data race ok; distance calculation approximate. */
+ distance = addr - data_race(meta->addr + meta->size);
+ }
+
+ meta = addr_to_metadata(addr + PAGE_SIZE);
+ if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+ /* Data race ok; distance calculation approximate. */
+ if (!to_report || distance > data_race(meta->addr) - addr)
+ to_report = meta;
+ }
+
+ if (!to_report)
+ goto out;
+
+ raw_spin_lock_irqsave(&to_report->lock, flags);
+ to_report->unprotected_page = addr;
+ error_type = KFENCE_ERROR_OOB;
+
+ /*
+ * If the object was freed before we took the look we can still
+ * report this as an OOB -- the report will simply show the
+ * stacktrace of the free as well.
+ */
+ } else {
+ to_report = addr_to_metadata(addr);
+ if (!to_report)
+ goto out;
+
+ raw_spin_lock_irqsave(&to_report->lock, flags);
+ error_type = KFENCE_ERROR_UAF;
+ /*
+ * We may race with __kfence_alloc(), and it is possible that a
+ * freed object may be reallocated. We simply report this as a
+ * use-after-free, with the stack trace showing the place where
+ * the object was re-allocated.
+ */
+ }
+
+out:
+ if (to_report) {
+ kfence_report_error(addr, is_write, regs, to_report, error_type);
+ raw_spin_unlock_irqrestore(&to_report->lock, flags);
+ } else {
+ /* This may be a UAF or OOB access, but we can't be sure. */
+ kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
+ }
+
+ return kfence_unprotect(addr); /* Unprotect and let access proceed. */
+}
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
new file mode 100644
index 000000000000..392fb273e7bd
--- /dev/null
+++ b/mm/kfence/kfence.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Kernel Electric-Fence (KFENCE). For more info please see
+ * Documentation/dev-tools/kfence.rst.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef MM_KFENCE_KFENCE_H
+#define MM_KFENCE_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "../slab.h" /* for struct kmem_cache */
+
+/*
+ * Get the canary byte pattern for @addr. Use a pattern that varies based on the
+ * lower 3 bits of the address, to detect memory corruptions with higher
+ * probability, where similar constants are used.
+ */
+#define KFENCE_CANARY_PATTERN_U8(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
+
+/*
+ * Define a continuous 8-byte canary starting from a multiple of 8. The canary
+ * of each byte is only related to the lowest three bits of its address, so the
+ * canary of every 8 bytes is the same. 64-bit memory can be filled and checked
+ * at a time instead of byte by byte to improve performance.
+ */
+#define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(le64_to_cpu(0x0706050403020100)))
+
+/* Maximum stack depth for reports. */
+#define KFENCE_STACK_DEPTH 64
+
+/* KFENCE object states. */
+enum kfence_object_state {
+ KFENCE_OBJECT_UNUSED, /* Object is unused. */
+ KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */
+ KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */
+};
+
+/* Alloc/free tracking information. */
+struct kfence_track {
+ pid_t pid;
+ int cpu;
+ u64 ts_nsec;
+ int num_stack_entries;
+ unsigned long stack_entries[KFENCE_STACK_DEPTH];
+};
+
+/* KFENCE metadata per guarded allocation. */
+struct kfence_metadata {
+ struct list_head list; /* Freelist node; access under kfence_freelist_lock. */
+ struct rcu_head rcu_head; /* For delayed freeing. */
+
+ /*
+ * Lock protecting below data; to ensure consistency of the below data,
+ * since the following may execute concurrently: __kfence_alloc(),
+ * __kfence_free(), kfence_handle_page_fault(). However, note that we
+ * cannot grab the same metadata off the freelist twice, and multiple
+ * __kfence_alloc() cannot run concurrently on the same metadata.
+ */
+ raw_spinlock_t lock;
+
+ /* The current state of the object; see above. */
+ enum kfence_object_state state;
+
+ /*
+ * Allocated object address; cannot be calculated from size, because of
+ * alignment requirements.
+ *
+ * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant.
+ */
+ unsigned long addr;
+
+ /*
+ * The size of the original allocation.
+ */
+ size_t size;
+
+ /*
+ * The kmem_cache cache of the last allocation; NULL if never allocated
+ * or the cache has already been destroyed.
+ */
+ struct kmem_cache *cache;
+
+ /*
+ * In case of an invalid access, the page that was unprotected; we
+ * optimistically only store one address.
+ */
+ unsigned long unprotected_page;
+
+ /* Allocation and free stack information. */
+ struct kfence_track alloc_track;
+ struct kfence_track free_track;
+ /* For updating alloc_covered on frees. */
+ u32 alloc_stack_hash;
+#ifdef CONFIG_MEMCG
+ struct obj_cgroup *objcg;
+#endif
+};
+
+extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
+{
+ long index;
+
+ /* The checks do not affect performance; only called from slow-paths. */
+
+ if (!is_kfence_address((void *)addr))
+ return NULL;
+
+ /*
+ * May be an invalid index if called with an address at the edge of
+ * __kfence_pool, in which case we would report an "invalid access"
+ * error.
+ */
+ index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
+ if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
+ return NULL;
+
+ return &kfence_metadata[index];
+}
+
+/* KFENCE error types for report generation. */
+enum kfence_error_type {
+ KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */
+ KFENCE_ERROR_UAF, /* Detected a use-after-free access. */
+ KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */
+ KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */
+ KFENCE_ERROR_INVALID_FREE, /* Invalid free. */
+};
+
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
+ const struct kfence_metadata *meta, enum kfence_error_type type);
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
+
+#endif /* MM_KFENCE_KFENCE_H */
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
new file mode 100644
index 000000000000..9e008a336d9f
--- /dev/null
+++ b/mm/kfence/kfence_test.c
@@ -0,0 +1,851 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test cases for KFENCE memory safety error detector. Since the interface with
+ * which KFENCE's reports are obtained is via the console, this is the output we
+ * should verify. For each test case checks the presence (or absence) of
+ * generated reports. Relies on 'console' tracepoint to capture reports as they
+ * appear in the kernel log.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Alexander Potapenko <glider@google.com>
+ * Marco Elver <elver@google.com>
+ */
+
+#include <kunit/test.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kfence.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <trace/events/printk.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* May be overridden by <asm/kfence.h>. */
+#ifndef arch_kfence_test_address
+#define arch_kfence_test_address(addr) (addr)
+#endif
+
+#define KFENCE_TEST_REQUIRES(test, cond) do { \
+ if (!(cond)) \
+ kunit_skip((test), "Test requires: " #cond); \
+} while (0)
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ int nlines;
+ char lines[2][256];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Probe for console output: obtains observed lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+ int nlines;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ nlines = observed.nlines;
+
+ if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) {
+ /*
+ * KFENCE report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+ nlines = 1;
+ } else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) {
+ strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+ }
+
+ WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+static bool report_available(void)
+{
+ return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Information we expect in a report. */
+struct expect_report {
+ enum kfence_error_type type; /* The type or error. */
+ void *fn; /* Function pointer to expected function where access occurred. */
+ char *addr; /* Address at which the bad access occurred. */
+ bool is_write; /* Is access a write. */
+};
+
+static const char *get_access_type(const struct expect_report *r)
+{
+ return r->is_write ? "write" : "read";
+}
+
+/* Check observed report matches information in @r. */
+static bool report_matches(const struct expect_report *r)
+{
+ unsigned long addr = (unsigned long)r->addr;
+ bool ret = false;
+ unsigned long flags;
+ typeof(observed.lines) expect;
+ const char *end;
+ char *cur;
+
+ /* Doubled-checked locking. */
+ if (!report_available())
+ return false;
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expect[0];
+ end = &expect[0][sizeof(expect[0]) - 1];
+ switch (r->type) {
+ case KFENCE_ERROR_OOB:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_UAF:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption");
+ break;
+ case KFENCE_ERROR_INVALID:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free");
+ break;
+ }
+
+ scnprintf(cur, end - cur, " in %pS", r->fn);
+ /* The exact offset won't match, remove it; also strip module name. */
+ cur = strchr(expect[0], '+');
+ if (cur)
+ *cur = '\0';
+
+ /* Access information */
+ cur = expect[1];
+ end = &expect[1][sizeof(expect[1]) - 1];
+
+ switch (r->type) {
+ case KFENCE_ERROR_OOB:
+ cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r));
+ addr = arch_kfence_test_address(addr);
+ break;
+ case KFENCE_ERROR_UAF:
+ cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r));
+ addr = arch_kfence_test_address(addr);
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ cur += scnprintf(cur, end - cur, "Corrupted memory at");
+ break;
+ case KFENCE_ERROR_INVALID:
+ cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r));
+ addr = arch_kfence_test_address(addr);
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ cur += scnprintf(cur, end - cur, "Invalid free of");
+ break;
+ }
+
+ cur += scnprintf(cur, end - cur, " 0x%p", (void *)addr);
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]);
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+ return ret;
+}
+
+/* ===== Test cases ===== */
+
+#define TEST_PRIV_WANT_MEMCACHE ((void *)1)
+
+/* Cache used by tests; if NULL, allocate from kmalloc instead. */
+static struct kmem_cache *test_cache;
+
+static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags,
+ void (*ctor)(void *))
+{
+ if (test->priv != TEST_PRIV_WANT_MEMCACHE)
+ return size;
+
+ kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor);
+
+ /*
+ * Use SLAB_NO_MERGE to prevent merging with existing caches.
+ * Use SLAB_ACCOUNT to allocate via memcg, if enabled.
+ */
+ flags |= SLAB_NO_MERGE | SLAB_ACCOUNT;
+ test_cache = kmem_cache_create("test", size, 1, flags, ctor);
+ KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache");
+
+ return size;
+}
+
+static void test_cache_destroy(void)
+{
+ if (!test_cache)
+ return;
+
+ kmem_cache_destroy(test_cache);
+ test_cache = NULL;
+}
+
+static inline size_t kmalloc_cache_alignment(size_t size)
+{
+ return kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]->align;
+}
+
+/* Must always inline to match stack trace against caller. */
+static __always_inline void test_free(void *ptr)
+{
+ if (test_cache)
+ kmem_cache_free(test_cache, ptr);
+ else
+ kfree(ptr);
+}
+
+/*
+ * If this should be a KFENCE allocation, and on which side the allocation and
+ * the closest guard page should be.
+ */
+enum allocation_policy {
+ ALLOCATE_ANY, /* KFENCE, any side. */
+ ALLOCATE_LEFT, /* KFENCE, left side of page. */
+ ALLOCATE_RIGHT, /* KFENCE, right side of page. */
+ ALLOCATE_NONE, /* No KFENCE allocation. */
+};
+
+/*
+ * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the
+ * current test_cache if set up.
+ */
+static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy)
+{
+ void *alloc;
+ unsigned long timeout, resched_after;
+ const char *policy_name;
+
+ switch (policy) {
+ case ALLOCATE_ANY:
+ policy_name = "any";
+ break;
+ case ALLOCATE_LEFT:
+ policy_name = "left";
+ break;
+ case ALLOCATE_RIGHT:
+ policy_name = "right";
+ break;
+ case ALLOCATE_NONE:
+ policy_name = "none";
+ break;
+ }
+
+ kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp,
+ policy_name, !!test_cache);
+
+ /*
+ * 100x the sample interval should be more than enough to ensure we get
+ * a KFENCE allocation eventually.
+ */
+ timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
+ /*
+ * Especially for non-preemption kernels, ensure the allocation-gate
+ * timer can catch up: after @resched_after, every failed allocation
+ * attempt yields, to ensure the allocation-gate timer is scheduled.
+ */
+ resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval);
+ do {
+ if (test_cache)
+ alloc = kmem_cache_alloc(test_cache, gfp);
+ else
+ alloc = kmalloc(size, gfp);
+
+ if (is_kfence_address(alloc)) {
+ struct slab *slab = virt_to_slab(alloc);
+ struct kmem_cache *s = test_cache ?:
+ kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)];
+
+ /*
+ * Verify that various helpers return the right values
+ * even for KFENCE objects; these are required so that
+ * memcg accounting works correctly.
+ */
+ KUNIT_EXPECT_EQ(test, obj_to_index(s, slab, alloc), 0U);
+ KUNIT_EXPECT_EQ(test, objs_per_slab(s, slab), 1);
+
+ if (policy == ALLOCATE_ANY)
+ return alloc;
+ if (policy == ALLOCATE_LEFT && PAGE_ALIGNED(alloc))
+ return alloc;
+ if (policy == ALLOCATE_RIGHT && !PAGE_ALIGNED(alloc))
+ return alloc;
+ } else if (policy == ALLOCATE_NONE)
+ return alloc;
+
+ test_free(alloc);
+
+ if (time_after(jiffies, resched_after))
+ cond_resched();
+ } while (time_before(jiffies, timeout));
+
+ KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE");
+ return NULL; /* Unreachable. */
+}
+
+static void test_out_of_bounds_read(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_out_of_bounds_read,
+ .is_write = false,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+
+ /*
+ * If we don't have our own cache, adjust based on alignment, so that we
+ * actually access guard pages on either side.
+ */
+ if (!test_cache)
+ size = kmalloc_cache_alignment(size);
+
+ /* Test both sides. */
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf - 1;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ expect.addr = buf + size;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+}
+
+static void test_out_of_bounds_write(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_out_of_bounds_write,
+ .is_write = true,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf - 1;
+ WRITE_ONCE(*expect.addr, 42);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+}
+
+static void test_use_after_free_read(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_use_after_free_read,
+ .is_write = false,
+ };
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ test_free(expect.addr);
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_double_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID_FREE,
+ .fn = test_double_free,
+ };
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ test_free(expect.addr);
+ test_free(expect.addr); /* Double-free. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_invalid_addr_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID_FREE,
+ .fn = test_invalid_addr_free,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ expect.addr = buf + 1; /* Free on invalid address. */
+ test_free(expect.addr); /* Invalid address free. */
+ test_free(buf); /* No error. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_corruption(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_CORRUPTION,
+ .fn = test_corruption,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+
+ /* Test both sides. */
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf + size;
+ WRITE_ONCE(*expect.addr, 42);
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ expect.addr = buf - 1;
+ WRITE_ONCE(*expect.addr, 42);
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * KFENCE is unable to detect an OOB if the allocation's alignment requirements
+ * leave a gap between the object and the guard page. Specifically, an
+ * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB
+ * respectively. Therefore it is impossible for the allocated object to
+ * contiguously line up with the right guard page.
+ *
+ * However, we test that an access to memory beyond the gap results in KFENCE
+ * detecting an OOB access.
+ */
+static void test_kmalloc_aligned_oob_read(struct kunit *test)
+{
+ const size_t size = 73;
+ const size_t align = kmalloc_cache_alignment(size);
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_kmalloc_aligned_oob_read,
+ .is_write = false,
+ };
+ char *buf;
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+
+ /*
+ * The object is offset to the right, so there won't be an OOB to the
+ * left of it.
+ */
+ READ_ONCE(*(buf - 1));
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /*
+ * @buf must be aligned on @align, therefore buf + size belongs to the
+ * same page -> no OOB.
+ */
+ READ_ONCE(*(buf + size));
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /* Overflowing by @align bytes will result in an OOB. */
+ expect.addr = buf + size + align;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+ test_free(buf);
+}
+
+static void test_kmalloc_aligned_oob_write(struct kunit *test)
+{
+ const size_t size = 73;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_CORRUPTION,
+ .fn = test_kmalloc_aligned_oob_write,
+ };
+ char *buf;
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ /*
+ * The object is offset to the right, so we won't get a page
+ * fault immediately after it.
+ */
+ expect.addr = buf + size;
+ WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1);
+ KUNIT_EXPECT_FALSE(test, report_available());
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test cache shrinking and destroying with KFENCE. */
+static void test_shrink_memcache(struct kunit *test)
+{
+ const size_t size = 32;
+ void *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ kmem_cache_shrink(test_cache);
+ test_free(buf);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void ctor_set_x(void *obj)
+{
+ /* Every object has at least 8 bytes. */
+ memset(obj, 'x', 8);
+}
+
+/* Ensure that SL*B does not modify KFENCE objects on bulk free. */
+static void test_free_bulk(struct kunit *test)
+{
+ int iter;
+
+ for (iter = 0; iter < 5; iter++) {
+ const size_t size = setup_test_cache(test, get_random_u32_inclusive(8, 307),
+ 0, (iter & 1) ? ctor_set_x : NULL);
+ void *objects[] = {
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ };
+
+ kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects);
+ KUNIT_ASSERT_FALSE(test, report_available());
+ test_cache_destroy();
+ }
+}
+
+/* Test init-on-free works. */
+static void test_init_on_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_init_on_free,
+ .is_write = false,
+ };
+ int i;
+
+ KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON));
+ /* Assume it hasn't been disabled on command line. */
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ for (i = 0; i < size; i++)
+ expect.addr[i] = i + 1;
+ test_free(expect.addr);
+
+ for (i = 0; i < size; i++) {
+ /*
+ * This may fail if the page was recycled by KFENCE and then
+ * written to again -- this however, is near impossible with a
+ * default config.
+ */
+ KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0);
+
+ if (!i) /* Only check first access to not fail test if page is ever re-protected. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ }
+}
+
+/* Ensure that constructors work properly. */
+static void test_memcache_ctor(struct kunit *test)
+{
+ const size_t size = 32;
+ char *buf;
+ int i;
+
+ setup_test_cache(test, size, 0, ctor_set_x);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+
+ for (i = 0; i < 8; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)'x');
+
+ test_free(buf);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/* Test that memory is zeroed if requested. */
+static void test_gfpzero(struct kunit *test)
+{
+ const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */
+ char *buf1, *buf2;
+ int i;
+
+ /* Skip if we think it'd take too long. */
+ KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100);
+
+ setup_test_cache(test, size, 0, NULL);
+ buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ for (i = 0; i < size; i++)
+ buf1[i] = i + 1;
+ test_free(buf1);
+
+ /* Try to get same address again -- this can take a while. */
+ for (i = 0;; i++) {
+ buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY);
+ if (buf1 == buf2)
+ break;
+ test_free(buf2);
+
+ if (kthread_should_stop() || (i == CONFIG_KFENCE_NUM_OBJECTS)) {
+ kunit_warn(test, "giving up ... cannot get same object back\n");
+ return;
+ }
+ cond_resched();
+ }
+
+ for (i = 0; i < size; i++)
+ KUNIT_EXPECT_EQ(test, buf2[i], (char)0);
+
+ test_free(buf2);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void test_invalid_access(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID,
+ .fn = test_invalid_access,
+ .addr = &__kfence_pool[10],
+ .is_write = false,
+ };
+
+ READ_ONCE(__kfence_pool[10]);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test SLAB_TYPESAFE_BY_RCU works. */
+static void test_memcache_typesafe_by_rcu(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_memcache_typesafe_by_rcu,
+ .is_write = false,
+ };
+
+ setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ *expect.addr = 42;
+
+ rcu_read_lock();
+ test_free(expect.addr);
+ KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+ /*
+ * Up to this point, memory should not have been freed yet, and
+ * therefore there should be no KFENCE report from the above access.
+ */
+ rcu_read_unlock();
+
+ /* Above access to @expect.addr should not have generated a report! */
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /* Only after rcu_barrier() is the memory guaranteed to be freed. */
+ rcu_barrier();
+
+ /* Expect use-after-free. */
+ KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test krealloc(). */
+static void test_krealloc(struct kunit *test)
+{
+ const size_t size = 32;
+ const struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_krealloc,
+ .addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY),
+ .is_write = false,
+ };
+ char *buf = expect.addr;
+ int i;
+
+ KUNIT_EXPECT_FALSE(test, test_cache);
+ KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */
+ for (i = 0; i < size; i++)
+ buf[i] = i + 1;
+
+ /* Check that we successfully change the size. */
+ buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */
+ /* Note: Might no longer be a KFENCE alloc. */
+ KUNIT_EXPECT_GE(test, ksize(buf), size * 3);
+ for (i = 0; i < size; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+ for (; i < size * 3; i++) /* Fill to extra bytes. */
+ buf[i] = i + 1;
+
+ buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */
+ KUNIT_EXPECT_GE(test, ksize(buf), size * 2);
+ for (i = 0; i < size * 2; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+
+ buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */
+ KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR);
+ KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */
+
+ READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */
+ KUNIT_ASSERT_TRUE(test, report_matches(&expect));
+}
+
+/* Test that some objects from a bulk allocation belong to KFENCE pool. */
+static void test_memcache_alloc_bulk(struct kunit *test)
+{
+ const size_t size = 32;
+ bool pass = false;
+ unsigned long timeout;
+
+ setup_test_cache(test, size, 0, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+ /*
+ * 100x the sample interval should be more than enough to ensure we get
+ * a KFENCE allocation eventually.
+ */
+ timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
+ do {
+ void *objects[100];
+ int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
+ objects);
+ if (!num)
+ continue;
+ for (i = 0; i < ARRAY_SIZE(objects); i++) {
+ if (is_kfence_address(objects[i])) {
+ pass = true;
+ break;
+ }
+ }
+ kmem_cache_free_bulk(test_cache, num, objects);
+ /*
+ * kmem_cache_alloc_bulk() disables interrupts, and calling it
+ * in a tight loop may not give KFENCE a chance to switch the
+ * static branch. Call cond_resched() to let KFENCE chime in.
+ */
+ cond_resched();
+ } while (!pass && time_before(jiffies, timeout));
+
+ KUNIT_EXPECT_TRUE(test, pass);
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/*
+ * KUnit does not provide a way to provide arguments to tests, and we encode
+ * additional info in the name. Set up 2 tests per test case, one using the
+ * default allocator, and another using a custom memcache (suffix '-memcache').
+ */
+#define KFENCE_KUNIT_CASE(test_name) \
+ { .run_case = test_name, .name = #test_name }, \
+ { .run_case = test_name, .name = #test_name "-memcache" }
+
+static struct kunit_case kfence_test_cases[] = {
+ KFENCE_KUNIT_CASE(test_out_of_bounds_read),
+ KFENCE_KUNIT_CASE(test_out_of_bounds_write),
+ KFENCE_KUNIT_CASE(test_use_after_free_read),
+ KFENCE_KUNIT_CASE(test_double_free),
+ KFENCE_KUNIT_CASE(test_invalid_addr_free),
+ KFENCE_KUNIT_CASE(test_corruption),
+ KFENCE_KUNIT_CASE(test_free_bulk),
+ KFENCE_KUNIT_CASE(test_init_on_free),
+ KUNIT_CASE(test_kmalloc_aligned_oob_read),
+ KUNIT_CASE(test_kmalloc_aligned_oob_write),
+ KUNIT_CASE(test_shrink_memcache),
+ KUNIT_CASE(test_memcache_ctor),
+ KUNIT_CASE(test_invalid_access),
+ KUNIT_CASE(test_gfpzero),
+ KUNIT_CASE(test_memcache_typesafe_by_rcu),
+ KUNIT_CASE(test_krealloc),
+ KUNIT_CASE(test_memcache_alloc_bulk),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+ int i;
+
+ if (!__kfence_pool)
+ return -EINVAL;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ for (i = 0; i < ARRAY_SIZE(observed.lines); i++)
+ observed.lines[i][0] = '\0';
+ observed.nlines = 0;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ /* Any test with 'memcache' in its name will want a memcache. */
+ if (strstr(test->name, "memcache"))
+ test->priv = TEST_PRIV_WANT_MEMCACHE;
+ else
+ test->priv = NULL;
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ test_cache_destroy();
+}
+
+static int kfence_suite_init(struct kunit_suite *suite)
+{
+ register_trace_console(probe_console, NULL);
+ return 0;
+}
+
+static void kfence_suite_exit(struct kunit_suite *suite)
+{
+ unregister_trace_console(probe_console, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+static struct kunit_suite kfence_test_suite = {
+ .name = "kfence",
+ .test_cases = kfence_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kfence_suite_init,
+ .suite_exit = kfence_suite_exit,
+};
+
+kunit_test_suites(&kfence_test_suite);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Alexander Potapenko <glider@google.com>, Marco Elver <elver@google.com>");
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
new file mode 100644
index 000000000000..197430a5be4a
--- /dev/null
+++ b/mm/kfence/report.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE reporting.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#include <linux/stdarg.h>
+
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/math.h>
+#include <linux/printk.h>
+#include <linux/sched/debug.h>
+#include <linux/seq_file.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <trace/events/error_report.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* May be overridden by <asm/kfence.h>. */
+#ifndef ARCH_FUNC_PREFIX
+#define ARCH_FUNC_PREFIX ""
+#endif
+
+extern bool no_hash_pointers;
+
+/* Helper function to either print to a seq_file or to console. */
+__printf(2, 3)
+static void seq_con_printf(struct seq_file *seq, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ if (seq)
+ seq_vprintf(seq, fmt, args);
+ else
+ vprintk(fmt, args);
+ va_end(args);
+}
+
+/*
+ * Get the number of stack entries to skip to get out of MM internals. @type is
+ * optional, and if set to NULL, assumes an allocation or free stack.
+ */
+static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries,
+ const enum kfence_error_type *type)
+{
+ char buf[64];
+ int skipnr, fallback = 0;
+
+ if (type) {
+ /* Depending on error type, find different stack entries. */
+ switch (*type) {
+ case KFENCE_ERROR_UAF:
+ case KFENCE_ERROR_OOB:
+ case KFENCE_ERROR_INVALID:
+ /*
+ * kfence_handle_page_fault() may be called with pt_regs
+ * set to NULL; in that case we'll simply show the full
+ * stack trace.
+ */
+ return 0;
+ case KFENCE_ERROR_CORRUPTION:
+ case KFENCE_ERROR_INVALID_FREE:
+ break;
+ }
+ }
+
+ for (skipnr = 0; skipnr < num_entries; skipnr++) {
+ int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
+
+ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") ||
+ !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) {
+ /*
+ * In case of tail calls from any of the below to any of
+ * the above, optimized by the compiler such that the
+ * stack trace would omit the initial entry point below.
+ */
+ fallback = skipnr + 1;
+ }
+
+ /*
+ * The below list should only include the initial entry points
+ * into the slab allocators. Includes the *_bulk() variants by
+ * checking prefixes.
+ */
+ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
+ goto found;
+ }
+ if (fallback < num_entries)
+ return fallback;
+found:
+ skipnr++;
+ return skipnr < num_entries ? skipnr : 0;
+}
+
+static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta,
+ bool show_alloc)
+{
+ const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track;
+ u64 ts_sec = track->ts_nsec;
+ unsigned long rem_nsec = do_div(ts_sec, NSEC_PER_SEC);
+
+ /* Timestamp matches printk timestamp format. */
+ seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus:\n",
+ show_alloc ? "allocated" : "freed", track->pid,
+ track->cpu, (unsigned long)ts_sec, rem_nsec / 1000);
+
+ if (track->num_stack_entries) {
+ /* Skip allocation/free internals stack. */
+ int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL);
+
+ /* stack_trace_seq_print() does not exist; open code our own. */
+ for (; i < track->num_stack_entries; i++)
+ seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]);
+ } else {
+ seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation");
+ }
+}
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta)
+{
+ const int size = abs(meta->size);
+ const unsigned long start = meta->addr;
+ const struct kmem_cache *const cache = meta->cache;
+
+ lockdep_assert_held(&meta->lock);
+
+ if (meta->state == KFENCE_OBJECT_UNUSED) {
+ seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata);
+ return;
+ }
+
+ seq_con_printf(seq, "kfence-#%td: 0x%p-0x%p, size=%d, cache=%s\n\n",
+ meta - kfence_metadata, (void *)start, (void *)(start + size - 1),
+ size, (cache && cache->name) ? cache->name : "<destroyed>");
+
+ kfence_print_stack(seq, meta, true);
+
+ if (meta->state == KFENCE_OBJECT_FREED) {
+ seq_con_printf(seq, "\n");
+ kfence_print_stack(seq, meta, false);
+ }
+}
+
+/*
+ * Show bytes at @addr that are different from the expected canary values, up to
+ * @max_bytes.
+ */
+static void print_diff_canary(unsigned long address, size_t bytes_to_show,
+ const struct kfence_metadata *meta)
+{
+ const unsigned long show_until_addr = address + bytes_to_show;
+ const u8 *cur, *end;
+
+ /* Do not show contents of object nor read into following guard page. */
+ end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr)
+ : min(show_until_addr, PAGE_ALIGN(address)));
+
+ pr_cont("[");
+ for (cur = (const u8 *)address; cur < end; cur++) {
+ if (*cur == KFENCE_CANARY_PATTERN_U8(cur))
+ pr_cont(" .");
+ else if (no_hash_pointers)
+ pr_cont(" 0x%02x", *cur);
+ else /* Do not leak kernel memory in non-debug builds. */
+ pr_cont(" !");
+ }
+ pr_cont(" ]");
+}
+
+static const char *get_access_type(bool is_write)
+{
+ return is_write ? "write" : "read";
+}
+
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
+ const struct kfence_metadata *meta, enum kfence_error_type type)
+{
+ unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
+ const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1;
+ int num_stack_entries;
+ int skipnr = 0;
+
+ if (regs) {
+ num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0);
+ } else {
+ num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
+ skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
+ }
+
+ /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */
+ if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta))
+ return;
+
+ if (meta)
+ lockdep_assert_held(&meta->lock);
+ /*
+ * Because we may generate reports in printk-unfriendly parts of the
+ * kernel, such as scheduler code, the use of printk() could deadlock.
+ * Until such time that all printing code here is safe in all parts of
+ * the kernel, accept the risk, and just get our message out (given the
+ * system might already behave unpredictably due to the memory error).
+ * As such, also disable lockdep to hide warnings, and avoid disabling
+ * lockdep for the rest of the kernel.
+ */
+ lockdep_off();
+
+ pr_err("==================================================================\n");
+ /* Print report header. */
+ switch (type) {
+ case KFENCE_ERROR_OOB: {
+ const bool left_of_object = address < meta->addr;
+
+ pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n",
+ get_access_type(is_write), (void *)address,
+ left_of_object ? meta->addr - address : address - meta->addr,
+ left_of_object ? "left" : "right", object_index);
+ break;
+ }
+ case KFENCE_ERROR_UAF:
+ pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n",
+ get_access_type(is_write), (void *)address, object_index);
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Corrupted memory at 0x%p ", (void *)address);
+ print_diff_canary(address, 16, meta);
+ pr_cont(" (in kfence-#%td):\n", object_index);
+ break;
+ case KFENCE_ERROR_INVALID:
+ pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Invalid %s at 0x%p:\n", get_access_type(is_write),
+ (void *)address);
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address,
+ object_index);
+ break;
+ }
+
+ /* Print stack trace and object info. */
+ stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0);
+
+ if (meta) {
+ pr_err("\n");
+ kfence_print_object(NULL, meta);
+ }
+
+ /* Print report footer. */
+ pr_err("\n");
+ if (no_hash_pointers && regs)
+ show_regs(regs);
+ else
+ dump_stack_print_info(KERN_ERR);
+ trace_error_report_end(ERROR_DETECTOR_KFENCE, address);
+ pr_err("==================================================================\n");
+
+ lockdep_on();
+
+ check_panic_on_warn("KFENCE");
+
+ /* We encountered a memory safety error, taint the kernel! */
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
+}
+
+#ifdef CONFIG_PRINTK
+static void kfence_to_kp_stack(const struct kfence_track *track, void **kp_stack)
+{
+ int i, j;
+
+ i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL);
+ for (j = 0; i < track->num_stack_entries && j < KS_ADDRS_COUNT; ++i, ++j)
+ kp_stack[j] = (void *)track->stack_entries[i];
+ if (j < KS_ADDRS_COUNT)
+ kp_stack[j] = NULL;
+}
+
+bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
+{
+ struct kfence_metadata *meta = addr_to_metadata((unsigned long)object);
+ unsigned long flags;
+
+ if (!meta)
+ return false;
+
+ /*
+ * If state is UNUSED at least show the pointer requested; the rest
+ * would be garbage data.
+ */
+ kpp->kp_ptr = object;
+
+ /* Requesting info an a never-used object is almost certainly a bug. */
+ if (WARN_ON(meta->state == KFENCE_OBJECT_UNUSED))
+ return true;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+
+ kpp->kp_slab = slab;
+ kpp->kp_slab_cache = meta->cache;
+ kpp->kp_objp = (void *)meta->addr;
+ kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack);
+ if (meta->state == KFENCE_OBJECT_FREED)
+ kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack);
+ /* get_stack_skipnr() ensures the first entry is outside allocator. */
+ kpp->kp_ret = kpp->kp_stack[0];
+
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ return true;
+}
+#endif
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 58b0d9c502a1..78c8d5d8b628 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -16,22 +16,27 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/page_table_check.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#include "mm_slot.h"
enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
SCAN_PMD_NULL,
+ SCAN_PMD_NONE,
+ SCAN_PMD_MAPPED,
SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
+ SCAN_PTE_MAPPED_HUGEPAGE,
SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
@@ -45,12 +50,14 @@ enum scan_result {
SCAN_VMA_NULL,
SCAN_VMA_CHECK,
SCAN_ADDRESS_RANGE,
- SCAN_SWAP_CACHE_PAGE,
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE,
+ SCAN_STORE_FAILED,
+ SCAN_COPY_MC,
+ SCAN_PAGE_FILLED,
};
#define CREATE_TRACE_POINTS
@@ -73,28 +80,38 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* default collapse hugepages if there is at least one pte mapped like
* it would have happened if the vma was large enough during page
* fault.
+ *
+ * Note that these are only respected if collapse was initiated by khugepaged.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
-static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
#define MAX_PTE_MAPPED_THP 8
+struct collapse_control {
+ bool is_khugepaged;
+
+ /* Num pages scanned per node */
+ u32 node_load[MAX_NUMNODES];
+
+ /* nodemask for allocation fallback */
+ nodemask_t alloc_nmask;
+};
+
/**
- * struct mm_slot - hash lookup from mm to mm_slot
- * @hash: hash collision list
- * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
- * @mm: the mm that this information is valid for
+ * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
+ * @slot: hash lookup from mm to mm_slot
+ * @nr_pte_mapped_thp: number of pte mapped THP
+ * @pte_mapped_thp: address array corresponding pte mapped THP
*/
-struct mm_slot {
- struct hlist_node hash;
- struct list_head mm_node;
- struct mm_struct *mm;
+struct khugepaged_mm_slot {
+ struct mm_slot slot;
/* pte-mapped THP in this mm */
int nr_pte_mapped_thp;
@@ -111,7 +128,7 @@ struct mm_slot {
*/
struct khugepaged_scan {
struct list_head mm_head;
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
unsigned long address;
};
@@ -124,18 +141,18 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+ return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}
static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned long msecs;
+ unsigned int msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
+ err = kstrtouint(buf, 10, &msecs);
+ if (err)
return -EINVAL;
khugepaged_scan_sleep_millisecs = msecs;
@@ -145,25 +162,24 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
- __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
- scan_sleep_millisecs_store);
+ __ATTR_RW(scan_sleep_millisecs);
static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+ return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
}
static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned long msecs;
+ unsigned int msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
+ err = kstrtouint(buf, 10, &msecs);
+ if (err)
return -EINVAL;
khugepaged_alloc_sleep_millisecs = msecs;
@@ -173,24 +189,23 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
- __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
- alloc_sleep_millisecs_store);
+ __ATTR_RW(alloc_sleep_millisecs);
static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+ return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ unsigned int pages;
int err;
- unsigned long pages;
- err = kstrtoul(buf, 10, &pages);
- if (err || !pages || pages > UINT_MAX)
+ err = kstrtouint(buf, 10, &pages);
+ if (err || !pages)
return -EINVAL;
khugepaged_pages_to_scan = pages;
@@ -198,14 +213,13 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute pages_to_scan_attr =
- __ATTR(pages_to_scan, 0644, pages_to_scan_show,
- pages_to_scan_store);
+ __ATTR_RW(pages_to_scan);
static ssize_t pages_collapsed_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+ return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
}
static struct kobj_attribute pages_collapsed_attr =
__ATTR_RO(pages_collapsed);
@@ -214,27 +228,26 @@ static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_full_scans);
+ return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
}
static struct kobj_attribute full_scans_attr =
__ATTR_RO(full_scans);
-static ssize_t khugepaged_defrag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
-static ssize_t khugepaged_defrag_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
- __ATTR(defrag, 0644, khugepaged_defrag_show,
- khugepaged_defrag_store);
+ __ATTR_RW(defrag);
/*
* max_ptes_none controls if khugepaged should collapse hugepages over
@@ -244,21 +257,21 @@ static struct kobj_attribute khugepaged_defrag_attr =
* runs. Increasing max_ptes_none will instead potentially reduce the
* free memory in the system during the khugepaged scan.
*/
-static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+ return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
}
-static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_none;
err = kstrtoul(buf, 10, &max_ptes_none);
- if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ if (err || max_ptes_none > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_none = max_ptes_none;
@@ -266,25 +279,24 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
- __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
- khugepaged_max_ptes_none_store);
+ __ATTR_RW(max_ptes_none);
-static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+ return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
}
-static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_swap;
err = kstrtoul(buf, 10, &max_ptes_swap);
- if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_swap = max_ptes_swap;
@@ -293,25 +305,24 @@ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
}
static struct kobj_attribute khugepaged_max_ptes_swap_attr =
- __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
- khugepaged_max_ptes_swap_store);
+ __ATTR_RW(max_ptes_swap);
-static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+ return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
}
-static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_shared_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_shared;
err = kstrtoul(buf, 10, &max_ptes_shared);
- if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+ if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_shared = max_ptes_shared;
@@ -320,8 +331,7 @@ static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
}
static struct kobj_attribute khugepaged_max_ptes_shared_attr =
- __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
- khugepaged_max_ptes_shared_store);
+ __ATTR_RW(max_ptes_shared);
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
@@ -363,9 +373,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
* register it here without waiting a page fault that
* may not happen any time soon.
*/
- if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
- khugepaged_enter_vma_merge(vma, *vm_flags))
- return -ENOMEM;
+ khugepaged_enter_vma(vma, *vm_flags);
break;
case MADV_NOHUGEPAGE:
*vm_flags &= ~VM_HUGEPAGE;
@@ -384,8 +392,9 @@ int hugepage_madvise(struct vm_area_struct *vma,
int __init khugepaged_init(void)
{
mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
- sizeof(struct mm_slot),
- __alignof__(struct mm_slot), 0, NULL);
+ sizeof(struct khugepaged_mm_slot),
+ __alignof__(struct khugepaged_mm_slot),
+ 0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
@@ -402,174 +411,123 @@ void __init khugepaged_destroy(void)
kmem_cache_destroy(mm_slot_cache);
}
-static inline struct mm_slot *alloc_mm_slot(void)
+static inline int hpage_collapse_test_exit(struct mm_struct *mm)
{
- if (!mm_slot_cache) /* initialization failed */
- return NULL;
- return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
- kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
- struct mm_slot *mm_slot;
-
- hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
- if (mm == mm_slot->mm)
- return mm_slot;
-
- return NULL;
-}
-
-static void insert_to_mm_slots_hash(struct mm_struct *mm,
- struct mm_slot *mm_slot)
-{
- mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
-}
-
-static inline int khugepaged_test_exit(struct mm_struct *mm)
-{
- return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
-}
-
-static bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return false;
-
- if (shmem_file(vma->vm_file) ||
- (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- vma->vm_file &&
- (vm_flags & VM_DENYWRITE))) {
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
- }
- if (!vma->anon_vma || vma->vm_ops)
- return false;
- if (vma_is_temporary_stack(vma))
- return false;
- return !(vm_flags & VM_NO_KHUGEPAGED);
+ return atomic_read(&mm->mm_users) == 0;
}
-int __khugepaged_enter(struct mm_struct *mm)
+void __khugepaged_enter(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
int wakeup;
- mm_slot = alloc_mm_slot();
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ return;
+
+ mm_slot = mm_slot_alloc(mm_slot_cache);
if (!mm_slot)
- return -ENOMEM;
+ return;
- /* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
- free_mm_slot(mm_slot);
- return 0;
- }
+ slot = &mm_slot->slot;
spin_lock(&khugepaged_mm_lock);
- insert_to_mm_slots_hash(mm, mm_slot);
+ mm_slot_insert(mm_slots_hash, mm, slot);
/*
* Insert just behind the scanning cursor, to let the area settle
* down a little.
*/
wakeup = list_empty(&khugepaged_scan.mm_head);
- list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
mmgrab(mm);
if (wakeup)
wake_up_interruptible(&khugepaged_wait);
-
- return 0;
}
-int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
- unsigned long vm_flags)
+void khugepaged_enter_vma(struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
- unsigned long hstart, hend;
-
- /*
- * khugepaged only supports read-only files for non-shmem files.
- * khugepaged does not yet work on special mappings. And
- * file-private shmem THP is not supported.
- */
- if (!hugepage_vma_check(vma, vm_flags))
- return 0;
-
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart < hend)
- return khugepaged_enter(vma, vm_flags);
- return 0;
+ if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
+ hugepage_flags_enabled()) {
+ if (hugepage_vma_check(vma, vm_flags, false, false, true))
+ __khugepaged_enter(vma->vm_mm);
+ }
}
void __khugepaged_exit(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
free = 1;
}
spin_unlock(&khugepaged_mm_lock);
if (free) {
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
} else if (mm_slot) {
/*
* This is required to serialize against
- * khugepaged_test_exit() (which is guaranteed to run
- * under mmap sem read mode). Stop here (after we
- * return all pagetables will be destroyed) until
- * khugepaged has finished working on the pagetables
- * under the mmap_lock.
+ * hpage_collapse_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we return all
+ * pagetables will be destroyed) until khugepaged has finished
+ * working on the pagetables under the mmap_lock.
*/
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
}
+static void release_pte_folio(struct folio *folio)
+{
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ -folio_nr_pages(folio));
+ folio_unlock(folio);
+ folio_putback_lru(folio);
+}
+
static void release_pte_page(struct page *page)
{
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- -compound_nr(page));
- unlock_page(page);
- putback_lru_page(page);
+ release_pte_folio(page_folio(page));
}
static void release_pte_pages(pte_t *pte, pte_t *_pte,
struct list_head *compound_pagelist)
{
- struct page *page, *tmp;
+ struct folio *folio, *tmp;
while (--_pte >= pte) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
+ unsigned long pfn;
- page = pte_page(pteval);
- if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
- !PageCompound(page))
- release_pte_page(page);
+ if (pte_none(pteval))
+ continue;
+ pfn = pte_pfn(pteval);
+ if (is_zero_pfn(pfn))
+ continue;
+ folio = pfn_folio(pfn);
+ if (folio_test_large(folio))
+ continue;
+ release_pte_folio(folio);
}
- list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
- list_del(&page->lru);
- release_pte_page(page);
+ list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
+ list_del(&folio->lru);
+ release_pte_folio(folio);
}
}
@@ -587,23 +545,27 @@ static bool is_refcount_suitable(struct page *page)
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
+ struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
bool writable = false;
- for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
+ ++none_or_zero;
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out;
}
}
@@ -611,18 +573,26 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_NON_PRESENT;
goto out;
}
+ if (pte_uffd_wp(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out;
+ }
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page)) {
+ if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
VM_BUG_ON_PAGE(!PageAnon(page), page);
- if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
- result = SCAN_EXCEED_SHARED_PTE;
- goto out;
+ if (page_mapcount(page) > 1) {
+ ++shared;
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ goto out;
+ }
}
if (PageCompound(page)) {
@@ -655,7 +625,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*
* The page table that maps the page has been already unlinked
* from the page table tree and this process cannot get
- * an additinal pin on the page.
+ * an additional pin on the page.
*
* New pins can come later if the page is shared across fork,
* but not from this process. The other process cannot write to
@@ -666,22 +636,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PAGE_COUNT;
goto out;
}
- if (!pte_write(pteval) && PageSwapCache(page) &&
- !reuse_swap_page(page, NULL)) {
- /*
- * Page is in the swap cache and cannot be re-used.
- * It cannot be collapsed into a THP.
- */
- unlock_page(page);
- result = SCAN_SWAP_CACHE_PAGE;
- goto out;
- }
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
- if (isolate_lru_page(page)) {
+ if (!isolate_lru_page(page)) {
unlock_page(page);
result = SCAN_DEL_PAGE_LRU;
goto out;
@@ -695,63 +655,63 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (PageCompound(page))
list_add_tail(&page->lru, compound_pagelist);
next:
- /* There should be enough young pte to collapse the page */
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
+ /*
+ * If collapse was initiated by khugepaged, check that there is
+ * enough young pte to justify collapsing the page
+ */
+ if (cc->is_khugepaged &&
+ (pte_young(pteval) || page_is_young(page) ||
+ PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ address)))
referenced++;
if (pte_write(pteval))
writable = true;
}
- if (likely(writable)) {
- if (likely(referenced)) {
- result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
- referenced, writable, result);
- return 1;
- }
- } else {
+
+ if (unlikely(!writable)) {
result = SCAN_PAGE_RO;
+ } else if (unlikely(cc->is_khugepaged && !referenced)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return result;
}
-
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
- return 0;
+ return result;
}
-static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
- struct vm_area_struct *vma,
- unsigned long address,
- spinlock_t *ptl,
- struct list_head *compound_pagelist)
+static void __collapse_huge_page_copy_succeeded(pte_t *pte,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist)
{
- struct page *src_page, *tmp;
+ struct page *src_page;
+ struct page *tmp;
pte_t *_pte;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, page++, address += PAGE_SIZE) {
- pte_t pteval = *_pte;
+ pte_t pteval;
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
if (is_zero_pfn(pte_pfn(pteval))) {
/*
* ptl mostly unnecessary.
*/
spin_lock(ptl);
- /*
- * paravirt calls inside pte_clear here are
- * superfluous.
- */
- pte_clear(vma->vm_mm, address, _pte);
+ ptep_clear(vma->vm_mm, address, _pte);
spin_unlock(ptl);
}
} else {
src_page = pte_page(pteval);
- copy_user_highpage(page, src_page, address, vma);
if (!PageCompound(src_page))
release_pte_page(src_page);
/*
@@ -760,12 +720,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* inside page_remove_rmap().
*/
spin_lock(ptl);
- /*
- * paravirt calls inside pte_clear here are
- * superfluous.
- */
- pte_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page, false);
+ ptep_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page, vma, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
@@ -773,8 +729,94 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
list_del(&src_page->lru);
- release_pte_page(src_page);
+ mod_node_page_state(page_pgdat(src_page),
+ NR_ISOLATED_ANON + page_is_file_lru(src_page),
+ -compound_nr(src_page));
+ unlock_page(src_page);
+ free_swap_cache(src_page);
+ putback_lru_page(src_page);
+ }
+}
+
+static void __collapse_huge_page_copy_failed(pte_t *pte,
+ pmd_t *pmd,
+ pmd_t orig_pmd,
+ struct vm_area_struct *vma,
+ struct list_head *compound_pagelist)
+{
+ spinlock_t *pmd_ptl;
+
+ /*
+ * Re-establish the PMD to point to the original page table
+ * entry. Restoring PMD needs to be done prior to releasing
+ * pages. Since pages are still isolated and locked here,
+ * acquiring anon_vma_lock_write is unnecessary.
+ */
+ pmd_ptl = pmd_lock(vma->vm_mm, pmd);
+ pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
+ spin_unlock(pmd_ptl);
+ /*
+ * Release both raw and compound pages isolated
+ * in __collapse_huge_page_isolate.
+ */
+ release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+}
+
+/*
+ * __collapse_huge_page_copy - attempts to copy memory contents from raw
+ * pages to a hugepage. Cleans up the raw pages if copying succeeds;
+ * otherwise restores the original page table and releases isolated raw pages.
+ * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
+ *
+ * @pte: starting of the PTEs to copy from
+ * @page: the new hugepage to copy contents to
+ * @pmd: pointer to the new hugepage's PMD
+ * @orig_pmd: the original raw pages' PMD
+ * @vma: the original raw pages' virtual memory area
+ * @address: starting address to copy
+ * @ptl: lock on raw pages' PTEs
+ * @compound_pagelist: list that stores compound pages
+ */
+static int __collapse_huge_page_copy(pte_t *pte,
+ struct page *page,
+ pmd_t *pmd,
+ pmd_t orig_pmd,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist)
+{
+ struct page *src_page;
+ pte_t *_pte;
+ pte_t pteval;
+ unsigned long _address;
+ int result = SCAN_SUCCEED;
+
+ /*
+ * Copying pages' contents is subject to memory poison at any iteration.
+ */
+ for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
+ _pte++, page++, _address += PAGE_SIZE) {
+ pteval = ptep_get(_pte);
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ clear_user_highpage(page, _address);
+ continue;
+ }
+ src_page = pte_page(pteval);
+ if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) {
+ result = SCAN_COPY_MC;
+ break;
+ }
}
+
+ if (likely(result == SCAN_SUCCEED))
+ __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
+ compound_pagelist);
+ else
+ __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
+ compound_pagelist);
+
+ return result;
}
static void khugepaged_alloc_sleep(void)
@@ -782,14 +824,16 @@ static void khugepaged_alloc_sleep(void)
DEFINE_WAIT(wait);
add_wait_queue(&khugepaged_wait, &wait);
- freezable_schedule_timeout_interruptible(
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
}
-static int khugepaged_node_load[MAX_NUMNODES];
+struct collapse_control khugepaged_collapse_control = {
+ .is_khugepaged = true,
+};
-static bool khugepaged_scan_abort(int nid)
+static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
{
int i;
@@ -797,15 +841,15 @@ static bool khugepaged_scan_abort(int nid)
* If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
- if (!node_reclaim_mode)
+ if (!node_reclaim_enabled())
return false;
/* If there is a count for this node already, it must be acceptable */
- if (khugepaged_node_load[nid])
+ if (cc->node_load[nid])
return false;
for (i = 0; i < MAX_NUMNODES; i++) {
- if (!khugepaged_node_load[i])
+ if (!cc->node_load[i])
continue;
if (node_distance(nid, i) > node_reclaim_distance)
return true;
@@ -813,6 +857,10 @@ static bool khugepaged_scan_abort(int nid)
return false;
}
+#define khugepaged_defrag() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
@@ -820,250 +868,237 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
}
#ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
- static int last_khugepaged_target_node = NUMA_NO_NODE;
int nid, target_node = 0, max_value = 0;
/* find first node with max normal pages hit */
for (nid = 0; nid < MAX_NUMNODES; nid++)
- if (khugepaged_node_load[nid] > max_value) {
- max_value = khugepaged_node_load[nid];
+ if (cc->node_load[nid] > max_value) {
+ max_value = cc->node_load[nid];
target_node = nid;
}
- /* do some balance if several nodes have the same hit record */
- if (target_node <= last_khugepaged_target_node)
- for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
- nid++)
- if (max_value == khugepaged_node_load[nid]) {
- target_node = nid;
- break;
- }
+ for_each_online_node(nid) {
+ if (max_value == cc->node_load[nid])
+ node_set(nid, cc->alloc_nmask);
+ }
- last_khugepaged_target_node = target_node;
return target_node;
}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+#else
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
- if (IS_ERR(*hpage)) {
- if (!*wait)
- return false;
-
- *wait = false;
- *hpage = NULL;
- khugepaged_alloc_sleep();
- } else if (*hpage) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- return true;
+ return 0;
}
+#endif
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+ nodemask_t *nmask)
{
- VM_BUG_ON_PAGE(*hpage, *hpage);
-
- *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+ *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- *hpage = ERR_PTR(-ENOMEM);
- return NULL;
+ return false;
}
prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
- return *hpage;
-}
-#else
-static int khugepaged_find_target_node(void)
-{
- return 0;
-}
-
-static inline struct page *alloc_khugepaged_hugepage(void)
-{
- struct page *page;
-
- page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
- HPAGE_PMD_ORDER);
- if (page)
- prep_transhuge_page(page);
- return page;
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
- struct page *hpage;
-
- do {
- hpage = alloc_khugepaged_hugepage();
- if (!hpage) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- if (!*wait)
- return NULL;
-
- *wait = false;
- khugepaged_alloc_sleep();
- } else
- count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) && likely(khugepaged_enabled()));
-
- return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
- /*
- * If the hpage allocated earlier was briefly exposed in page cache
- * before collapse_file() failed, it is possible that racing lookups
- * have not yet completed, and would then be unpleasantly surprised by
- * finding the hpage reused for the same mapping at a different offset.
- * Just release the previous allocation if there is any danger of that.
- */
- if (*hpage && page_count(*hpage) > 1) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- if (!*hpage)
- *hpage = khugepaged_alloc_hugepage(wait);
-
- if (unlikely(!*hpage))
- return false;
-
return true;
}
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
-{
- VM_BUG_ON(!*hpage);
-
- return *hpage;
-}
-#endif
-
/*
* If mmap_lock temporarily dropped, revalidate vma
* before taking mmap_lock.
- * Return 0 if succeeds, otherwise return none-zero
- * value (scan code).
+ * Returns enum scan_result value.
*/
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- struct vm_area_struct **vmap)
+ bool expect_anon,
+ struct vm_area_struct **vmap,
+ struct collapse_control *cc)
{
struct vm_area_struct *vma;
- unsigned long hstart, hend;
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
if (!vma)
return SCAN_VMA_NULL;
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (!transhuge_vma_suitable(vma, address))
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma, vma->vm_flags))
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
+ cc->is_khugepaged))
return SCAN_VMA_CHECK;
- /* Anon VMA expected */
- if (!vma->anon_vma || vma->vm_ops)
- return SCAN_VMA_CHECK;
- return 0;
+ /*
+ * Anon VMA expected, the address may be unmapped then
+ * remapped to file after khugepaged reaquired the mmap_lock.
+ *
+ * hugepage_vma_check may return true for qualified file
+ * vmas.
+ */
+ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
+ return SCAN_PAGE_ANON;
+ return SCAN_SUCCEED;
+}
+
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t **pmd)
+{
+ pmd_t pmde;
+
+ *pmd = mm_find_pmd(mm, address);
+ if (!*pmd)
+ return SCAN_PMD_NULL;
+
+ pmde = pmdp_get_lockless(*pmd);
+ if (pmd_none(pmde))
+ return SCAN_PMD_NONE;
+ if (!pmd_present(pmde))
+ return SCAN_PMD_NULL;
+ if (pmd_trans_huge(pmde))
+ return SCAN_PMD_MAPPED;
+ if (pmd_devmap(pmde))
+ return SCAN_PMD_NULL;
+ if (pmd_bad(pmde))
+ return SCAN_PMD_NULL;
+ return SCAN_SUCCEED;
+}
+
+static int check_pmd_still_valid(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmd)
+{
+ pmd_t *new_pmd;
+ int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
+
+ if (result != SCAN_SUCCEED)
+ return result;
+ if (new_pmd != pmd)
+ return SCAN_FAIL;
+ return SCAN_SUCCEED;
}
/*
* Bring missing pages in from swap, to complete THP collapse.
- * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
*
- * Called and returns without pte mapped or spinlocks held,
- * but with mmap_lock held to protect against vma changes.
+ * Called and returns without pte mapped or spinlocks held.
+ * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
-
-static bool __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- int referenced)
+static int __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- struct vm_fault vmf = {
- .vma = vma,
- .address = address,
- .flags = FAULT_FLAG_ALLOW_RETRY,
- .pmd = pmd,
- .pgoff = linear_page_index(vma, address),
- };
+ unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
+ int result;
+ pte_t *pte = NULL;
+ spinlock_t *ptl;
+
+ for (address = haddr; address < end; address += PAGE_SIZE) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = address,
+ .pgoff = linear_page_index(vma, address),
+ .flags = FAULT_FLAG_ALLOW_RETRY,
+ .pmd = pmd,
+ };
+
+ if (!pte++) {
+ pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
+ if (!pte) {
+ mmap_read_unlock(mm);
+ result = SCAN_PMD_NULL;
+ goto out;
+ }
+ }
- vmf.pte = pte_offset_map(pmd, address);
- for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
- vmf.pte++, vmf.address += PAGE_SIZE) {
- vmf.orig_pte = *vmf.pte;
+ vmf.orig_pte = ptep_get_lockless(pte);
if (!is_swap_pte(vmf.orig_pte))
continue;
- swapped_in++;
+
+ vmf.pte = pte;
+ vmf.ptl = ptl;
ret = do_swap_page(&vmf);
+ /* Which unmaps pte (after perhaps re-checking the entry) */
+ pte = NULL;
- /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
+ /*
+ * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
+ * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
+ * we do not retry here and swap entry will remain in pagetable
+ * resulting in later failure.
+ */
if (ret & VM_FAULT_RETRY) {
- mmap_read_lock(mm);
- if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
- /* vma is no longer available, don't continue to swapin */
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
- /* check if the pmd is still valid */
- if (mm_find_pmd(mm, address) != pmd) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
+ /* Likely, but not guaranteed, that page lock failed */
+ result = SCAN_PAGE_LOCK;
+ goto out;
}
if (ret & VM_FAULT_ERROR) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
+ mmap_read_unlock(mm);
+ result = SCAN_FAIL;
+ goto out;
}
- /* pte is unmapped now, we need to map it */
- vmf.pte = pte_offset_map(pmd, vmf.address);
+ swapped_in++;
}
- vmf.pte--;
- pte_unmap(vmf.pte);
- /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
+ if (pte)
+ pte_unmap(pte);
+
+ /* Drain LRU cache to remove extra pin on the swapped in pages */
if (swapped_in)
lru_add_drain();
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
- return true;
+ result = SCAN_SUCCEED;
+out:
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+ return result;
}
-static void collapse_huge_page(struct mm_struct *mm,
- unsigned long address,
- struct page **hpage,
- int node, int referenced, int unmapped)
+static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
+ struct collapse_control *cc)
+{
+ gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
+ GFP_TRANSHUGE);
+ int node = hpage_collapse_find_target_node(cc);
+ struct folio *folio;
+
+ if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
+ return SCAN_ALLOC_HUGE_PAGE_FAIL;
+
+ folio = page_folio(*hpage);
+ if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
+ folio_put(folio);
+ *hpage = NULL;
+ return SCAN_CGROUP_CHARGE_FAIL;
+ }
+ count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+
+ return SCAN_SUCCEED;
+}
+
+static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped,
+ struct collapse_control *cc)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
- struct page *new_page;
+ struct page *hpage;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated = 0, result = 0;
+ int result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
- gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
@@ -1071,41 +1106,34 @@ static void collapse_huge_page(struct mm_struct *mm,
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
- new_page = khugepaged_alloc_page(hpage, gfp, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
- goto out_nolock;
- }
- if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
+ result = alloc_charge_hpage(&hpage, mm, cc);
+ if (result != SCAN_SUCCEED)
goto out_nolock;
- }
- count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, &vma);
- if (result) {
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
+ result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- /*
- * __collapse_huge_page_swapin always returns with mmap_lock locked.
- * If it fails, we release mmap_lock and jump out_nolock.
- * Continuing to collapse causes inconsistency.
- */
- if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
- pmd, referenced)) {
- mmap_read_unlock(mm);
- goto out_nolock;
+ if (unmapped) {
+ /*
+ * __collapse_huge_page_swapin will return with mmap_lock
+ * released when it fails. So we jump out_nolock directly in
+ * that case. Continuing to collapse causes inconsistency.
+ */
+ result = __collapse_huge_page_swapin(mm, vma, address, pmd,
+ referenced);
+ if (result != SCAN_SUCCEED)
+ goto out_nolock;
}
mmap_read_unlock(mm);
@@ -1115,40 +1143,47 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, &vma);
- if (result)
- goto out;
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ if (result != SCAN_SUCCEED)
+ goto out_up_write;
/* check if the pmd is still valid */
- if (mm_find_pmd(mm, address) != pmd)
- goto out;
+ result = check_pmd_still_valid(mm, address, pmd);
+ if (result != SCAN_SUCCEED)
+ goto out_up_write;
+ vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
- address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
+ address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
- pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
-
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();
- spin_lock(pte_ptl);
- isolated = __collapse_huge_page_isolate(vma, address, pte,
- &compound_pagelist);
- spin_unlock(pte_ptl);
+ pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+ if (pte) {
+ result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ &compound_pagelist);
+ spin_unlock(pte_ptl);
+ } else {
+ result = SCAN_PMD_NULL;
+ }
- if (unlikely(!isolated)) {
- pte_unmap(pte);
+ if (unlikely(result != SCAN_SUCCEED)) {
+ if (pte)
+ pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
/*
@@ -1159,8 +1194,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
- result = SCAN_FAIL;
- goto out;
+ goto out_up_write;
}
/*
@@ -1169,54 +1203,54 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
- &compound_pagelist);
+ result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
+ vma, address, pte_ptl,
+ &compound_pagelist);
pte_unmap(pte);
- __SetPageUptodate(new_page);
- pgtable = pmd_pgtable(_pmd);
-
- _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
- _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+ if (unlikely(result != SCAN_SUCCEED))
+ goto out_up_write;
/*
- * spin_lock() below is not the equivalent of smp_wmb(), so
- * this is needed to avoid the copy_huge_page writes to become
- * visible after the set_pmd_at() write.
+ * spin_lock() below is not the equivalent of smp_wmb(), but
+ * the smp_wmb() inside __SetPageUptodate() can be reused to
+ * avoid the copy_huge_page writes to become visible after
+ * the set_pmd_at() write.
*/
- smp_wmb();
+ __SetPageUptodate(hpage);
+ pgtable = pmd_pgtable(_pmd);
+
+ _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address, true);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ page_add_new_anon_rmap(hpage, vma, address);
+ lru_cache_add_inactive_or_unevictable(hpage, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
- *hpage = NULL;
+ hpage = NULL;
- khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_up_write:
mmap_write_unlock(mm);
out_nolock:
- if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(*hpage);
- trace_mm_collapse_huge_page(mm, isolated, result);
- return;
-out:
- goto out_up_write;
+ if (hpage)
+ put_page(hpage);
+ trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+ return result;
}
-static int khugepaged_scan_pmd(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- struct page **hpage)
+static int hpage_collapse_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, bool *mmap_locked,
+ struct collapse_control *cc)
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, result = 0, referenced = 0;
+ int result = SCAN_FAIL, referenced = 0;
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
unsigned long _address;
@@ -1226,54 +1260,60 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
+ result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ if (result != SCAN_SUCCEED)
+ goto out;
+
+ memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte) {
result = SCAN_PMD_NULL;
goto out;
}
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
- if (++unmapped <= khugepaged_max_ptes_swap) {
+ ++unmapped;
+ if (!cc->is_khugepaged ||
+ unmapped <= khugepaged_max_ptes_swap) {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
* comment below for pte_uffd_wp().
*/
- if (pte_swp_uffd_wp(pteval)) {
+ if (pte_swp_uffd_wp_any(pteval)) {
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
goto out_unmap;
}
}
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ ++none_or_zero;
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out_unmap;
}
}
- if (!pte_present(pteval)) {
- result = SCAN_PTE_NON_PRESENT;
- goto out_unmap;
- }
if (pte_uffd_wp(pteval)) {
/*
* Don't collapse the page if any of the small
* PTEs are armed with uffd write protection.
* Here we can also mark the new huge pmd as
* write protected if any of the small ones is
- * marked but that could bring uknown
+ * marked but that could bring unknown
* userfault messages that falls outside of
* the registered range. So, just be simple.
*/
@@ -1284,31 +1324,35 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page)) {
+ if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
- if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
- result = SCAN_EXCEED_SHARED_PTE;
- goto out_unmap;
+ if (page_mapcount(page) > 1) {
+ ++shared;
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ goto out_unmap;
+ }
}
page = compound_head(page);
/*
* Record which node the original page is from and save this
- * information to khugepaged_node_load[].
- * Khupaged will allocate hugepage from the node has the max
+ * information to cc->node_load[].
+ * Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
+ if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
- khugepaged_node_load[node]++;
+ cc->node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
goto out_unmap;
@@ -1325,15 +1369,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
/*
* Check if the page has any GUP (or other external) pins.
*
- * Here the check is racy it may see totmal_mapcount > refcount
- * in some cases.
- * For example, one process with one forked child process.
- * The parent has the PMD split due to MADV_DONTNEED, then
- * the child is trying unmap the whole PMD, but khugepaged
- * may be scanning the parent between the child has
- * PageDoubleMap flag cleared and dec the mapcount. So
- * khugepaged may see total_mapcount > refcount.
- *
+ * Here the check may be racy:
+ * it may see total_mapcount > refcount in some cases?
* But such case is ephemeral we could always retry collapse
* later. However it may report false positive if the page
* has excessive GUP pins (i.e. 512). Anyway the same check
@@ -1343,43 +1380,51 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
+
+ /*
+ * If collapse was initiated by khugepaged, check that there is
+ * enough young pte to justify collapsing the page
+ */
+ if (cc->is_khugepaged &&
+ (pte_young(pteval) || page_is_young(page) ||
+ PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ address)))
referenced++;
}
if (!writable) {
result = SCAN_PAGE_RO;
- } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+ } else if (cc->is_khugepaged &&
+ (!referenced ||
+ (unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- ret = 1;
}
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- node = khugepaged_find_target_node();
+ if (result == SCAN_SUCCEED) {
+ result = collapse_huge_page(mm, address, referenced,
+ unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
- collapse_huge_page(mm, address, hpage, node,
- referenced, unmapped);
+ *mmap_locked = false;
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
none_or_zero, result, unmapped);
- return ret;
+ return result;
}
-static void collect_mm_slot(struct mm_slot *mm_slot)
+static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
{
- struct mm_struct *mm = mm_slot->mm;
+ struct mm_slot *slot = &mm_slot->slot;
+ struct mm_struct *mm = slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
- if (khugepaged_test_exit(mm)) {
+ if (hpage_collapse_test_exit(mm)) {
/* free mm_slot */
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
/*
* Not strictly needed because the mm exited already.
@@ -1388,7 +1433,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
*/
/* khugepaged_mm_lock actually not necessary for the below */
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
}
}
@@ -1397,82 +1442,229 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
/*
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
+ *
+ * Note that following race exists:
+ * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
+ * emptying the A's ->pte_mapped_thp[] array.
+ * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
+ * retract_page_tables() finds a VMA in mm_struct A mapping the same extent
+ * (at virtual address X) and adds an entry (for X) into mm_struct A's
+ * ->pte-mapped_thp[] array.
+ * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
+ * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
+ * (for X) into mm_struct A's ->pte-mapped_thp[] array.
+ * Thus, it's possible the same address is added multiple times for the same
+ * mm_struct. Should this happen, we'll simply attempt
+ * collapse_pte_mapped_thp() multiple times for the same address, under the same
+ * exclusive mmap_lock, and assuming the first call is successful, subsequent
+ * attempts will return quickly (without grabbing any additional locks) when
+ * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap
+ * check, and since this is a rare occurrence, the cost of preventing this
+ * "multiple-add" is thought to be more expensive than just handling it, should
+ * it occur.
*/
-static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
+static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
+ bool ret = false;
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
- if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
+ if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) {
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+ ret = true;
+ }
spin_unlock(&khugepaged_mm_lock);
- return 0;
+ return ret;
+}
+
+/* hpage must be locked, and mmap_lock must be held in write */
+static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmdp, struct page *hpage)
+{
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = addr,
+ .flags = 0,
+ .pmd = pmdp,
+ };
+
+ VM_BUG_ON(!PageTransHuge(hpage));
+ mmap_assert_write_locked(vma->vm_mm);
+
+ if (do_set_pmd(&vmf, hpage))
+ return SCAN_FAIL;
+
+ get_page(hpage);
+ return SCAN_SUCCEED;
+}
+
+/*
+ * A note about locking:
+ * Trying to take the page table spinlocks would be useless here because those
+ * are only used to synchronize:
+ *
+ * - modifying terminal entries (ones that point to a data page, not to another
+ * page table)
+ * - installing *new* non-terminal entries
+ *
+ * Instead, we need roughly the same kind of protection as free_pgtables() or
+ * mm_take_all_locks() (but only for a single VMA):
+ * The mmap lock together with this VMA's rmap locks covers all paths towards
+ * the page table entries we're messing with here, except for hardware page
+ * table walks and lockless_pages_from_mm().
+ */
+static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd;
+ struct mmu_notifier_range range;
+
+ mmap_assert_write_locked(mm);
+ if (vma->vm_file)
+ lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
+ /*
+ * All anon_vmas attached to the VMA have the same root and are
+ * therefore locked by the same lock.
+ */
+ if (vma->anon_vma)
+ lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
+ addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ pmd = pmdp_collapse_flush(vma, addr, pmdp);
+ tlb_remove_table_sync_one();
+ mmu_notifier_invalidate_range_end(&range);
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, addr, pmd);
+ pte_free(mm, pmd_pgtable(pmd));
}
/**
- * Try to collapse a pte-mapped THP for mm at address haddr.
+ * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
+ * address haddr.
+ *
+ * @mm: process address space where collapse happens
+ * @addr: THP collapse address
+ * @install_pmd: If a huge PMD should be installed
*
* This function checks whether all the PTEs in the PMD are pointing to the
* right THP. If so, retract the page table so the THP can refault in with
- * as pmd-mapped.
+ * as pmd-mapped. Possibly install a huge PMD mapping the THP.
*/
-void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ bool install_pmd)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
- struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
- pmd_t *pmd, _pmd;
+ pmd_t *pmd;
spinlock_t *ptl;
- int count = 0;
+ int count = 0, result = SCAN_FAIL;
int i;
+ mmap_assert_write_locked(mm);
+
+ /* Fast check before locking page if already PMD-mapped */
+ result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+ if (result == SCAN_PMD_MAPPED)
+ return result;
+
if (!vma || !vma->vm_file ||
- vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
- return;
+ !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
+ return SCAN_VMA_CHECK;
/*
- * This vm_flags may not have VM_HUGEPAGE if the page was not
- * collapsed by this mm. But we can still collapse if the page is
- * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
- * will not fail the vma for missing VM_HUGEPAGE
+ * If we are here, we've succeeded in replacing all the native pages
+ * in the page cache with a single hugepage. If a mm were to fault-in
+ * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
+ * and map it by a PMD, regardless of sysfs THP settings. As such, let's
+ * analogously elide sysfs THP settings here.
*/
- if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
- return;
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ return SCAN_VMA_CHECK;
+
+ /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
+ if (userfaultfd_wp(vma))
+ return SCAN_PTE_UFFD_WP;
hpage = find_lock_page(vma->vm_file->f_mapping,
linear_page_index(vma, haddr));
if (!hpage)
- return;
+ return SCAN_PAGE_NULL;
- if (!PageHead(hpage))
+ if (!PageHead(hpage)) {
+ result = SCAN_FAIL;
goto drop_hpage;
+ }
- pmd = mm_find_pmd(mm, haddr);
- if (!pmd)
+ if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+ result = SCAN_PAGE_COMPOUND;
goto drop_hpage;
+ }
+
+ switch (result) {
+ case SCAN_SUCCEED:
+ break;
+ case SCAN_PMD_NONE:
+ /*
+ * In MADV_COLLAPSE path, possible race with khugepaged where
+ * all pte entries have been removed and pmd cleared. If so,
+ * skip all the pte checks and just update the pmd mapping.
+ */
+ goto maybe_install_pmd;
+ default:
+ goto drop_hpage;
+ }
+
+ /* Lock the vma before taking i_mmap and page table locks */
+ vma_start_write(vma);
+
+ /*
+ * We need to lock the mapping so that from here on, only GUP-fast and
+ * hardware page walks can access the parts of the page tables that
+ * we're operating on.
+ * See collapse_and_free_pmd().
+ */
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ /*
+ * This spinlock should be unnecessary: Nobody else should be accessing
+ * the page tables under spinlock protection here, only
+ * lockless_pages_from_mm() and the hardware page walker can access page
+ * tables while all the high-level locks are held in write mode.
+ */
+ result = SCAN_FAIL;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+ if (!start_pte)
+ goto drop_immap;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
+ pte_t ptent = ptep_get(pte);
/* empty pte, skip */
- if (pte_none(*pte))
+ if (pte_none(ptent))
continue;
/* page swapped out, abort */
- if (!pte_present(*pte))
+ if (!pte_present(ptent)) {
+ result = SCAN_PTE_NON_PRESENT;
goto abort;
+ }
- page = vm_normal_page(vma, addr, *pte);
-
+ page = vm_normal_page(vma, addr, ptent);
+ if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ page = NULL;
/*
* Note that uprobe, debugger, or MAP_PRIVATE may change the
* page table, but the new page will not be a subpage of hpage.
@@ -1486,11 +1678,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
+ pte_t ptent = ptep_get(pte);
- if (pte_none(*pte))
+ if (pte_none(ptent))
continue;
- page = vm_normal_page(vma, addr, *pte);
- page_remove_rmap(page, false);
+ page = vm_normal_page(vma, addr, ptent);
+ if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ goto abort;
+ page_remove_rmap(page, vma, false);
}
pte_unmap_unlock(start_pte, ptl);
@@ -1501,62 +1696,81 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
}
- /* step 4: collapse pmd */
- ptl = pmd_lock(vma->vm_mm, pmd);
- _pmd = pmdp_collapse_flush(vma, haddr, pmd);
- spin_unlock(ptl);
- mm_dec_nr_ptes(mm);
- pte_free(mm, pmd_pgtable(_pmd));
+ /* step 4: remove pte entries */
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
+ collapse_and_free_pmd(mm, vma, haddr, pmd);
+
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+maybe_install_pmd:
+ /* step 5: install pmd entry */
+ result = install_pmd
+ ? set_huge_pmd(vma, haddr, pmd, hpage)
+ : SCAN_SUCCEED;
drop_hpage:
unlock_page(hpage);
put_page(hpage);
- return;
+ return result;
abort:
pte_unmap_unlock(start_pte, ptl);
+drop_immap:
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
{
- struct mm_struct *mm = mm_slot->mm;
+ struct mm_slot *slot = &mm_slot->slot;
+ struct mm_struct *mm = slot->mm;
int i;
if (likely(mm_slot->nr_pte_mapped_thp == 0))
- return 0;
+ return;
if (!mmap_write_trylock(mm))
- return -EBUSY;
+ return;
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
goto out;
for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
- collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+ collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
out:
mm_slot->nr_pte_mapped_thp = 0;
mmap_write_unlock(mm);
- return 0;
}
-static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+ struct mm_struct *target_mm,
+ unsigned long target_addr, struct page *hpage,
+ struct collapse_control *cc)
{
struct vm_area_struct *vma;
- struct mm_struct *mm;
- unsigned long addr;
- pmd_t *pmd, _pmd;
+ int target_result = SCAN_FAIL;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ int result = SCAN_FAIL;
+ struct mm_struct *mm = NULL;
+ unsigned long addr = 0;
+ pmd_t *pmd;
+ bool is_target = false;
+
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
* got written to. These VMAs are likely not worth investing
* mmap_write_lock(mm) as PMD-mapping is likely to be split
* later.
*
- * Not that vma->anon_vma check is racy: it can be set up after
+ * Note that vma->anon_vma check is racy: it can be set up after
* the check but before we took mmap_lock by the fault path.
* But page lock would prevent establishing any new ptes of the
* page, so we are safe.
@@ -1564,94 +1778,153 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too.
+ * has higher cost too. It would also probably require locking
+ * the anon_vma.
*/
- if (vma->anon_vma)
- continue;
+ if (READ_ONCE(vma->anon_vma)) {
+ result = SCAN_PAGE_ANON;
+ goto next;
+ }
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (addr & ~HPAGE_PMD_MASK)
- continue;
- if (vma->vm_end < addr + HPAGE_PMD_SIZE)
- continue;
+ if (addr & ~HPAGE_PMD_MASK ||
+ vma->vm_end < addr + HPAGE_PMD_SIZE) {
+ result = SCAN_VMA_CHECK;
+ goto next;
+ }
mm = vma->vm_mm;
- pmd = mm_find_pmd(mm, addr);
- if (!pmd)
- continue;
+ is_target = mm == target_mm && addr == target_addr;
+ result = find_pmd_or_thp_or_none(mm, addr, &pmd);
+ if (result != SCAN_SUCCEED)
+ goto next;
/*
* We need exclusive mmap_lock to retract page table.
*
* We use trylock due to lock inversion: we need to acquire
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
+ *
+ * Also, it's not MADV_COLLAPSE's job to collapse other
+ * mappings - let khugepaged take care of them later.
*/
- if (mmap_write_trylock(mm)) {
- if (!khugepaged_test_exit(mm)) {
- spinlock_t *ptl = pmd_lock(mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mm_dec_nr_ptes(mm);
- pte_free(mm, pmd_pgtable(_pmd));
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ if ((cc->is_khugepaged || is_target) &&
+ mmap_write_trylock(mm)) {
+ /* trylock for the same lock inversion as above */
+ if (!vma_try_start_write(vma))
+ goto unlock_next;
+
+ /*
+ * Re-check whether we have an ->anon_vma, because
+ * collapse_and_free_pmd() requires that either no
+ * ->anon_vma exists or the anon_vma is locked.
+ * We already checked ->anon_vma above, but that check
+ * is racy because ->anon_vma can be populated under the
+ * mmap lock in read mode.
+ */
+ if (vma->anon_vma) {
+ result = SCAN_PAGE_ANON;
+ goto unlock_next;
+ }
+ /*
+ * When a vma is registered with uffd-wp, we can't
+ * recycle the pmd pgtable because there can be pte
+ * markers installed. Skip it only, so the rest mm/vma
+ * can still have the same file mapped hugely, however
+ * it'll always mapped in small page size for uffd-wp
+ * registered ranges.
+ */
+ if (hpage_collapse_test_exit(mm)) {
+ result = SCAN_ANY_PROCESS;
+ goto unlock_next;
}
+ if (userfaultfd_wp(vma)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto unlock_next;
+ }
+ collapse_and_free_pmd(mm, vma, addr, pmd);
+ if (!cc->is_khugepaged && is_target)
+ result = set_huge_pmd(vma, addr, pmd, hpage);
+ else
+ result = SCAN_SUCCEED;
+
+unlock_next:
mmap_write_unlock(mm);
- } else {
- /* Try again later */
+ goto next;
+ }
+ /*
+ * Calling context will handle target mm/addr. Otherwise, let
+ * khugepaged try again later.
+ */
+ if (!is_target) {
khugepaged_add_pte_mapped_thp(mm, addr);
+ continue;
}
+next:
+ if (is_target)
+ target_result = result;
}
i_mmap_unlock_write(mapping);
+ return target_result;
}
/**
* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
+ * @mm: process address space where collapse happens
+ * @addr: virtual collapse start address
+ * @file: file that collapse on
+ * @start: collapse start address
+ * @cc: collapse context and scratchpad
+ *
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
- * - scan page cache replacing old pages with the new one
+ * - scan page cache, locking old pages
* + swap/gup in pages if necessary;
- * + fill in gaps;
- * + keep old pages around in case rollback is required;
+ * - copy data to new page
+ * - handle shmem holes
+ * + re-validate that holes weren't filled by someone else
+ * + check for userfaultfd
+ * - finalize updates to the page cache;
* - if replacing succeeds:
- * + copy data over;
- * + free old pages;
* + unlock huge page;
+ * + free old pages;
* - if replacing failed;
- * + put all pages back and unfreeze them;
- * + restore gaps in the page cache;
+ * + unlock old pages
* + unlock and free huge page;
*/
-static void collapse_file(struct mm_struct *mm,
- struct file *file, pgoff_t start,
- struct page **hpage, int node)
+static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
struct address_space *mapping = file->f_mapping;
- gfp_t gfp;
- struct page *new_page;
- pgoff_t index, end = start + HPAGE_PMD_NR;
+ struct page *hpage;
+ struct page *page;
+ struct page *tmp;
+ struct folio *folio;
+ pgoff_t index = 0, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
+ int nr = 0;
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
- new_page = khugepaged_alloc_page(hpage, gfp, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ result = alloc_charge_hpage(&hpage, mm, cc);
+ if (result != SCAN_SUCCEED)
goto out;
- }
- if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
- goto out;
- }
- count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
+ __SetPageLocked(hpage);
+ if (is_shmem)
+ __SetPageSwapBacked(hpage);
+ hpage->index = start;
+ hpage->mapping = mapping;
- /* This will be less messy when we use multi-index entries */
+ /*
+ * Ensure we have slots for all the pages in the range. This is
+ * almost certainly a no-op because most of the pages must be present
+ */
do {
xas_lock_irq(&xas);
xas_create_range(&xas);
@@ -1660,25 +1933,13 @@ static void collapse_file(struct mm_struct *mm,
xas_unlock_irq(&xas);
if (!xas_nomem(&xas, GFP_KERNEL)) {
result = SCAN_FAIL;
- goto out;
+ goto rollback;
}
} while (1);
- __SetPageLocked(new_page);
- if (is_shmem)
- __SetPageSwapBacked(new_page);
- new_page->index = start;
- new_page->mapping = mapping;
-
- /*
- * At this point the new_page is locked and not up-to-date.
- * It's safe to insert it into the page cache, because nobody would
- * be able to map it or use it in another way until we unlock it.
- */
-
- xas_set(&xas, start);
for (index = start; index < end; index++) {
- struct page *page = xas_next(&xas);
+ xas_set(&xas, index);
+ page = xas_load(&xas);
VM_BUG_ON(index != xas.xa_index);
if (is_shmem) {
@@ -1693,13 +1954,11 @@ static void collapse_file(struct mm_struct *mm,
result = SCAN_TRUNCATED;
goto xa_locked;
}
- xas_set(&xas, index);
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, new_page);
nr_none++;
continue;
}
@@ -1707,11 +1966,14 @@ static void collapse_file(struct mm_struct *mm,
if (xa_is_value(page) || !PageUptodate(page)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
- if (shmem_getpage(mapping->host, index, &page,
- SGP_NOHUGE)) {
+ if (shmem_get_folio(mapping->host, index,
+ &folio, SGP_NOALLOC)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
+ /* drain lru cache to help isolate_lru_page() */
+ lru_add_drain();
+ page = folio_file_page(folio, index);
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
@@ -1725,7 +1987,7 @@ static void collapse_file(struct mm_struct *mm,
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
end - index);
- /* drain pagevecs to help isolate_lru_page() */
+ /* drain lru cache to help isolate_lru_page() */
lru_add_drain();
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
@@ -1750,6 +2012,10 @@ static void collapse_file(struct mm_struct *mm,
filemap_flush(mapping);
result = SCAN_FAIL;
goto xa_unlocked;
+ } else if (PageWriteback(page)) {
+ xas_unlock_irq(&xas);
+ result = SCAN_FAIL;
+ goto xa_unlocked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
@@ -1774,18 +2040,28 @@ static void collapse_file(struct mm_struct *mm,
/*
* If file was truncated then extended, or hole-punched, before
* we locked the first page, then a THP might be there already.
+ * This will be discovered on the first iteration.
*/
if (PageTransCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ struct page *head = compound_head(page);
+
+ result = compound_order(head) == HPAGE_PMD_ORDER &&
+ head->index == start
+ /* Maybe PMD-mapped */
+ ? SCAN_PTE_MAPPED_HUGEPAGE
+ : SCAN_PAGE_COMPOUND;
goto out_unlock;
}
- if (page_mapping(page) != mapping) {
+ folio = page_folio(page);
+
+ if (folio_mapping(folio) != mapping) {
result = SCAN_TRUNCATED;
goto out_unlock;
}
- if (!is_shmem && PageDirty(page)) {
+ if (!is_shmem && (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))) {
/*
* khugepaged only works on read-only fd, so this
* page is dirty because it hasn't been flushed
@@ -1795,34 +2071,37 @@ static void collapse_file(struct mm_struct *mm,
goto out_unlock;
}
- if (isolate_lru_page(page)) {
+ if (!folio_isolate_lru(folio)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
}
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL)) {
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
- putback_lru_page(page);
+ folio_putback_lru(folio);
goto out_unlock;
}
- if (page_mapped(page))
- unmap_mapping_pages(mapping, index, 1, false);
+ if (folio_mapped(folio))
+ try_to_unmap(folio,
+ TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
xas_lock_irq(&xas);
- xas_set(&xas, index);
- VM_BUG_ON_PAGE(page != xas_load(&xas), page);
- VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
/*
- * The page is expected to have page_count() == 3:
+ * We control three references to the page:
* - we hold a pin on it;
* - one reference from page cache;
* - one from isolate_lru_page;
+ * If those are the only references, then any new usage of the
+ * page will have to fetch it from the page cache. That requires
+ * locking the page to handle truncate, so any new usage will be
+ * blocked until we unlock page after collapse/during rollback.
*/
- if (!page_ref_freeze(page, 3)) {
+ if (page_count(page) != 3) {
result = SCAN_PAGE_COUNT;
xas_unlock_irq(&xas);
putback_lru_page(page);
@@ -1830,13 +2109,9 @@ static void collapse_file(struct mm_struct *mm,
}
/*
- * Add the page to the list to be able to undo the collapse if
- * something go wrong.
+ * Accumulate the pages that are being collapsed.
*/
list_add_tail(&page->lru, &pagelist);
-
- /* Finally, replace with the new page. */
- xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
@@ -1844,116 +2119,207 @@ out_unlock:
goto xa_unlocked;
}
- if (is_shmem)
- __inc_node_page_state(new_page, NR_SHMEM_THPS);
- else {
- __inc_node_page_state(new_page, NR_FILE_THPS);
+ if (!is_shmem) {
filemap_nr_thps_inc(mapping);
- }
-
- if (nr_none) {
- __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
- if (is_shmem)
- __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+ /*
+ * Paired with smp_mb() in do_dentry_open() to ensure
+ * i_writecount is up to date and the update to nr_thps is
+ * visible. Ensures the page cache will be truncated if the
+ * file is opened writable.
+ */
+ smp_mb();
+ if (inode_is_open_for_write(mapping->host)) {
+ result = SCAN_FAIL;
+ filemap_nr_thps_dec(mapping);
+ }
}
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
- if (result == SCAN_SUCCEED) {
- struct page *page, *tmp;
+ /*
+ * If collapse is successful, flush must be done now before copying.
+ * If collapse is unsuccessful, does flush actually need to be done?
+ * Do it anyway, to clear the state.
+ */
+ try_to_unmap_flush();
- /*
- * Replacing old pages with new one has succeeded, now we
- * need to copy the content and free the old pages.
- */
- index = start;
- list_for_each_entry_safe(page, tmp, &pagelist, lru) {
- while (index < page->index) {
- clear_highpage(new_page + (index % HPAGE_PMD_NR));
- index++;
- }
- copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
- page);
- list_del(&page->lru);
- page->mapping = NULL;
- page_ref_unfreeze(page, 1);
- ClearPageActive(page);
- ClearPageUnevictable(page);
- unlock_page(page);
- put_page(page);
+ if (result != SCAN_SUCCEED)
+ goto rollback;
+
+ /*
+ * The old pages are locked, so they won't change anymore.
+ */
+ index = start;
+ list_for_each_entry(page, &pagelist, lru) {
+ while (index < page->index) {
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
- while (index < end) {
- clear_highpage(new_page + (index % HPAGE_PMD_NR));
- index++;
+ if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) {
+ result = SCAN_COPY_MC;
+ goto rollback;
}
+ index++;
+ }
+ while (index < end) {
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
+ index++;
+ }
- SetPageUptodate(new_page);
- page_ref_add(new_page, HPAGE_PMD_NR - 1);
- if (is_shmem)
- set_page_dirty(new_page);
- lru_cache_add(new_page);
+ if (nr_none) {
+ struct vm_area_struct *vma;
+ int nr_none_check = 0;
+
+ i_mmap_lock_read(mapping);
+ xas_lock_irq(&xas);
+
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ if (!xas_next(&xas)) {
+ xas_store(&xas, XA_RETRY_ENTRY);
+ if (xas_error(&xas)) {
+ result = SCAN_STORE_FAILED;
+ goto immap_locked;
+ }
+ nr_none_check++;
+ }
+ }
+
+ if (nr_none != nr_none_check) {
+ result = SCAN_PAGE_FILLED;
+ goto immap_locked;
+ }
/*
- * Remove pte page tables, so we can re-fault the page as huge.
+ * If userspace observed a missing page in a VMA with a MODE_MISSING
+ * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
+ * page. If so, we need to roll back to avoid suppressing such an
+ * event. Since wp/minor userfaultfds don't give userspace any
+ * guarantees that the kernel doesn't fill a missing page with a zero
+ * page, so they don't matter here.
+ *
+ * Any userfaultfds registered after this point will not be able to
+ * observe any missing pages due to the previously inserted retry
+ * entries.
*/
- retract_page_tables(mapping, start);
- *hpage = NULL;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
+ if (userfaultfd_missing(vma)) {
+ result = SCAN_EXCEED_NONE_PTE;
+ goto immap_locked;
+ }
+ }
- khugepaged_pages_collapsed++;
- } else {
- struct page *page;
+immap_locked:
+ i_mmap_unlock_read(mapping);
+ if (result != SCAN_SUCCEED) {
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ if (xas_next(&xas) == XA_RETRY_ENTRY)
+ xas_store(&xas, NULL);
+ }
- /* Something went wrong: roll back page cache changes */
+ xas_unlock_irq(&xas);
+ goto rollback;
+ }
+ } else {
xas_lock_irq(&xas);
- mapping->nrpages -= nr_none;
+ }
- if (is_shmem)
- shmem_uncharge(mapping->host, nr_none);
+ nr = thp_nr_pages(hpage);
+ if (is_shmem)
+ __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
+ else
+ __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
- xas_set(&xas, start);
- xas_for_each(&xas, page, end - 1) {
- page = list_first_entry_or_null(&pagelist,
- struct page, lru);
- if (!page || xas.xa_index < page->index) {
- if (!nr_none)
- break;
- nr_none--;
- /* Put holes back where they were */
- xas_store(&xas, NULL);
- continue;
- }
+ if (nr_none) {
+ __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
+ /* nr_none is always 0 for non-shmem. */
+ __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
+ }
- VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
+ /*
+ * Mark hpage as uptodate before inserting it into the page cache so
+ * that it isn't mistaken for an fallocated but unwritten page.
+ */
+ folio = page_folio(hpage);
+ folio_mark_uptodate(folio);
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
- /* Unfreeze the page. */
- list_del(&page->lru);
- page_ref_unfreeze(page, 2);
- xas_store(&xas, page);
- xas_pause(&xas);
- xas_unlock_irq(&xas);
- unlock_page(page);
- putback_lru_page(page);
- xas_lock_irq(&xas);
- }
- VM_BUG_ON(nr_none);
+ if (is_shmem)
+ folio_mark_dirty(folio);
+ folio_add_lru(folio);
+
+ /* Join all the small entries into a single multi-index entry. */
+ xas_set_order(&xas, start, HPAGE_PMD_ORDER);
+ xas_store(&xas, hpage);
+ WARN_ON_ONCE(xas_error(&xas));
+ xas_unlock_irq(&xas);
+
+ /*
+ * Remove pte page tables, so we can re-fault the page as huge.
+ */
+ result = retract_page_tables(mapping, start, mm, addr, hpage,
+ cc);
+ unlock_page(hpage);
+
+ /*
+ * The collapse has succeeded, so free the old pages.
+ */
+ list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+ list_del(&page->lru);
+ page->mapping = NULL;
+ ClearPageActive(page);
+ ClearPageUnevictable(page);
+ unlock_page(page);
+ folio_put_refs(page_folio(page), 3);
+ }
+
+ goto out;
+
+rollback:
+ /* Something went wrong: roll back page cache changes */
+ if (nr_none) {
+ xas_lock_irq(&xas);
+ mapping->nrpages -= nr_none;
+ shmem_uncharge(mapping->host, nr_none);
xas_unlock_irq(&xas);
+ }
- new_page->mapping = NULL;
+ list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+ list_del(&page->lru);
+ unlock_page(page);
+ putback_lru_page(page);
+ put_page(page);
+ }
+ /*
+ * Undo the updates of filemap_nr_thps_inc for non-SHMEM
+ * file only. This undo is not needed unless failure is
+ * due to SCAN_COPY_MC.
+ */
+ if (!is_shmem && result == SCAN_COPY_MC) {
+ filemap_nr_thps_dec(mapping);
+ /*
+ * Paired with smp_mb() in do_dentry_open() to
+ * ensure the update to nr_thps is visible.
+ */
+ smp_mb();
}
- unlock_page(new_page);
+ hpage->mapping = NULL;
+
+ unlock_page(hpage);
+ put_page(hpage);
out:
VM_BUG_ON(!list_empty(&pagelist));
- if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(*hpage);
- /* TODO: tracepoints */
+ trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
+ return result;
}
-static void khugepaged_scan_file(struct mm_struct *mm,
- struct file *file, pgoff_t start, struct page **hpage)
+static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
struct page *page = NULL;
struct address_space *mapping = file->f_mapping;
@@ -1964,31 +2330,51 @@ static void khugepaged_scan_file(struct mm_struct *mm,
present = 0;
swap = 0;
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
rcu_read_lock();
xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page)) {
- if (++swap > khugepaged_max_ptes_swap) {
+ ++swap;
+ if (cc->is_khugepaged &&
+ swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
}
continue;
}
+ /*
+ * TODO: khugepaged should compact smaller compound pages
+ * into a PMD sized page
+ */
if (PageTransCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ struct page *head = compound_head(page);
+
+ result = compound_order(head) == HPAGE_PMD_ORDER &&
+ head->index == start
+ /* Maybe PMD-mapped */
+ ? SCAN_PTE_MAPPED_HUGEPAGE
+ : SCAN_PAGE_COMPOUND;
+ /*
+ * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
+ * by the caller won't touch the page cache, and so
+ * it's safe to skip LRU and refcount checks before
+ * returning.
+ */
break;
}
node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
+ if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
break;
}
- khugepaged_node_load[node]++;
+ cc->node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
@@ -2017,54 +2403,67 @@ static void khugepaged_scan_file(struct mm_struct *mm,
rcu_read_unlock();
if (result == SCAN_SUCCEED) {
- if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ if (cc->is_khugepaged &&
+ present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
- node = khugepaged_find_target_node();
- collapse_file(mm, file, start, hpage, node);
+ result = collapse_file(mm, addr, file, start, cc);
}
}
- /* TODO: tracepoints */
+ trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
+ return result;
}
#else
-static void khugepaged_scan_file(struct mm_struct *mm,
- struct file *file, pgoff_t start, struct page **hpage)
+static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
BUILD_BUG();
}
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
{
- return 0;
+}
+
+static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+ return false;
}
#endif
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
- struct page **hpage)
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
+ struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
- struct mm_slot *mm_slot;
+ struct vma_iterator vmi;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
+ *result = SCAN_FAIL;
- if (khugepaged_scan.mm_slot)
+ if (khugepaged_scan.mm_slot) {
mm_slot = khugepaged_scan.mm_slot;
- else {
- mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = &mm_slot->slot;
+ } else {
+ slot = list_entry(khugepaged_scan.mm_head.next,
struct mm_slot, mm_node);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
khugepaged_collapse_pte_mapped_thps(mm_slot);
- mm = mm_slot->mm;
+ mm = slot->mm;
/*
* Don't wait for semaphore (to avoid long wait times). Just move to
* the next mm on the list.
@@ -2072,39 +2471,38 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
vma = NULL;
if (unlikely(!mmap_read_trylock(mm)))
goto breakouterloop_mmap_lock;
- if (likely(!khugepaged_test_exit(mm)))
- vma = find_vma(mm, khugepaged_scan.address);
progress++;
- for (; vma; vma = vma->vm_next) {
+ if (unlikely(hpage_collapse_test_exit(mm)))
+ goto breakouterloop;
+
+ vma_iter_init(&vmi, mm, khugepaged_scan.address);
+ for_each_vma(vmi, vma) {
unsigned long hstart, hend;
cond_resched();
- if (unlikely(khugepaged_test_exit(mm))) {
+ if (unlikely(hpage_collapse_test_exit(mm))) {
progress++;
break;
}
- if (!hugepage_vma_check(vma, vma->vm_flags)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
skip:
progress++;
continue;
}
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart >= hend)
- goto skip;
+ hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
+ hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
if (khugepaged_scan.address > hend)
goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
- if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
- goto skip;
while (khugepaged_scan.address < hend) {
- int ret;
+ bool mmap_locked = true;
+
cond_resched();
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2116,19 +2514,48 @@ skip:
khugepaged_scan.address);
mmap_read_unlock(mm);
- ret = 1;
- khugepaged_scan_file(mm, file, pgoff, hpage);
+ *result = hpage_collapse_scan_file(mm,
+ khugepaged_scan.address,
+ file, pgoff, cc);
+ mmap_locked = false;
fput(file);
} else {
- ret = khugepaged_scan_pmd(mm, vma,
- khugepaged_scan.address,
- hpage);
+ *result = hpage_collapse_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ &mmap_locked,
+ cc);
+ }
+ switch (*result) {
+ case SCAN_PTE_MAPPED_HUGEPAGE: {
+ pmd_t *pmd;
+
+ *result = find_pmd_or_thp_or_none(mm,
+ khugepaged_scan.address,
+ &pmd);
+ if (*result != SCAN_SUCCEED)
+ break;
+ if (!khugepaged_add_pte_mapped_thp(mm,
+ khugepaged_scan.address))
+ break;
+ } fallthrough;
+ case SCAN_SUCCEED:
+ ++khugepaged_pages_collapsed;
+ break;
+ default:
+ break;
}
+
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
- if (ret)
- /* we released mmap_lock so break loop */
+ if (!mmap_locked)
+ /*
+ * We released mmap_lock so break loop. Note
+ * that we drop mmap_lock before all hugepage
+ * allocations, so if allocation fails, we are
+ * guaranteed to break here and report the
+ * correct result back to caller.
+ */
goto breakouterloop_mmap_lock;
if (progress >= pages)
goto breakouterloop;
@@ -2144,16 +2571,17 @@ breakouterloop_mmap_lock:
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
- if (khugepaged_test_exit(mm) || !vma) {
+ if (hpage_collapse_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
- khugepaged_scan.mm_slot = list_entry(
- mm_slot->mm_node.next,
- struct mm_slot, mm_node);
+ if (slot->mm_node.next != &khugepaged_scan.mm_head) {
+ slot = list_entry(slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.mm_slot =
+ mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
@@ -2169,7 +2597,7 @@ breakouterloop_mmap_lock:
static int khugepaged_has_work(void)
{
return !list_empty(&khugepaged_scan.mm_head) &&
- khugepaged_enabled();
+ hugepage_flags_enabled();
}
static int khugepaged_wait_event(void)
@@ -2178,21 +2606,16 @@ static int khugepaged_wait_event(void)
kthread_should_stop();
}
-static void khugepaged_do_scan(void)
+static void khugepaged_do_scan(struct collapse_control *cc)
{
- struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
- unsigned int pages = khugepaged_pages_to_scan;
+ unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
bool wait = true;
-
- barrier(); /* write khugepaged_pages_to_scan to local stack */
+ int result = SCAN_SUCCEED;
lru_add_drain_all();
- while (progress < pages) {
- if (!khugepaged_prealloc_page(&hpage, &wait))
- break;
-
+ while (true) {
cond_resched();
if (unlikely(kthread_should_stop() || try_to_freeze()))
@@ -2204,14 +2627,25 @@ static void khugepaged_do_scan(void)
if (khugepaged_has_work() &&
pass_through_head < 2)
progress += khugepaged_scan_mm_slot(pages - progress,
- &hpage);
+ &result, cc);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
- }
- if (!IS_ERR_OR_NULL(hpage))
- put_page(hpage);
+ if (progress >= pages)
+ break;
+
+ if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
+ /*
+ * If fail to allocate the first time, try to sleep for
+ * a while. When hit again, cancel the scan.
+ */
+ if (!wait)
+ break;
+ wait = false;
+ khugepaged_alloc_sleep();
+ }
+ }
}
static bool khugepaged_should_wakeup(void)
@@ -2236,19 +2670,19 @@ static void khugepaged_wait_work(void)
return;
}
- if (khugepaged_enabled())
+ if (hugepage_flags_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
static int khugepaged(void *none)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
set_freezable();
set_user_nice(current, MAX_NICE);
while (!kthread_should_stop()) {
- khugepaged_do_scan();
+ khugepaged_do_scan(&khugepaged_collapse_control);
khugepaged_wait_work();
}
@@ -2267,6 +2701,11 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
+ if (!hugepage_flags_enabled()) {
+ calculate_min_free_kbytes();
+ goto update_wmarks;
+ }
+
for_each_populated_zone(zone) {
/*
* We don't need to worry about fragmentation of
@@ -2302,6 +2741,8 @@ static void set_recommended_min_free_kbytes(void)
min_free_kbytes = recommended_min;
}
+
+update_wmarks:
setup_per_zone_wmarks();
}
@@ -2310,7 +2751,7 @@ int start_stop_khugepaged(void)
int err = 0;
mutex_lock(&khugepaged_mutex);
- if (khugepaged_enabled()) {
+ if (hugepage_flags_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -2323,12 +2764,11 @@ int start_stop_khugepaged(void)
if (!list_empty(&khugepaged_scan.mm_head))
wake_up_interruptible(&khugepaged_wait);
-
- set_recommended_min_free_kbytes();
} else if (khugepaged_thread) {
kthread_stop(khugepaged_thread);
khugepaged_thread = NULL;
}
+ set_recommended_min_free_kbytes();
fail:
mutex_unlock(&khugepaged_mutex);
return err;
@@ -2337,7 +2777,152 @@ fail:
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
- if (khugepaged_enabled() && khugepaged_thread)
+ if (hugepage_flags_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
+
+bool current_is_khugepaged(void)
+{
+ return kthread_func(current) == khugepaged;
+}
+
+static int madvise_collapse_errno(enum scan_result r)
+{
+ /*
+ * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
+ * actionable feedback to caller, so they may take an appropriate
+ * fallback measure depending on the nature of the failure.
+ */
+ switch (r) {
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ return -ENOMEM;
+ case SCAN_CGROUP_CHARGE_FAIL:
+ case SCAN_EXCEED_NONE_PTE:
+ return -EBUSY;
+ /* Resource temporary unavailable - trying again might succeed */
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_LRU:
+ case SCAN_DEL_PAGE_LRU:
+ case SCAN_PAGE_FILLED:
+ return -EAGAIN;
+ /*
+ * Other: Trying again likely not to succeed / error intrinsic to
+ * specified memory range. khugepaged likely won't be able to collapse
+ * either.
+ */
+ default:
+ return -EINVAL;
+ }
+}
+
+int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ struct collapse_control *cc;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long hstart, hend, addr;
+ int thps = 0, last_fail = SCAN_FAIL;
+ bool mmap_locked = true;
+
+ BUG_ON(vma->vm_start > start);
+ BUG_ON(vma->vm_end < end);
+
+ *prev = vma;
+
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ return -EINVAL;
+
+ cc = kmalloc(sizeof(*cc), GFP_KERNEL);
+ if (!cc)
+ return -ENOMEM;
+ cc->is_khugepaged = false;
+
+ mmgrab(mm);
+ lru_add_drain_all();
+
+ hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = end & HPAGE_PMD_MASK;
+
+ for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
+ int result = SCAN_FAIL;
+
+ if (!mmap_locked) {
+ cond_resched();
+ mmap_read_lock(mm);
+ mmap_locked = true;
+ result = hugepage_vma_revalidate(mm, addr, false, &vma,
+ cc);
+ if (result != SCAN_SUCCEED) {
+ last_fail = result;
+ goto out_nolock;
+ }
+
+ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
+ }
+ mmap_assert_locked(mm);
+ memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ struct file *file = get_file(vma->vm_file);
+ pgoff_t pgoff = linear_page_index(vma, addr);
+
+ mmap_read_unlock(mm);
+ mmap_locked = false;
+ result = hpage_collapse_scan_file(mm, addr, file, pgoff,
+ cc);
+ fput(file);
+ } else {
+ result = hpage_collapse_scan_pmd(mm, vma, addr,
+ &mmap_locked, cc);
+ }
+ if (!mmap_locked)
+ *prev = NULL; /* Tell caller we dropped mmap_lock */
+
+handle_result:
+ switch (result) {
+ case SCAN_SUCCEED:
+ case SCAN_PMD_MAPPED:
+ ++thps;
+ break;
+ case SCAN_PTE_MAPPED_HUGEPAGE:
+ BUG_ON(mmap_locked);
+ BUG_ON(*prev);
+ mmap_write_lock(mm);
+ result = collapse_pte_mapped_thp(mm, addr, true);
+ mmap_write_unlock(mm);
+ goto handle_result;
+ /* Whitelisted set of results where continuing OK */
+ case SCAN_PMD_NULL:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_PAGE_RO:
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_PAGE_NULL:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COMPOUND:
+ case SCAN_PAGE_LRU:
+ case SCAN_DEL_PAGE_LRU:
+ last_fail = result;
+ break;
+ default:
+ last_fail = result;
+ /* Other error, exit */
+ goto out_maybelock;
+ }
+ }
+
+out_maybelock:
+ /* Caller expects us to hold mmap_lock on return */
+ if (!mmap_locked)
+ mmap_read_lock(mm);
+out_nolock:
+ mmap_assert_locked(mm);
+ mmdrop(mm);
+ kfree(cc);
+
+ return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
+ : madvise_collapse_errno(last_fail);
+}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c0014d3b91c1..a2d34226e3c8 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -13,15 +13,18 @@
*
* The following locks and mutexes are used by kmemleak:
*
- * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
- * accesses to the object_tree_root. The object_list is the main list
- * holding the metadata (struct kmemleak_object) for the allocated memory
- * blocks. The object_tree_root is a red black tree used to look-up
- * metadata based on a pointer to the corresponding memory block. The
- * kmemleak_object structures are added to the object_list and
- * object_tree_root in the create_object() function called from the
- * kmemleak_alloc() callback and removed in delete_object() called from the
- * kmemleak_free() callback
+ * - kmemleak_lock (raw_spinlock_t): protects the object_list as well as
+ * del_state modifications and accesses to the object_tree_root (or
+ * object_phys_tree_root). The object_list is the main list holding the
+ * metadata (struct kmemleak_object) for the allocated memory blocks.
+ * The object_tree_root and object_phys_tree_root are red
+ * black trees used to look-up metadata based on a pointer to the
+ * corresponding memory block. The object_phys_tree_root is for objects
+ * allocated with physical address. The kmemleak_object structures are
+ * added to the object_list and object_tree_root (or object_phys_tree_root)
+ * in the create_object() function called from the kmemleak_alloc() (or
+ * kmemleak_alloc_phys()) callback and removed in delete_object() called from
+ * the kmemleak_free() callback
* - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
* Accesses to the metadata (e.g. count) are protected by this lock. Note
* that some members of this structure may be protected by other means
@@ -77,6 +80,7 @@
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/stacktrace.h>
+#include <linux/stackdepot.h>
#include <linux/cache.h>
#include <linux/percpu.h>
#include <linux/memblock.h>
@@ -97,6 +101,7 @@
#include <linux/atomic.h>
#include <linux/kasan.h>
+#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/memory_hotplug.h>
@@ -112,7 +117,8 @@
#define BYTES_PER_POINTER sizeof(void *)
/* GFP bitmask for kmemleak internal allocations */
-#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
+ __GFP_NOLOCKDEP)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
__GFP_NOWARN)
@@ -143,6 +149,7 @@ struct kmemleak_object {
struct rcu_head rcu; /* object_list lockless traversal */
/* object usage count; object freed when use_count == 0 */
atomic_t use_count;
+ unsigned int del_state; /* deletion state */
unsigned long pointer;
size_t size;
/* pass surplus references to this pointer */
@@ -155,8 +162,7 @@ struct kmemleak_object {
u32 checksum;
/* memory ranges to be scanned inside an object (empty for all) */
struct hlist_head area_list;
- unsigned long trace[MAX_TRACE];
- unsigned int trace_len;
+ depot_stack_handle_t trace_handle;
unsigned long jiffies; /* creation timestamp */
pid_t pid; /* pid of the current task */
char comm[TASK_COMM_LEN]; /* executable name */
@@ -170,6 +176,13 @@ struct kmemleak_object {
#define OBJECT_NO_SCAN (1 << 2)
/* flag set to fully scan the object when scan_area allocation failed */
#define OBJECT_FULL_SCAN (1 << 3)
+/* flag set for object allocated with physical address */
+#define OBJECT_PHYS (1 << 4)
+
+/* set when __remove_object() called */
+#define DELSTATE_REMOVED (1 << 0)
+/* set to temporarily prevent deletion from object_list */
+#define DELSTATE_NO_DELETE (1 << 1)
#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
@@ -191,7 +204,9 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
-/* protecting the access to object_list and object_tree_root */
+/* search tree for object (with OBJECT_PHYS flag) boundaries */
+static struct rb_root object_phys_tree_root = RB_ROOT;
+/* protecting the access to object_list, object_tree_root (or object_phys_tree_root) */
static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
@@ -218,7 +233,7 @@ static struct task_struct *scan_thread;
static unsigned long jiffies_min_age;
static unsigned long jiffies_last_scan;
/* delay between automatic memory scannings */
-static signed long jiffies_scan_wait;
+static unsigned long jiffies_scan_wait;
/* enables or disables the task stacks scanning */
static int kmemleak_stack_scan = 1;
/* protects the memory scanning, parameters and debug/kmemleak file access */
@@ -283,13 +298,16 @@ static void hex_dump_object(struct seq_file *seq,
const u8 *ptr = (const u8 *)object->pointer;
size_t len;
+ if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+ return;
+
/* limit the number of lines to HEX_MAX_LINES */
len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
kasan_disable_current();
warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
- HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
+ HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII);
kasan_enable_current();
}
@@ -335,19 +353,22 @@ static void print_unreferenced(struct seq_file *seq,
struct kmemleak_object *object)
{
int i;
+ unsigned long *entries;
+ unsigned int nr_entries;
unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
+ nr_entries = stack_depot_fetch(object->trace_handle, &entries);
warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ object->pointer, object->size);
warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
- object->comm, object->pid, object->jiffies,
- msecs_age / 1000, msecs_age % 1000);
+ object->comm, object->pid, object->jiffies,
+ msecs_age / 1000, msecs_age % 1000);
hex_dump_object(seq, object);
warn_or_seq_printf(seq, " backtrace:\n");
- for (i = 0; i < object->trace_len; i++) {
- void *ptr = (void *)object->trace[i];
- warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ for (i = 0; i < nr_entries; i++) {
+ void *ptr = (void *)entries[i];
+ warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr);
}
}
@@ -359,15 +380,16 @@ static void print_unreferenced(struct seq_file *seq,
static void dump_object_info(struct kmemleak_object *object)
{
pr_notice("Object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
- object->comm, object->pid, object->jiffies);
+ object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%x\n", object->flags);
pr_notice(" checksum = %u\n", object->checksum);
pr_notice(" backtrace:\n");
- stack_trace_print(object->trace, object->trace_len, 4);
+ if (object->trace_handle)
+ stack_depot_print(object->trace_handle);
}
/*
@@ -376,18 +398,25 @@ static void dump_object_info(struct kmemleak_object *object)
* beginning of the memory block are allowed. The kmemleak_lock must be held
* when calling this function.
*/
-static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
+ bool is_phys)
{
- struct rb_node *rb = object_tree_root.rb_node;
+ struct rb_node *rb = is_phys ? object_phys_tree_root.rb_node :
+ object_tree_root.rb_node;
+ unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
while (rb) {
- struct kmemleak_object *object =
- rb_entry(rb, struct kmemleak_object, rb_node);
- if (ptr < object->pointer)
+ struct kmemleak_object *object;
+ unsigned long untagged_objp;
+
+ object = rb_entry(rb, struct kmemleak_object, rb_node);
+ untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
+
+ if (untagged_ptr < untagged_objp)
rb = object->rb_node.rb_left;
- else if (object->pointer + object->size <= ptr)
+ else if (untagged_objp + object->size <= untagged_ptr)
rb = object->rb_node.rb_right;
- else if (object->pointer == ptr || alias)
+ else if (untagged_objp == untagged_ptr || alias)
return object;
else {
kmemleak_warn("Found object by alias at 0x%08lx\n",
@@ -399,6 +428,12 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
return NULL;
}
+/* Look-up a kmemleak object which allocated with virtual address. */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+ return __lookup_object(ptr, alias, false);
+}
+
/*
* Increment the object use_count. Return 1 if successful or 0 otherwise. Note
* that once an object's use_count reached 0, the RCU freeing was already
@@ -508,14 +543,15 @@ static void put_object(struct kmemleak_object *object)
/*
* Look up an object in the object search tree and increase its use_count.
*/
-static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alias,
+ bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object;
rcu_read_lock();
raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = lookup_object(ptr, alias);
+ object = __lookup_object(ptr, alias, is_phys);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
@@ -526,28 +562,41 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
return object;
}
+/* Look up and get an object which allocated with virtual address. */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+ return __find_and_get_object(ptr, alias, false);
+}
+
/*
- * Remove an object from the object_tree_root and object_list. Must be called
- * with the kmemleak_lock held _if_ kmemleak is still enabled.
+ * Remove an object from the object_tree_root (or object_phys_tree_root)
+ * and object_list. Must be called with the kmemleak_lock held _if_ kmemleak
+ * is still enabled.
*/
static void __remove_object(struct kmemleak_object *object)
{
- rb_erase(&object->rb_node, &object_tree_root);
- list_del_rcu(&object->object_list);
+ rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ?
+ &object_phys_tree_root :
+ &object_tree_root);
+ if (!(object->del_state & DELSTATE_NO_DELETE))
+ list_del_rcu(&object->object_list);
+ object->del_state |= DELSTATE_REMOVED;
}
/*
* Look up an object in the object search tree and remove it from both
- * object_tree_root and object_list. The returned object's use_count should be
- * at least 1, as initially set by create_object().
+ * object_tree_root (or object_phys_tree_root) and object_list. The
+ * returned object's use_count should be at least 1, as initially set
+ * by create_object().
*/
-static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
+static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias,
+ bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object;
raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = lookup_object(ptr, alias);
+ object = __lookup_object(ptr, alias, is_phys);
if (object)
__remove_object(object);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
@@ -555,31 +604,39 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
return object;
}
-/*
- * Save stack trace to the given array of MAX_TRACE size.
- */
-static int __save_stack_trace(unsigned long *trace)
+static noinline depot_stack_handle_t set_track_prepare(void)
{
- return stack_trace_save(trace, MAX_TRACE, 2);
+ depot_stack_handle_t trace_handle;
+ unsigned long entries[MAX_TRACE];
+ unsigned int nr_entries;
+
+ if (!kmemleak_initialized)
+ return 0;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
+ trace_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+
+ return trace_handle;
}
/*
* Create the metadata (struct kmemleak_object) corresponding to an allocated
- * memory block and add it to the object_list and object_tree_root.
+ * memory block and add it to the object_list and object_tree_root (or
+ * object_phys_tree_root).
*/
-static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static void __create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp, bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object, *parent;
struct rb_node **link, *rb_parent;
unsigned long untagged_ptr;
+ unsigned long untagged_objp;
object = mem_pool_alloc(gfp);
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
- return NULL;
+ return;
}
INIT_LIST_HEAD(&object->object_list);
@@ -587,17 +644,18 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
INIT_HLIST_HEAD(&object->area_list);
raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
- object->flags = OBJECT_ALLOCATED;
+ object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0);
object->pointer = ptr;
- object->size = size;
+ object->size = kfence_ksize((void *)ptr) ?: size;
object->excess_ref = 0;
object->min_count = min_count;
object->count = 0; /* white color initially */
object->jiffies = jiffies;
object->checksum = 0;
+ object->del_state = 0;
/* task information */
- if (in_irq()) {
+ if (in_hardirq()) {
object->pid = 0;
strncpy(object->comm, "hardirq", sizeof(object->comm));
} else if (in_serving_softirq()) {
@@ -615,21 +673,29 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
}
/* kernel backtrace */
- object->trace_len = __save_stack_trace(object->trace);
+ object->trace_handle = set_track_prepare();
raw_spin_lock_irqsave(&kmemleak_lock, flags);
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
- min_addr = min(min_addr, untagged_ptr);
- max_addr = max(max_addr, untagged_ptr + size);
- link = &object_tree_root.rb_node;
+ /*
+ * Only update min_addr and max_addr with object
+ * storing virtual address.
+ */
+ if (!is_phys) {
+ min_addr = min(min_addr, untagged_ptr);
+ max_addr = max(max_addr, untagged_ptr + size);
+ }
+ link = is_phys ? &object_phys_tree_root.rb_node :
+ &object_tree_root.rb_node;
rb_parent = NULL;
while (*link) {
rb_parent = *link;
parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
- if (ptr + size <= parent->pointer)
+ untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer);
+ if (untagged_ptr + size <= untagged_objp)
link = &parent->rb_node.rb_left;
- else if (parent->pointer + parent->size <= ptr)
+ else if (untagged_objp + parent->size <= untagged_ptr)
link = &parent->rb_node.rb_right;
else {
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
@@ -640,17 +706,29 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
*/
dump_object_info(parent);
kmem_cache_free(object_cache, object);
- object = NULL;
goto out;
}
}
rb_link_node(&object->rb_node, rb_parent, link);
- rb_insert_color(&object->rb_node, &object_tree_root);
-
+ rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root :
+ &object_tree_root);
list_add_tail_rcu(&object->object_list, &object_list);
out:
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
- return object;
+}
+
+/* Create kmemleak object which allocated with virtual address. */
+static void create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ __create_object(ptr, size, min_count, gfp, false);
+}
+
+/* Create kmemleak object which allocated with physical address. */
+static void create_object_phys(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ __create_object(ptr, size, min_count, gfp, true);
}
/*
@@ -681,7 +759,7 @@ static void delete_object_full(unsigned long ptr)
{
struct kmemleak_object *object;
- object = find_and_remove_object(ptr, 0);
+ object = find_and_remove_object(ptr, 0, false);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Freeing unknown object at 0x%08lx\n",
@@ -697,12 +775,12 @@ static void delete_object_full(unsigned long ptr)
* delete it. If the memory block is partially freed, the function may create
* additional metadata for the remaining parts of the block.
*/
-static void delete_object_part(unsigned long ptr, size_t size)
+static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
{
struct kmemleak_object *object;
unsigned long start, end;
- object = find_and_remove_object(ptr, 1);
+ object = find_and_remove_object(ptr, 1, is_phys);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
@@ -719,11 +797,11 @@ static void delete_object_part(unsigned long ptr, size_t size)
start = object->pointer;
end = object->pointer + object->size;
if (ptr > start)
- create_object(start, ptr - start, object->min_count,
- GFP_KERNEL);
+ __create_object(start, ptr - start, object->min_count,
+ GFP_KERNEL, is_phys);
if (ptr + size < end)
- create_object(ptr + size, end - ptr - size, object->min_count,
- GFP_KERNEL);
+ __create_object(ptr + size, end - ptr - size, object->min_count,
+ GFP_KERNEL, is_phys);
__delete_object(object);
}
@@ -744,11 +822,11 @@ static void paint_it(struct kmemleak_object *object, int color)
raw_spin_unlock_irqrestore(&object->lock, flags);
}
-static void paint_ptr(unsigned long ptr, int color)
+static void paint_ptr(unsigned long ptr, int color, bool is_phys)
{
struct kmemleak_object *object;
- object = find_and_get_object(ptr, 0);
+ object = __find_and_get_object(ptr, 0, is_phys);
if (!object) {
kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
ptr,
@@ -766,16 +844,16 @@ static void paint_ptr(unsigned long ptr, int color)
*/
static void make_gray_object(unsigned long ptr)
{
- paint_ptr(ptr, KMEMLEAK_GREY);
+ paint_ptr(ptr, KMEMLEAK_GREY, false);
}
/*
* Mark the object as black-colored so that it is ignored from scans and
* reporting.
*/
-static void make_black_object(unsigned long ptr)
+static void make_black_object(unsigned long ptr, bool is_phys)
{
- paint_ptr(ptr, KMEMLEAK_BLACK);
+ paint_ptr(ptr, KMEMLEAK_BLACK, is_phys);
}
/*
@@ -787,6 +865,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
unsigned long flags;
struct kmemleak_object *object;
struct kmemleak_scan_area *area = NULL;
+ unsigned long untagged_ptr;
+ unsigned long untagged_objp;
object = find_and_get_object(ptr, 1);
if (!object) {
@@ -795,6 +875,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
return;
}
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+ untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
+
if (scan_area_cache)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
@@ -806,8 +889,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
goto out_unlock;
}
if (size == SIZE_MAX) {
- size = object->pointer + object->size - ptr;
- } else if (ptr + size > object->pointer + object->size) {
+ size = untagged_objp + object->size - untagged_ptr;
+ } else if (untagged_ptr + size > untagged_objp + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
@@ -976,7 +1059,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- delete_object_part((unsigned long)ptr, size);
+ delete_object_part((unsigned long)ptr, size, false);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -1027,7 +1110,7 @@ void __ref kmemleak_update_trace(const void *ptr)
}
raw_spin_lock_irqsave(&object->lock, flags);
- object->trace_len = __save_stack_trace(object->trace);
+ object->trace_handle = set_track_prepare();
raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
@@ -1064,7 +1147,7 @@ void __ref kmemleak_ignore(const void *ptr)
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- make_black_object((unsigned long)ptr);
+ make_black_object((unsigned long)ptr, false);
}
EXPORT_SYMBOL(kmemleak_ignore);
@@ -1111,15 +1194,18 @@ EXPORT_SYMBOL(kmemleak_no_scan);
* address argument
* @phys: physical address of the object
* @size: size of the object
- * @min_count: minimum number of references to this object.
- * See kmemleak_alloc()
* @gfp: kmalloc() flags used for kmemleak internal memory allocations
*/
-void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
- gfp_t gfp)
+void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp)
{
- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
- kmemleak_alloc(__va(phys), size, min_count, gfp);
+ pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size);
+
+ if (kmemleak_enabled)
+ /*
+ * Create object with OBJECT_PHYS flag and
+ * assume min_count 0.
+ */
+ create_object_phys((unsigned long)phys, size, 0, gfp);
}
EXPORT_SYMBOL(kmemleak_alloc_phys);
@@ -1132,22 +1218,12 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
*/
void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
{
- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
- kmemleak_free_part(__va(phys), size);
-}
-EXPORT_SYMBOL(kmemleak_free_part_phys);
+ pr_debug("%s(0x%pa)\n", __func__, &phys);
-/**
- * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
- * address argument
- * @phys: physical address of the object
- */
-void __ref kmemleak_not_leak_phys(phys_addr_t phys)
-{
- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
- kmemleak_not_leak(__va(phys));
+ if (kmemleak_enabled)
+ delete_object_part((unsigned long)phys, size, true);
}
-EXPORT_SYMBOL(kmemleak_not_leak_phys);
+EXPORT_SYMBOL(kmemleak_free_part_phys);
/**
* kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
@@ -1156,8 +1232,10 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
*/
void __ref kmemleak_ignore_phys(phys_addr_t phys)
{
- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
- kmemleak_ignore(__va(phys));
+ pr_debug("%s(0x%pa)\n", __func__, &phys);
+
+ if (kmemleak_enabled)
+ make_black_object((unsigned long)phys, true);
}
EXPORT_SYMBOL(kmemleak_ignore_phys);
@@ -1168,9 +1246,12 @@ static bool update_checksum(struct kmemleak_object *object)
{
u32 old_csum = object->checksum;
+ if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+ return false;
+
kasan_disable_current();
kcsan_disable_current();
- object->checksum = crc32(0, (void *)object->pointer, object->size);
+ object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size);
kasan_enable_current();
kcsan_enable_current();
@@ -1202,7 +1283,7 @@ static void update_refs(struct kmemleak_object *object)
}
/*
- * Memory scanning is a long process and it needs to be interruptable. This
+ * Memory scanning is a long process and it needs to be interruptible. This
* function checks whether such interrupt condition occurred.
*/
static int scan_should_stop(void)
@@ -1245,7 +1326,7 @@ static void scan_block(void *_start, void *_end,
break;
kasan_disable_current();
- pointer = *ptr;
+ pointer = *(unsigned long *)kasan_reset_tag((void *)ptr);
kasan_enable_current();
untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
@@ -1321,6 +1402,7 @@ static void scan_object(struct kmemleak_object *object)
{
struct kmemleak_scan_area *area;
unsigned long flags;
+ void *obj_ptr;
/*
* Once the object->lock is acquired, the corresponding memory block
@@ -1332,10 +1414,15 @@ static void scan_object(struct kmemleak_object *object)
if (!(object->flags & OBJECT_ALLOCATED))
/* already freed object */
goto out;
+
+ obj_ptr = object->flags & OBJECT_PHYS ?
+ __va((phys_addr_t)object->pointer) :
+ (void *)object->pointer;
+
if (hlist_empty(&object->area_list) ||
object->flags & OBJECT_FULL_SCAN) {
- void *start = (void *)object->pointer;
- void *end = (void *)(object->pointer + object->size);
+ void *start = obj_ptr;
+ void *end = obj_ptr + object->size;
void *next;
do {
@@ -1393,15 +1480,44 @@ static void scan_gray_list(void)
}
/*
+ * Conditionally call resched() in an object iteration loop while making sure
+ * that the given object won't go away without RCU read lock by performing a
+ * get_object() if necessaary.
+ */
+static void kmemleak_cond_resched(struct kmemleak_object *object)
+{
+ if (!get_object(object))
+ return; /* Try next object */
+
+ raw_spin_lock_irq(&kmemleak_lock);
+ if (object->del_state & DELSTATE_REMOVED)
+ goto unlock_put; /* Object removed */
+ object->del_state |= DELSTATE_NO_DELETE;
+ raw_spin_unlock_irq(&kmemleak_lock);
+
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+
+ raw_spin_lock_irq(&kmemleak_lock);
+ if (object->del_state & DELSTATE_REMOVED)
+ list_del_rcu(&object->object_list);
+ object->del_state &= ~DELSTATE_NO_DELETE;
+unlock_put:
+ raw_spin_unlock_irq(&kmemleak_lock);
+ put_object(object);
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
*/
static void kmemleak_scan(void)
{
- unsigned long flags;
struct kmemleak_object *object;
- int i;
+ struct zone *zone;
+ int __maybe_unused i;
int new_leaks = 0;
jiffies_last_scan = jiffies;
@@ -1409,7 +1525,7 @@ static void kmemleak_scan(void)
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irq(&object->lock);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
@@ -1421,12 +1537,26 @@ static void kmemleak_scan(void)
dump_object_info(object);
}
#endif
+
+ /* ignore objects outside lowmem (paint them black) */
+ if ((object->flags & OBJECT_PHYS) &&
+ !(object->flags & OBJECT_NO_SCAN)) {
+ unsigned long phys = object->pointer;
+
+ if (PHYS_PFN(phys) < min_low_pfn ||
+ PHYS_PFN(phys + object->size) >= max_low_pfn)
+ __paint_it(object, KMEMLEAK_BLACK);
+ }
+
/* reset the reference count (whiten the object) */
object->count = 0;
if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
+
+ if (need_resched())
+ kmemleak_cond_resched(object);
}
rcu_read_unlock();
@@ -1441,9 +1571,9 @@ static void kmemleak_scan(void)
* Struct page scanning for each node.
*/
get_online_mems();
- for_each_online_node(i) {
- unsigned long start_pfn = node_start_pfn(i);
- unsigned long end_pfn = node_end_pfn(i);
+ for_each_populated_zone(zone) {
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -1452,8 +1582,8 @@ static void kmemleak_scan(void)
if (!page)
continue;
- /* only scan pages belonging to this node */
- if (page_to_nid(page) != i)
+ /* only scan pages belonging to this zone */
+ if (page_zone(page) != zone)
continue;
/* only scan if page is in use */
if (page_count(page) == 0)
@@ -1494,14 +1624,24 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ if (need_resched())
+ kmemleak_cond_resched(object);
+
+ /*
+ * This is racy but we can save the overhead of lock/unlock
+ * calls. The missed objects, if any, should be caught in
+ * the next scan.
+ */
+ if (!color_white(object))
+ continue;
+ raw_spin_lock_irq(&object->lock);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
@@ -1521,7 +1661,17 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ if (need_resched())
+ kmemleak_cond_resched(object);
+
+ /*
+ * This is racy but we can save the overhead of lock/unlock
+ * calls. The missed objects, if any, should be caught in
+ * the next scan.
+ */
+ if (!color_white(object))
+ continue;
+ raw_spin_lock_irq(&object->lock);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
@@ -1531,7 +1681,7 @@ static void kmemleak_scan(void)
new_leaks++;
}
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
@@ -1566,7 +1716,7 @@ static int kmemleak_scan_thread(void *arg)
}
while (!kthread_should_stop()) {
- signed long timeout = jiffies_scan_wait;
+ signed long timeout = READ_ONCE(jiffies_scan_wait);
mutex_lock(&scan_mutex);
kmemleak_scan();
@@ -1733,15 +1883,14 @@ static int dump_str_object_info(const char *str)
static void kmemleak_clear(void)
{
struct kmemleak_object *object;
- unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irq(&object->lock);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
@@ -1806,14 +1955,20 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
else if (strncmp(buf, "scan=off", 8) == 0)
stop_scan_thread();
else if (strncmp(buf, "scan=", 5) == 0) {
- unsigned long secs;
+ unsigned secs;
+ unsigned long msecs;
- ret = kstrtoul(buf + 5, 0, &secs);
+ ret = kstrtouint(buf + 5, 0, &secs);
if (ret < 0)
goto out;
+
+ msecs = secs * MSEC_PER_SEC;
+ if (msecs > UINT_MAX)
+ msecs = UINT_MAX;
+
stop_scan_thread();
- if (secs) {
- jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
+ if (msecs) {
+ WRITE_ONCE(jiffies_scan_wait, msecs_to_jiffies(msecs));
start_scan_thread();
}
} else if (strncmp(buf, "scan", 4) == 0)
@@ -1914,8 +2069,10 @@ static int __init kmemleak_boot_config(char *str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
- else if (strcmp(str, "on") == 0)
+ else if (strcmp(str, "on") == 0) {
kmemleak_skip_disable = 1;
+ stack_depot_request_early_init();
+ }
else
return -EINVAL;
return 0;
diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile
new file mode 100644
index 000000000000..91cfdde642d1
--- /dev/null
+++ b/mm/kmsan/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for KernelMemorySanitizer (KMSAN).
+#
+#
+obj-y := core.o instrumentation.o init.o hooks.o report.o shadow.o
+
+KMSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+UBSAN_SANITIZE := n
+
+# Disable instrumentation of KMSAN runtime with other tools.
+CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector
+CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack)
+CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
+
+# Disable ftrace to avoid recursion.
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_hooks.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_instrumentation.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE)
+
+CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME)
+
+obj-$(CONFIG_KMSAN_KUNIT_TEST) += kmsan_test.o
+KMSAN_SANITIZE_kmsan_test.o := y
+CFLAGS_kmsan_test.o += $(call cc-disable-warning, uninitialized)
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
new file mode 100644
index 000000000000..3adb4c1d3b19
--- /dev/null
+++ b/mm/kmsan/core.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN runtime library.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <asm/page.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmsan_types.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mmzone.h>
+#include <linux/percpu-defs.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "../slab.h"
+#include "kmsan.h"
+
+bool kmsan_enabled __read_mostly;
+
+/*
+ * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is
+ * unavaliable.
+ */
+DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
+
+void kmsan_internal_task_create(struct task_struct *task)
+{
+ struct kmsan_ctx *ctx = &task->kmsan_ctx;
+ struct thread_info *info = current_thread_info();
+
+ __memset(ctx, 0, sizeof(*ctx));
+ ctx->allow_reporting = true;
+ kmsan_internal_unpoison_memory(info, sizeof(*info), false);
+}
+
+void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags,
+ unsigned int poison_flags)
+{
+ u32 extra_bits =
+ kmsan_extra_bits(/*depth*/ 0, poison_flags & KMSAN_POISON_FREE);
+ bool checked = poison_flags & KMSAN_POISON_CHECK;
+ depot_stack_handle_t handle;
+
+ handle = kmsan_save_stack_with_flags(flags, extra_bits);
+ kmsan_internal_set_shadow_origin(address, size, -1, handle, checked);
+}
+
+void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked)
+{
+ kmsan_internal_set_shadow_origin(address, size, 0, 0, checked);
+}
+
+depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
+ unsigned int extra)
+{
+ unsigned long entries[KMSAN_STACK_DEPTH];
+ unsigned int nr_entries;
+ depot_stack_handle_t handle;
+
+ nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
+
+ /* Don't sleep. */
+ flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM);
+
+ handle = __stack_depot_save(entries, nr_entries, flags, true);
+ return stack_depot_set_extra_bits(handle, extra);
+}
+
+/* Copy the metadata following the memmove() behavior. */
+void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n)
+{
+ depot_stack_handle_t old_origin = 0, new_origin = 0;
+ int src_slots, dst_slots, i, iter, step, skip_bits;
+ depot_stack_handle_t *origin_src, *origin_dst;
+ void *shadow_src, *shadow_dst;
+ u32 *align_shadow_src, shadow;
+ bool backwards;
+
+ shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW);
+ if (!shadow_dst)
+ return;
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n));
+
+ shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW);
+ if (!shadow_src) {
+ /*
+ * @src is untracked: zero out destination shadow, ignore the
+ * origins, we're done.
+ */
+ __memset(shadow_dst, 0, n);
+ return;
+ }
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n));
+
+ __memmove(shadow_dst, shadow_src, n);
+
+ origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN);
+ origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN);
+ KMSAN_WARN_ON(!origin_dst || !origin_src);
+ src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) -
+ ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) /
+ KMSAN_ORIGIN_SIZE;
+ dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) -
+ ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) /
+ KMSAN_ORIGIN_SIZE;
+ KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1));
+ KMSAN_WARN_ON((src_slots - dst_slots > 1) ||
+ (dst_slots - src_slots < -1));
+
+ backwards = dst > src;
+ i = backwards ? min(src_slots, dst_slots) - 1 : 0;
+ iter = backwards ? -1 : 1;
+
+ align_shadow_src =
+ (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE);
+ for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) {
+ KMSAN_WARN_ON(i < 0);
+ shadow = align_shadow_src[i];
+ if (i == 0) {
+ /*
+ * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't
+ * look at the first @src % KMSAN_ORIGIN_SIZE bytes
+ * of the first shadow slot.
+ */
+ skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow >> skip_bits) << skip_bits;
+ }
+ if (i == src_slots - 1) {
+ /*
+ * If @src + n isn't aligned on
+ * KMSAN_ORIGIN_SIZE, don't look at the last
+ * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the
+ * last shadow slot.
+ */
+ skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow << skip_bits) >> skip_bits;
+ }
+ /*
+ * Overwrite the origin only if the corresponding
+ * shadow is nonempty.
+ */
+ if (origin_src[i] && (origin_src[i] != old_origin) && shadow) {
+ old_origin = origin_src[i];
+ new_origin = kmsan_internal_chain_origin(old_origin);
+ /*
+ * kmsan_internal_chain_origin() may return
+ * NULL, but we don't want to lose the previous
+ * origin value.
+ */
+ if (!new_origin)
+ new_origin = old_origin;
+ }
+ if (shadow)
+ origin_dst[i] = new_origin;
+ else
+ origin_dst[i] = 0;
+ }
+ /*
+ * If dst_slots is greater than src_slots (i.e.
+ * dst_slots == src_slots + 1), there is an extra origin slot at the
+ * beginning or end of the destination buffer, for which we take the
+ * origin from the previous slot.
+ * This is only done if the part of the source shadow corresponding to
+ * slot is non-zero.
+ *
+ * E.g. if we copy 8 aligned bytes that are marked as uninitialized
+ * and have origins o111 and o222, to an unaligned buffer with offset 1,
+ * these two origins are copied to three origin slots, so one of then
+ * needs to be duplicated, depending on the copy direction (@backwards)
+ *
+ * src shadow: |uuuu|uuuu|....|
+ * src origin: |o111|o222|....|
+ *
+ * backwards = 0:
+ * dst shadow: |.uuu|uuuu|u...|
+ * dst origin: |....|o111|o222| - fill the empty slot with o111
+ * backwards = 1:
+ * dst shadow: |.uuu|uuuu|u...|
+ * dst origin: |o111|o222|....| - fill the empty slot with o222
+ */
+ if (src_slots < dst_slots) {
+ if (backwards) {
+ shadow = align_shadow_src[src_slots - 1];
+ skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow << skip_bits) >> skip_bits;
+ if (shadow)
+ /* src_slots > 0, therefore dst_slots is at least 2 */
+ origin_dst[dst_slots - 1] =
+ origin_dst[dst_slots - 2];
+ } else {
+ shadow = align_shadow_src[0];
+ skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow >> skip_bits) << skip_bits;
+ if (shadow)
+ origin_dst[0] = origin_dst[1];
+ }
+ }
+}
+
+depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
+{
+ unsigned long entries[3];
+ u32 extra_bits;
+ int depth;
+ bool uaf;
+ depot_stack_handle_t handle;
+
+ if (!id)
+ return id;
+ /*
+ * Make sure we have enough spare bits in @id to hold the UAF bit and
+ * the chain depth.
+ */
+ BUILD_BUG_ON(
+ (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1));
+
+ extra_bits = stack_depot_get_extra_bits(id);
+ depth = kmsan_depth_from_eb(extra_bits);
+ uaf = kmsan_uaf_from_eb(extra_bits);
+
+ /*
+ * Stop chaining origins once the depth reached KMSAN_MAX_ORIGIN_DEPTH.
+ * This mostly happens in the case structures with uninitialized padding
+ * are copied around many times. Origin chains for such structures are
+ * usually periodic, and it does not make sense to fully store them.
+ */
+ if (depth == KMSAN_MAX_ORIGIN_DEPTH)
+ return id;
+
+ depth++;
+ extra_bits = kmsan_extra_bits(depth, uaf);
+
+ entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN;
+ entries[1] = kmsan_save_stack_with_flags(__GFP_HIGH, 0);
+ entries[2] = id;
+ /*
+ * @entries is a local var in non-instrumented code, so KMSAN does not
+ * know it is initialized. Explicitly unpoison it to avoid false
+ * positives when __stack_depot_save() passes it to instrumented code.
+ */
+ kmsan_internal_unpoison_memory(entries, sizeof(entries), false);
+ handle = __stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH,
+ true);
+ return stack_depot_set_extra_bits(handle, extra_bits);
+}
+
+void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
+ u32 origin, bool checked)
+{
+ u64 address = (u64)addr;
+ void *shadow_start;
+ u32 *origin_start;
+ size_t pad = 0;
+
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
+ shadow_start = kmsan_get_metadata(addr, KMSAN_META_SHADOW);
+ if (!shadow_start) {
+ /*
+ * kmsan_metadata_is_contiguous() is true, so either all shadow
+ * and origin pages are NULL, or all are non-NULL.
+ */
+ if (checked) {
+ pr_err("%s: not memsetting %ld bytes starting at %px, because the shadow is NULL\n",
+ __func__, size, addr);
+ KMSAN_WARN_ON(true);
+ }
+ return;
+ }
+ __memset(shadow_start, b, size);
+
+ if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ pad = address % KMSAN_ORIGIN_SIZE;
+ address -= pad;
+ size += pad;
+ }
+ size = ALIGN(size, KMSAN_ORIGIN_SIZE);
+ origin_start =
+ (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN);
+
+ for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++)
+ origin_start[i] = origin;
+}
+
+struct page *kmsan_vmalloc_to_page_or_null(void *vaddr)
+{
+ struct page *page;
+
+ if (!kmsan_internal_is_vmalloc_addr(vaddr) &&
+ !kmsan_internal_is_module_addr(vaddr))
+ return NULL;
+ page = vmalloc_to_page(vaddr);
+ if (pfn_valid(page_to_pfn(page)))
+ return page;
+ else
+ return NULL;
+}
+
+void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr,
+ int reason)
+{
+ depot_stack_handle_t cur_origin = 0, new_origin = 0;
+ unsigned long addr64 = (unsigned long)addr;
+ depot_stack_handle_t *origin = NULL;
+ unsigned char *shadow = NULL;
+ int cur_off_start = -1;
+ int chunk_size;
+ size_t pos = 0;
+
+ if (!size)
+ return;
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
+ while (pos < size) {
+ chunk_size = min(size - pos,
+ PAGE_SIZE - ((addr64 + pos) % PAGE_SIZE));
+ shadow = kmsan_get_metadata((void *)(addr64 + pos),
+ KMSAN_META_SHADOW);
+ if (!shadow) {
+ /*
+ * This page is untracked. If there were uninitialized
+ * bytes before, report them.
+ */
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos - 1, user_addr,
+ reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = 0;
+ cur_off_start = -1;
+ pos += chunk_size;
+ continue;
+ }
+ for (int i = 0; i < chunk_size; i++) {
+ if (!shadow[i]) {
+ /*
+ * This byte is unpoisoned. If there were
+ * poisoned bytes before, report them.
+ */
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos + i - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = 0;
+ cur_off_start = -1;
+ continue;
+ }
+ origin = kmsan_get_metadata((void *)(addr64 + pos + i),
+ KMSAN_META_ORIGIN);
+ KMSAN_WARN_ON(!origin);
+ new_origin = *origin;
+ /*
+ * Encountered new origin - report the previous
+ * uninitialized range.
+ */
+ if (cur_origin != new_origin) {
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos + i - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = new_origin;
+ cur_off_start = pos + i;
+ }
+ }
+ pos += chunk_size;
+ }
+ KMSAN_WARN_ON(pos != size);
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+}
+
+bool kmsan_metadata_is_contiguous(void *addr, size_t size)
+{
+ char *cur_shadow = NULL, *next_shadow = NULL, *cur_origin = NULL,
+ *next_origin = NULL;
+ u64 cur_addr = (u64)addr, next_addr = cur_addr + PAGE_SIZE;
+ depot_stack_handle_t *origin_p;
+ bool all_untracked = false;
+
+ if (!size)
+ return true;
+
+ /* The whole range belongs to the same page. */
+ if (ALIGN_DOWN(cur_addr + size - 1, PAGE_SIZE) ==
+ ALIGN_DOWN(cur_addr, PAGE_SIZE))
+ return true;
+
+ cur_shadow = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ false);
+ if (!cur_shadow)
+ all_untracked = true;
+ cur_origin = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ true);
+ if (all_untracked && cur_origin)
+ goto report;
+
+ for (; next_addr < (u64)addr + size;
+ cur_addr = next_addr, cur_shadow = next_shadow,
+ cur_origin = next_origin, next_addr += PAGE_SIZE) {
+ next_shadow = kmsan_get_metadata((void *)next_addr, false);
+ next_origin = kmsan_get_metadata((void *)next_addr, true);
+ if (all_untracked) {
+ if (next_shadow || next_origin)
+ goto report;
+ if (!next_shadow && !next_origin)
+ continue;
+ }
+ if (((u64)cur_shadow == ((u64)next_shadow - PAGE_SIZE)) &&
+ ((u64)cur_origin == ((u64)next_origin - PAGE_SIZE)))
+ continue;
+ goto report;
+ }
+ return true;
+
+report:
+ pr_err("%s: attempting to access two shadow page ranges.\n", __func__);
+ pr_err("Access of size %ld at %px.\n", size, addr);
+ pr_err("Addresses belonging to different ranges: %px and %px\n",
+ (void *)cur_addr, (void *)next_addr);
+ pr_err("page[0].shadow: %px, page[1].shadow: %px\n", cur_shadow,
+ next_shadow);
+ pr_err("page[0].origin: %px, page[1].origin: %px\n", cur_origin,
+ next_origin);
+ origin_p = kmsan_get_metadata(addr, KMSAN_META_ORIGIN);
+ if (origin_p) {
+ pr_err("Origin: %08x\n", *origin_p);
+ kmsan_print_origin(*origin_p);
+ } else {
+ pr_err("Origin: unavailable\n");
+ }
+ return false;
+}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
new file mode 100644
index 000000000000..ec0da72e65aa
--- /dev/null
+++ b/mm/kmsan/hooks.c
@@ -0,0 +1,424 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN hooks for kernel subsystems.
+ *
+ * These functions handle creation of KMSAN metadata for memory allocations.
+ *
+ * Copyright (C) 2018-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <linux/cacheflush.h>
+#include <linux/dma-direction.h>
+#include <linux/gfp.h>
+#include <linux/kmsan.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/usb.h>
+
+#include "../internal.h"
+#include "../slab.h"
+#include "kmsan.h"
+
+/*
+ * Instrumented functions shouldn't be called under
+ * kmsan_enter_runtime()/kmsan_leave_runtime(), because this will lead to
+ * skipping effects of functions like memset() inside instrumented code.
+ */
+
+void kmsan_task_create(struct task_struct *task)
+{
+ kmsan_enter_runtime();
+ kmsan_internal_task_create(task);
+ kmsan_leave_runtime();
+}
+
+void kmsan_task_exit(struct task_struct *task)
+{
+ struct kmsan_ctx *ctx = &task->kmsan_ctx;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ctx->allow_reporting = false;
+}
+
+void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags)
+{
+ if (unlikely(object == NULL))
+ return;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ /*
+ * There's a ctor or this is an RCU cache - do nothing. The memory
+ * status hasn't changed since last use.
+ */
+ if (s->ctor || (s->flags & SLAB_TYPESAFE_BY_RCU))
+ return;
+
+ kmsan_enter_runtime();
+ if (flags & __GFP_ZERO)
+ kmsan_internal_unpoison_memory(object, s->object_size,
+ KMSAN_POISON_CHECK);
+ else
+ kmsan_internal_poison_memory(object, s->object_size, flags,
+ KMSAN_POISON_CHECK);
+ kmsan_leave_runtime();
+}
+
+void kmsan_slab_free(struct kmem_cache *s, void *object)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)))
+ return;
+ /*
+ * If there's a constructor, freed memory must remain in the same state
+ * until the next allocation. We cannot save its state to detect
+ * use-after-free bugs, instead we just keep it unpoisoned.
+ */
+ if (s->ctor)
+ return;
+ kmsan_enter_runtime();
+ kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+{
+ if (unlikely(ptr == NULL))
+ return;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ if (flags & __GFP_ZERO)
+ kmsan_internal_unpoison_memory((void *)ptr, size,
+ /*checked*/ true);
+ else
+ kmsan_internal_poison_memory((void *)ptr, size, flags,
+ KMSAN_POISON_CHECK);
+ kmsan_leave_runtime();
+}
+
+void kmsan_kfree_large(const void *ptr)
+{
+ struct page *page;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ page = virt_to_head_page((void *)ptr);
+ KMSAN_WARN_ON(ptr != page_address(page));
+ kmsan_internal_poison_memory((void *)ptr,
+ PAGE_SIZE << compound_order(page),
+ GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+static unsigned long vmalloc_shadow(unsigned long addr)
+{
+ return (unsigned long)kmsan_get_metadata((void *)addr,
+ KMSAN_META_SHADOW);
+}
+
+static unsigned long vmalloc_origin(unsigned long addr)
+{
+ return (unsigned long)kmsan_get_metadata((void *)addr,
+ KMSAN_META_ORIGIN);
+}
+
+void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+ __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end));
+ __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end));
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+}
+
+/*
+ * This function creates new shadow/origin pages for the physical pages mapped
+ * into the virtual memory. If those physical pages already had shadow/origin,
+ * those are ignored.
+ */
+int kmsan_ioremap_page_range(unsigned long start, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift)
+{
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO;
+ struct page *shadow, *origin;
+ unsigned long off = 0;
+ int nr, err = 0, clean = 0, mapped;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return 0;
+
+ nr = (end - start) / PAGE_SIZE;
+ kmsan_enter_runtime();
+ for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) {
+ shadow = alloc_pages(gfp_mask, 1);
+ origin = alloc_pages(gfp_mask, 1);
+ if (!shadow || !origin) {
+ err = -ENOMEM;
+ goto ret;
+ }
+ mapped = __vmap_pages_range_noflush(
+ vmalloc_shadow(start + off),
+ vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow,
+ PAGE_SHIFT);
+ if (mapped) {
+ err = mapped;
+ goto ret;
+ }
+ shadow = NULL;
+ mapped = __vmap_pages_range_noflush(
+ vmalloc_origin(start + off),
+ vmalloc_origin(start + off + PAGE_SIZE), prot, &origin,
+ PAGE_SHIFT);
+ if (mapped) {
+ __vunmap_range_noflush(
+ vmalloc_shadow(start + off),
+ vmalloc_shadow(start + off + PAGE_SIZE));
+ err = mapped;
+ goto ret;
+ }
+ origin = NULL;
+ }
+ /* Page mapping loop finished normally, nothing to clean up. */
+ clean = 0;
+
+ret:
+ if (clean > 0) {
+ /*
+ * Something went wrong. Clean up shadow/origin pages allocated
+ * on the last loop iteration, then delete mappings created
+ * during the previous iterations.
+ */
+ if (shadow)
+ __free_pages(shadow, 1);
+ if (origin)
+ __free_pages(origin, 1);
+ __vunmap_range_noflush(
+ vmalloc_shadow(start),
+ vmalloc_shadow(start + clean * PAGE_SIZE));
+ __vunmap_range_noflush(
+ vmalloc_origin(start),
+ vmalloc_origin(start + clean * PAGE_SIZE));
+ }
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+ kmsan_leave_runtime();
+ return err;
+}
+
+void kmsan_iounmap_page_range(unsigned long start, unsigned long end)
+{
+ unsigned long v_shadow, v_origin;
+ struct page *shadow, *origin;
+ int nr;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ nr = (end - start) / PAGE_SIZE;
+ kmsan_enter_runtime();
+ v_shadow = (unsigned long)vmalloc_shadow(start);
+ v_origin = (unsigned long)vmalloc_origin(start);
+ for (int i = 0; i < nr;
+ i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) {
+ shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow);
+ origin = kmsan_vmalloc_to_page_or_null((void *)v_origin);
+ __vunmap_range_noflush(v_shadow, vmalloc_shadow(end));
+ __vunmap_range_noflush(v_origin, vmalloc_origin(end));
+ if (shadow)
+ __free_pages(shadow, 1);
+ if (origin)
+ __free_pages(origin, 1);
+ }
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+ kmsan_leave_runtime();
+}
+
+void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
+ size_t left)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ /*
+ * At this point we've copied the memory already. It's hard to check it
+ * before copying, as the size of actually copied buffer is unknown.
+ */
+
+ /* copy_to_user() may copy zero bytes. No need to check. */
+ if (!to_copy)
+ return;
+ /* Or maybe copy_to_user() failed to copy anything. */
+ if (to_copy <= left)
+ return;
+
+ ua_flags = user_access_save();
+ if ((u64)to < TASK_SIZE) {
+ /* This is a user memory access, check it. */
+ kmsan_internal_check_memory((void *)from, to_copy - left, to,
+ REASON_COPY_TO_USER);
+ } else {
+ /* Otherwise this is a kernel memory access. This happens when a
+ * compat syscall passes an argument allocated on the kernel
+ * stack to a real syscall.
+ * Don't check anything, just copy the shadow of the copied
+ * bytes.
+ */
+ kmsan_internal_memmove_metadata((void *)to, (void *)from,
+ to_copy - left);
+ }
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(kmsan_copy_to_user);
+
+/* Helper function to check an URB. */
+void kmsan_handle_urb(const struct urb *urb, bool is_out)
+{
+ if (!urb)
+ return;
+ if (is_out)
+ kmsan_internal_check_memory(urb->transfer_buffer,
+ urb->transfer_buffer_length,
+ /*user_addr*/ 0, REASON_SUBMIT_URB);
+ else
+ kmsan_internal_unpoison_memory(urb->transfer_buffer,
+ urb->transfer_buffer_length,
+ /*checked*/ false);
+}
+EXPORT_SYMBOL_GPL(kmsan_handle_urb);
+
+static void kmsan_handle_dma_page(const void *addr, size_t size,
+ enum dma_data_direction dir)
+{
+ switch (dir) {
+ case DMA_BIDIRECTIONAL:
+ kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+ kmsan_internal_unpoison_memory((void *)addr, size,
+ /*checked*/ false);
+ break;
+ case DMA_TO_DEVICE:
+ kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+ break;
+ case DMA_FROM_DEVICE:
+ kmsan_internal_unpoison_memory((void *)addr, size,
+ /*checked*/ false);
+ break;
+ case DMA_NONE:
+ break;
+ }
+}
+
+/* Helper function to handle DMA data transfers. */
+void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+ enum dma_data_direction dir)
+{
+ u64 page_offset, to_go, addr;
+
+ if (PageHighMem(page))
+ return;
+ addr = (u64)page_address(page) + offset;
+ /*
+ * The kernel may occasionally give us adjacent DMA pages not belonging
+ * to the same allocation. Process them separately to avoid triggering
+ * internal KMSAN checks.
+ */
+ while (size > 0) {
+ page_offset = addr % PAGE_SIZE;
+ to_go = min(PAGE_SIZE - page_offset, (u64)size);
+ kmsan_handle_dma_page((void *)addr, to_go, dir);
+ addr += to_go;
+ size -= to_go;
+ }
+}
+
+void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+ enum dma_data_direction dir)
+{
+ struct scatterlist *item;
+ int i;
+
+ for_each_sg(sg, item, nents, i)
+ kmsan_handle_dma(sg_page(item), item->offset, item->length,
+ dir);
+}
+
+/* Functions from kmsan-checks.h follow. */
+void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ /* The users may want to poison/unpoison random memory. */
+ kmsan_internal_poison_memory((void *)address, size, flags,
+ KMSAN_POISON_NOCHECK);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(kmsan_poison_memory);
+
+void kmsan_unpoison_memory(const void *address, size_t size)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ua_flags = user_access_save();
+ kmsan_enter_runtime();
+ /* The users may want to poison/unpoison random memory. */
+ kmsan_internal_unpoison_memory((void *)address, size,
+ KMSAN_POISON_NOCHECK);
+ kmsan_leave_runtime();
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(kmsan_unpoison_memory);
+
+/*
+ * Version of kmsan_unpoison_memory() that can be called from within the KMSAN
+ * runtime.
+ *
+ * Non-instrumented IRQ entry functions receive struct pt_regs from assembly
+ * code. Those regs need to be unpoisoned, otherwise using them will result in
+ * false positives.
+ * Using kmsan_unpoison_memory() is not an option in entry code, because the
+ * return value of in_task() is inconsistent - as a result, certain calls to
+ * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that
+ * the registers are unpoisoned even if kmsan_in_runtime() is true in the early
+ * entry code.
+ */
+void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled)
+ return;
+
+ ua_flags = user_access_save();
+ kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs),
+ KMSAN_POISON_NOCHECK);
+ user_access_restore(ua_flags);
+}
+
+void kmsan_check_memory(const void *addr, size_t size)
+{
+ if (!kmsan_enabled)
+ return;
+ return kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+}
+EXPORT_SYMBOL(kmsan_check_memory);
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
new file mode 100644
index 000000000000..ffedf4dbc49d
--- /dev/null
+++ b/mm/kmsan/init.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN initialization routines.
+ *
+ * Copyright (C) 2017-2021 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include "kmsan.h"
+
+#include <asm/sections.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+
+#include "../internal.h"
+
+#define NUM_FUTURE_RANGES 128
+struct start_end_pair {
+ u64 start, end;
+};
+
+static struct start_end_pair start_end_pairs[NUM_FUTURE_RANGES] __initdata;
+static int future_index __initdata;
+
+/*
+ * Record a range of memory for which the metadata pages will be created once
+ * the page allocator becomes available.
+ */
+static void __init kmsan_record_future_shadow_range(void *start, void *end)
+{
+ u64 nstart = (u64)start, nend = (u64)end, cstart, cend;
+ bool merged = false;
+
+ KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES);
+ KMSAN_WARN_ON((nstart >= nend) || !nstart || !nend);
+ nstart = ALIGN_DOWN(nstart, PAGE_SIZE);
+ nend = ALIGN(nend, PAGE_SIZE);
+
+ /*
+ * Scan the existing ranges to see if any of them overlaps with
+ * [start, end). In that case, merge the two ranges instead of
+ * creating a new one.
+ * The number of ranges is less than 20, so there is no need to organize
+ * them into a more intelligent data structure.
+ */
+ for (int i = 0; i < future_index; i++) {
+ cstart = start_end_pairs[i].start;
+ cend = start_end_pairs[i].end;
+ if ((cstart < nstart && cend < nstart) ||
+ (cstart > nend && cend > nend))
+ /* ranges are disjoint - do not merge */
+ continue;
+ start_end_pairs[i].start = min(nstart, cstart);
+ start_end_pairs[i].end = max(nend, cend);
+ merged = true;
+ break;
+ }
+ if (merged)
+ return;
+ start_end_pairs[future_index].start = nstart;
+ start_end_pairs[future_index].end = nend;
+ future_index++;
+}
+
+/*
+ * Initialize the shadow for existing mappings during kernel initialization.
+ * These include kernel text/data sections, NODE_DATA and future ranges
+ * registered while creating other data (e.g. percpu).
+ *
+ * Allocations via memblock can be only done before slab is initialized.
+ */
+void __init kmsan_init_shadow(void)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ phys_addr_t p_start, p_end;
+ u64 loop;
+ int nid;
+
+ for_each_reserved_mem_range(loop, &p_start, &p_end)
+ kmsan_record_future_shadow_range(phys_to_virt(p_start),
+ phys_to_virt(p_end));
+ /* Allocate shadow for .data */
+ kmsan_record_future_shadow_range(_sdata, _edata);
+
+ for_each_online_node(nid)
+ kmsan_record_future_shadow_range(
+ NODE_DATA(nid), (char *)NODE_DATA(nid) + nd_size);
+
+ for (int i = 0; i < future_index; i++)
+ kmsan_init_alloc_meta_for_range(
+ (void *)start_end_pairs[i].start,
+ (void *)start_end_pairs[i].end);
+}
+
+struct metadata_page_pair {
+ struct page *shadow, *origin;
+};
+static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata;
+
+/*
+ * Eager metadata allocation. When the memblock allocator is freeing pages to
+ * pagealloc, we use 2/3 of them as metadata for the remaining 1/3.
+ * We store the pointers to the returned blocks of pages in held_back[] grouped
+ * by their order: when kmsan_memblock_free_pages() is called for the first
+ * time with a certain order, it is reserved as a shadow block, for the second
+ * time - as an origin block. On the third time the incoming block receives its
+ * shadow and origin ranges from the previously saved shadow and origin blocks,
+ * after which held_back[order] can be used again.
+ *
+ * At the very end there may be leftover blocks in held_back[]. They are
+ * collected later by kmsan_memblock_discard().
+ */
+bool kmsan_memblock_free_pages(struct page *page, unsigned int order)
+{
+ struct page *shadow, *origin;
+
+ if (!held_back[order].shadow) {
+ held_back[order].shadow = page;
+ return false;
+ }
+ if (!held_back[order].origin) {
+ held_back[order].origin = page;
+ return false;
+ }
+ shadow = held_back[order].shadow;
+ origin = held_back[order].origin;
+ kmsan_setup_meta(page, shadow, origin, order);
+
+ held_back[order].shadow = NULL;
+ held_back[order].origin = NULL;
+ return true;
+}
+
+#define MAX_BLOCKS 8
+struct smallstack {
+ struct page *items[MAX_BLOCKS];
+ int index;
+ int order;
+};
+
+static struct smallstack collect = {
+ .index = 0,
+ .order = MAX_ORDER,
+};
+
+static void smallstack_push(struct smallstack *stack, struct page *pages)
+{
+ KMSAN_WARN_ON(stack->index == MAX_BLOCKS);
+ stack->items[stack->index] = pages;
+ stack->index++;
+}
+#undef MAX_BLOCKS
+
+static struct page *smallstack_pop(struct smallstack *stack)
+{
+ struct page *ret;
+
+ KMSAN_WARN_ON(stack->index == 0);
+ stack->index--;
+ ret = stack->items[stack->index];
+ stack->items[stack->index] = NULL;
+ return ret;
+}
+
+static void do_collection(void)
+{
+ struct page *page, *shadow, *origin;
+
+ while (collect.index >= 3) {
+ page = smallstack_pop(&collect);
+ shadow = smallstack_pop(&collect);
+ origin = smallstack_pop(&collect);
+ kmsan_setup_meta(page, shadow, origin, collect.order);
+ __free_pages_core(page, collect.order);
+ }
+}
+
+static void collect_split(void)
+{
+ struct smallstack tmp = {
+ .order = collect.order - 1,
+ .index = 0,
+ };
+ struct page *page;
+
+ if (!collect.order)
+ return;
+ while (collect.index) {
+ page = smallstack_pop(&collect);
+ smallstack_push(&tmp, &page[0]);
+ smallstack_push(&tmp, &page[1 << tmp.order]);
+ }
+ __memcpy(&collect, &tmp, sizeof(tmp));
+}
+
+/*
+ * Memblock is about to go away. Split the page blocks left over in held_back[]
+ * and return 1/3 of that memory to the system.
+ */
+static void kmsan_memblock_discard(void)
+{
+ /*
+ * For each order=N:
+ * - push held_back[N].shadow and .origin to @collect;
+ * - while there are >= 3 elements in @collect, do garbage collection:
+ * - pop 3 ranges from @collect;
+ * - use two of them as shadow and origin for the third one;
+ * - repeat;
+ * - split each remaining element from @collect into 2 ranges of
+ * order=N-1,
+ * - repeat.
+ */
+ collect.order = MAX_ORDER;
+ for (int i = MAX_ORDER; i >= 0; i--) {
+ if (held_back[i].shadow)
+ smallstack_push(&collect, held_back[i].shadow);
+ if (held_back[i].origin)
+ smallstack_push(&collect, held_back[i].origin);
+ held_back[i].shadow = NULL;
+ held_back[i].origin = NULL;
+ do_collection();
+ collect_split();
+ }
+}
+
+void __init kmsan_init_runtime(void)
+{
+ /* Assuming current is init_task */
+ kmsan_internal_task_create(current);
+ kmsan_memblock_discard();
+ pr_info("Starting KernelMemorySanitizer\n");
+ pr_info("ATTENTION: KMSAN is a debugging tool! Do not use it on production machines!\n");
+ kmsan_enabled = true;
+}
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
new file mode 100644
index 000000000000..cc3907a9c33a
--- /dev/null
+++ b/mm/kmsan/instrumentation.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN compiler API.
+ *
+ * This file implements __msan_XXX hooks that Clang inserts into the code
+ * compiled with -fsanitize=kernel-memory.
+ * See Documentation/dev-tools/kmsan.rst for more information on how KMSAN
+ * instrumentation works.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include "kmsan.h"
+#include <linux/gfp.h>
+#include <linux/kmsan_string.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+static inline bool is_bad_asm_addr(void *addr, uintptr_t size, bool is_store)
+{
+ if ((u64)addr < TASK_SIZE)
+ return true;
+ if (!kmsan_get_metadata(addr, KMSAN_META_SHADOW))
+ return true;
+ return false;
+}
+
+static inline struct shadow_origin_ptr
+get_shadow_origin_ptr(void *addr, u64 size, bool store)
+{
+ unsigned long ua_flags = user_access_save();
+ struct shadow_origin_ptr ret;
+
+ ret = kmsan_get_shadow_origin_ptr(addr, size, store);
+ user_access_restore(ua_flags);
+ return ret;
+}
+
+/*
+ * KMSAN instrumentation functions follow. They are not declared elsewhere in
+ * the kernel code, so they are preceded by prototypes, to silence
+ * -Wmissing-prototypes warnings.
+ */
+
+/* Get shadow and origin pointers for a memory load with non-standard size. */
+struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr,
+ uintptr_t size);
+struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr,
+ uintptr_t size)
+{
+ return get_shadow_origin_ptr(addr, size, /*store*/ false);
+}
+EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n);
+
+/* Get shadow and origin pointers for a memory store with non-standard size. */
+struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr,
+ uintptr_t size);
+struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr,
+ uintptr_t size)
+{
+ return get_shadow_origin_ptr(addr, size, /*store*/ true);
+}
+EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n);
+
+/*
+ * Declare functions that obtain shadow/origin pointers for loads and stores
+ * with fixed size.
+ */
+#define DECLARE_METADATA_PTR_GETTER(size) \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \
+ void *addr); \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \
+ void *addr) \
+ { \
+ return get_shadow_origin_ptr(addr, size, /*store*/ false); \
+ } \
+ EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \
+ void *addr); \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \
+ void *addr) \
+ { \
+ return get_shadow_origin_ptr(addr, size, /*store*/ true); \
+ } \
+ EXPORT_SYMBOL(__msan_metadata_ptr_for_store_##size)
+
+DECLARE_METADATA_PTR_GETTER(1);
+DECLARE_METADATA_PTR_GETTER(2);
+DECLARE_METADATA_PTR_GETTER(4);
+DECLARE_METADATA_PTR_GETTER(8);
+
+/*
+ * Handle a memory store performed by inline assembly. KMSAN conservatively
+ * attempts to unpoison the outputs of asm() directives to prevent false
+ * positives caused by missed stores.
+ *
+ * __msan_instrument_asm_store() may be called for inline assembly code when
+ * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure
+ * the memory written to in these cases is also marked as initialized.
+ */
+void __msan_instrument_asm_store(void *addr, uintptr_t size);
+void __msan_instrument_asm_store(void *addr, uintptr_t size)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled)
+ return;
+
+ ua_flags = user_access_save();
+ /*
+ * Most of the accesses are below 32 bytes. The two exceptions so far
+ * are clwb() (64 bytes) and FPU state (512 bytes).
+ * It's unlikely that the assembly will touch more than 512 bytes.
+ */
+ if (size > 512) {
+ WARN_ONCE(1, "assembly store size too big: %ld\n", size);
+ size = 8;
+ }
+ if (is_bad_asm_addr(addr, size, /*is_store*/ true)) {
+ user_access_restore(ua_flags);
+ return;
+ }
+ /* Unpoisoning the memory on best effort. */
+ kmsan_internal_unpoison_memory(addr, size, /*checked*/ false);
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(__msan_instrument_asm_store);
+
+/*
+ * KMSAN instrumentation pass replaces LLVM memcpy, memmove and memset
+ * intrinsics with calls to respective __msan_ functions. We use
+ * get_param0_metadata() and set_retval_metadata() to store the shadow/origin
+ * values for the destination argument of these functions and use them for the
+ * functions' return values.
+ */
+static inline void get_param0_metadata(u64 *shadow,
+ depot_stack_handle_t *origin)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ *shadow = *(u64 *)(ctx->cstate.param_tls);
+ *origin = ctx->cstate.param_origin_tls[0];
+}
+
+static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ *(u64 *)(ctx->cstate.retval_tls) = shadow;
+ ctx->cstate.retval_origin_tls = origin;
+}
+
+/* Handle llvm.memmove intrinsic. */
+void *__msan_memmove(void *dst, const void *src, uintptr_t n);
+void *__msan_memmove(void *dst, const void *src, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memmove(dst, src, n);
+ if (!n)
+ /* Some people call memmove() with zero length. */
+ return result;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ kmsan_internal_memmove_metadata(dst, (void *)src, n);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memmove);
+
+/* Handle llvm.memcpy intrinsic. */
+void *__msan_memcpy(void *dst, const void *src, uintptr_t n);
+void *__msan_memcpy(void *dst, const void *src, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memcpy(dst, src, n);
+ if (!n)
+ /* Some people call memcpy() with zero length. */
+ return result;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ /* Using memmove instead of memcpy doesn't affect correctness. */
+ kmsan_internal_memmove_metadata(dst, (void *)src, n);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memcpy);
+
+/* Handle llvm.memset intrinsic. */
+void *__msan_memset(void *dst, int c, uintptr_t n);
+void *__msan_memset(void *dst, int c, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memset(dst, c, n);
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ /*
+ * Clang doesn't pass parameter metadata here, so it is impossible to
+ * use shadow of @c to set up the shadow for @dst.
+ */
+ kmsan_internal_unpoison_memory(dst, n, /*checked*/ false);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memset);
+
+/*
+ * Create a new origin from an old one. This is done when storing an
+ * uninitialized value to memory. When reporting an error, KMSAN unrolls and
+ * prints the whole chain of stores that preceded the use of this value.
+ */
+depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin);
+depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin)
+{
+ depot_stack_handle_t ret = 0;
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return ret;
+
+ ua_flags = user_access_save();
+
+ /* Creating new origins may allocate memory. */
+ kmsan_enter_runtime();
+ ret = kmsan_internal_chain_origin(origin);
+ kmsan_leave_runtime();
+ user_access_restore(ua_flags);
+ return ret;
+}
+EXPORT_SYMBOL(__msan_chain_origin);
+
+/* Poison a local variable when entering a function. */
+void __msan_poison_alloca(void *address, uintptr_t size, char *descr);
+void __msan_poison_alloca(void *address, uintptr_t size, char *descr)
+{
+ depot_stack_handle_t handle;
+ unsigned long entries[4];
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ua_flags = user_access_save();
+ entries[0] = KMSAN_ALLOCA_MAGIC_ORIGIN;
+ entries[1] = (u64)descr;
+ entries[2] = (u64)__builtin_return_address(0);
+ /*
+ * With frame pointers enabled, it is possible to quickly fetch the
+ * second frame of the caller stack without calling the unwinder.
+ * Without them, simply do not bother.
+ */
+ if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER))
+ entries[3] = (u64)__builtin_return_address(1);
+ else
+ entries[3] = 0;
+
+ /* stack_depot_save() may allocate memory. */
+ kmsan_enter_runtime();
+ handle = stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH);
+ kmsan_leave_runtime();
+
+ kmsan_internal_set_shadow_origin(address, size, -1, handle,
+ /*checked*/ true);
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(__msan_poison_alloca);
+
+/* Unpoison a local variable. */
+void __msan_unpoison_alloca(void *address, uintptr_t size);
+void __msan_unpoison_alloca(void *address, uintptr_t size)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ kmsan_enter_runtime();
+ kmsan_internal_unpoison_memory(address, size, /*checked*/ true);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(__msan_unpoison_alloca);
+
+/*
+ * Report that an uninitialized value with the given origin was used in a way
+ * that constituted undefined behavior.
+ */
+void __msan_warning(u32 origin);
+void __msan_warning(u32 origin)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ kmsan_report(origin, /*address*/ 0, /*size*/ 0,
+ /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ 0,
+ REASON_ANY);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(__msan_warning);
+
+/*
+ * At the beginning of an instrumented function, obtain the pointer to
+ * `struct kmsan_context_state` holding the metadata for function parameters.
+ */
+struct kmsan_context_state *__msan_get_context_state(void);
+struct kmsan_context_state *__msan_get_context_state(void)
+{
+ return &kmsan_get_context()->cstate;
+}
+EXPORT_SYMBOL(__msan_get_context_state);
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
new file mode 100644
index 000000000000..a14744205435
--- /dev/null
+++ b/mm/kmsan/kmsan.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Functions used by the KMSAN runtime.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#ifndef __MM_KMSAN_KMSAN_H
+#define __MM_KMSAN_KMSAN_H
+
+#include <asm/pgtable_64_types.h>
+#include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+
+#define KMSAN_ALLOCA_MAGIC_ORIGIN 0xabcd0100
+#define KMSAN_CHAIN_MAGIC_ORIGIN 0xabcd0200
+
+#define KMSAN_POISON_NOCHECK 0x0
+#define KMSAN_POISON_CHECK 0x1
+#define KMSAN_POISON_FREE 0x2
+
+#define KMSAN_ORIGIN_SIZE 4
+#define KMSAN_MAX_ORIGIN_DEPTH 7
+
+#define KMSAN_STACK_DEPTH 64
+
+#define KMSAN_META_SHADOW (false)
+#define KMSAN_META_ORIGIN (true)
+
+extern bool kmsan_enabled;
+extern int panic_on_kmsan;
+
+/*
+ * KMSAN performs a lot of consistency checks that are currently enabled by
+ * default. BUG_ON is normally discouraged in the kernel, unless used for
+ * debugging, but KMSAN itself is a debugging tool, so it makes little sense to
+ * recover if something goes wrong.
+ */
+#define KMSAN_WARN_ON(cond) \
+ ({ \
+ const bool __cond = WARN_ON(cond); \
+ if (unlikely(__cond)) { \
+ WRITE_ONCE(kmsan_enabled, false); \
+ if (panic_on_kmsan) { \
+ /* Can't call panic() here because */ \
+ /* of uaccess checks. */ \
+ BUG(); \
+ } \
+ } \
+ __cond; \
+ })
+
+/*
+ * A pair of metadata pointers to be returned by the instrumentation functions.
+ */
+struct shadow_origin_ptr {
+ void *shadow, *origin;
+};
+
+struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size,
+ bool store);
+void *kmsan_get_metadata(void *addr, bool is_origin);
+void __init kmsan_init_alloc_meta_for_range(void *start, void *end);
+
+enum kmsan_bug_reason {
+ REASON_ANY,
+ REASON_COPY_TO_USER,
+ REASON_SUBMIT_URB,
+};
+
+void kmsan_print_origin(depot_stack_handle_t origin);
+
+/**
+ * kmsan_report() - Report a use of uninitialized value.
+ * @origin: Stack ID of the uninitialized value.
+ * @address: Address at which the memory access happens.
+ * @size: Memory access size.
+ * @off_first: Offset (from @address) of the first byte to be reported.
+ * @off_last: Offset (from @address) of the last byte to be reported.
+ * @user_addr: When non-NULL, denotes the userspace address to which the kernel
+ * is leaking data.
+ * @reason: Error type from enum kmsan_bug_reason.
+ *
+ * kmsan_report() prints an error message for a consequent group of bytes
+ * sharing the same origin. If an uninitialized value is used in a comparison,
+ * this function is called once without specifying the addresses. When checking
+ * a memory range, KMSAN may call kmsan_report() multiple times with the same
+ * @address, @size, @user_addr and @reason, but different @off_first and
+ * @off_last corresponding to different @origin values.
+ */
+void kmsan_report(depot_stack_handle_t origin, void *address, int size,
+ int off_first, int off_last, const void *user_addr,
+ enum kmsan_bug_reason reason);
+
+DECLARE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
+
+static __always_inline struct kmsan_ctx *kmsan_get_context(void)
+{
+ return in_task() ? &current->kmsan_ctx : raw_cpu_ptr(&kmsan_percpu_ctx);
+}
+
+/*
+ * When a compiler hook or KMSAN runtime function is invoked, it may make a
+ * call to instrumented code and eventually call itself recursively. To avoid
+ * that, we guard the runtime entry regions with
+ * kmsan_enter_runtime()/kmsan_leave_runtime() and exit the hook if
+ * kmsan_in_runtime() is true.
+ *
+ * Non-runtime code may occasionally get executed in nested IRQs from the
+ * runtime code (e.g. when called via smp_call_function_single()). Because some
+ * KMSAN routines may take locks (e.g. for memory allocation), we conservatively
+ * bail out instead of calling them. To minimize the effect of this (potentially
+ * missing initialization events) kmsan_in_runtime() is not checked in
+ * non-blocking runtime functions.
+ */
+static __always_inline bool kmsan_in_runtime(void)
+{
+ if ((hardirq_count() >> HARDIRQ_SHIFT) > 1)
+ return true;
+ if (in_nmi())
+ return true;
+ return kmsan_get_context()->kmsan_in_runtime;
+}
+
+static __always_inline void kmsan_enter_runtime(void)
+{
+ struct kmsan_ctx *ctx;
+
+ ctx = kmsan_get_context();
+ KMSAN_WARN_ON(ctx->kmsan_in_runtime++);
+}
+
+static __always_inline void kmsan_leave_runtime(void)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ KMSAN_WARN_ON(--ctx->kmsan_in_runtime);
+}
+
+depot_stack_handle_t kmsan_save_stack(void);
+depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
+ unsigned int extra_bits);
+
+/*
+ * Pack and unpack the origin chain depth and UAF flag to/from the extra bits
+ * provided by the stack depot.
+ * The UAF flag is stored in the lowest bit, followed by the depth in the upper
+ * bits.
+ * set_dsh_extra_bits() is responsible for clamping the value.
+ */
+static __always_inline unsigned int kmsan_extra_bits(unsigned int depth,
+ bool uaf)
+{
+ return (depth << 1) | uaf;
+}
+
+static __always_inline bool kmsan_uaf_from_eb(unsigned int extra_bits)
+{
+ return extra_bits & 1;
+}
+
+static __always_inline unsigned int kmsan_depth_from_eb(unsigned int extra_bits)
+{
+ return extra_bits >> 1;
+}
+
+/*
+ * kmsan_internal_ functions are supposed to be very simple and not require the
+ * kmsan_in_runtime() checks.
+ */
+void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n);
+void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags,
+ unsigned int poison_flags);
+void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked);
+void kmsan_internal_set_shadow_origin(void *address, size_t size, int b,
+ u32 origin, bool checked);
+depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id);
+
+void kmsan_internal_task_create(struct task_struct *task);
+
+bool kmsan_metadata_is_contiguous(void *addr, size_t size);
+void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr,
+ int reason);
+
+struct page *kmsan_vmalloc_to_page_or_null(void *vaddr);
+void kmsan_setup_meta(struct page *page, struct page *shadow,
+ struct page *origin, int order);
+
+/*
+ * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are
+ * non-instrumented versions of is_module_address() and is_vmalloc_addr() that
+ * are safe to call from KMSAN runtime without recursion.
+ */
+static inline bool kmsan_internal_is_module_addr(void *vaddr)
+{
+ return ((u64)vaddr >= MODULES_VADDR) && ((u64)vaddr < MODULES_END);
+}
+
+static inline bool kmsan_internal_is_vmalloc_addr(void *addr)
+{
+ return ((u64)addr >= VMALLOC_START) && ((u64)addr < VMALLOC_END);
+}
+
+#endif /* __MM_KMSAN_KMSAN_H */
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
new file mode 100644
index 000000000000..312989aa2865
--- /dev/null
+++ b/mm/kmsan/kmsan_test.c
@@ -0,0 +1,652 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test cases for KMSAN.
+ * For each test case checks the presence (or absence) of generated reports.
+ * Relies on 'console' tracepoint to capture reports as they appear in the
+ * kernel log.
+ *
+ * Copyright (C) 2021-2022, Google LLC.
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <kunit/test.h>
+#include "kmsan.h"
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kmsan.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <linux/vmalloc.h>
+#include <trace/events/printk.h>
+
+static DEFINE_PER_CPU(int, per_cpu_var);
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ bool available;
+ bool ignore; /* Stop console output collection. */
+ char header[256];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Probe for console output: obtains observed lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+
+ if (observed.ignore)
+ return;
+ spin_lock_irqsave(&observed.lock, flags);
+
+ if (strnstr(buf, "BUG: KMSAN: ", len)) {
+ /*
+ * KMSAN report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.header, buf,
+ min(len + 1, sizeof(observed.header)));
+ WRITE_ONCE(observed.available, true);
+ observed.ignore = true;
+ }
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+static bool report_available(void)
+{
+ return READ_ONCE(observed.available);
+}
+
+/* Information we expect in a report. */
+struct expect_report {
+ const char *error_type; /* Error type. */
+ /*
+ * Kernel symbol from the error header, or NULL if no report is
+ * expected.
+ */
+ const char *symbol;
+};
+
+/* Check observed report matches information in @r. */
+static bool report_matches(const struct expect_report *r)
+{
+ typeof(observed.header) expected_header;
+ unsigned long flags;
+ bool ret = false;
+ const char *end;
+ char *cur;
+
+ /* Doubled-checked locking. */
+ if (!report_available() || !r->symbol)
+ return (!report_available() && !r->symbol);
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expected_header;
+ end = &expected_header[sizeof(expected_header) - 1];
+
+ cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type);
+
+ scnprintf(cur, end - cur, " in %s", r->symbol);
+ /* The exact offset won't match, remove it; also strip module name. */
+ cur = strchr(expected_header, '+');
+ if (cur)
+ *cur = '\0';
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.header, expected_header);
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ return ret;
+}
+
+/* ===== Test cases ===== */
+
+/* Prevent replacing branch with select in LLVM. */
+static noinline void check_true(char *arg)
+{
+ pr_info("%s is true\n", arg);
+}
+
+static noinline void check_false(char *arg)
+{
+ pr_info("%s is false\n", arg);
+}
+
+#define USE(x) \
+ do { \
+ if (x) \
+ check_true(#x); \
+ else \
+ check_false(#x); \
+ } while (0)
+
+#define EXPECTATION_ETYPE_FN(e, reason, fn) \
+ struct expect_report e = { \
+ .error_type = reason, \
+ .symbol = fn, \
+ }
+
+#define EXPECTATION_NO_REPORT(e) EXPECTATION_ETYPE_FN(e, NULL, NULL)
+#define EXPECTATION_UNINIT_VALUE_FN(e, fn) \
+ EXPECTATION_ETYPE_FN(e, "uninit-value", fn)
+#define EXPECTATION_UNINIT_VALUE(e) EXPECTATION_UNINIT_VALUE_FN(e, __func__)
+#define EXPECTATION_USE_AFTER_FREE(e) \
+ EXPECTATION_ETYPE_FN(e, "use-after-free", __func__)
+
+/* Test case: ensure that kmalloc() returns uninitialized memory. */
+static void test_uninit_kmalloc(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ int *ptr;
+
+ kunit_info(test, "uninitialized kmalloc test (UMR report)\n");
+ ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that kmalloc'ed memory becomes initialized after memset().
+ */
+static void test_init_kmalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int *ptr;
+
+ kunit_info(test, "initialized kmalloc test (no reports)\n");
+ ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
+ memset(ptr, 0, sizeof(*ptr));
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that kzalloc() returns initialized memory. */
+static void test_init_kzalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int *ptr;
+
+ kunit_info(test, "initialized kzalloc test (no reports)\n");
+ ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that local variables are uninitialized by default. */
+static void test_uninit_stack_var(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile int cond;
+
+ kunit_info(test, "uninitialized stack variable (UMR report)\n");
+ USE(cond);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that local variables with initializers are initialized. */
+static void test_init_stack_var(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ volatile int cond = 1;
+
+ kunit_info(test, "initialized stack variable (no reports)\n");
+ USE(cond);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static noinline void two_param_fn_2(int arg1, int arg2)
+{
+ USE(arg1);
+ USE(arg2);
+}
+
+static noinline void one_param_fn(int arg)
+{
+ two_param_fn_2(arg, arg);
+ USE(arg);
+}
+
+static noinline void two_param_fn(int arg1, int arg2)
+{
+ int init = 0;
+
+ one_param_fn(init);
+ USE(arg1);
+ USE(arg2);
+}
+
+static void test_params(struct kunit *test)
+{
+#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL
+ /*
+ * With eager param/retval checking enabled, KMSAN will report an error
+ * before the call to two_param_fn().
+ */
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_params");
+#else
+ EXPECTATION_UNINIT_VALUE_FN(expect, "two_param_fn");
+#endif
+ volatile int uninit, init = 1;
+
+ kunit_info(test,
+ "uninit passed through a function parameter (UMR report)\n");
+ two_param_fn(uninit, init);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static int signed_sum3(int a, int b, int c)
+{
+ return a + b + c;
+}
+
+/*
+ * Test case: ensure that uninitialized values are tracked through function
+ * arguments.
+ */
+static void test_uninit_multiple_params(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile char b = 3, c;
+ volatile int a;
+
+ kunit_info(test, "uninitialized local passed to fn (UMR report)\n");
+ USE(signed_sum3(a, b, c));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Helper function to make an array uninitialized. */
+static noinline void do_uninit_local_array(char *array, int start, int stop)
+{
+ volatile char uninit;
+
+ for (int i = start; i < stop; i++)
+ array[i] = uninit;
+}
+
+/*
+ * Test case: ensure kmsan_check_memory() reports an error when checking
+ * uninitialized memory.
+ */
+static void test_uninit_kmsan_check_memory(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_uninit_kmsan_check_memory");
+ volatile char local_array[8];
+
+ kunit_info(
+ test,
+ "kmsan_check_memory() called on uninit local (UMR report)\n");
+ do_uninit_local_array((char *)local_array, 5, 7);
+
+ kmsan_check_memory((char *)local_array, 8);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: check that a virtual memory range created with vmap() from
+ * initialized pages is still considered as initialized.
+ */
+static void test_init_kmsan_vmap_vunmap(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ const int npages = 2;
+ struct page **pages;
+ void *vbuf;
+
+ kunit_info(test, "pages initialized via vmap (no reports)\n");
+
+ pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+ for (int i = 0; i < npages; i++)
+ pages[i] = alloc_page(GFP_KERNEL);
+ vbuf = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
+ memset(vbuf, 0xfe, npages * PAGE_SIZE);
+ for (int i = 0; i < npages; i++)
+ kmsan_check_memory(page_address(pages[i]), PAGE_SIZE);
+
+ if (vbuf)
+ vunmap(vbuf);
+ for (int i = 0; i < npages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memset() can initialize a buffer allocated via
+ * vmalloc().
+ */
+static void test_init_vmalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int npages = 8;
+ char *buf;
+
+ kunit_info(test, "vmalloc buffer can be initialized (no reports)\n");
+ buf = vmalloc(PAGE_SIZE * npages);
+ buf[0] = 1;
+ memset(buf, 0xfe, PAGE_SIZE * npages);
+ USE(buf[0]);
+ for (int i = 0; i < npages; i++)
+ kmsan_check_memory(&buf[PAGE_SIZE * i], PAGE_SIZE);
+ vfree(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that use-after-free reporting works. */
+static void test_uaf(struct kunit *test)
+{
+ EXPECTATION_USE_AFTER_FREE(expect);
+ volatile int value;
+ volatile int *var;
+
+ kunit_info(test, "use-after-free in kmalloc-ed buffer (UMR report)\n");
+ var = kmalloc(80, GFP_KERNEL);
+ var[3] = 0xfeedface;
+ kfree((int *)var);
+ /* Copy the invalid value before checking it. */
+ value = var[3];
+ USE(value);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that uninitialized values are propagated through per-CPU
+ * memory.
+ */
+static void test_percpu_propagate(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile int uninit, check;
+
+ kunit_info(test,
+ "uninit local stored to per_cpu memory (UMR report)\n");
+
+ this_cpu_write(per_cpu_var, uninit);
+ check = this_cpu_read(per_cpu_var);
+ USE(check);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that passing uninitialized values to printk() leads to an
+ * error report.
+ */
+static void test_printk(struct kunit *test)
+{
+#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL
+ /*
+ * With eager param/retval checking enabled, KMSAN will report an error
+ * before the call to pr_info().
+ */
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_printk");
+#else
+ EXPECTATION_UNINIT_VALUE_FN(expect, "number");
+#endif
+ volatile int uninit;
+
+ kunit_info(test, "uninit local passed to pr_info() (UMR report)\n");
+ pr_info("%px contains %d\n", &uninit, uninit);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Prevent the compiler from optimizing @var away. Without this, Clang may
+ * notice that @var is uninitialized and drop memcpy() calls that use it.
+ *
+ * There is OPTIMIZER_HIDE_VAR() in linux/compier.h that we cannot use here,
+ * because it is implemented as inline assembly receiving @var as a parameter
+ * and will enforce a KMSAN check. Same is true for e.g. barrier_data(var).
+ */
+#define DO_NOT_OPTIMIZE(var) barrier()
+
+/*
+ * Test case: ensure that memcpy() correctly copies initialized values.
+ * Also serves as a regression test to ensure DO_NOT_OPTIMIZE() does not cause
+ * extra checks.
+ */
+static void test_init_memcpy(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ volatile int src;
+ volatile int dst = 0;
+
+ DO_NOT_OPTIMIZE(src);
+ src = 1;
+ kunit_info(
+ test,
+ "memcpy()ing aligned initialized src to aligned dst (no reports)\n");
+ memcpy((void *)&dst, (void *)&src, sizeof(src));
+ kmsan_check_memory((void *)&dst, sizeof(dst));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and `dst`.
+ */
+static void test_memcpy_aligned_to_aligned(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_aligned");
+ volatile int uninit_src;
+ volatile int dst = 0;
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to aligned dst (UMR report)\n");
+ DO_NOT_OPTIMIZE(uninit_src);
+ memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)&dst, sizeof(dst));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and unaligned `dst`.
+ *
+ * Copying aligned 4-byte value to an unaligned one leads to touching two
+ * aligned 4-byte values. This test case checks that KMSAN correctly reports an
+ * error on the first of the two values.
+ */
+static void test_memcpy_aligned_to_unaligned(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_unaligned");
+ volatile int uninit_src;
+ volatile char dst[8] = { 0 };
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n");
+ DO_NOT_OPTIMIZE(uninit_src);
+ memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)dst, 4);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and unaligned `dst`.
+ *
+ * Copying aligned 4-byte value to an unaligned one leads to touching two
+ * aligned 4-byte values. This test case checks that KMSAN correctly reports an
+ * error on the second of the two values.
+ */
+static void test_memcpy_aligned_to_unaligned2(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect,
+ "test_memcpy_aligned_to_unaligned2");
+ volatile int uninit_src;
+ volatile char dst[8] = { 0 };
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n");
+ DO_NOT_OPTIMIZE(uninit_src);
+ memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)&dst[4], sizeof(uninit_src));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Generate test cases for memset16(), memset32(), memset64(). */
+#define DEFINE_TEST_MEMSETXX(size) \
+ static void test_memset##size(struct kunit *test) \
+ { \
+ EXPECTATION_NO_REPORT(expect); \
+ volatile uint##size##_t uninit; \
+ \
+ kunit_info(test, \
+ "memset" #size "() should initialize memory\n"); \
+ DO_NOT_OPTIMIZE(uninit); \
+ memset##size((uint##size##_t *)&uninit, 0, 1); \
+ kmsan_check_memory((void *)&uninit, sizeof(uninit)); \
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect)); \
+ }
+
+DEFINE_TEST_MEMSETXX(16)
+DEFINE_TEST_MEMSETXX(32)
+DEFINE_TEST_MEMSETXX(64)
+
+static noinline void fibonacci(int *array, int size, int start)
+{
+ if (start < 2 || (start == size))
+ return;
+ array[start] = array[start - 1] + array[start - 2];
+ fibonacci(array, size, start + 1);
+}
+
+static void test_long_origin_chain(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_long_origin_chain");
+ /* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */
+ volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2];
+ int last = ARRAY_SIZE(accum) - 1;
+
+ kunit_info(
+ test,
+ "origin chain exceeding KMSAN_MAX_ORIGIN_DEPTH (UMR report)\n");
+ /*
+ * We do not set accum[1] to 0, so the uninitializedness will be carried
+ * over to accum[2..last].
+ */
+ accum[0] = 1;
+ fibonacci((int *)accum, ARRAY_SIZE(accum), 2);
+ kmsan_check_memory((void *)&accum[last], sizeof(int));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that saving/restoring/printing stacks to/from stackdepot
+ * does not trigger errors.
+ *
+ * KMSAN uses stackdepot to store origin stack traces, that's why we do not
+ * instrument lib/stackdepot.c. Yet it must properly mark its outputs as
+ * initialized because other kernel features (e.g. netdev tracker) may also
+ * access stackdepot from instrumented code.
+ */
+static void test_stackdepot_roundtrip(struct kunit *test)
+{
+ unsigned long src_entries[16], *dst_entries;
+ unsigned int src_nentries, dst_nentries;
+ EXPECTATION_NO_REPORT(expect);
+ depot_stack_handle_t handle;
+
+ kunit_info(test, "testing stackdepot roundtrip (no reports)\n");
+
+ src_nentries =
+ stack_trace_save(src_entries, ARRAY_SIZE(src_entries), 1);
+ handle = stack_depot_save(src_entries, src_nentries, GFP_KERNEL);
+ stack_depot_print(handle);
+ dst_nentries = stack_depot_fetch(handle, &dst_entries);
+ KUNIT_EXPECT_TRUE(test, src_nentries == dst_nentries);
+
+ kmsan_check_memory((void *)dst_entries,
+ sizeof(*dst_entries) * dst_nentries);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static struct kunit_case kmsan_test_cases[] = {
+ KUNIT_CASE(test_uninit_kmalloc),
+ KUNIT_CASE(test_init_kmalloc),
+ KUNIT_CASE(test_init_kzalloc),
+ KUNIT_CASE(test_uninit_stack_var),
+ KUNIT_CASE(test_init_stack_var),
+ KUNIT_CASE(test_params),
+ KUNIT_CASE(test_uninit_multiple_params),
+ KUNIT_CASE(test_uninit_kmsan_check_memory),
+ KUNIT_CASE(test_init_kmsan_vmap_vunmap),
+ KUNIT_CASE(test_init_vmalloc),
+ KUNIT_CASE(test_uaf),
+ KUNIT_CASE(test_percpu_propagate),
+ KUNIT_CASE(test_printk),
+ KUNIT_CASE(test_init_memcpy),
+ KUNIT_CASE(test_memcpy_aligned_to_aligned),
+ KUNIT_CASE(test_memcpy_aligned_to_unaligned),
+ KUNIT_CASE(test_memcpy_aligned_to_unaligned2),
+ KUNIT_CASE(test_memset16),
+ KUNIT_CASE(test_memset32),
+ KUNIT_CASE(test_memset64),
+ KUNIT_CASE(test_long_origin_chain),
+ KUNIT_CASE(test_stackdepot_roundtrip),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ observed.header[0] = '\0';
+ observed.ignore = false;
+ observed.available = false;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+}
+
+static int kmsan_suite_init(struct kunit_suite *suite)
+{
+ register_trace_console(probe_console, NULL);
+ return 0;
+}
+
+static void kmsan_suite_exit(struct kunit_suite *suite)
+{
+ unregister_trace_console(probe_console, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+static struct kunit_suite kmsan_test_suite = {
+ .name = "kmsan",
+ .test_cases = kmsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kmsan_suite_init,
+ .suite_exit = kmsan_suite_exit,
+};
+kunit_test_suites(&kmsan_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c
new file mode 100644
index 000000000000..02736ec757f2
--- /dev/null
+++ b/mm/kmsan/report.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN error reporting routines.
+ *
+ * Copyright (C) 2019-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <linux/console.h>
+#include <linux/moduleparam.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/uaccess.h>
+
+#include "kmsan.h"
+
+static DEFINE_RAW_SPINLOCK(kmsan_report_lock);
+#define DESCR_SIZE 128
+/* Protected by kmsan_report_lock */
+static char report_local_descr[DESCR_SIZE];
+int panic_on_kmsan __read_mostly;
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kmsan."
+module_param_named(panic, panic_on_kmsan, int, 0);
+
+/*
+ * Skip internal KMSAN frames.
+ */
+static int get_stack_skipnr(const unsigned long stack_entries[],
+ int num_entries)
+{
+ int len, skip;
+ char buf[64];
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ len = scnprintf(buf, sizeof(buf), "%ps",
+ (void *)stack_entries[skip]);
+
+ /* Never show __msan_* or kmsan_* functions. */
+ if ((strnstr(buf, "__msan_", len) == buf) ||
+ (strnstr(buf, "kmsan_", len) == buf))
+ continue;
+
+ /*
+ * No match for runtime functions -- @skip entries to skip to
+ * get to first frame of interest.
+ */
+ break;
+ }
+
+ return skip;
+}
+
+/*
+ * Currently the descriptions of locals generated by Clang look as follows:
+ * ----local_name@function_name
+ * We want to print only the name of the local, as other information in that
+ * description can be confusing.
+ * The meaningful part of the description is copied to a global buffer to avoid
+ * allocating memory.
+ */
+static char *pretty_descr(char *descr)
+{
+ int pos = 0, len = strlen(descr);
+
+ for (int i = 0; i < len; i++) {
+ if (descr[i] == '@')
+ break;
+ if (descr[i] == '-')
+ continue;
+ report_local_descr[pos] = descr[i];
+ if (pos + 1 == DESCR_SIZE)
+ break;
+ pos++;
+ }
+ report_local_descr[pos] = 0;
+ return report_local_descr;
+}
+
+void kmsan_print_origin(depot_stack_handle_t origin)
+{
+ unsigned long *entries = NULL, *chained_entries = NULL;
+ unsigned int nr_entries, chained_nr_entries, skipnr;
+ void *pc1 = NULL, *pc2 = NULL;
+ depot_stack_handle_t head;
+ unsigned long magic;
+ char *descr = NULL;
+ unsigned int depth;
+
+ if (!origin)
+ return;
+
+ while (true) {
+ nr_entries = stack_depot_fetch(origin, &entries);
+ depth = kmsan_depth_from_eb(stack_depot_get_extra_bits(origin));
+ magic = nr_entries ? entries[0] : 0;
+ if ((nr_entries == 4) && (magic == KMSAN_ALLOCA_MAGIC_ORIGIN)) {
+ descr = (char *)entries[1];
+ pc1 = (void *)entries[2];
+ pc2 = (void *)entries[3];
+ pr_err("Local variable %s created at:\n",
+ pretty_descr(descr));
+ if (pc1)
+ pr_err(" %pSb\n", pc1);
+ if (pc2)
+ pr_err(" %pSb\n", pc2);
+ break;
+ }
+ if ((nr_entries == 3) && (magic == KMSAN_CHAIN_MAGIC_ORIGIN)) {
+ /*
+ * Origin chains deeper than KMSAN_MAX_ORIGIN_DEPTH are
+ * not stored, so the output may be incomplete.
+ */
+ if (depth == KMSAN_MAX_ORIGIN_DEPTH)
+ pr_err("<Zero or more stacks not recorded to save memory>\n\n");
+ head = entries[1];
+ origin = entries[2];
+ pr_err("Uninit was stored to memory at:\n");
+ chained_nr_entries =
+ stack_depot_fetch(head, &chained_entries);
+ kmsan_internal_unpoison_memory(
+ chained_entries,
+ chained_nr_entries * sizeof(*chained_entries),
+ /*checked*/ false);
+ skipnr = get_stack_skipnr(chained_entries,
+ chained_nr_entries);
+ stack_trace_print(chained_entries + skipnr,
+ chained_nr_entries - skipnr, 0);
+ pr_err("\n");
+ continue;
+ }
+ pr_err("Uninit was created at:\n");
+ if (nr_entries) {
+ skipnr = get_stack_skipnr(entries, nr_entries);
+ stack_trace_print(entries + skipnr, nr_entries - skipnr,
+ 0);
+ } else {
+ pr_err("(stack is not available)\n");
+ }
+ break;
+ }
+}
+
+void kmsan_report(depot_stack_handle_t origin, void *address, int size,
+ int off_first, int off_last, const void *user_addr,
+ enum kmsan_bug_reason reason)
+{
+ unsigned long stack_entries[KMSAN_STACK_DEPTH];
+ int num_stack_entries, skipnr;
+ char *bug_type = NULL;
+ unsigned long ua_flags;
+ bool is_uaf;
+
+ if (!kmsan_enabled)
+ return;
+ if (!current->kmsan_ctx.allow_reporting)
+ return;
+ if (!origin)
+ return;
+
+ current->kmsan_ctx.allow_reporting = false;
+ ua_flags = user_access_save();
+ raw_spin_lock(&kmsan_report_lock);
+ pr_err("=====================================================\n");
+ is_uaf = kmsan_uaf_from_eb(stack_depot_get_extra_bits(origin));
+ switch (reason) {
+ case REASON_ANY:
+ bug_type = is_uaf ? "use-after-free" : "uninit-value";
+ break;
+ case REASON_COPY_TO_USER:
+ bug_type = is_uaf ? "kernel-infoleak-after-free" :
+ "kernel-infoleak";
+ break;
+ case REASON_SUBMIT_URB:
+ bug_type = is_uaf ? "kernel-usb-infoleak-after-free" :
+ "kernel-usb-infoleak";
+ break;
+ }
+
+ num_stack_entries =
+ stack_trace_save(stack_entries, KMSAN_STACK_DEPTH, 1);
+ skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+
+ pr_err("BUG: KMSAN: %s in %pSb\n", bug_type,
+ (void *)stack_entries[skipnr]);
+ stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
+ 0);
+ pr_err("\n");
+
+ kmsan_print_origin(origin);
+
+ if (size) {
+ pr_err("\n");
+ if (off_first == off_last)
+ pr_err("Byte %d of %d is uninitialized\n", off_first,
+ size);
+ else
+ pr_err("Bytes %d-%d of %d are uninitialized\n",
+ off_first, off_last, size);
+ }
+ if (address)
+ pr_err("Memory access of size %d starts at %px\n", size,
+ address);
+ if (user_addr && reason == REASON_COPY_TO_USER)
+ pr_err("Data copied to user address %px\n", user_addr);
+ pr_err("\n");
+ dump_stack_print_info(KERN_ERR);
+ pr_err("=====================================================\n");
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ raw_spin_unlock(&kmsan_report_lock);
+ if (panic_on_kmsan)
+ panic("kmsan.panic set ...\n");
+ user_access_restore(ua_flags);
+ current->kmsan_ctx.allow_reporting = true;
+}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
new file mode 100644
index 000000000000..b8bb95eea5e3
--- /dev/null
+++ b/mm/kmsan/shadow.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN shadow implementation.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <asm/kmsan.h>
+#include <asm/tlbflush.h>
+#include <linux/cacheflush.h>
+#include <linux/memblock.h>
+#include <linux/mm_types.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+
+#include "../internal.h"
+#include "kmsan.h"
+
+#define shadow_page_for(page) ((page)->kmsan_shadow)
+
+#define origin_page_for(page) ((page)->kmsan_origin)
+
+static void *shadow_ptr_for(struct page *page)
+{
+ return page_address(shadow_page_for(page));
+}
+
+static void *origin_ptr_for(struct page *page)
+{
+ return page_address(origin_page_for(page));
+}
+
+static bool page_has_metadata(struct page *page)
+{
+ return shadow_page_for(page) && origin_page_for(page);
+}
+
+static void set_no_shadow_origin_page(struct page *page)
+{
+ shadow_page_for(page) = NULL;
+ origin_page_for(page) = NULL;
+}
+
+/*
+ * Dummy load and store pages to be used when the real metadata is unavailable.
+ * There are separate pages for loads and stores, so that every load returns a
+ * zero, and every store doesn't affect other loads.
+ */
+static char dummy_load_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+static char dummy_store_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static unsigned long vmalloc_meta(void *addr, bool is_origin)
+{
+ unsigned long addr64 = (unsigned long)addr, off;
+
+ KMSAN_WARN_ON(is_origin && !IS_ALIGNED(addr64, KMSAN_ORIGIN_SIZE));
+ if (kmsan_internal_is_vmalloc_addr(addr)) {
+ off = addr64 - VMALLOC_START;
+ return off + (is_origin ? KMSAN_VMALLOC_ORIGIN_START :
+ KMSAN_VMALLOC_SHADOW_START);
+ }
+ if (kmsan_internal_is_module_addr(addr)) {
+ off = addr64 - MODULES_VADDR;
+ return off + (is_origin ? KMSAN_MODULES_ORIGIN_START :
+ KMSAN_MODULES_SHADOW_START);
+ }
+ return 0;
+}
+
+static struct page *virt_to_page_or_null(void *vaddr)
+{
+ if (kmsan_virt_addr_valid(vaddr))
+ return virt_to_page(vaddr);
+ else
+ return NULL;
+}
+
+struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *address, u64 size,
+ bool store)
+{
+ struct shadow_origin_ptr ret;
+ void *shadow;
+
+ /*
+ * Even if we redirect this memory access to the dummy page, it will
+ * go out of bounds.
+ */
+ KMSAN_WARN_ON(size > PAGE_SIZE);
+
+ if (!kmsan_enabled)
+ goto return_dummy;
+
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(address, size));
+ shadow = kmsan_get_metadata(address, KMSAN_META_SHADOW);
+ if (!shadow)
+ goto return_dummy;
+
+ ret.shadow = shadow;
+ ret.origin = kmsan_get_metadata(address, KMSAN_META_ORIGIN);
+ return ret;
+
+return_dummy:
+ if (store) {
+ /* Ignore this store. */
+ ret.shadow = dummy_store_page;
+ ret.origin = dummy_store_page;
+ } else {
+ /* This load will return zero. */
+ ret.shadow = dummy_load_page;
+ ret.origin = dummy_load_page;
+ }
+ return ret;
+}
+
+/*
+ * Obtain the shadow or origin pointer for the given address, or NULL if there's
+ * none. The caller must check the return value for being non-NULL if needed.
+ * The return value of this function should not depend on whether we're in the
+ * runtime or not.
+ */
+void *kmsan_get_metadata(void *address, bool is_origin)
+{
+ u64 addr = (u64)address, pad, off;
+ struct page *page;
+ void *ret;
+
+ if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) {
+ pad = addr % KMSAN_ORIGIN_SIZE;
+ addr -= pad;
+ }
+ address = (void *)addr;
+ if (kmsan_internal_is_vmalloc_addr(address) ||
+ kmsan_internal_is_module_addr(address))
+ return (void *)vmalloc_meta(address, is_origin);
+
+ ret = arch_kmsan_get_meta_or_null(address, is_origin);
+ if (ret)
+ return ret;
+
+ page = virt_to_page_or_null(address);
+ if (!page)
+ return NULL;
+ if (!page_has_metadata(page))
+ return NULL;
+ off = addr % PAGE_SIZE;
+
+ return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off;
+}
+
+void kmsan_copy_page_meta(struct page *dst, struct page *src)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ if (!dst || !page_has_metadata(dst))
+ return;
+ if (!src || !page_has_metadata(src)) {
+ kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE,
+ /*checked*/ false);
+ return;
+ }
+
+ kmsan_enter_runtime();
+ __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE);
+ __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(kmsan_copy_page_meta);
+
+void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags)
+{
+ bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled;
+ struct page *shadow, *origin;
+ depot_stack_handle_t handle;
+ int pages = 1 << order;
+
+ if (!page)
+ return;
+
+ shadow = shadow_page_for(page);
+ origin = origin_page_for(page);
+
+ if (initialized) {
+ __memset(page_address(shadow), 0, PAGE_SIZE * pages);
+ __memset(page_address(origin), 0, PAGE_SIZE * pages);
+ return;
+ }
+
+ /* Zero pages allocated by the runtime should also be initialized. */
+ if (kmsan_in_runtime())
+ return;
+
+ __memset(page_address(shadow), -1, PAGE_SIZE * pages);
+ kmsan_enter_runtime();
+ handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0);
+ kmsan_leave_runtime();
+ /*
+ * Addresses are page-aligned, pages are contiguous, so it's ok
+ * to just fill the origin pages with @handle.
+ */
+ for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++)
+ ((depot_stack_handle_t *)page_address(origin))[i] = handle;
+}
+
+void kmsan_free_page(struct page *page, unsigned int order)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ kmsan_internal_poison_memory(page_address(page),
+ PAGE_SIZE << compound_order(page),
+ GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift)
+{
+ unsigned long shadow_start, origin_start, shadow_end, origin_end;
+ struct page **s_pages, **o_pages;
+ int nr, mapped, err = 0;
+
+ if (!kmsan_enabled)
+ return 0;
+
+ shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW);
+ shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW);
+ if (!shadow_start)
+ return 0;
+
+ nr = (end - start) / PAGE_SIZE;
+ s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL);
+ o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL);
+ if (!s_pages || !o_pages) {
+ err = -ENOMEM;
+ goto ret;
+ }
+ for (int i = 0; i < nr; i++) {
+ s_pages[i] = shadow_page_for(pages[i]);
+ o_pages[i] = origin_page_for(pages[i]);
+ }
+ prot = __pgprot(pgprot_val(prot) | _PAGE_NX);
+ prot = PAGE_KERNEL;
+
+ origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN);
+ origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN);
+ kmsan_enter_runtime();
+ mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
+ s_pages, page_shift);
+ if (mapped) {
+ err = mapped;
+ goto ret;
+ }
+ mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
+ o_pages, page_shift);
+ if (mapped) {
+ err = mapped;
+ goto ret;
+ }
+ kmsan_leave_runtime();
+ flush_tlb_kernel_range(shadow_start, shadow_end);
+ flush_tlb_kernel_range(origin_start, origin_end);
+ flush_cache_vmap(shadow_start, shadow_end);
+ flush_cache_vmap(origin_start, origin_end);
+
+ret:
+ kfree(s_pages);
+ kfree(o_pages);
+ return err;
+}
+
+/* Allocate metadata for pages allocated at boot time. */
+void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
+{
+ struct page *shadow_p, *origin_p;
+ void *shadow, *origin;
+ struct page *page;
+ u64 size;
+
+ start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE);
+ size = ALIGN((u64)end - (u64)start, PAGE_SIZE);
+ shadow = memblock_alloc(size, PAGE_SIZE);
+ origin = memblock_alloc(size, PAGE_SIZE);
+ for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
+ page = virt_to_page_or_null((char *)start + addr);
+ shadow_p = virt_to_page_or_null((char *)shadow + addr);
+ set_no_shadow_origin_page(shadow_p);
+ shadow_page_for(page) = shadow_p;
+ origin_p = virt_to_page_or_null((char *)origin + addr);
+ set_no_shadow_origin_page(origin_p);
+ origin_page_for(page) = origin_p;
+ }
+}
+
+void kmsan_setup_meta(struct page *page, struct page *shadow,
+ struct page *origin, int order)
+{
+ for (int i = 0; i < (1 << order); i++) {
+ set_no_shadow_origin_page(&shadow[i]);
+ set_no_shadow_origin_page(&origin[i]);
+ shadow_page_for(&page[i]) = &shadow[i];
+ origin_page_for(&page[i]) = &origin[i];
+ }
+}
diff --git a/mm/ksm.c b/mm/ksm.c
index 9afccc36dbd2..d7b5b95e936e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -15,6 +15,7 @@
#include <linux/errno.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
@@ -38,9 +39,14 @@
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/pagewalk.h>
#include <asm/tlbflush.h>
#include "internal.h"
+#include "mm_slot.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ksm.h>
#ifdef CONFIG_NUMA
#define NUMA(x) (x)
@@ -81,7 +87,7 @@
* different KSM page copy of that content
*
* Internally, the regular nodes, "dups" and "chains" are represented
- * using the same :c:type:`struct stable_node` structure.
+ * using the same struct ksm_stable_node structure.
*
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
@@ -111,17 +117,13 @@
*/
/**
- * struct mm_slot - ksm information per mm that is being scanned
- * @link: link to the mm_slots hash list
- * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
+ * struct ksm_mm_slot - ksm information per mm that is being scanned
+ * @slot: hash lookup from mm to mm_slot
* @rmap_list: head for this mm_slot's singly-linked list of rmap_items
- * @mm: the mm that this information is valid for
*/
-struct mm_slot {
- struct hlist_node link;
- struct list_head mm_list;
- struct rmap_item *rmap_list;
- struct mm_struct *mm;
+struct ksm_mm_slot {
+ struct mm_slot slot;
+ struct ksm_rmap_item *rmap_list;
};
/**
@@ -134,14 +136,14 @@ struct mm_slot {
* There is only the one ksm_scan instance of this cursor structure.
*/
struct ksm_scan {
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
unsigned long address;
- struct rmap_item **rmap_list;
+ struct ksm_rmap_item **rmap_list;
unsigned long seqnr;
};
/**
- * struct stable_node - node of the stable rbtree
+ * struct ksm_stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
* @hlist_dup: linked into the stable_node->hlist with a stable_node chain
@@ -152,7 +154,7 @@ struct ksm_scan {
* @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/
-struct stable_node {
+struct ksm_stable_node {
union {
struct rb_node node; /* when node of stable tree */
struct { /* when listed for migration */
@@ -181,7 +183,7 @@ struct stable_node {
};
/**
- * struct rmap_item - reverse mapping item for virtual addresses
+ * struct ksm_rmap_item - reverse mapping item for virtual addresses
* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
* @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
* @nid: NUMA node id of unstable tree in which linked (may not match page)
@@ -192,8 +194,8 @@ struct stable_node {
* @head: pointer to stable_node heading this list in the stable tree
* @hlist: link into hlist of rmap_items hanging off that stable_node
*/
-struct rmap_item {
- struct rmap_item *rmap_list;
+struct ksm_rmap_item {
+ struct ksm_rmap_item *rmap_list;
union {
struct anon_vma *anon_vma; /* when stable */
#ifdef CONFIG_NUMA
@@ -206,7 +208,7 @@ struct rmap_item {
union {
struct rb_node node; /* when node of unstable tree */
struct { /* when listed from stable tree */
- struct stable_node *head;
+ struct ksm_stable_node *head;
struct hlist_node hlist;
};
};
@@ -215,8 +217,6 @@ struct rmap_item {
#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
-#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
- /* to mask all the flags */
/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -231,8 +231,8 @@ static LIST_HEAD(migrate_nodes);
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
-static struct mm_slot ksm_mm_head = {
- .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
+static struct ksm_mm_slot ksm_mm_head = {
+ .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
};
static struct ksm_scan ksm_scan = {
.mm_slot = &ksm_mm_head,
@@ -261,7 +261,7 @@ static unsigned long ksm_stable_node_chains;
static unsigned long ksm_stable_node_dups;
/* Delay in pruning stale stable_node_dups in the stable_node_chains */
-static int ksm_stable_node_chains_prune_millisecs = 2000;
+static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
/* Maximum number of page slots sharing a stable node */
static int ksm_max_page_sharing = 256;
@@ -299,21 +299,21 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
-#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
+#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL)
static int __init ksm_slab_init(void)
{
- rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
+ rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0);
if (!rmap_item_cache)
goto out;
- stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
+ stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0);
if (!stable_node_cache)
goto out_free1;
- mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
+ mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0);
if (!mm_slot_cache)
goto out_free2;
@@ -335,18 +335,18 @@ static void __init ksm_slab_free(void)
mm_slot_cache = NULL;
}
-static __always_inline bool is_stable_node_chain(struct stable_node *chain)
+static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
{
return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
}
-static __always_inline bool is_stable_node_dup(struct stable_node *dup)
+static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
{
return dup->head == STABLE_NODE_DUP_HEAD;
}
-static inline void stable_node_chain_add_dup(struct stable_node *dup,
- struct stable_node *chain)
+static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
+ struct ksm_stable_node *chain)
{
VM_BUG_ON(is_stable_node_dup(dup));
dup->head = STABLE_NODE_DUP_HEAD;
@@ -355,14 +355,14 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup,
ksm_stable_node_dups++;
}
-static inline void __stable_node_dup_del(struct stable_node *dup)
+static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
{
VM_BUG_ON(!is_stable_node_dup(dup));
hlist_del(&dup->hlist_dup);
ksm_stable_node_dups--;
}
-static inline void stable_node_dup_del(struct stable_node *dup)
+static inline void stable_node_dup_del(struct ksm_stable_node *dup)
{
VM_BUG_ON(is_stable_node_chain(dup));
if (is_stable_node_dup(dup))
@@ -374,9 +374,9 @@ static inline void stable_node_dup_del(struct stable_node *dup)
#endif
}
-static inline struct rmap_item *alloc_rmap_item(void)
+static inline struct ksm_rmap_item *alloc_rmap_item(void)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
__GFP_NORETRY | __GFP_NOWARN);
@@ -385,14 +385,15 @@ static inline struct rmap_item *alloc_rmap_item(void)
return rmap_item;
}
-static inline void free_rmap_item(struct rmap_item *rmap_item)
+static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
{
ksm_rmap_items--;
+ rmap_item->mm->ksm_rmap_items--;
rmap_item->mm = NULL; /* debug safety */
kmem_cache_free(rmap_item_cache, rmap_item);
}
-static inline struct stable_node *alloc_stable_node(void)
+static inline struct ksm_stable_node *alloc_stable_node(void)
{
/*
* The allocation can take too long with GFP_KERNEL when memory is under
@@ -402,43 +403,13 @@ static inline struct stable_node *alloc_stable_node(void)
return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
}
-static inline void free_stable_node(struct stable_node *stable_node)
+static inline void free_stable_node(struct ksm_stable_node *stable_node)
{
VM_BUG_ON(stable_node->rmap_hlist_len &&
!is_stable_node_chain(stable_node));
kmem_cache_free(stable_node_cache, stable_node);
}
-static inline struct mm_slot *alloc_mm_slot(void)
-{
- if (!mm_slot_cache) /* initialization failed */
- return NULL;
- return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
- kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
- struct mm_slot *slot;
-
- hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
- if (slot->mm == mm)
- return slot;
-
- return NULL;
-}
-
-static void insert_to_mm_slots_hash(struct mm_struct *mm,
- struct mm_slot *mm_slot)
-{
- mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
-}
-
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
@@ -452,47 +423,82 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte;
+ pte_t ptent;
+ int ret;
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
+ ptent = ptep_get(pte);
+ if (pte_present(ptent)) {
+ page = vm_normal_page(walk->vma, addr, ptent);
+ } else if (!pte_none(ptent)) {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
+ }
+ ret = page && PageKsm(page);
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops break_ksm_lock_vma_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
/*
- * We use break_ksm to break COW on a ksm page: it's a stripped down
+ * We use break_ksm to break COW on a ksm page by triggering unsharing,
+ * such that the ksm page will get replaced by an exclusive anonymous page.
*
- * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
- * put_page(page);
- *
- * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
- * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ * mmap of /dev/mem, where we would not want to touch it.
*
- * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
{
- struct page *page;
vm_fault_t ret = 0;
+ const struct mm_walk_ops *ops = lock_vma ?
+ &break_ksm_lock_vma_ops : &break_ksm_ops;
do {
+ int ksm_page;
+
cond_resched();
- page = follow_page(vma, addr,
- FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
- if (IS_ERR_OR_NULL(page))
- break;
- if (PageKsm(page))
- ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
- NULL);
- else
- ret = VM_FAULT_WRITE;
- put_page(page);
- } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
+ ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
+ if (WARN_ON_ONCE(ksm_page < 0))
+ return ksm_page;
+ if (!ksm_page)
+ return 0;
+ ret = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
- * We must loop because handle_mm_fault() may back out if there's
- * any difficulty e.g. if pte accessed bit gets updated concurrently.
- *
- * VM_FAULT_WRITE is what we have been hoping for: it indicates that
- * COW has been broken, even if the vma does not permit VM_WRITE;
- * but note that a concurrent fault might break PageKsm for us.
+ * We must loop until we no longer find a KSM page because
+ * handle_mm_fault() may back out if there's any difficulty e.g. if
+ * pte accessed bit gets updated concurrently.
*
* VM_FAULT_SIGBUS could occur if we race with truncation of the
* backing file, which also invalidates anonymous pages: that's
@@ -517,21 +523,41 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
+static bool vma_ksm_compatible(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP |
+ VM_IO | VM_DONTEXPAND | VM_HUGETLB |
+ VM_MIXEDMAP))
+ return false; /* just ignore the advice */
+
+ if (vma_is_dax(vma))
+ return false;
+
+#ifdef VM_SAO
+ if (vma->vm_flags & VM_SAO)
+ return false;
+#endif
+#ifdef VM_SPARC_ADI
+ if (vma->vm_flags & VM_SPARC_ADI)
+ return false;
+#endif
+
+ return true;
+}
+
static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
unsigned long addr)
{
struct vm_area_struct *vma;
if (ksm_test_exit(mm))
return NULL;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- return NULL;
- if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ vma = vma_lookup(mm, addr);
+ if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
return NULL;
return vma;
}
-static void break_cow(struct rmap_item *rmap_item)
+static void break_cow(struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
@@ -546,11 +572,11 @@ static void break_cow(struct rmap_item *rmap_item)
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
- break_ksm(vma, addr);
+ break_ksm(vma, addr, false);
mmap_read_unlock(mm);
}
-static struct page *get_mergeable_page(struct rmap_item *rmap_item)
+static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
@@ -565,10 +591,13 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
+ if (is_zone_device_page(page))
+ goto out_putpage;
if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
+out_putpage:
put_page(page);
out:
page = NULL;
@@ -588,10 +617,10 @@ static inline int get_kpfn_nid(unsigned long kpfn)
return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}
-static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
+static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
struct rb_root *root)
{
- struct stable_node *chain = alloc_stable_node();
+ struct ksm_stable_node *chain = alloc_stable_node();
VM_BUG_ON(is_stable_node_chain(dup));
if (likely(chain)) {
INIT_HLIST_HEAD(&chain->hlist);
@@ -621,7 +650,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
return chain;
}
-static inline void free_stable_node_chain(struct stable_node *chain,
+static inline void free_stable_node_chain(struct ksm_stable_node *chain,
struct rb_root *root)
{
rb_erase(&chain->node, root);
@@ -629,18 +658,23 @@ static inline void free_stable_node_chain(struct stable_node *chain,
ksm_stable_node_chains--;
}
-static void remove_node_from_stable_tree(struct stable_node *stable_node)
+static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
/* check it's not STABLE_NODE_CHAIN or negative */
BUG_ON(stable_node->rmap_hlist_len < 0);
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- if (rmap_item->hlist.next)
+ if (rmap_item->hlist.next) {
ksm_pages_sharing--;
- else
+ trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
+ } else {
ksm_pages_shared--;
+ }
+
+ rmap_item->mm->ksm_merging_pages--;
+
VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
@@ -655,11 +689,10 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* from &migrate_nodes. This will verify that future list.h changes
* don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
*/
-#if defined(GCC_VERSION) && GCC_VERSION >= 40903
BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
-#endif
+ trace_ksm_remove_ksm_page(stable_node->kpfn);
if (stable_node->head == &migrate_nodes)
list_del(&stable_node->list);
else
@@ -692,7 +725,7 @@ enum get_ksm_page_flags {
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node,
+static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
enum get_ksm_page_flags flags)
{
struct page *page;
@@ -714,7 +747,7 @@ again:
* however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
* the same is in reuse_ksm_page() case; but if page is swapcache
- * in migrate_page_move_mapping(), it might still be our page,
+ * in folio_migrate_mapping(), it might still be our page,
* in which case it's essential to keep the node.
*/
while (!get_page_unless_zero(page)) {
@@ -757,7 +790,7 @@ stale:
/*
* We come here from above when page->mapping or !PageSwapCache
* suggests that the node is stale; but it might be under migration.
- * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+ * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
* before checking whether node->kpfn has been changed.
*/
smp_rmb();
@@ -771,10 +804,10 @@ stale:
* Removing rmap_item from stable or unstable tree.
* This function will clean the information from the stable/unstable tree.
*/
-static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
{
if (rmap_item->address & STABLE_FLAG) {
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
struct page *page;
stable_node = rmap_item->head;
@@ -790,10 +823,14 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+
+ rmap_item->mm->ksm_merging_pages--;
+
VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
+ rmap_item->head = NULL;
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -817,11 +854,10 @@ out:
cond_resched(); /* we're called from many long loops */
}
-static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
- struct rmap_item **rmap_list)
+static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
{
while (*rmap_list) {
- struct rmap_item *rmap_item = *rmap_list;
+ struct ksm_rmap_item *rmap_item = *rmap_list;
*rmap_list = rmap_item->rmap_list;
remove_rmap_item_from_tree(rmap_item);
free_rmap_item(rmap_item);
@@ -842,7 +878,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
* in cmp_and_merge_page on one of the rmap_items we would be removing.
*/
static int unmerge_ksm_pages(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end, bool lock_vma)
{
unsigned long addr;
int err = 0;
@@ -853,19 +889,25 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
if (signal_pending(current))
err = -ERESTARTSYS;
else
- err = break_ksm(vma, addr);
+ err = break_ksm(vma, addr, lock_vma);
}
return err;
}
-static inline struct stable_node *page_stable_node(struct page *page)
+static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
{
- return PageKsm(page) ? page_rmapping(page) : NULL;
+ return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
+}
+
+static inline struct ksm_stable_node *page_stable_node(struct page *page)
+{
+ return folio_stable_node(page_folio(page));
}
static inline void set_page_stable_node(struct page *page,
- struct stable_node *stable_node)
+ struct ksm_stable_node *stable_node)
{
+ VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
}
@@ -873,7 +915,7 @@ static inline void set_page_stable_node(struct page *page,
/*
* Only called through the sysfs control interface:
*/
-static int remove_stable_node(struct stable_node *stable_node)
+static int remove_stable_node(struct ksm_stable_node *stable_node)
{
struct page *page;
int err;
@@ -897,7 +939,7 @@ static int remove_stable_node(struct stable_node *stable_node)
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
* right up until it is freed; but the node is safe to remove.
- * This page might be in a pagevec waiting to be freed,
+ * This page might be in an LRU cache waiting to be freed,
* or it might be PageSwapCache (perhaps under writeback),
* or it might have been removed from swapcache a moment ago.
*/
@@ -911,10 +953,10 @@ static int remove_stable_node(struct stable_node *stable_node)
return err;
}
-static int remove_stable_node_chain(struct stable_node *stable_node,
+static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
struct rb_root *root)
{
- struct stable_node *dup;
+ struct ksm_stable_node *dup;
struct hlist_node *hlist_safe;
if (!is_stable_node_chain(stable_node)) {
@@ -938,14 +980,14 @@ static int remove_stable_node_chain(struct stable_node *stable_node,
static int remove_all_stable_nodes(void)
{
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
int nid;
int err = 0;
for (nid = 0; nid < ksm_nr_node_ids; nid++) {
while (root_stable_tree[nid].rb_node) {
stable_node = rb_entry(root_stable_tree[nid].rb_node,
- struct stable_node, node);
+ struct ksm_stable_node, node);
if (remove_stable_node_chain(stable_node,
root_stable_tree + nid)) {
err = -EBUSY;
@@ -964,44 +1006,57 @@ static int remove_all_stable_nodes(void)
static int unmerge_and_remove_all_rmap_items(void)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int err = 0;
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(ksm_mm_head.slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
spin_unlock(&ksm_mmlist_lock);
- for (mm_slot = ksm_scan.mm_slot;
- mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
- mm = mm_slot->mm;
+ for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
+ mm_slot = ksm_scan.mm_slot) {
+ VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);
+
+ mm = mm_slot->slot.mm;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (ksm_test_exit(mm))
- break;
+
+ /*
+ * Exit right away if mm is exiting to avoid lockdep issue in
+ * the maple tree
+ */
+ if (ksm_test_exit(mm))
+ goto mm_exiting;
+
+ for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
err = unmerge_ksm_pages(vma,
- vma->vm_start, vma->vm_end);
+ vma->vm_start, vma->vm_end, false);
if (err)
goto error;
}
- remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+mm_exiting:
+ remove_trailing_rmap_items(&mm_slot->rmap_list);
mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (ksm_test_exit(mm)) {
- hash_del(&mm_slot->link);
- list_del(&mm_slot->mm_list);
+ hash_del(&mm_slot->slot.hash);
+ list_del(&mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
mmdrop(mm);
} else
spin_unlock(&ksm_mmlist_lock);
@@ -1034,13 +1089,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- };
+ DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
int swapped;
int err = -EFAULT;
struct mmu_notifier_range range;
+ bool anon_exclusive;
+ pte_t entry;
pvmw.address = page_address_in_vma(page, vma);
if (pvmw.address == -EFAULT)
@@ -1048,8 +1102,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
- pvmw.address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -1058,17 +1111,16 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
goto out_unlock;
- if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
- (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
- mm_tlb_flush_pending(mm)) {
- pte_t entry;
-
+ anon_exclusive = PageAnonExclusive(page);
+ entry = ptep_get(pvmw.pte);
+ if (pte_write(entry) || pte_dirty(entry) ||
+ anon_exclusive || mm_tlb_flush_pending(mm)) {
swapped = PageSwapCache(page);
flush_cache_page(vma, pvmw.address, page_to_pfn(page));
/*
* Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
- * with the pagecount against the mapcount is racey and
+ * with the pagecount against the mapcount is racy and
* O_DIRECT can happen right after the check.
* So we clear the pte and flush the tlb before the check
* this assure us that no O_DIRECT can happen after the check
@@ -1077,7 +1129,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* No need to notify as we are downgrading page table to read
* only not changing it to point to a new page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
/*
@@ -1088,16 +1140,23 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
}
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
+ goto out_unlock;
+ }
+
if (pte_dirty(entry))
set_page_dirty(page);
+ entry = pte_mkclean(entry);
+
+ if (pte_write(entry))
+ entry = pte_wrprotect(entry);
- if (pte_protnone(entry))
- entry = pte_mkclean(pte_clear_savedwrite(entry));
- else
- entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
}
- *orig_pte = *pvmw.pte;
+ *orig_pte = entry;
err = 0;
out_unlock:
@@ -1121,7 +1180,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
+ struct folio *folio;
pmd_t *pmd;
+ pmd_t pmde;
pte_t *ptep;
pte_t newpte;
spinlock_t *ptl;
@@ -1136,16 +1197,28 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
+ /*
+ * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
+ * without holding anon_vma lock for write. So when looking for a
+ * genuine pmde (in which to find pte), test present and !THP together.
+ */
+ pmde = pmdp_get_lockless(pmd);
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ goto out;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!pte_same(*ptep, orig_pte)) {
+ if (!ptep)
+ goto out_mn;
+ if (!pte_same(ptep_get(ptep), orig_pte)) {
pte_unmap_unlock(ptep, ptl);
goto out_mn;
}
+ VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
+ VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage);
/*
* No need to check ksm_use_zero_pages here: we can only have a
@@ -1153,7 +1226,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
*/
if (!is_zero_pfn(page_to_pfn(kpage))) {
get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr, false);
+ page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
newpte = mk_pte(kpage, vma->vm_page_prot);
} else {
newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
@@ -1167,20 +1240,21 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
dec_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(*ptep));
+ flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
/*
* No need to notify as we are replacing a read only page with another
* read only page with the same content.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
- page_remove_rmap(page, false);
- if (!page_mapped(page))
- try_to_free_swap(page);
- put_page(page);
+ folio = page_folio(page);
+ page_remove_rmap(page, vma, false);
+ if (!folio_mapped(folio))
+ folio_free_swap(folio);
+ folio_put(folio);
pte_unmap_unlock(ptep, ptl);
err = 0;
@@ -1252,16 +1326,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
err = replace_page(vma, page, kpage, orig_pte);
}
- if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
- munlock_vma_page(page);
- if (!PageMlocked(kpage)) {
- unlock_page(page);
- lock_page(kpage);
- mlock_vma_page(kpage);
- page = kpage; /* for final unlock */
- }
- }
-
out_unlock:
unlock_page(page);
out:
@@ -1274,7 +1338,7 @@ out:
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
-static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
+static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
struct page *page, struct page *kpage)
{
struct mm_struct *mm = rmap_item->mm;
@@ -1298,6 +1362,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
get_anon_vma(vma->anon_vma);
out:
mmap_read_unlock(mm);
+ trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
+ rmap_item, mm, err);
return err;
}
@@ -1311,9 +1377,9 @@ out:
* Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
-static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
+static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
struct page *page,
- struct rmap_item *tree_rmap_item,
+ struct ksm_rmap_item *tree_rmap_item,
struct page *tree_page)
{
int err;
@@ -1333,7 +1399,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
}
static __always_inline
-bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
+bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
{
VM_BUG_ON(stable_node->rmap_hlist_len < 0);
/*
@@ -1347,17 +1413,17 @@ bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
}
static __always_inline
-bool is_page_sharing_candidate(struct stable_node *stable_node)
+bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
{
return __is_page_sharing_candidate(stable_node, 0);
}
-static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
- struct stable_node **_stable_node,
+static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
+ struct ksm_stable_node **_stable_node,
struct rb_root *root,
bool prune_stale_stable_nodes)
{
- struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
+ struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
struct hlist_node *hlist_safe;
struct page *_tree_page, *tree_page = NULL;
int nr = 0;
@@ -1438,7 +1504,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
*/
*_stable_node = found;
/*
- * Just for robustneess as stable_node is
+ * Just for robustness, as stable_node is
* otherwise left as a stable pointer, the
* compiler shall optimize it away at build
* time.
@@ -1471,7 +1537,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
return tree_page;
}
-static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
+static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node,
struct rb_root *root)
{
if (!is_stable_node_chain(stable_node))
@@ -1498,12 +1564,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
* function and will be overwritten in all cases, the caller doesn't
* need to initialize it.
*/
-static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
- struct stable_node **_stable_node,
+static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
+ struct ksm_stable_node **_stable_node,
struct rb_root *root,
bool prune_stale_stable_nodes)
{
- struct stable_node *stable_node = *_stable_node;
+ struct ksm_stable_node *stable_node = *_stable_node;
if (!is_stable_node_chain(stable_node)) {
if (is_page_sharing_candidate(stable_node)) {
*_stable_node_dup = stable_node;
@@ -1520,18 +1586,18 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
prune_stale_stable_nodes);
}
-static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
- struct stable_node **s_n,
+static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d,
+ struct ksm_stable_node **s_n,
struct rb_root *root)
{
return __stable_node_chain(s_n_d, s_n, root, true);
}
-static __always_inline struct page *chain(struct stable_node **s_n_d,
- struct stable_node *s_n,
+static __always_inline struct page *chain(struct ksm_stable_node **s_n_d,
+ struct ksm_stable_node *s_n,
struct rb_root *root)
{
- struct stable_node *old_stable_node = s_n;
+ struct ksm_stable_node *old_stable_node = s_n;
struct page *tree_page;
tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
@@ -1555,8 +1621,8 @@ static struct page *stable_tree_search(struct page *page)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
- struct stable_node *page_node;
+ struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ struct ksm_stable_node *page_node;
page_node = page_stable_node(page);
if (page_node && page_node->head != &migrate_nodes) {
@@ -1576,7 +1642,7 @@ again:
int ret;
cond_resched();
- stable_node = rb_entry(*new, struct stable_node, node);
+ stable_node = rb_entry(*new, struct ksm_stable_node, node);
stable_node_any = NULL;
tree_page = chain_prune(&stable_node_dup, &stable_node, root);
/*
@@ -1586,7 +1652,7 @@ again:
* the rbtree instead as a regular stable_node (in
* order to collapse the stable_node chain if a single
* stable_node dup was found in it). In such case the
- * stable_node is overwritten by the calleee to point
+ * stable_node is overwritten by the callee to point
* to the stable_node_dup that was collapsed in the
* stable rbtree and stable_node will be equal to
* stable_node_dup like if the chain never existed.
@@ -1771,7 +1837,6 @@ chain_append:
* stable_node_dup is the dup to replace.
*/
if (stable_node_dup == stable_node) {
- VM_BUG_ON(is_stable_node_chain(stable_node_dup));
VM_BUG_ON(is_stable_node_dup(stable_node_dup));
/* chain is missing so create it */
stable_node = alloc_stable_node_chain(stable_node_dup,
@@ -1785,7 +1850,6 @@ chain_append:
* of the current nid for this page
* content.
*/
- VM_BUG_ON(!is_stable_node_chain(stable_node));
VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
VM_BUG_ON(page_node->head != &migrate_nodes);
list_del(&page_node->list);
@@ -1801,14 +1865,14 @@ chain_append:
* This function returns the stable tree node just allocated on success,
* NULL otherwise.
*/
-static struct stable_node *stable_tree_insert(struct page *kpage)
+static struct ksm_stable_node *stable_tree_insert(struct page *kpage)
{
int nid;
unsigned long kpfn;
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
bool need_chain = false;
kpfn = page_to_pfn(kpage);
@@ -1823,7 +1887,7 @@ again:
int ret;
cond_resched();
- stable_node = rb_entry(*new, struct stable_node, node);
+ stable_node = rb_entry(*new, struct ksm_stable_node, node);
stable_node_any = NULL;
tree_page = chain(&stable_node_dup, stable_node, root);
if (!stable_node_dup) {
@@ -1892,7 +1956,7 @@ again:
rb_insert_color(&stable_node_dup->node, root);
} else {
if (!is_stable_node_chain(stable_node)) {
- struct stable_node *orig = stable_node;
+ struct ksm_stable_node *orig = stable_node;
/* chain is missing so create it */
stable_node = alloc_stable_node_chain(orig, root);
if (!stable_node) {
@@ -1921,7 +1985,7 @@ again:
* the same walking algorithm in an rbtree.
*/
static
-struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
+struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
struct page *page,
struct page **tree_pagep)
{
@@ -1935,12 +1999,12 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
new = &root->rb_node;
while (*new) {
- struct rmap_item *tree_rmap_item;
+ struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page;
int ret;
cond_resched();
- tree_rmap_item = rb_entry(*new, struct rmap_item, node);
+ tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
tree_page = get_mergeable_page(tree_rmap_item);
if (!tree_page)
return NULL;
@@ -1992,8 +2056,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
* rmap_items hanging off a given node of the stable tree, all sharing
* the same ksm page.
*/
-static void stable_tree_append(struct rmap_item *rmap_item,
- struct stable_node *stable_node,
+static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+ struct ksm_stable_node *stable_node,
bool max_page_sharing_bypass)
{
/*
@@ -2022,6 +2086,8 @@ static void stable_tree_append(struct rmap_item *rmap_item,
ksm_pages_sharing++;
else
ksm_pages_shared++;
+
+ rmap_item->mm->ksm_merging_pages++;
}
/*
@@ -2033,12 +2099,12 @@ static void stable_tree_append(struct rmap_item *rmap_item,
* @page: the page that we are searching identical page to.
* @rmap_item: the reverse mapping into the virtual address of this page
*/
-static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
+static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
- struct rmap_item *tree_rmap_item;
+ struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
struct page *kpage;
unsigned int checksum;
int err;
@@ -2116,6 +2182,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
if (vma) {
err = try_to_merge_one_page(vma, page,
ZERO_PAGE(rmap_item->address));
+ trace_ksm_merge_one_page(
+ page_to_pfn(ZERO_PAGE(rmap_item->address)),
+ rmap_item, mm, err);
} else {
/*
* If the vma is out of date, we do not need to
@@ -2194,11 +2263,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
}
}
-static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
- struct rmap_item **rmap_list,
+static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
+ struct ksm_rmap_item **rmap_list,
unsigned long addr)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
while (*rmap_list) {
rmap_item = *rmap_list;
@@ -2214,7 +2283,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
rmap_item = alloc_rmap_item();
if (rmap_item) {
/* It has already been zeroed */
- rmap_item->mm = mm_slot->mm;
+ rmap_item->mm = mm_slot->slot.mm;
+ rmap_item->mm->ksm_rmap_items++;
rmap_item->address = addr;
rmap_item->rmap_list = *rmap_list;
*rmap_list = rmap_item;
@@ -2222,22 +2292,26 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
return rmap_item;
}
-static struct rmap_item *scan_get_next_rmap_item(struct page **page)
+static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
+ struct ksm_mm_slot *mm_slot;
struct mm_slot *slot;
struct vm_area_struct *vma;
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
+ struct vma_iterator vmi;
int nid;
- if (list_empty(&ksm_mm_head.mm_list))
+ if (list_empty(&ksm_mm_head.slot.mm_node))
return NULL;
- slot = ksm_scan.mm_slot;
- if (slot == &ksm_mm_head) {
+ mm_slot = ksm_scan.mm_slot;
+ if (mm_slot == &ksm_mm_head) {
+ trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
+
/*
- * A number of pages can hang around indefinitely on per-cpu
- * pagevecs, raised page count preventing write_protect_page
+ * A number of pages can hang around indefinitely in per-cpu
+ * LRU cache, raised page count preventing write_protect_page
* from merging them. Though it doesn't really matter much,
* it is puzzling to see some stuck in pages_volatile until
* other activity jostles them out, and they also prevented
@@ -2254,7 +2328,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
* so prune them once before each full scan.
*/
if (!ksm_merge_across_nodes) {
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
struct page *page;
list_for_each_entry_safe(stable_node, next,
@@ -2271,28 +2345,31 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
root_unstable_tree[nid] = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
- slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
- ksm_scan.mm_slot = slot;
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
+ ksm_scan.mm_slot = mm_slot;
spin_unlock(&ksm_mmlist_lock);
/*
* Although we tested list_empty() above, a racing __ksm_exit
* of the last mm on the list may have removed it since then.
*/
- if (slot == &ksm_mm_head)
+ if (mm_slot == &ksm_mm_head)
return NULL;
next_mm:
ksm_scan.address = 0;
- ksm_scan.rmap_list = &slot->rmap_list;
+ ksm_scan.rmap_list = &mm_slot->rmap_list;
}
+ slot = &mm_slot->slot;
mm = slot->mm;
+ vma_iter_init(&vmi, mm, ksm_scan.address);
+
mmap_read_lock(mm);
if (ksm_test_exit(mm))
- vma = NULL;
- else
- vma = find_vma(mm, ksm_scan.address);
+ goto no_vmas;
- for (; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_MERGEABLE))
continue;
if (ksm_scan.address < vma->vm_start)
@@ -2309,10 +2386,12 @@ next_mm:
cond_resched();
continue;
}
+ if (is_zone_device_page(*page))
+ goto next_page;
if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
- rmap_item = get_next_rmap_item(slot,
+ rmap_item = get_next_rmap_item(mm_slot,
ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
ksm_scan.rmap_list =
@@ -2323,6 +2402,7 @@ next_mm:
mmap_read_unlock(mm);
return rmap_item;
}
+next_page:
put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
@@ -2330,18 +2410,20 @@ next_mm:
}
if (ksm_test_exit(mm)) {
+no_vmas:
ksm_scan.address = 0;
- ksm_scan.rmap_list = &slot->rmap_list;
+ ksm_scan.rmap_list = &mm_slot->rmap_list;
}
/*
* Nuke all the rmap_items that are above this current rmap:
* because there were no VM_MERGEABLE vmas with such addresses.
*/
- remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
+ remove_trailing_rmap_items(ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(slot->mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (ksm_scan.address == 0) {
/*
* We've completed a full scan of all vmas, holding mmap_lock
@@ -2352,12 +2434,13 @@ next_mm:
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_lock then protects against race with MADV_MERGEABLE).
*/
- hash_del(&slot->link);
- list_del(&slot->mm_list);
+ hash_del(&mm_slot->slot.hash);
+ list_del(&mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- free_mm_slot(slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
mmap_read_unlock(mm);
mmdrop(mm);
} else {
@@ -2373,10 +2456,11 @@ next_mm:
}
/* Repeat until we've completed scanning the whole list */
- slot = ksm_scan.mm_slot;
- if (slot != &ksm_mm_head)
+ mm_slot = ksm_scan.mm_slot;
+ if (mm_slot != &ksm_mm_head)
goto next_mm;
+ trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
ksm_scan.seqnr++;
return NULL;
}
@@ -2387,7 +2471,7 @@ next_mm:
*/
static void ksm_do_scan(unsigned int scan_npages)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
struct page *page;
while (scan_npages-- && likely(!freezing(current))) {
@@ -2402,7 +2486,7 @@ static void ksm_do_scan(unsigned int scan_npages)
static int ksmd_should_run(void)
{
- return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+ return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
}
static int ksm_scan_thread(void *nothing)
@@ -2434,6 +2518,136 @@ static int ksm_scan_thread(void *nothing)
return 0;
}
+static void __ksm_add_vma(struct vm_area_struct *vma)
+{
+ unsigned long vm_flags = vma->vm_flags;
+
+ if (vm_flags & VM_MERGEABLE)
+ return;
+
+ if (vma_ksm_compatible(vma))
+ vm_flags_set(vma, VM_MERGEABLE);
+}
+
+static int __ksm_del_vma(struct vm_area_struct *vma)
+{
+ int err;
+
+ if (!(vma->vm_flags & VM_MERGEABLE))
+ return 0;
+
+ if (vma->anon_vma) {
+ err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
+ if (err)
+ return err;
+ }
+
+ vm_flags_clear(vma, VM_MERGEABLE);
+ return 0;
+}
+/**
+ * ksm_add_vma - Mark vma as mergeable if compatible
+ *
+ * @vma: Pointer to vma
+ */
+void ksm_add_vma(struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ __ksm_add_vma(vma);
+}
+
+static void ksm_add_vmas(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ VMA_ITERATOR(vmi, mm, 0);
+ for_each_vma(vmi, vma)
+ __ksm_add_vma(vma);
+}
+
+static int ksm_del_vmas(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err;
+
+ VMA_ITERATOR(vmi, mm, 0);
+ for_each_vma(vmi, vma) {
+ err = __ksm_del_vma(vma);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/**
+ * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
+ * compatible VMA's
+ *
+ * @mm: Pointer to mm
+ *
+ * Returns 0 on success, otherwise error code
+ */
+int ksm_enable_merge_any(struct mm_struct *mm)
+{
+ int err;
+
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ return 0;
+
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ err = __ksm_enter(mm);
+ if (err)
+ return err;
+ }
+
+ set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ ksm_add_vmas(mm);
+
+ return 0;
+}
+
+/**
+ * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
+ * previously enabled via ksm_enable_merge_any().
+ *
+ * Disabling merging implies unmerging any merged pages, like setting
+ * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
+ * merging on all compatible VMA's remains enabled.
+ *
+ * @mm: Pointer to mm
+ *
+ * Returns 0 on success, otherwise error code
+ */
+int ksm_disable_merge_any(struct mm_struct *mm)
+{
+ int err;
+
+ if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ return 0;
+
+ err = ksm_del_vmas(mm);
+ if (err) {
+ ksm_add_vmas(mm);
+ return err;
+ }
+
+ clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ return 0;
+}
+
+int ksm_disable(struct mm_struct *mm)
+{
+ mmap_assert_write_locked(mm);
+
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ return 0;
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ return ksm_disable_merge_any(mm);
+ return ksm_del_vmas(mm);
+}
+
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags)
{
@@ -2442,25 +2656,10 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
switch (advice) {
case MADV_MERGEABLE:
- /*
- * Be somewhat over-protective for now!
- */
- if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
- VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_HUGETLB | VM_MIXEDMAP))
- return 0; /* just ignore the advice */
-
- if (vma_is_dax(vma))
+ if (vma->vm_flags & VM_MERGEABLE)
return 0;
-
-#ifdef VM_SAO
- if (*vm_flags & VM_SAO)
+ if (!vma_ksm_compatible(vma))
return 0;
-#endif
-#ifdef VM_SPARC_ADI
- if (*vm_flags & VM_SPARC_ADI)
- return 0;
-#endif
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
err = __ksm_enter(mm);
@@ -2476,7 +2675,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return 0; /* just ignore the advice */
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, start, end);
+ err = unmerge_ksm_pages(vma, start, end, true);
if (err)
return err;
}
@@ -2491,18 +2690,21 @@ EXPORT_SYMBOL_GPL(ksm_madvise);
int __ksm_enter(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
int needs_wakeup;
- mm_slot = alloc_mm_slot();
+ mm_slot = mm_slot_alloc(mm_slot_cache);
if (!mm_slot)
return -ENOMEM;
+ slot = &mm_slot->slot;
+
/* Check ksm_run too? Would need tighter locking */
- needs_wakeup = list_empty(&ksm_mm_head.mm_list);
+ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);
spin_lock(&ksm_mmlist_lock);
- insert_to_mm_slots_hash(mm, mm_slot);
+ mm_slot_insert(mm_slots_hash, mm, slot);
/*
* When KSM_RUN_MERGE (or KSM_RUN_STOP),
* insert just behind the scanning cursor, to let the area settle
@@ -2514,9 +2716,9 @@ int __ksm_enter(struct mm_struct *mm)
* missed: then we might as well insert at the end of the list.
*/
if (ksm_run & KSM_RUN_UNMERGE)
- list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+ list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
else
- list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -2525,12 +2727,14 @@ int __ksm_enter(struct mm_struct *mm)
if (needs_wakeup)
wake_up_interruptible(&ksm_thread_wait);
+ trace_ksm_enter(mm);
return 0;
}
void __ksm_exit(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
int easy_to_free = 0;
/*
@@ -2543,33 +2747,38 @@ void __ksm_exit(struct mm_struct *mm)
*/
spin_lock(&ksm_mmlist_lock);
- mm_slot = get_mm_slot(mm);
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (mm_slot && ksm_scan.mm_slot != mm_slot) {
if (!mm_slot->rmap_list) {
- hash_del(&mm_slot->link);
- list_del(&mm_slot->mm_list);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
easy_to_free = 1;
} else {
- list_move(&mm_slot->mm_list,
- &ksm_scan.mm_slot->mm_list);
+ list_move(&slot->mm_node,
+ &ksm_scan.mm_slot->slot.mm_node);
}
}
spin_unlock(&ksm_mmlist_lock);
if (easy_to_free) {
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
+ clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else if (mm_slot) {
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
+
+ trace_ksm_exit(mm);
}
struct page *ksm_might_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- struct anon_vma *anon_vma = page_anon_vma(page);
+ struct folio *folio = page_folio(page);
+ struct anon_vma *anon_vma = folio_anon_vma(folio);
struct page *new_page;
if (PageKsm(page)) {
@@ -2578,44 +2787,53 @@ struct page *ksm_might_need_to_copy(struct page *page,
return page; /* no need to copy it */
} else if (!anon_vma) {
return page; /* no need to copy it */
- } else if (anon_vma->root == vma->anon_vma->root &&
- page->index == linear_page_index(vma, address)) {
+ } else if (page->index == linear_page_index(vma, address) &&
+ anon_vma->root == vma->anon_vma->root) {
return page; /* still no need to copy it */
}
+ if (PageHWPoison(page))
+ return ERR_PTR(-EHWPOISON);
if (!PageUptodate(page))
return page; /* let do_swap_page report the error */
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
- if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
+ if (new_page &&
+ mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
put_page(new_page);
new_page = NULL;
}
if (new_page) {
- copy_user_highpage(new_page, page, address, vma);
-
+ if (copy_mc_user_highpage(new_page, page, address, vma)) {
+ put_page(new_page);
+ memory_failure_queue(page_to_pfn(page), 0);
+ return ERR_PTR(-EHWPOISON);
+ }
SetPageDirty(new_page);
__SetPageUptodate(new_page);
__SetPageLocked(new_page);
+#ifdef CONFIG_SWAP
+ count_vm_event(KSM_SWPIN_COPY);
+#endif
}
return new_page;
}
-void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
+ struct ksm_stable_node *stable_node;
+ struct ksm_rmap_item *rmap_item;
int search_new_forks = 0;
- VM_BUG_ON_PAGE(!PageKsm(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
/*
* Rely on the page lock to protect against concurrent modifications
* to that page's node of the stable tree.
*/
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- stable_node = page_stable_node(page);
+ stable_node = folio_stable_node(folio);
if (!stable_node)
return;
again:
@@ -2625,7 +2843,13 @@ again:
struct vm_area_struct *vma;
cond_resched();
- anon_vma_lock_read(anon_vma);
+ if (!anon_vma_trylock_read(anon_vma)) {
+ if (rwc->try_lock) {
+ rwc->contended = true;
+ return;
+ }
+ anon_vma_lock_read(anon_vma);
+ }
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
unsigned long addr;
@@ -2634,7 +2858,7 @@ again:
vma = vmac->vma;
/* Ignore the stable/unstable/sqnr flags */
- addr = rmap_item->address & ~KSM_FLAG_MASK;
+ addr = rmap_item->address & PAGE_MASK;
if (addr < vma->vm_start || addr >= vma->vm_end)
continue;
@@ -2650,11 +2874,11 @@ again:
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
+ if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
anon_vma_unlock_read(anon_vma);
return;
}
- if (rwc->done && rwc->done(page)) {
+ if (rwc->done && rwc->done(folio)) {
anon_vma_unlock_read(anon_vma);
return;
}
@@ -2665,27 +2889,72 @@ again:
goto again;
}
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Collect processes when the error hit an ksm page.
+ */
+void collect_procs_ksm(struct page *page, struct list_head *to_kill,
+ int force_early)
+{
+ struct ksm_stable_node *stable_node;
+ struct ksm_rmap_item *rmap_item;
+ struct folio *folio = page_folio(page);
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+
+ stable_node = folio_stable_node(folio);
+ if (!stable_node)
+ return;
+ hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+ struct anon_vma *av = rmap_item->anon_vma;
+
+ anon_vma_lock_read(av);
+ read_lock(&tasklist_lock);
+ for_each_process(tsk) {
+ struct anon_vma_chain *vmac;
+ unsigned long addr;
+ struct task_struct *t =
+ task_early_kill(tsk, force_early);
+ if (!t)
+ continue;
+ anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
+ ULONG_MAX)
+ {
+ vma = vmac->vma;
+ if (vma->vm_mm == t->mm) {
+ addr = rmap_item->address & PAGE_MASK;
+ add_to_kill_ksm(t, page, vma, to_kill,
+ addr);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+ anon_vma_unlock_read(av);
+ }
+}
+#endif
+
#ifdef CONFIG_MIGRATION
-void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
{
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
- VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
+ VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
- stable_node = page_stable_node(newpage);
+ stable_node = folio_stable_node(folio);
if (stable_node) {
- VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
- stable_node->kpfn = page_to_pfn(newpage);
+ VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
+ stable_node->kpfn = folio_pfn(newfolio);
/*
- * newpage->mapping was set in advance; now we need smp_wmb()
+ * newfolio->mapping was set in advance; now we need smp_wmb()
* to make sure that the new stable_node->kpfn is visible
- * to get_ksm_page() before it can see that oldpage->mapping
- * has gone stale (or that PageSwapCache has been cleared).
+ * to get_ksm_page() before it can see that folio->mapping
+ * has gone stale (or that folio_test_swapcache has been cleared).
*/
smp_wmb();
- set_page_stable_node(oldpage, NULL);
+ set_page_stable_node(&folio->page, NULL);
}
}
#endif /* CONFIG_MIGRATION */
@@ -2701,7 +2970,7 @@ static void wait_while_offlining(void)
}
}
-static bool stable_node_dup_remove_range(struct stable_node *stable_node,
+static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -2717,12 +2986,12 @@ static bool stable_node_dup_remove_range(struct stable_node *stable_node,
return false;
}
-static bool stable_node_chain_remove_range(struct stable_node *stable_node,
+static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
unsigned long start_pfn,
unsigned long end_pfn,
struct rb_root *root)
{
- struct stable_node *dup;
+ struct ksm_stable_node *dup;
struct hlist_node *hlist_safe;
if (!is_stable_node_chain(stable_node)) {
@@ -2746,14 +3015,14 @@ static bool stable_node_chain_remove_range(struct stable_node *stable_node,
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
struct rb_node *node;
int nid;
for (nid = 0; nid < ksm_nr_node_ids; nid++) {
node = rb_first(root_stable_tree + nid);
while (node) {
- stable_node = rb_entry(node, struct stable_node, node);
+ stable_node = rb_entry(node, struct ksm_stable_node, node);
if (stable_node_chain_remove_range(stable_node,
start_pfn, end_pfn,
root_stable_tree +
@@ -2819,6 +3088,14 @@ static void wait_while_offlining(void)
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
+#ifdef CONFIG_PROC_FS
+long ksm_process_profit(struct mm_struct *mm)
+{
+ return mm->ksm_merging_pages * PAGE_SIZE -
+ mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
+}
+#endif /* CONFIG_PROC_FS */
+
#ifdef CONFIG_SYSFS
/*
* This all compiles without CONFIG_SYSFS, but is a waste of space.
@@ -2827,24 +3104,23 @@ static void wait_while_offlining(void)
#define KSM_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KSM_ATTR(_name) \
- static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+ static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
static ssize_t sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
+ return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
}
static ssize_t sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned long msecs;
+ unsigned int msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
+ err = kstrtouint(buf, 10, &msecs);
+ if (err)
return -EINVAL;
ksm_thread_sleep_millisecs = msecs;
@@ -2857,18 +3133,18 @@ KSM_ATTR(sleep_millisecs);
static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
+ return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ unsigned int nr_pages;
int err;
- unsigned long nr_pages;
- err = kstrtoul(buf, 10, &nr_pages);
- if (err || nr_pages > UINT_MAX)
+ err = kstrtouint(buf, 10, &nr_pages);
+ if (err)
return -EINVAL;
ksm_thread_pages_to_scan = nr_pages;
@@ -2880,17 +3156,17 @@ KSM_ATTR(pages_to_scan);
static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n", ksm_run);
+ return sysfs_emit(buf, "%lu\n", ksm_run);
}
static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ unsigned int flags;
int err;
- unsigned long flags;
- err = kstrtoul(buf, 10, &flags);
- if (err || flags > UINT_MAX)
+ err = kstrtouint(buf, 10, &flags);
+ if (err)
return -EINVAL;
if (flags > KSM_RUN_UNMERGE)
return -EINVAL;
@@ -2927,9 +3203,9 @@ KSM_ATTR(run);
#ifdef CONFIG_NUMA
static ssize_t merge_across_nodes_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_merge_across_nodes);
+ return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
}
static ssize_t merge_across_nodes_store(struct kobject *kobj,
@@ -2984,9 +3260,9 @@ KSM_ATTR(merge_across_nodes);
#endif
static ssize_t use_zero_pages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_use_zero_pages);
+ return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
}
static ssize_t use_zero_pages_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -3008,7 +3284,7 @@ KSM_ATTR(use_zero_pages);
static ssize_t max_page_sharing_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_max_page_sharing);
+ return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
}
static ssize_t max_page_sharing_store(struct kobject *kobj,
@@ -3049,21 +3325,21 @@ KSM_ATTR(max_page_sharing);
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_pages_shared);
+ return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
}
KSM_ATTR_RO(pages_shared);
static ssize_t pages_sharing_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_pages_sharing);
+ return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
}
KSM_ATTR_RO(pages_sharing);
static ssize_t pages_unshared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_pages_unshared);
+ return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
}
KSM_ATTR_RO(pages_unshared);
@@ -3080,21 +3356,33 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
*/
if (ksm_pages_volatile < 0)
ksm_pages_volatile = 0;
- return sprintf(buf, "%ld\n", ksm_pages_volatile);
+ return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
}
KSM_ATTR_RO(pages_volatile);
+static ssize_t general_profit_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ long general_profit;
+
+ general_profit = ksm_pages_sharing * PAGE_SIZE -
+ ksm_rmap_items * sizeof(struct ksm_rmap_item);
+
+ return sysfs_emit(buf, "%ld\n", general_profit);
+}
+KSM_ATTR_RO(general_profit);
+
static ssize_t stable_node_dups_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_stable_node_dups);
+ return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
}
KSM_ATTR_RO(stable_node_dups);
static ssize_t stable_node_chains_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_stable_node_chains);
+ return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
}
KSM_ATTR_RO(stable_node_chains);
@@ -3103,7 +3391,7 @@ stable_node_chains_prune_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
+ return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
}
static ssize_t
@@ -3111,11 +3399,11 @@ stable_node_chains_prune_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned long msecs;
+ unsigned int msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
+ err = kstrtouint(buf, 10, &msecs);
+ if (err)
return -EINVAL;
ksm_stable_node_chains_prune_millisecs = msecs;
@@ -3127,7 +3415,7 @@ KSM_ATTR(stable_node_chains_prune_millisecs);
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_scan.seqnr);
+ return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
}
KSM_ATTR_RO(full_scans);
@@ -3148,6 +3436,7 @@ static struct attribute *ksm_attrs[] = {
&stable_node_dups_attr.attr,
&stable_node_chains_prune_millisecs_attr.attr,
&use_zero_pages_attr.attr,
+ &general_profit_attr.attr,
NULL,
};
@@ -3192,7 +3481,7 @@ static int __init ksm_init(void)
#ifdef CONFIG_MEMORY_HOTREMOVE
/* There is no significance to this priority 100 */
- hotplug_memory_notifier(ksm_memory_callback, 100);
+ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
#endif
return 0;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5aa6e44bc2ae..a05e5bef3b40 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -13,20 +13,32 @@
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"
+#include "internal.h"
#ifdef CONFIG_MEMCG_KMEM
-static LIST_HEAD(list_lrus);
+static LIST_HEAD(memcg_list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return lru->memcg_aware;
+}
+
static void list_lru_register(struct list_lru *lru)
{
+ if (!list_lru_memcg_aware(lru))
+ return;
+
mutex_lock(&list_lrus_mutex);
- list_add(&lru->list, &list_lrus);
+ list_add(&lru->list, &memcg_list_lrus);
mutex_unlock(&list_lrus_mutex);
}
static void list_lru_unregister(struct list_lru *lru)
{
+ if (!list_lru_memcg_aware(lru))
+ return;
+
mutex_lock(&list_lrus_mutex);
list_del(&lru->list);
mutex_unlock(&list_lrus_mutex);
@@ -37,41 +49,33 @@ static int lru_shrinker_id(struct list_lru *lru)
return lru->shrinker_id;
}
-static inline bool list_lru_memcg_aware(struct list_lru *lru)
-{
- return lru->memcg_aware;
-}
-
static inline struct list_lru_one *
-list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
- struct list_lru_memcg *memcg_lrus;
- /*
- * Either lock or RCU protects the array of per cgroup lists
- * from relocation (see memcg_update_list_lru_node).
- */
- memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
- lockdep_is_held(&nlru->lock));
- if (memcg_lrus && idx >= 0)
- return memcg_lrus->lru[idx];
- return &nlru->lru;
+ if (list_lru_memcg_aware(lru) && idx >= 0) {
+ struct list_lru_memcg *mlru = xa_load(&lru->xa, idx);
+
+ return mlru ? &mlru->node[nid] : NULL;
+ }
+ return &lru->node[nid].lru;
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
struct mem_cgroup **memcg_ptr)
{
+ struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l = &nlru->lru;
struct mem_cgroup *memcg = NULL;
- if (!nlru->memcg_lrus)
+ if (!list_lru_memcg_aware(lru))
goto out;
- memcg = mem_cgroup_from_obj(ptr);
+ memcg = mem_cgroup_from_slab_obj(ptr);
if (!memcg)
goto out;
- l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
out:
if (memcg_ptr)
*memcg_ptr = memcg;
@@ -97,18 +101,18 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru)
}
static inline struct list_lru_one *
-list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
- return &nlru->lru;
+ return &lru->node[nid].lru;
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
struct mem_cgroup **memcg_ptr)
{
if (memcg_ptr)
*memcg_ptr = NULL;
- return &nlru->lru;
+ return &lru->node[nid].lru;
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -121,12 +125,12 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
spin_lock(&nlru->lock);
if (list_empty(item)) {
- l = list_lru_from_kmem(nlru, item, &memcg);
+ l = list_lru_from_kmem(lru, nid, item, &memcg);
list_add_tail(item, &l->list);
/* Set shrinker bit if the first element was added */
if (!l->nr_items++)
- memcg_set_shrinker_bit(memcg, nid,
- lru_shrinker_id(lru));
+ set_shrinker_bit(memcg, nid,
+ lru_shrinker_id(lru));
nlru->nr_items++;
spin_unlock(&nlru->lock);
return true;
@@ -144,7 +148,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
spin_lock(&nlru->lock);
if (!list_empty(item)) {
- l = list_lru_from_kmem(nlru, item, NULL);
+ l = list_lru_from_kmem(lru, nid, item, NULL);
list_del_init(item);
l->nr_items--;
nlru->nr_items--;
@@ -174,15 +178,17 @@ EXPORT_SYMBOL_GPL(list_lru_isolate_move);
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg)
{
- struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
- unsigned long count;
+ long count;
rcu_read_lock();
- l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
- count = READ_ONCE(l->nr_items);
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+ count = l ? READ_ONCE(l->nr_items) : 0;
rcu_read_unlock();
+ if (unlikely(count < 0))
+ count = 0;
+
return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_one);
@@ -197,17 +203,20 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
EXPORT_SYMBOL_GPL(list_lru_count_node);
static unsigned long
-__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
-
+ struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
- l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
+ l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
+ if (!l)
+ goto out;
+
list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
@@ -251,6 +260,7 @@ restart:
BUG();
}
}
+out:
return isolated;
}
@@ -263,8 +273,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
unsigned long ret;
spin_lock(&nlru->lock);
- ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
- nr_to_walk);
+ ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
+ cb_arg, nr_to_walk);
spin_unlock(&nlru->lock);
return ret;
}
@@ -279,8 +289,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
unsigned long ret;
spin_lock_irq(&nlru->lock);
- ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
- nr_to_walk);
+ ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
+ cb_arg, nr_to_walk);
spin_unlock_irq(&nlru->lock);
return ret;
}
@@ -290,16 +300,20 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
unsigned long *nr_to_walk)
{
long isolated = 0;
- int memcg_idx;
isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
nr_to_walk);
+
+#ifdef CONFIG_MEMCG_KMEM
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
- for_each_memcg_cache_index(memcg_idx) {
+ struct list_lru_memcg *mlru;
+ unsigned long index;
+
+ xa_for_each(&lru->xa, index, mlru) {
struct list_lru_node *nlru = &lru->node[nid];
spin_lock(&nlru->lock);
- isolated += __list_lru_walk_one(nlru, memcg_idx,
+ isolated += __list_lru_walk_one(lru, nid, index,
isolate, cb_arg,
nr_to_walk);
spin_unlock(&nlru->lock);
@@ -308,6 +322,8 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
break;
}
}
+#endif
+
return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
@@ -319,267 +335,220 @@ static void init_one_lru(struct list_lru_one *l)
}
#ifdef CONFIG_MEMCG_KMEM
-static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
- int begin, int end)
-{
- int i;
-
- for (i = begin; i < end; i++)
- kfree(memcg_lrus->lru[i]);
-}
-
-static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
- int begin, int end)
+static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
{
- int i;
+ int nid;
+ struct list_lru_memcg *mlru;
- for (i = begin; i < end; i++) {
- struct list_lru_one *l;
+ mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp);
+ if (!mlru)
+ return NULL;
- l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
- if (!l)
- goto fail;
+ for_each_node(nid)
+ init_one_lru(&mlru->node[nid]);
- init_one_lru(l);
- memcg_lrus->lru[i] = l;
- }
- return 0;
-fail:
- __memcg_destroy_list_lru_node(memcg_lrus, begin, i);
- return -ENOMEM;
+ return mlru;
}
-static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
{
- struct list_lru_memcg *memcg_lrus;
- int size = memcg_nr_cache_ids;
-
- memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
- size * sizeof(void *), GFP_KERNEL);
- if (!memcg_lrus)
- return -ENOMEM;
+ struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
- if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
- kvfree(memcg_lrus);
- return -ENOMEM;
- }
- RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);
-
- return 0;
-}
-
-static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
-{
- struct list_lru_memcg *memcg_lrus;
/*
- * This is called when shrinker has already been unregistered,
- * and nobody can use it. So, there is no need to use kvfree_rcu_local().
+ * The __list_lru_walk_one() can walk the list of this node.
+ * We need kvfree_rcu() here. And the walking of the list
+ * is under lru->node[nid]->lock, which can serve as a RCU
+ * read-side critical section.
*/
- memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
- __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
- kvfree(memcg_lrus);
+ if (mlru)
+ kvfree_rcu(mlru, rcu);
}
-static void kvfree_rcu_local(struct rcu_head *head)
+static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
- struct list_lru_memcg *mlru;
-
- mlru = container_of(head, struct list_lru_memcg, rcu);
- kvfree(mlru);
+ if (memcg_aware)
+ xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
+ lru->memcg_aware = memcg_aware;
}
-static int memcg_update_list_lru_node(struct list_lru_node *nlru,
- int old_size, int new_size)
+static void memcg_destroy_list_lru(struct list_lru *lru)
{
- struct list_lru_memcg *old, *new;
-
- BUG_ON(old_size > new_size);
+ XA_STATE(xas, &lru->xa, 0);
+ struct list_lru_memcg *mlru;
- old = rcu_dereference_protected(nlru->memcg_lrus,
- lockdep_is_held(&list_lrus_mutex));
- new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
- if (!new)
- return -ENOMEM;
+ if (!list_lru_memcg_aware(lru))
+ return;
- if (__memcg_init_list_lru_node(new, old_size, new_size)) {
- kvfree(new);
- return -ENOMEM;
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, mlru, ULONG_MAX) {
+ kfree(mlru);
+ xas_store(&xas, NULL);
}
+ xas_unlock_irq(&xas);
+}
- memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
+static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
+ int src_idx, struct mem_cgroup *dst_memcg)
+{
+ struct list_lru_node *nlru = &lru->node[nid];
+ int dst_idx = dst_memcg->kmemcg_id;
+ struct list_lru_one *src, *dst;
/*
- * The locking below allows readers that hold nlru->lock avoid taking
- * rcu_read_lock (see list_lru_from_memcg_idx).
- *
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
* we have to use IRQ-safe primitives here to avoid deadlock.
*/
spin_lock_irq(&nlru->lock);
- rcu_assign_pointer(nlru->memcg_lrus, new);
- spin_unlock_irq(&nlru->lock);
-
- call_rcu(&old->rcu, kvfree_rcu_local);
- return 0;
-}
-static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
- int old_size, int new_size)
-{
- struct list_lru_memcg *memcg_lrus;
-
- memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus,
- lockdep_is_held(&list_lrus_mutex));
- /* do not bother shrinking the array back to the old size, because we
- * cannot handle allocation failures here */
- __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size);
-}
-
-static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
-{
- int i;
-
- lru->memcg_aware = memcg_aware;
+ src = list_lru_from_memcg_idx(lru, nid, src_idx);
+ if (!src)
+ goto out;
+ dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
- if (!memcg_aware)
- return 0;
+ list_splice_init(&src->list, &dst->list);
- for_each_node(i) {
- if (memcg_init_list_lru_node(&lru->node[i]))
- goto fail;
- }
- return 0;
-fail:
- for (i = i - 1; i >= 0; i--) {
- if (!lru->node[i].memcg_lrus)
- continue;
- memcg_destroy_list_lru_node(&lru->node[i]);
+ if (src->nr_items) {
+ dst->nr_items += src->nr_items;
+ set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
+ src->nr_items = 0;
}
- return -ENOMEM;
+out:
+ spin_unlock_irq(&nlru->lock);
}
-static void memcg_destroy_list_lru(struct list_lru *lru)
+static void memcg_reparent_list_lru(struct list_lru *lru,
+ int src_idx, struct mem_cgroup *dst_memcg)
{
int i;
- if (!list_lru_memcg_aware(lru))
- return;
-
for_each_node(i)
- memcg_destroy_list_lru_node(&lru->node[i]);
-}
-
-static int memcg_update_list_lru(struct list_lru *lru,
- int old_size, int new_size)
-{
- int i;
-
- if (!list_lru_memcg_aware(lru))
- return 0;
-
- for_each_node(i) {
- if (memcg_update_list_lru_node(&lru->node[i],
- old_size, new_size))
- goto fail;
- }
- return 0;
-fail:
- for (i = i - 1; i >= 0; i--) {
- if (!lru->node[i].memcg_lrus)
- continue;
+ memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg);
- memcg_cancel_update_list_lru_node(&lru->node[i],
- old_size, new_size);
- }
- return -ENOMEM;
+ memcg_list_lru_free(lru, src_idx);
}
-static void memcg_cancel_update_list_lru(struct list_lru *lru,
- int old_size, int new_size)
+void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
- int i;
-
- if (!list_lru_memcg_aware(lru))
- return;
+ struct cgroup_subsys_state *css;
+ struct list_lru *lru;
+ int src_idx = memcg->kmemcg_id;
- for_each_node(i)
- memcg_cancel_update_list_lru_node(&lru->node[i],
- old_size, new_size);
-}
+ /*
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent.
+ *
+ * After we have finished, all list_lrus corresponding to this cgroup
+ * are guaranteed to remain empty. So we can safely free this cgroup's
+ * list lrus in memcg_list_lru_free().
+ *
+ * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc()
+ * from allocating list lrus for this cgroup after memcg_list_lru_free()
+ * call.
+ */
+ rcu_read_lock();
+ css_for_each_descendant_pre(css, &memcg->css) {
+ struct mem_cgroup *child;
-int memcg_update_all_list_lrus(int new_size)
-{
- int ret = 0;
- struct list_lru *lru;
- int old_size = memcg_nr_cache_ids;
+ child = mem_cgroup_from_css(css);
+ WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
+ }
+ rcu_read_unlock();
mutex_lock(&list_lrus_mutex);
- list_for_each_entry(lru, &list_lrus, list) {
- ret = memcg_update_list_lru(lru, old_size, new_size);
- if (ret)
- goto fail;
- }
-out:
+ list_for_each_entry(lru, &memcg_list_lrus, list)
+ memcg_reparent_list_lru(lru, src_idx, parent);
mutex_unlock(&list_lrus_mutex);
- return ret;
-fail:
- list_for_each_entry_continue_reverse(lru, &list_lrus, list)
- memcg_cancel_update_list_lru(lru, old_size, new_size);
- goto out;
}
-static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
- int src_idx, struct mem_cgroup *dst_memcg)
+static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
+ struct list_lru *lru)
{
- struct list_lru_node *nlru = &lru->node[nid];
- int dst_idx = dst_memcg->kmemcg_id;
- struct list_lru_one *src, *dst;
- bool set;
+ int idx = memcg->kmemcg_id;
- /*
- * Since list_lru_{add,del} may be called under an IRQ-safe lock,
- * we have to use IRQ-safe primitives here to avoid deadlock.
- */
- spin_lock_irq(&nlru->lock);
-
- src = list_lru_from_memcg_idx(nlru, src_idx);
- dst = list_lru_from_memcg_idx(nlru, dst_idx);
-
- list_splice_init(&src->list, &dst->list);
- set = (!dst->nr_items && src->nr_items);
- dst->nr_items += src->nr_items;
- if (set)
- memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
- src->nr_items = 0;
-
- spin_unlock_irq(&nlru->lock);
+ return idx < 0 || xa_load(&lru->xa, idx);
}
-static void memcg_drain_list_lru(struct list_lru *lru,
- int src_idx, struct mem_cgroup *dst_memcg)
+int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
+ gfp_t gfp)
{
int i;
+ unsigned long flags;
+ struct list_lru_memcg_table {
+ struct list_lru_memcg *mlru;
+ struct mem_cgroup *memcg;
+ } *table;
+ XA_STATE(xas, &lru->xa, 0);
+
+ if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
+ return 0;
- if (!list_lru_memcg_aware(lru))
- return;
+ gfp &= GFP_RECLAIM_MASK;
+ table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
+ if (!table)
+ return -ENOMEM;
- for_each_node(i)
- memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
-}
+ /*
+ * Because the list_lru can be reparented to the parent cgroup's
+ * list_lru, we should make sure that this cgroup and all its
+ * ancestors have allocated list_lru_memcg.
+ */
+ for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
+ if (memcg_list_lru_allocated(memcg, lru))
+ break;
-void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
-{
- struct list_lru *lru;
+ table[i].memcg = memcg;
+ table[i].mlru = memcg_init_list_lru_one(gfp);
+ if (!table[i].mlru) {
+ while (i--)
+ kfree(table[i].mlru);
+ kfree(table);
+ return -ENOMEM;
+ }
+ }
- mutex_lock(&list_lrus_mutex);
- list_for_each_entry(lru, &list_lrus, list)
- memcg_drain_list_lru(lru, src_idx, dst_memcg);
- mutex_unlock(&list_lrus_mutex);
+ xas_lock_irqsave(&xas, flags);
+ while (i--) {
+ int index = READ_ONCE(table[i].memcg->kmemcg_id);
+ struct list_lru_memcg *mlru = table[i].mlru;
+
+ xas_set(&xas, index);
+retry:
+ if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
+ kfree(mlru);
+ } else {
+ xas_store(&xas, mlru);
+ if (xas_error(&xas) == -ENOMEM) {
+ xas_unlock_irqrestore(&xas, flags);
+ if (xas_nomem(&xas, gfp))
+ xas_set_err(&xas, 0);
+ xas_lock_irqsave(&xas, flags);
+ /*
+ * The xas lock has been released, this memcg
+ * can be reparented before us. So reload
+ * memcg id. More details see the comments
+ * in memcg_reparent_list_lrus().
+ */
+ index = READ_ONCE(table[i].memcg->kmemcg_id);
+ if (index < 0)
+ xas_set_err(&xas, 0);
+ else if (!xas_error(&xas) && index != xas.xa_index)
+ xas_set(&xas, index);
+ goto retry;
+ }
+ }
+ }
+ /* xas_nomem() is used to free memory instead of memory allocation. */
+ if (xas.xa_alloc)
+ xas_nomem(&xas, gfp);
+ xas_unlock_irqrestore(&xas, flags);
+ kfree(table);
+
+ return xas_error(&xas);
}
#else
-static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
- return 0;
}
static void memcg_destroy_list_lru(struct list_lru *lru)
@@ -591,7 +560,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
- int err = -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
if (shrinker)
@@ -599,11 +567,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
else
lru->shrinker_id = -1;
#endif
- memcg_get_cache_ids();
lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
if (!lru->node)
- goto out;
+ return -ENOMEM;
for_each_node(i) {
spin_lock_init(&lru->node[i].lock);
@@ -612,18 +579,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
init_one_lru(&lru->node[i].lru);
}
- err = memcg_init_list_lru(lru, memcg_aware);
- if (err) {
- kfree(lru->node);
- /* Do this so a list_lru_destroy() doesn't crash: */
- lru->node = NULL;
- goto out;
- }
-
+ memcg_init_list_lru(lru, memcg_aware);
list_lru_register(lru);
-out:
- memcg_put_cache_ids();
- return err;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(__list_lru_init);
@@ -633,8 +592,6 @@ void list_lru_destroy(struct list_lru *lru)
if (!lru->node)
return;
- memcg_get_cache_ids();
-
list_lru_unregister(lru);
memcg_destroy_list_lru(lru);
@@ -644,6 +601,5 @@ void list_lru_destroy(struct list_lru *lru)
#ifdef CONFIG_MEMCG_KMEM
lru->shrinker_id = -1;
#endif
- memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/maccess.c b/mm/maccess.c
index 3bd70405f2d8..518a25667323 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
+#include <asm/tlb.h>
bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
size_t size)
@@ -12,8 +13,6 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
return true;
}
-#ifdef HAVE_GET_KERNEL_NOFAULT
-
#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
__get_kernel_nofault(dst, src, type, err_label); \
@@ -24,13 +23,21 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
{
+ unsigned long align = 0;
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ align = (unsigned long)dst | (unsigned long)src;
+
if (!copy_from_kernel_nofault_allowed(src, size))
return -ERANGE;
pagefault_disable();
- copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
- copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
- copy_from_kernel_nofault_loop(dst, src, size, u16, Efault);
+ if (!(align & 7))
+ copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
+ if (!(align & 3))
+ copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
+ if (!(align & 1))
+ copy_from_kernel_nofault_loop(dst, src, size, u16, Efault);
copy_from_kernel_nofault_loop(dst, src, size, u8, Efault);
pagefault_enable();
return 0;
@@ -50,10 +57,18 @@ EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
{
+ unsigned long align = 0;
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ align = (unsigned long)dst | (unsigned long)src;
+
pagefault_disable();
- copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
- copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
- copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
+ if (!(align & 7))
+ copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
+ if (!(align & 3))
+ copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
+ if (!(align & 1))
+ copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
copy_to_kernel_nofault_loop(dst, src, size, u8, Efault);
pagefault_enable();
return 0;
@@ -83,115 +98,9 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
return src - unsafe_addr;
Efault:
pagefault_enable();
- dst[-1] = '\0';
+ dst[0] = '\0';
return -EFAULT;
}
-#else /* HAVE_GET_KERNEL_NOFAULT */
-/**
- * copy_from_kernel_nofault(): safely attempt to read from kernel-space
- * @dst: pointer to the buffer that shall take the data
- * @src: address to read from
- * @size: size of the data chunk
- *
- * Safely read from kernel address @src to the buffer at @dst. If a kernel
- * fault happens, handle that and return -EFAULT. If @src is not a valid kernel
- * address, return -ERANGE.
- *
- * We ensure that the copy_from_user is executed in atomic context so that
- * do_page_fault() doesn't attempt to take mmap_lock. This makes
- * copy_from_kernel_nofault() suitable for use within regions where the caller
- * already holds mmap_lock, or other locks which nest inside mmap_lock.
- */
-long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
-{
- long ret;
- mm_segment_t old_fs = get_fs();
-
- if (!copy_from_kernel_nofault_allowed(src, size))
- return -ERANGE;
-
- set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst, (__force const void __user *)src,
- size);
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret)
- return -EFAULT;
- return 0;
-}
-EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
-
-/**
- * copy_to_kernel_nofault(): safely attempt to write to a location
- * @dst: address to write to
- * @src: pointer to the data that shall be written
- * @size: size of the data chunk
- *
- * Safely write to address @dst from the buffer at @src. If a kernel fault
- * happens, handle that and return -EFAULT.
- */
-long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
-{
- long ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret)
- return -EFAULT;
- return 0;
-}
-
-/**
- * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe
- * address.
- * @dst: Destination address, in kernel space. This buffer must be at
- * least @count bytes long.
- * @unsafe_addr: Unsafe address.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from unsafe address to kernel buffer.
- *
- * On success, returns the length of the string INCLUDING the trailing NUL.
- *
- * If access fails, returns -EFAULT (some data may have been copied and the
- * trailing NUL added). If @unsafe_addr is not a valid kernel address, return
- * -ERANGE.
- *
- * If @count is smaller than the length of the string, copies @count-1 bytes,
- * sets the last byte of @dst buffer to NUL and returns @count.
- */
-long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
-{
- mm_segment_t old_fs = get_fs();
- const void *src = unsafe_addr;
- long ret;
-
- if (unlikely(count <= 0))
- return 0;
- if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
- return -ERANGE;
-
- set_fs(KERNEL_DS);
- pagefault_disable();
-
- do {
- ret = __get_user(*dst++, (const char __user __force *)src++);
- } while (dst[-1] && ret == 0 && src - unsafe_addr < count);
-
- dst[-1] = '\0';
- pagefault_enable();
- set_fs(old_fs);
-
- return ret ? -EFAULT : src - unsafe_addr;
-}
-#endif /* HAVE_GET_KERNEL_NOFAULT */
/**
* copy_from_user_nofault(): safely attempt to read from a user-space location
@@ -205,14 +114,16 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = force_uaccess_begin();
- if (access_ok(src, size)) {
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst, src, size);
- pagefault_enable();
- }
- force_uaccess_end(old_fs);
+ if (!__access_ok(src, size))
+ return ret;
+
+ if (!nmi_uaccess_okay())
+ return ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
if (ret)
return -EFAULT;
@@ -232,14 +143,12 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault);
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = force_uaccess_begin();
if (access_ok(dst, size)) {
pagefault_disable();
ret = __copy_to_user_inatomic(dst, src, size);
pagefault_enable();
}
- force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -268,17 +177,14 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault);
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count)
{
- mm_segment_t old_fs;
long ret;
if (unlikely(count <= 0))
return 0;
- old_fs = force_uaccess_begin();
pagefault_disable();
ret = strncpy_from_user(dst, unsafe_addr, count);
pagefault_enable();
- force_uaccess_end(old_fs);
if (ret >= count) {
ret = count;
@@ -308,14 +214,17 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
*/
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
- mm_segment_t old_fs;
int ret;
- old_fs = force_uaccess_begin();
pagefault_disable();
ret = strnlen_user(unsafe_addr, count);
pagefault_enable();
- force_uaccess_end(old_fs);
return ret;
}
+
+void __copy_overflow(int size, unsigned long count)
+{
+ WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
+EXPORT_SYMBOL(__copy_overflow);
diff --git a/mm/madvise.c b/mm/madvise.c
index 9b065d412e5f..ec30f48f8f2e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,6 +17,10 @@
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/string.h>
+#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
@@ -27,11 +31,11 @@
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/mm.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
struct madvise_walk_private {
struct mmu_gather *tlb;
@@ -49,9 +53,13 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_FREE:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -59,83 +67,92 @@ static int madvise_need_mmap_write(int behavior)
}
}
+#ifdef CONFIG_ANON_VMA_NAME
+struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+ struct anon_vma_name *anon_name;
+ size_t count;
+
+ /* Add 1 for NUL terminator at the end of the anon_name->name */
+ count = strlen(name) + 1;
+ anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
+ if (anon_name) {
+ kref_init(&anon_name->kref);
+ memcpy(anon_name->name, name, count);
+ }
+
+ return anon_name;
+}
+
+void anon_vma_name_free(struct kref *kref)
+{
+ struct anon_vma_name *anon_name =
+ container_of(kref, struct anon_vma_name, kref);
+ kfree(anon_name);
+}
+
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
+{
+ mmap_assert_locked(vma->vm_mm);
+
+ return vma->anon_name;
+}
+
+/* mmap_lock should be write-locked */
+static int replace_anon_vma_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
+{
+ struct anon_vma_name *orig_name = anon_vma_name(vma);
+
+ if (!anon_name) {
+ vma->anon_name = NULL;
+ anon_vma_name_put(orig_name);
+ return 0;
+ }
+
+ if (anon_vma_name_eq(orig_name, anon_name))
+ return 0;
+
+ vma->anon_name = anon_vma_name_reuse(anon_name);
+ anon_vma_name_put(orig_name);
+
+ return 0;
+}
+#else /* CONFIG_ANON_VMA_NAME */
+static int replace_anon_vma_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
+{
+ if (anon_name)
+ return -EINVAL;
+
+ return 0;
+}
+#endif /* CONFIG_ANON_VMA_NAME */
/*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
+ * Update the vm_flags on region of a vma, splitting it or merging it as
+ * necessary. Must be called with mmap_lock held for writing;
+ * Caller should ensure anon_name stability by raising its refcount even when
+ * anon_name belongs to a valid vma because this function might free that vma.
*/
-static long madvise_behavior(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end, int behavior)
+static int madvise_update_vma(struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, unsigned long new_flags,
+ struct anon_vma_name *anon_name)
{
struct mm_struct *mm = vma->vm_mm;
- int error = 0;
+ int error;
pgoff_t pgoff;
- unsigned long new_flags = vma->vm_flags;
+ VMA_ITERATOR(vmi, mm, start);
- switch (behavior) {
- case MADV_NORMAL:
- new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
- break;
- case MADV_SEQUENTIAL:
- new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
- break;
- case MADV_RANDOM:
- new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
- break;
- case MADV_DONTFORK:
- new_flags |= VM_DONTCOPY;
- break;
- case MADV_DOFORK:
- if (vma->vm_flags & VM_IO) {
- error = -EINVAL;
- goto out;
- }
- new_flags &= ~VM_DONTCOPY;
- break;
- case MADV_WIPEONFORK:
- /* MADV_WIPEONFORK is only supported on anonymous memory. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED) {
- error = -EINVAL;
- goto out;
- }
- new_flags |= VM_WIPEONFORK;
- break;
- case MADV_KEEPONFORK:
- new_flags &= ~VM_WIPEONFORK;
- break;
- case MADV_DONTDUMP:
- new_flags |= VM_DONTDUMP;
- break;
- case MADV_DODUMP:
- if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
- error = -EINVAL;
- goto out;
- }
- new_flags &= ~VM_DONTDUMP;
- break;
- case MADV_MERGEABLE:
- case MADV_UNMERGEABLE:
- error = ksm_madvise(vma, start, end, behavior, &new_flags);
- if (error)
- goto out_convert_errno;
- break;
- case MADV_HUGEPAGE:
- case MADV_NOHUGEPAGE:
- error = hugepage_madvise(vma, &new_flags, behavior);
- if (error)
- goto out_convert_errno;
- break;
- }
-
- if (new_flags == vma->vm_flags) {
+ if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
*prev = vma;
- goto out;
+ return 0;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_name);
if (*prev) {
vma = *prev;
goto success;
@@ -144,110 +161,116 @@ static long madvise_behavior(struct vm_area_struct *vma,
*prev = vma;
if (start != vma->vm_start) {
- if (unlikely(mm->map_count >= sysctl_max_map_count)) {
- error = -ENOMEM;
- goto out;
- }
- error = __split_vma(mm, vma, start, 1);
+ error = split_vma(&vmi, vma, start, 1);
if (error)
- goto out_convert_errno;
+ return error;
}
if (end != vma->vm_end) {
- if (unlikely(mm->map_count >= sysctl_max_map_count)) {
- error = -ENOMEM;
- goto out;
- }
- error = __split_vma(mm, vma, end, 0);
+ error = split_vma(&vmi, vma, end, 0);
if (error)
- goto out_convert_errno;
+ return error;
}
success:
/*
* vm_flags is protected by the mmap_lock held in write mode.
*/
- vma->vm_flags = new_flags;
+ vm_flags_reset(vma, new_flags);
+ if (!vma->vm_file || vma_is_anon_shmem(vma)) {
+ error = replace_anon_vma_name(vma, anon_name);
+ if (error)
+ return error;
+ }
-out_convert_errno:
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
-out:
- return error;
+ return 0;
}
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
+ unsigned long end, struct mm_walk *walk)
{
- pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
- unsigned long index;
-
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- return 0;
+ struct swap_iocb *splug = NULL;
+ pte_t *ptep = NULL;
+ spinlock_t *ptl;
+ unsigned long addr;
- for (index = start; index != end; index += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
pte_t pte;
swp_entry_t entry;
struct page *page;
- spinlock_t *ptl;
- orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
- pte = *(orig_pte + ((index - start) / PAGE_SIZE));
- pte_unmap_unlock(orig_pte, ptl);
+ if (!ptep++) {
+ ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!ptep)
+ break;
+ }
- if (pte_present(pte) || pte_none(pte))
+ pte = ptep_get(ptep);
+ if (!is_swap_pte(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
continue;
+ pte_unmap_unlock(ptep, ptl);
+ ptep = NULL;
+
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index, false);
+ vma, addr, false, &splug);
if (page)
put_page(page);
}
+ if (ptep)
+ pte_unmap_unlock(ptep, ptl);
+ swap_read_unplug(splug);
+ cond_resched();
+
return 0;
}
static const struct mm_walk_ops swapin_walk_ops = {
.pmd_entry = swapin_walk_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
};
-static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+static void shmem_swapin_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
- pgoff_t end_index = end / PAGE_SIZE;
+ pgoff_t end_index = linear_page_index(vma, end) - 1;
struct page *page;
+ struct swap_iocb *splug = NULL;
rcu_read_lock();
xas_for_each(&xas, page, end_index) {
- swp_entry_t swap;
+ unsigned long addr;
+ swp_entry_t entry;
if (!xa_is_value(page))
continue;
+ entry = radix_to_swp_entry(page);
+ /* There might be swapin error entries in shmem mapping. */
+ if (non_swap_entry(entry))
+ continue;
+
+ addr = vma->vm_start +
+ ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
xas_pause(&xas);
rcu_read_unlock();
- swap = radix_to_swp_entry(page);
- page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0, false);
+ page = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
+ vma, addr, false, &splug);
if (page)
put_page(page);
rcu_read_lock();
}
rcu_read_unlock();
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
+ swap_read_unplug(splug);
}
#endif /* CONFIG_SWAP */
@@ -258,6 +281,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
+ struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
@@ -270,8 +294,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
}
if (shmem_mapping(file->f_mapping)) {
- force_shm_swapin_readahead(vma, start, end,
- file->f_mapping);
+ shmem_swapin_range(vma, start, end, file->f_mapping);
+ lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
#else
@@ -294,13 +318,28 @@ static long madvise_willneed(struct vm_area_struct *vma,
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return 0;
}
+static inline bool can_do_file_pageout(struct vm_area_struct *vma)
+{
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(&nop_mnt_idmap,
+ file_inode(vma->vm_file)) ||
+ file_permission(vma->vm_file, MAY_WRITE) == 0;
+}
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -310,14 +349,18 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
bool pageout = private->pageout;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
- pte_t *orig_pte, *pte, ptent;
+ pte_t *start_pte, *pte, ptent;
spinlock_t *ptl;
- struct page *page = NULL;
- LIST_HEAD(page_list);
+ struct folio *folio = NULL;
+ LIST_HEAD(folio_list);
+ bool pageout_anon_only_filter;
if (fatal_signal_pending(current))
return -EINTR;
+ pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
+ !can_do_file_pageout(vma);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
pmd_t orig_pmd;
@@ -338,23 +381,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
goto huge_unlock;
}
- page = pmd_page(orig_pmd);
+ folio = pfn_folio(pmd_pfn(orig_pmd));
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /* Do not interfere with other mappings of this folio */
+ if (folio_estimated_sharers(folio) != 1)
+ goto huge_unlock;
+
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
goto huge_unlock;
if (next - addr != HPAGE_PMD_SIZE) {
int err;
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- lock_page(page);
- err = split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
if (!err)
- goto regular_page;
+ goto regular_folio;
return 0;
}
@@ -366,34 +412,34 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
if (pageout) {
- if (!isolate_lru_page(page)) {
- if (PageUnevictable(page))
- putback_lru_page(page);
+ if (folio_isolate_lru(folio)) {
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
else
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &folio_list);
}
} else
- deactivate_page(page);
+ folio_deactivate(folio);
huge_unlock:
spin_unlock(ptl);
if (pageout)
- reclaim_pages(&page_list);
+ reclaim_pages(&folio_list);
return 0;
}
-regular_page:
- if (pmd_trans_unstable(pmd))
- return 0;
+regular_folio:
#endif
tlb_change_page_size(tlb, PAGE_SIZE);
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr < end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
+ ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -401,42 +447,53 @@ regular_page:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
continue;
/*
* Creating a THP page is expensive so split it only if we
* are sure it's worth. Split it if we are only owner.
*/
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
+ if (folio_test_large(folio)) {
+ int err;
+
+ if (folio_estimated_sharers(folio) != 1)
break;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
- }
- pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_trylock(folio))
+ break;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (err)
+ break;
+ start_pte = pte =
pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
break;
- }
- unlock_page(page);
- put_page(page);
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
pte--;
addr -= PAGE_SIZE;
continue;
}
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /*
+ * Do not interfere with other mappings of this folio and
+ * non-LRU folio.
+ */
+ if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+ continue;
+
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
if (pte_young(ptent)) {
ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -447,28 +504,30 @@ regular_page:
}
/*
- * We are deactivating a page for accelerating reclaiming.
- * VM couldn't reclaim the page unless we clear PG_young.
+ * We are deactivating a folio for accelerating reclaiming.
+ * VM couldn't reclaim the folio unless we clear PG_young.
* As a side effect, it makes confuse idle-page tracking
* because they will miss recent referenced history.
*/
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
if (pageout) {
- if (!isolate_lru_page(page)) {
- if (PageUnevictable(page))
- putback_lru_page(page);
+ if (folio_isolate_lru(folio)) {
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
else
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &folio_list);
}
} else
- deactivate_page(page);
+ folio_deactivate(folio);
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(orig_pte, ptl);
+ if (start_pte) {
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ }
if (pageout)
- reclaim_pages(&page_list);
+ reclaim_pages(&folio_list);
cond_resched();
return 0;
@@ -476,6 +535,7 @@ regular_page:
static const struct mm_walk_ops cold_walk_ops = {
.pmd_entry = madvise_cold_or_pageout_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
@@ -492,6 +552,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
+{
+ return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
+}
+
static long madvise_cold(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
@@ -504,9 +569,9 @@ static long madvise_cold(struct vm_area_struct *vma,
return -EINVAL;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, mm);
madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
- tlb_finish_mmu(&tlb, start_addr, end_addr);
+ tlb_finish_mmu(&tlb);
return 0;
}
@@ -525,22 +590,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}
-static inline bool can_do_pageout(struct vm_area_struct *vma)
-{
- if (vma_is_anonymous(vma))
- return true;
- if (!vma->vm_file)
- return false;
- /*
- * paging out pagecache only for non-anonymous mappings that correspond
- * to the files the calling process could (if tried) open for writing;
- * otherwise we'd be including shared non-exclusive mappings, which
- * opens a side channel.
- */
- return inode_owner_or_capable(file_inode(vma->vm_file)) ||
- inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
-}
-
static long madvise_pageout(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
@@ -552,13 +601,20 @@ static long madvise_pageout(struct vm_area_struct *vma,
if (!can_madv_lru_vma(vma))
return -EINVAL;
- if (!can_do_pageout(vma))
+ /*
+ * If the VMA belongs to a private file mapping, there can be private
+ * dirty pages which can be paged out if even this process is neither
+ * owner nor write capable of the file. We allow private file mappings
+ * further to pageout dirty anon pages.
+ */
+ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
+ (vma->vm_flags & VM_MAYSHARE)))
return 0;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, mm);
madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
- tlb_finish_mmu(&tlb, start_addr, end_addr);
+ tlb_finish_mmu(&tlb);
return 0;
}
@@ -571,25 +627,24 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
- pte_t *orig_pte, *pte, ptent;
- struct page *page;
+ pte_t *start_pte, *pte, ptent;
+ struct folio *folio;
int nr_swap = 0;
unsigned long next;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
- goto next;
-
- if (pmd_trans_unstable(pmd))
- return 0;
+ return 0;
tlb_change_page_size(tlb, PAGE_SIZE);
- orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
+ ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -602,67 +657,72 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
swp_entry_t entry;
entry = pte_to_swp_entry(ptent);
- if (non_swap_entry(entry))
- continue;
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ if (!non_swap_entry(entry)) {
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ } else if (is_hwpoison_entry(entry) ||
+ is_swapin_error_entry(entry)) {
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ }
continue;
}
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
continue;
/*
- * If pmd isn't transhuge but the page is THP and
+ * If pmd isn't transhuge but the folio is large and
* is owned by only this process, split it and
* deactivate all pages.
*/
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
- goto out;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
- goto out;
- }
- pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio_test_large(folio)) {
+ int err;
+
+ if (folio_estimated_sharers(folio) != 1)
+ break;
+ if (!folio_trylock(folio))
+ break;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (err)
+ break;
+ start_pte = pte =
pte_offset_map_lock(mm, pmd, addr, &ptl);
- goto out;
- }
- unlock_page(page);
- put_page(page);
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
pte--;
addr -= PAGE_SIZE;
continue;
}
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
-
- if (PageSwapCache(page) || PageDirty(page)) {
- if (!trylock_page(page))
+ if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
+ if (!folio_trylock(folio))
continue;
/*
- * If page is shared with others, we couldn't clear
- * PG_dirty of the page.
+ * If folio is shared with others, we mustn't clear
+ * the folio's dirty flag.
*/
- if (page_mapcount(page) != 1) {
- unlock_page(page);
+ if (folio_mapcount(folio) != 1) {
+ folio_unlock(folio);
continue;
}
- if (PageSwapCache(page) && !try_to_free_swap(page)) {
- unlock_page(page);
+ if (folio_test_swapcache(folio) &&
+ !folio_free_swap(folio)) {
+ folio_unlock(folio);
continue;
}
- ClearPageDirty(page);
- unlock_page(page);
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
}
if (pte_young(ptent) || pte_dirty(ptent)) {
@@ -680,24 +740,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
- mark_page_lazyfree(page);
+ folio_mark_lazyfree(folio);
}
-out:
+
if (nr_swap) {
if (current->mm == mm)
sync_mm_rss(mm);
-
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(orig_pte, ptl);
+ if (start_pte) {
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ }
cond_resched();
-next:
+
return 0;
}
static const struct mm_walk_ops madvise_free_walk_ops = {
.pmd_entry = madvise_free_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static int madvise_free_single_vma(struct vm_area_struct *vma,
@@ -717,11 +779,11 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.end = min(vma->vm_end, end_addr);
if (range.end <= vma->vm_start)
return -EINVAL;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
@@ -730,7 +792,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
&madvise_free_walk_ops, &tlb);
tlb_end_vma(&tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
return 0;
}
@@ -739,8 +801,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for shrink_active_list to actually free
- * these pages later if no one else has touched them in the meantime,
+ * zap_page_range_single call sets things up for shrink_active_list to actually
+ * free these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages.
*
@@ -757,44 +819,72 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range(vma, start, end - start);
+ zap_page_range_single(vma, start, end - start, NULL);
return 0;
}
+static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long *end,
+ int behavior)
+{
+ if (!is_vm_hugetlb_page(vma)) {
+ unsigned int forbidden = VM_PFNMAP;
+
+ if (behavior != MADV_DONTNEED_LOCKED)
+ forbidden |= VM_LOCKED;
+
+ return !(vma->vm_flags & forbidden);
+ }
+
+ if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
+ return false;
+ if (start & ~huge_page_mask(hstate_vma(vma)))
+ return false;
+
+ /*
+ * Madvise callers expect the length to be rounded up to PAGE_SIZE
+ * boundaries, and may be unaware that this VMA uses huge pages.
+ * Avoid unexpected data loss by rounding down the number of
+ * huge pages freed.
+ */
+ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+
+ return true;
+}
+
static long madvise_dontneed_free(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
int behavior)
{
+ struct mm_struct *mm = vma->vm_mm;
+
*prev = vma;
- if (!can_madv_lru_vma(vma))
+ if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
return -EINVAL;
+ if (start == end)
+ return 0;
+
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
- mmap_read_lock(current->mm);
- vma = find_vma(current->mm, start);
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, start);
if (!vma)
return -ENOMEM;
- if (start < vma->vm_start) {
- /*
- * This "vma" under revalidation is the one
- * with the lowest vma->vm_start where start
- * is also < vma->vm_end. If start <
- * vma->vm_start it means an hole materialized
- * in the user address space within the
- * virtual range passed to MADV_DONTNEED
- * or MADV_FREE.
- */
- return -ENOMEM;
- }
- if (!can_madv_lru_vma(vma))
+ /*
+ * Potential end adjustment for hugetlb vma is OK as
+ * the check below keeps end within vma.
+ */
+ if (!madvise_dontneed_free_valid_vma(vma, start, &end,
+ behavior))
return -EINVAL;
if (end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
- * vma was splitted while the mmap_lock was
+ * vma was split while the mmap_lock was
* released the effect of the concurrent
* operation may not cause madvise() to
* have an undefined result. There may be an
@@ -809,7 +899,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
VM_WARN_ON(start >= end);
}
- if (behavior == MADV_DONTNEED)
+ if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
return madvise_dontneed_single_vma(vma, start, end);
else if (behavior == MADV_FREE)
return madvise_free_single_vma(vma, start, end);
@@ -817,6 +907,63 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
return -EINVAL;
}
+static long madvise_populate(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ int behavior)
+{
+ const bool write = behavior == MADV_POPULATE_WRITE;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long tmp_end;
+ int locked = 1;
+ long pages;
+
+ *prev = vma;
+
+ while (start < end) {
+ /*
+ * We might have temporarily dropped the lock. For example,
+ * our VMA might have been split.
+ */
+ if (!vma || start >= vma->vm_end) {
+ vma = vma_lookup(mm, start);
+ if (!vma)
+ return -ENOMEM;
+ }
+
+ tmp_end = min_t(unsigned long, end, vma->vm_end);
+ /* Populate (prefault) page tables readable/writable. */
+ pages = faultin_vma_page_range(vma, start, tmp_end, write,
+ &locked);
+ if (!locked) {
+ mmap_read_lock(mm);
+ locked = 1;
+ *prev = NULL;
+ vma = NULL;
+ }
+ if (pages < 0) {
+ switch (pages) {
+ case -EINTR:
+ return -EINTR;
+ case -EINVAL: /* Incompatible mappings / permissions. */
+ return -EINVAL;
+ case -EHWPOISON:
+ return -EHWPOISON;
+ case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
+ return -EFAULT;
+ default:
+ pr_warn_once("%s: unhandled return value: %ld\n",
+ __func__, pages);
+ fallthrough;
+ case -ENOMEM:
+ return -ENOMEM;
+ }
+ }
+ start += pages * PAGE_SIZE;
+ }
+ return 0;
+}
+
/*
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
@@ -828,6 +975,7 @@ static long madvise_remove(struct vm_area_struct *vma,
loff_t offset;
int error;
struct file *f;
+ struct mm_struct *mm = vma->vm_mm;
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
@@ -847,7 +995,7 @@ static long madvise_remove(struct vm_area_struct *vma,
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
/*
- * Filesystem's fallocate may need to take i_mutex. We need to
+ * Filesystem's fallocate may need to take i_rwsem. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
@@ -855,13 +1003,109 @@ static long madvise_remove(struct vm_area_struct *vma,
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
/* mmap_lock was not released by userfaultfd_remove() */
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
+ return error;
+}
+
+/*
+ * Apply an madvise behavior to a region of a vma. madvise_update_vma
+ * will handle splitting a vm area into separate areas, each area with its own
+ * behavior.
+ */
+static int madvise_vma_behavior(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ unsigned long behavior)
+{
+ int error;
+ struct anon_vma_name *anon_name;
+ unsigned long new_flags = vma->vm_flags;
+
+ switch (behavior) {
+ case MADV_REMOVE:
+ return madvise_remove(vma, prev, start, end);
+ case MADV_WILLNEED:
+ return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
+ case MADV_PAGEOUT:
+ return madvise_pageout(vma, prev, start, end);
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ return madvise_dontneed_free(vma, prev, start, end, behavior);
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return madvise_populate(vma, prev, start, end, behavior);
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
+ case MADV_SEQUENTIAL:
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+ break;
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO)
+ return -EINVAL;
+ new_flags &= ~VM_DONTCOPY;
+ break;
+ case MADV_WIPEONFORK:
+ /* MADV_WIPEONFORK is only supported on anonymous memory. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ return -EINVAL;
+ new_flags |= VM_WIPEONFORK;
+ break;
+ case MADV_KEEPONFORK:
+ new_flags &= ~VM_WIPEONFORK;
+ break;
+ case MADV_DONTDUMP:
+ new_flags |= VM_DONTDUMP;
+ break;
+ case MADV_DODUMP:
+ if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
+ return -EINVAL;
+ new_flags &= ~VM_DONTDUMP;
+ break;
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+ error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ if (error)
+ goto out;
+ break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out;
+ break;
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, prev, start, end);
+ }
+
+ anon_name = anon_vma_name(vma);
+ anon_vma_name_get(anon_name);
+ error = madvise_update_vma(vma, prev, start, end, new_flags,
+ anon_name);
+ anon_vma_name_put(anon_name);
+
+out:
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
return error;
}
@@ -872,8 +1116,6 @@ static long madvise_remove(struct vm_area_struct *vma,
static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
- struct page *page;
- struct zone *zone;
unsigned long size;
if (!capable(CAP_SYS_ADMIN))
@@ -882,6 +1124,7 @@ static int madvise_inject_error(int behavior,
for (; start < end; start += size) {
unsigned long pfn;
+ struct page *page;
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
@@ -896,65 +1139,26 @@ static int madvise_inject_error(int behavior,
*/
size = page_size(compound_head(page));
- if (PageHWPoison(page)) {
- put_page(page);
- continue;
- }
-
if (behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
+ pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
- if (ret)
- return ret;
- continue;
+ } else {
+ pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
+ pfn, start);
+ ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
}
- pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
- /*
- * Drop the page reference taken by get_user_pages_fast(). In
- * the absence of MF_COUNT_INCREASED the memory_failure()
- * routine is responsible for pinning the page to prevent it
- * from being released back to the page allocator.
- */
- put_page(page);
- ret = memory_failure(pfn, 0);
if (ret)
return ret;
}
- /* Ensure that all poisoned pages are removed from per-cpu lists */
- for_each_populated_zone(zone)
- drain_all_pages(zone);
-
return 0;
}
#endif
-static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end, int behavior)
-{
- switch (behavior) {
- case MADV_REMOVE:
- return madvise_remove(vma, prev, start, end);
- case MADV_WILLNEED:
- return madvise_willneed(vma, prev, start, end);
- case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
- case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
- case MADV_FREE:
- case MADV_DONTNEED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
- default:
- return madvise_behavior(vma, prev, start, end, behavior);
- }
-}
-
static bool
madvise_behavior_valid(int behavior)
{
@@ -967,9 +1171,12 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
case MADV_FREE:
case MADV_COLD:
case MADV_PAGEOUT:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
@@ -977,6 +1184,7 @@ madvise_behavior_valid(int behavior)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
+ case MADV_COLLAPSE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
@@ -993,6 +1201,135 @@ madvise_behavior_valid(int behavior)
}
}
+static bool process_madvise_behavior_valid(int behavior)
+{
+ switch (behavior) {
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ case MADV_WILLNEED:
+ case MADV_COLLAPSE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * Walk the vmas in range [start,end), and call the visit function on each one.
+ * The visit function will get start and end parameters that cover the overlap
+ * between the current vma and the original range. Any unmapped regions in the
+ * original range will result in this function returning -ENOMEM while still
+ * calling the visit function on all of the existing vmas in the range.
+ * Must be called with the mmap_lock held for reading or writing.
+ */
+static
+int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long arg,
+ int (*visit)(struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, unsigned long arg))
+{
+ struct vm_area_struct *vma;
+ struct vm_area_struct *prev;
+ unsigned long tmp;
+ int unmapped_error = 0;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - different from the way of handling in mlock etc.
+ */
+ vma = find_vma_prev(mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ int error;
+
+ /* Still start < end. */
+ if (!vma)
+ return -ENOMEM;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ break;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = visit(vma, &prev, start, tmp, arg);
+ if (error)
+ return error;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ if (start >= end)
+ break;
+ if (prev)
+ vma = find_vma(mm, prev->vm_end);
+ else /* madvise_remove dropped mmap_lock */
+ vma = find_vma(mm, start);
+ }
+
+ return unmapped_error;
+}
+
+#ifdef CONFIG_ANON_VMA_NAME
+static int madvise_vma_anon_name(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ unsigned long anon_name)
+{
+ int error;
+
+ /* Only anonymous mappings can be named */
+ if (vma->vm_file && !vma_is_anon_shmem(vma))
+ return -EBADF;
+
+ error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
+ (struct anon_vma_name *)anon_name);
+
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, struct anon_vma_name *anon_name)
+{
+ unsigned long end;
+ unsigned long len;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
+ madvise_vma_anon_name);
+}
+#endif /* CONFIG_ANON_VMA_NAME */
/*
* The madvise(2) system call.
*
@@ -1037,9 +1374,19 @@ madvise_behavior_valid(int behavior)
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
* transparent huge pages so the existing pages will not be
* coalesced into THP and new pages will not be allocated as THP.
+ * MADV_COLLAPSE - synchronously coalesce pages into new THP.
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ * MADV_COLD - the application is not expected to use this memory soon,
+ * deactivate pages in this range so that they can be reclaimed
+ * easily if memory pressure happens.
+ * MADV_PAGEOUT - the application is not expected to use this memory soon,
+ * page out the pages in this range immediately.
+ * MADV_POPULATE_READ - populate (prefault) page tables readable by
+ * triggering read faults if required
+ * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
+ * triggering write faults if required
*
* return values:
* zero - success
@@ -1054,36 +1401,31 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-int do_madvise(unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end, tmp;
- struct vm_area_struct *vma, *prev;
- int unmapped_error = 0;
- int error = -EINVAL;
+ unsigned long end;
+ int error;
int write;
size_t len;
struct blk_plug plug;
- start = untagged_addr(start);
-
if (!madvise_behavior_valid(behavior))
- return error;
+ return -EINVAL;
if (!PAGE_ALIGNED(start))
- return error;
+ return -EINVAL;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- return error;
+ return -EINVAL;
end = start + len;
if (end < start)
- return error;
+ return -EINVAL;
- error = 0;
if (end == start)
- return error;
+ return 0;
#ifdef CONFIG_MEMORY_FAILURE
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -1092,84 +1434,98 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
- if (mmap_write_lock_killable(current->mm))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
-
- /*
- * We may have stolen the mm from another process
- * that is undergoing core dumping.
- *
- * Right now that's io_ring, in the future it may
- * be remote process management and not "current"
- * at all.
- *
- * We need to fix core dumping to not do this,
- * but for now we have the mmget_still_valid()
- * model.
- */
- if (!mmget_still_valid(current->mm)) {
- mmap_write_unlock(current->mm);
- return -EINTR;
- }
} else {
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
}
- /*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- * - different from the way of handling in mlock etc.
- */
- vma = find_vma_prev(current->mm, start, &prev);
- if (vma && start > vma->vm_start)
- prev = vma;
+ start = untagged_addr_remote(mm, start);
+ end = start + len;
blk_start_plug(&plug);
- for (;;) {
- /* Still start < end. */
- error = -ENOMEM;
- if (!vma)
- goto out;
-
- /* Here start < (end|vma->vm_end). */
- if (start < vma->vm_start) {
- unmapped_error = -ENOMEM;
- start = vma->vm_start;
- if (start >= end)
- goto out;
- }
-
- /* Here vma->vm_start <= start < (end|vma->vm_end) */
- tmp = vma->vm_end;
- if (end < tmp)
- tmp = end;
-
- /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = madvise_vma(vma, &prev, start, tmp, behavior);
- if (error)
- goto out;
- start = tmp;
- if (prev && start < prev->vm_end)
- start = prev->vm_end;
- error = unmapped_error;
- if (start >= end)
- goto out;
- if (prev)
- vma = prev->vm_next;
- else /* madvise_remove dropped mmap_lock */
- vma = find_vma(current->mm, start);
- }
-out:
+ error = madvise_walk_vmas(mm, start, end, behavior,
+ madvise_vma_behavior);
blk_finish_plug(&plug);
if (write)
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
else
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
- return do_madvise(start, len_in, behavior);
+ return do_madvise(current->mm, start, len_in, behavior);
+}
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+ size_t, vlen, int, behavior, unsigned int, flags)
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ size_t total_len;
+ unsigned int f_flags;
+
+ if (flags != 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ goto out;
+
+ task = pidfd_get_task(pidfd, &f_flags);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto free_iov;
+ }
+
+ if (!process_madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_task;
+ }
+
+ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
+ /*
+ * Require CAP_SYS_NICE for influencing process performance. Note that
+ * only non-destructive hints are currently supported.
+ */
+ if (!capable(CAP_SYS_NICE)) {
+ ret = -EPERM;
+ goto release_mm;
+ }
+
+ total_len = iov_iter_count(&iter);
+
+ while (iov_iter_count(&iter)) {
+ ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
+ iter_iov_len(&iter), behavior);
+ if (ret < 0)
+ break;
+ iov_iter_advance(&iter, iter_iov_len(&iter));
+ }
+
+ ret = (total_len - iov_iter_count(&iter)) ? : ret;
+
+release_mm:
+ mmput(mm);
+release_task:
+ put_task_struct(task);
+free_iov:
+ kfree(iov);
+out:
+ return ret;
}
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2c7d03675903..a26dd8bcfcdb 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -3,6 +3,7 @@
#include <linux/hugetlb.h>
#include <linux/bitops.h>
#include <linux/mmu_notifier.h>
+#include <linux/mm_inline.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -23,7 +24,8 @@ struct wp_walk {
/**
* wp_pte - Write-protect a pte
* @pte: Pointer to the pte
- * @addr: The virtual page address
+ * @addr: The start of protecting virtual address
+ * @end: The end of protecting virtual address
* @walk: pagetable walk callback argument
*
* The function write-protects a pte and records the range in
@@ -33,7 +35,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct wp_walk *wpwalk = walk->private;
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
if (pte_write(ptent)) {
pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
@@ -74,7 +76,8 @@ struct clean_walk {
* clean_record_pte - Clean a pte and record its address space offset in a
* bitmap
* @pte: Pointer to the pte
- * @addr: The virtual page address
+ * @addr: The start of virtual address to be clean
+ * @end: The end of virtual address to be clean
* @walk: pagetable walk callback argument
*
* The function cleans a pte and records the range in
@@ -88,7 +91,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
{
struct wp_walk *wpwalk = walk->private;
struct clean_walk *cwalk = to_clean_walk(wpwalk);
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
if (pte_dirty(ptent)) {
pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
@@ -123,21 +126,13 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- pmd_t pmdval = pmd_read_atomic(pmd);
+ pmd_t pmdval = pmdp_get_lockless(pmd);
- if (!pmd_trans_unstable(&pmdval))
- return 0;
-
- if (pmd_none(pmdval)) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
-
- /* Huge pmd, present or migrated */
- walk->action = ACTION_CONTINUE;
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ /* Do not split a huge pmd, present or migrated */
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) {
WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
-
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -153,21 +148,15 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
pud_t pudval = READ_ONCE(*pud);
- if (!pud_trans_unstable(&pudval))
- return 0;
-
- if (pud_none(pudval)) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
-
- /* Huge pud */
- walk->action = ACTION_CONTINUE;
- if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ /* Do not split a huge pud */
+ if (pud_trans_huge(pudval) || pud_devmap(pudval)) {
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
-
+ walk->action = ACTION_CONTINUE;
+ }
+#endif
return 0;
}
@@ -186,7 +175,7 @@ static int wp_clean_pre_vma(unsigned long start, unsigned long end,
wpwalk->tlbflush_end = start;
mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0,
- walk->vma, walk->mm, start, end);
+ walk->mm, start, end);
mmu_notifier_invalidate_range_start(&wpwalk->range);
flush_cache_range(walk->vma, start, end);
@@ -313,7 +302,7 @@ EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
* pfn_mkwrite(). And then after a TLB flush following the write-protection
* pick up all dirty bits.
*
- * Note: This function currently skips transhuge page-table entries, since
+ * This function currently skips transhuge page-table entries, since
* it's intended for dirty-tracking on the PTE level. It will warn on
* encountering transhuge dirty entries, though, and can easily be extended
* to handle them as well.
diff --git a/mm/memblock.c b/mm/memblock.c
index 165f40a8a254..f9e61e565a53 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -29,6 +29,10 @@
# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
#endif
+#ifndef INIT_MEMBLOCK_MEMORY_REGIONS
+#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS
+#endif
+
/**
* DOC: memblock overview
*
@@ -48,16 +52,16 @@
* boot regardless of the possible restrictions and memory hot(un)plug;
* the ``physmem`` type is only available on some architectures.
*
- * Each region is represented by :c:type:`struct memblock_region` that
+ * Each region is represented by struct memblock_region that
* defines the region extents, its attributes and NUMA node id on NUMA
- * systems. Every memory type is described by the :c:type:`struct
- * memblock_type` which contains an array of memory regions along with
+ * systems. Every memory type is described by the struct memblock_type
+ * which contains an array of memory regions along with
* the allocator metadata. The "memory" and "reserved" types are nicely
- * wrapped with :c:type:`struct memblock`. This structure is statically
+ * wrapped with struct memblock. This structure is statically
* initialized at build time. The region arrays are initially sized to
- * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
- * for "reserved". The region array for "physmem" is initially sized to
- * %INIT_PHYSMEM_REGIONS.
+ * %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and
+ * %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array
+ * for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS.
* The memblock_allow_resize() enables automatic resizing of the region
* arrays during addition of new regions. This feature should be used
* with care so that memory allocated for the region array will not
@@ -92,7 +96,7 @@
* system initialization completes.
*/
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
struct pglist_data __refdata contig_page_data;
EXPORT_SYMBOL(contig_page_data);
#endif
@@ -102,7 +106,7 @@ unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;
-static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
@@ -111,7 +115,7 @@ static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
- .memory.max = INIT_MEMBLOCK_REGIONS,
+ .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS,
.memory.name = "memory",
.reserved.regions = memblock_reserved_init_regions,
@@ -152,10 +156,10 @@ static __refdata struct memblock_type *memblock_memory = &memblock.memory;
} while (0)
static int memblock_debug __initdata_memblock;
-static bool system_has_some_mirror __initdata_memblock = false;
+static bool system_has_some_mirror __initdata_memblock;
static int memblock_can_resize __initdata_memblock;
-static int memblock_memory_in_slab __initdata_memblock = 0;
-static int memblock_reserved_in_slab __initdata_memblock = 0;
+static int memblock_memory_in_slab __initdata_memblock;
+static int memblock_reserved_in_slab __initdata_memblock;
static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
@@ -182,6 +186,8 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
{
unsigned long i;
+ memblock_cap_size(base, &size);
+
for (i = 0; i < type->cnt; i++)
if (memblock_addrs_overlap(base, size, type->regions[i].base,
type->regions[i].size))
@@ -275,14 +281,6 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*
* Find @size free area aligned to @align in the specified range and node.
*
- * When allocation direction is bottom-up, the @start should be greater
- * than the end of the kernel image. Otherwise, it will be trimmed. The
- * reason is that we want the bottom-up allocation just near the kernel
- * image so it is highly likely that the allocated memory and the kernel
- * will reside in the same node.
- *
- * If bottom-up allocation failed, will try to allocate memory top-down.
- *
* Return:
* Found address on success, 0 on failure.
*/
@@ -291,50 +289,21 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t end, int nid,
enum memblock_flags flags)
{
- phys_addr_t kernel_end, ret;
-
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
- end == MEMBLOCK_ALLOC_KASAN)
+ end == MEMBLOCK_ALLOC_NOLEAKTRACE)
end = memblock.current_limit;
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
- kernel_end = __pa_symbol(_end);
-
- /*
- * try bottom-up allocation only when bottom-up mode
- * is set and @end is above the kernel image.
- */
- if (memblock_bottom_up() && end > kernel_end) {
- phys_addr_t bottom_up_start;
-
- /* make sure we will allocate above the kernel */
- bottom_up_start = max(start, kernel_end);
-
- /* ok, try bottom-up allocation first */
- ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid, flags);
- if (ret)
- return ret;
-
- /*
- * we always limit bottom-up allocation above the kernel,
- * but top-down allocation doesn't have the limit, so
- * retrying top-down allocation may succeed when bottom-up
- * allocation failed.
- *
- * bottom-up allocation is expected to be fail very rarely,
- * so we use WARN_ONCE() here to see the stack trace if
- * fail happens.
- */
- WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE),
- "memblock: bottom-up allocation failed, memory hotremove may be affected\n");
- }
- return __memblock_find_range_top_down(start, end, size, align, nid,
- flags);
+ if (memblock_bottom_up())
+ return __memblock_find_range_bottom_up(start, end, size, align,
+ nid, flags);
+ else
+ return __memblock_find_range_top_down(start, end, size, align,
+ nid, flags);
}
/**
@@ -350,7 +319,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
* Return:
* Found address on success, 0 on failure.
*/
-phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
+static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
@@ -362,7 +331,7 @@ again:
NUMA_NO_NODE, flags);
if (!ret && (flags & MEMBLOCK_MIRROR)) {
- pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
&size);
flags &= ~MEMBLOCK_MIRROR;
goto again;
@@ -401,14 +370,20 @@ void __init memblock_discard(void)
addr = __pa(memblock.reserved.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.reserved.max);
- __memblock_free_late(addr, size);
+ if (memblock_reserved_in_slab)
+ kfree(memblock.reserved.regions);
+ else
+ memblock_free_late(addr, size);
}
if (memblock.memory.regions != memblock_memory_init_regions) {
addr = __pa(memblock.memory.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.memory.max);
- __memblock_free_late(addr, size);
+ if (memblock_memory_in_slab)
+ kfree(memblock.memory.regions);
+ else
+ memblock_free_late(addr, size);
}
memblock_memory = NULL;
@@ -507,7 +482,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
kfree(old_array);
else if (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
- memblock_free(__pa(old_array), old_alloc_size);
+ memblock_free(old_array, old_alloc_size);
/*
* Reserve the new array if that comes from the memblock. Otherwise, we
@@ -525,15 +500,19 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
/**
* memblock_merge_regions - merge neighboring compatible regions
* @type: memblock type to scan
- *
- * Scan @type and merge neighboring compatible regions.
+ * @start_rgn: start scanning from (@start_rgn - 1)
+ * @end_rgn: end scanning at (@end_rgn - 1)
+ * Scan @type and merge neighboring compatible regions in [@start_rgn - 1, @end_rgn)
*/
-static void __init_memblock memblock_merge_regions(struct memblock_type *type)
+static void __init_memblock memblock_merge_regions(struct memblock_type *type,
+ unsigned long start_rgn,
+ unsigned long end_rgn)
{
int i = 0;
-
- /* cnt never goes below 1 */
- while (i < type->cnt - 1) {
+ if (start_rgn)
+ i = start_rgn - 1;
+ end_rgn = min(end_rgn, type->cnt - 1);
+ while (i < end_rgn) {
struct memblock_region *this = &type->regions[i];
struct memblock_region *next = &type->regions[i + 1];
@@ -550,6 +529,7 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
/* move forward from next + 1, index of which is i + 2 */
memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
type->cnt--;
+ end_rgn--;
}
}
@@ -606,7 +586,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);
- int idx, nr_new;
+ int idx, nr_new, start_rgn = -1, end_rgn;
struct memblock_region *rgn;
if (!size)
@@ -622,6 +602,17 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
type->total_size = size;
return 0;
}
+
+ /*
+ * The worst case is when new range overlaps all existing regions,
+ * then we'll need type->cnt + 1 empty regions in @type. So if
+ * type->cnt * 2 + 1 is less than or equal to type->max, we know
+ * that there is enough empty regions in @type, and we can insert
+ * regions directly.
+ */
+ if (type->cnt * 2 + 1 <= type->max)
+ insert = true;
+
repeat:
/*
* The following is executed twice. Once with %false @insert and
@@ -644,15 +635,19 @@ repeat:
* area, insert that portion.
*/
if (rbase > base) {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
WARN_ON(flags != rgn->flags);
nr_new++;
- if (insert)
+ if (insert) {
+ if (start_rgn == -1)
+ start_rgn = idx;
+ end_rgn = idx + 1;
memblock_insert_region(type, idx++, base,
rbase - base, nid,
flags);
+ }
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
@@ -661,9 +656,13 @@ repeat:
/* insert the remaining portion */
if (base < end) {
nr_new++;
- if (insert)
+ if (insert) {
+ if (start_rgn == -1)
+ start_rgn = idx;
+ end_rgn = idx + 1;
memblock_insert_region(type, idx, base, end - base,
nid, flags);
+ }
}
if (!nr_new)
@@ -680,7 +679,7 @@ repeat:
insert = true;
goto repeat;
} else {
- memblock_merge_regions(type);
+ memblock_merge_regions(type, start_rgn, end_rgn);
return 0;
}
}
@@ -690,6 +689,7 @@ repeat:
* @base: base address of the new region
* @size: size of the new region
* @nid: nid of the new region
+ * @flags: flags of the new region
*
* Add new memblock region [@base, @base + @size) to the "memory"
* type. See memblock_add_range() description for mode details
@@ -698,9 +698,14 @@ repeat:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
- int nid)
+ int nid, enum memblock_flags flags)
{
- return memblock_add_range(&memblock.memory, base, size, nid, 0);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
+ &base, &end, nid, flags, (void *)_RET_IP_);
+
+ return memblock_add_range(&memblock.memory, base, size, nid, flags);
}
/**
@@ -826,14 +831,28 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
}
/**
- * memblock_free - free boot memory block
- * @base: phys starting address of the boot memory block
+ * memblock_free - free boot memory allocation
+ * @ptr: starting address of the boot memory allocation
* @size: size of the boot memory block in bytes
*
* Free boot memory block previously allocated by memblock_alloc_xx() API.
* The freeing memory will not be released to the buddy allocator.
*/
-int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
+void __init_memblock memblock_free(void *ptr, size_t size)
+{
+ if (ptr)
+ memblock_phys_free(__pa(ptr), size);
+}
+
+/**
+ * memblock_phys_free - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_phys_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
@@ -871,7 +890,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
* @base: base address of the region
* @size: size of the region
* @set: set or clear the flag
- * @flag: the flag to udpate
+ * @flag: the flag to update
*
* This function isolates region [@base, @base + @size), and sets/clears flag
*
@@ -896,7 +915,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
r->flags &= ~flag;
}
- memblock_merge_regions(type);
+ memblock_merge_regions(type, start_rgn, end_rgn);
return 0;
}
@@ -933,6 +952,9 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
*/
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
{
+ if (!mirrored_kernelcore)
+ return 0;
+
system_has_some_mirror = true;
return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
@@ -943,6 +965,14 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
+ * The memory regions marked with %MEMBLOCK_NOMAP will not be added to the
+ * direct mapping of the physical memory. These regions will still be
+ * covered by the memory map. The struct page representing NOMAP memory
+ * frames in the memory map will be PageReserved()
+ *
+ * Note: if the memory being marked %MEMBLOCK_NOMAP was allocated from
+ * memblock, the caller must inform kmemleak to ignore that memory
+ *
* Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
@@ -977,7 +1007,8 @@ static bool should_skip_region(struct memblock_type *type,
return true;
/* skip hotpluggable memory regions if needed */
- if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m) &&
+ !(flags & MEMBLOCK_HOTPLUG))
return true;
/* if we want mirror memory skip non-mirror memory regions */
@@ -988,6 +1019,10 @@ static bool should_skip_region(struct memblock_type *type,
if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
return true;
+ /* skip driver-managed memory unless we were asked for it explicitly */
+ if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
+ return true;
+
return false;
}
@@ -1242,7 +1277,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
int start_rgn, end_rgn;
int i, ret;
@@ -1253,7 +1288,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
for (i = start_rgn; i < end_rgn; i++)
memblock_set_region_node(&type->regions[i], nid);
- memblock_merge_regions(type);
+ memblock_merge_regions(type, start_rgn, end_rgn);
#endif
return 0;
}
@@ -1280,11 +1315,10 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
{
int zone_nid = zone_to_nid(zone);
phys_addr_t spa, epa;
- int nid;
__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
&memblock.memory, &memblock.reserved,
- &spa, &epa, &nid);
+ &spa, &epa, NULL);
while (*idx != U64_MAX) {
unsigned long epfn = PFN_DOWN(epa);
@@ -1311,7 +1345,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
&memblock.memory, &memblock.reserved,
- &spa, &epa, &nid);
+ &spa, &epa, NULL);
}
/* signal end of iteration */
@@ -1342,8 +1376,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
* from the regions with mirroring enabled and then retried from any
* memory region.
*
- * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
- * allocated boot memory block, so that it is never reported as leaks.
+ * In addition, function using kmemleak_alloc_phys for allocated boot
+ * memory block, it is never reported as leaks.
*
* Return:
* Physical address of allocated memory block on success, %0 on failure.
@@ -1381,7 +1415,7 @@ again:
if (flags & MEMBLOCK_MIRROR) {
flags &= ~MEMBLOCK_MIRROR;
- pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
&size);
goto again;
}
@@ -1389,15 +1423,27 @@ again:
return 0;
done:
- /* Skip kmemleak for kasan_init() due to high volume. */
- if (end != MEMBLOCK_ALLOC_KASAN)
+ /*
+ * Skip kmemleak for those places like kasan_init() and
+ * early_pgtable_alloc() due to high volume.
+ */
+ if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
/*
- * The min_count is set to 0 so that memblock allocated
- * blocks are never reported as leaks. This is because many
- * of these blocks are only referred via the physical
- * address which is not looked up by kmemleak.
+ * Memblock allocated blocks are never reported as
+ * leaks. This is because many of these blocks are
+ * only referred via the physical address which is
+ * not looked up by kmemleak.
*/
- kmemleak_alloc_phys(found, size, 0, 0);
+ kmemleak_alloc_phys(found, size, 0);
+
+ /*
+ * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
+ * require memory to be accepted before it can be used by the
+ * guest.
+ *
+ * Accept the memory of the allocated buffer.
+ */
+ accept_memory(found, found + size);
return found;
}
@@ -1419,12 +1465,15 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
phys_addr_t start,
phys_addr_t end)
{
+ memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n",
+ __func__, (u64)size, (u64)align, &start, &end,
+ (void *)_RET_IP_);
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
false);
}
/**
- * memblock_phys_alloc_try_nid - allocate a memory block from specified MUMA node
+ * memblock_phys_alloc_try_nid - allocate a memory block from specified NUMA node
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
@@ -1517,18 +1566,12 @@ void * __init memblock_alloc_exact_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
- void *ptr;
-
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
- ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid, true);
- if (ptr && size > 0)
- page_init_poison(ptr, size);
-
- return ptr;
+ return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
+ true);
}
/**
@@ -1555,18 +1598,12 @@ void * __init memblock_alloc_try_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
- void *ptr;
-
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
- ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid, false);
- if (ptr && size > 0)
- page_init_poison(ptr, size);
-
- return ptr;
+ return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
+ false);
}
/**
@@ -1605,7 +1642,7 @@ void * __init memblock_alloc_try_nid(
}
/**
- * __memblock_free_late - free pages directly to buddy allocator
+ * memblock_free_late - free pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
@@ -1613,7 +1650,7 @@ void * __init memblock_alloc_try_nid(
* down, but we are still initializing the system. Pages are released directly
* to the buddy allocator.
*/
-void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
{
phys_addr_t cursor, end;
@@ -1706,6 +1743,11 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
if (!size)
return;
+ if (!memblock_memory->total_size) {
+ pr_warn("%s: No memory registered yet\n", __func__);
+ return;
+ }
+
ret = memblock_isolate_range(&memblock.memory, base, size,
&start_rgn, &end_rgn);
if (ret)
@@ -1828,7 +1870,6 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz
*/
bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
- memblock_cap_size(base, &size);
return memblock_overlaps_region(&memblock.reserved, base, size);
}
@@ -1883,7 +1924,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
size = rgn->size;
end = base + size - 1;
flags = rgn->flags;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
@@ -1926,12 +1967,101 @@ static int __init early_memblock(char *p)
}
early_param("memblock", early_memblock);
+static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct page *start_pg, *end_pg;
+ phys_addr_t pg, pgend;
+
+ /*
+ * Convert start_pfn/end_pfn to a struct page pointer.
+ */
+ start_pg = pfn_to_page(start_pfn - 1) + 1;
+ end_pg = pfn_to_page(end_pfn - 1) + 1;
+
+ /*
+ * Convert to physical addresses, and round start upwards and end
+ * downwards.
+ */
+ pg = PAGE_ALIGN(__pa(start_pg));
+ pgend = __pa(end_pg) & PAGE_MASK;
+
+ /*
+ * If there are free pages between these, free the section of the
+ * memmap array.
+ */
+ if (pg < pgend)
+ memblock_phys_free(pg, pgend - pg);
+}
+
+/*
+ * The mem_map array can get very big. Free the unused area of the memory map.
+ */
+static void __init free_unused_memmap(void)
+{
+ unsigned long start, end, prev_end = 0;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) ||
+ IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+ return;
+
+ /*
+ * This relies on each bank being in address order.
+ * The banks are sorted previously in bootmem_init().
+ */
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * Take care not to free memmap entries that don't exist
+ * due to SPARSEMEM sections which aren't present.
+ */
+ start = min(start, ALIGN(prev_end, PAGES_PER_SECTION));
+#endif
+ /*
+ * Align down here since many operations in VM subsystem
+ * presume that there are no holes in the memory map inside
+ * a pageblock
+ */
+ start = pageblock_start_pfn(start);
+
+ /*
+ * If we had a previous bank, and there is a space
+ * between the current bank and the previous, free it.
+ */
+ if (prev_end && prev_end < start)
+ free_memmap(prev_end, start);
+
+ /*
+ * Align up here since many operations in VM subsystem
+ * presume that there are no holes in the memory map inside
+ * a pageblock
+ */
+ prev_end = pageblock_align(end);
+ }
+
+#ifdef CONFIG_SPARSEMEM
+ if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) {
+ prev_end = pageblock_align(end);
+ free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
+ }
+#endif
+}
+
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
int order;
while (start < end) {
- order = min(MAX_ORDER - 1UL, __ffs(start));
+ /*
+ * Free the pages in the largest chunks alignment allows.
+ *
+ * __ffs() behaviour is undefined for 0. start == 0 is
+ * MAX_ORDER-aligned, set order to MAX_ORDER for the case.
+ */
+ if (start)
+ order = min_t(int, MAX_ORDER, __ffs(start));
+ else
+ order = MAX_ORDER;
while (start + (1UL << order) > end)
order--;
@@ -1957,6 +2087,37 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
return end_pfn - start_pfn;
}
+static void __init memmap_init_reserved_pages(void)
+{
+ struct memblock_region *region;
+ phys_addr_t start, end;
+ int nid;
+
+ /*
+ * set nid on all reserved pages and also treat struct
+ * pages for the NOMAP regions as PageReserved
+ */
+ for_each_mem_region(region) {
+ nid = memblock_get_region_node(region);
+ start = region->base;
+ end = start + region->size;
+
+ if (memblock_is_nomap(region))
+ reserve_bootmem_region(start, end, nid);
+
+ memblock_set_node(start, end, &memblock.reserved, nid);
+ }
+
+ /* initialize struct pages for the reserved regions */
+ for_each_reserved_mem_region(region) {
+ nid = memblock_get_region_node(region);
+ start = region->base;
+ end = start + region->size;
+
+ reserve_bootmem_region(start, end, nid);
+ }
+}
+
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
@@ -1965,8 +2126,7 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
- for_each_reserved_mem_range(i, &start, &end)
- reserve_bootmem_region(start, end);
+ memmap_init_reserved_pages();
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
@@ -1982,7 +2142,7 @@ static unsigned long __init free_low_memory_core_early(void)
static int reset_managed_pages_done __initdata;
-void reset_node_managed_pages(pg_data_t *pgdat)
+static void __init reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
@@ -2005,36 +2165,57 @@ void __init reset_all_zones_managed_pages(void)
/**
* memblock_free_all - release free pages to the buddy allocator
- *
- * Return: the number of pages actually released.
*/
-unsigned long __init memblock_free_all(void)
+void __init memblock_free_all(void)
{
unsigned long pages;
+ free_unused_memmap();
reset_all_zones_managed_pages();
pages = free_low_memory_core_early();
totalram_pages_add(pages);
-
- return pages;
}
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
+static const char * const flagname[] = {
+ [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",
+ [ilog2(MEMBLOCK_MIRROR)] = "MIRROR",
+ [ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
+ [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
+};
static int memblock_debug_show(struct seq_file *m, void *private)
{
struct memblock_type *type = m->private;
struct memblock_region *reg;
- int i;
+ int i, j, nid;
+ unsigned int count = ARRAY_SIZE(flagname);
phys_addr_t end;
for (i = 0; i < type->cnt; i++) {
reg = &type->regions[i];
end = reg->base + reg->size - 1;
+ nid = memblock_get_region_node(reg);
seq_printf(m, "%4d: ", i);
- seq_printf(m, "%pa..%pa\n", &reg->base, &end);
+ seq_printf(m, "%pa..%pa ", &reg->base, &end);
+ if (nid != MAX_NUMNODES)
+ seq_printf(m, "%4d ", nid);
+ else
+ seq_printf(m, "%4c ", 'x');
+ if (reg->flags) {
+ for (j = 0; j < count; j++) {
+ if (reg->flags & (1U << j)) {
+ seq_printf(m, "%s\n", flagname[j]);
+ break;
+ }
+ }
+ if (j == count)
+ seq_printf(m, "%s\n", "UNKNOWN");
+ } else {
+ seq_printf(m, "%s\n", "NONE");
+ }
}
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b98911c88bab..51507e514636 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -20,6 +20,9 @@
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
+ * Per memcg lru locking
+ * Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/
#include <linux/page_counter.h>
@@ -50,19 +53,22 @@
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
+#include <linux/memremap.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
+#include <linux/sched/isolation.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
+#include "swap.h"
#include <linux/uaccess.h>
@@ -73,18 +79,18 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Active memory cgroup to use from an interrupt context */
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
+
/* Socket memory accounting disabled? */
-static bool cgroup_memory_nosocket;
+static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
-static bool cgroup_memory_nokmem;
+static bool cgroup_memory_nokmem __ro_after_init;
-/* Whether the swap controller is active */
-#ifdef CONFIG_MEMCG_SWAP
-bool cgroup_memory_noswap __read_mostly;
-#else
-#define cgroup_memory_noswap 1
-#endif
+/* BPF memory accounting disabled? */
+static bool cgroup_memory_nobpf __ro_after_init;
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
@@ -93,7 +99,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
}
#define THRESHOLDS_EVENTS_TARGET 128
@@ -201,7 +207,6 @@ static struct move_charge_struct {
enum res_type {
_MEM,
_MEMSWAP,
- _OOM_TYPE,
_KMEM,
_TCP,
};
@@ -209,8 +214,6 @@ enum res_type {
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-/* Used for OOM nofiier */
-#define OOM_CONTROL (0)
/*
* Iteration constructs for visiting all cgroups (under a tree). If
@@ -227,7 +230,7 @@ enum res_type {
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
(current->flags & PF_EXITING);
@@ -241,18 +244,25 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
return &memcg->vmpressure;
}
-struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
{
- return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+ return container_of(vmpr, struct mem_cgroup, vmpressure);
}
#ifdef CONFIG_MEMCG_KMEM
-extern spinlock_t css_set_lock;
+static DEFINE_SPINLOCK(objcg_lock);
+
+bool mem_cgroup_kmem_disabled(void)
+{
+ return cgroup_memory_nokmem;
+}
+
+static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
- struct mem_cgroup *memcg;
unsigned int nr_bytes;
unsigned int nr_pages;
unsigned long flags;
@@ -281,13 +291,12 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
- spin_lock_irqsave(&css_set_lock, flags);
- memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
- __memcg_kmem_uncharge(memcg, nr_pages);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
+
+ spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
- mem_cgroup_put(memcg);
- spin_unlock_irqrestore(&css_set_lock, flags);
+ spin_unlock_irqrestore(&objcg_lock, flags);
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
@@ -319,218 +328,48 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
- spin_lock_irq(&css_set_lock);
-
- /* Move active objcg to the parent's list */
- xchg(&objcg->memcg, parent);
- css_get(&parent->css);
- list_add(&objcg->list, &parent->objcg_list);
+ spin_lock_irq(&objcg_lock);
- /* Move already reparented objcgs to the parent's list */
- list_for_each_entry(iter, &memcg->objcg_list, list) {
- css_get(&parent->css);
- xchg(&iter->memcg, parent);
- css_put(&memcg->css);
- }
+ /* 1) Ready to reparent active objcg. */
+ list_add(&objcg->list, &memcg->objcg_list);
+ /* 2) Reparent active objcg and already reparented objcgs to parent. */
+ list_for_each_entry(iter, &memcg->objcg_list, list)
+ WRITE_ONCE(iter->memcg, parent);
+ /* 3) Move already reparented objcgs to the parent's list */
list_splice(&memcg->objcg_list, &parent->objcg_list);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irq(&objcg_lock);
percpu_ref_kill(&objcg->refcnt);
}
/*
- * This will be used as a shrinker list's index.
- * The main reason for not using cgroup id for this:
- * this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
- *
- * The current size of the caches array is stored in memcg_nr_cache_ids. It
- * will double each time we have to increase it.
- */
-static DEFINE_IDA(memcg_cache_ida);
-int memcg_nr_cache_ids;
-
-/* Protects memcg_nr_cache_ids */
-static DECLARE_RWSEM(memcg_cache_ids_sem);
-
-void memcg_get_cache_ids(void)
-{
- down_read(&memcg_cache_ids_sem);
-}
-
-void memcg_put_cache_ids(void)
-{
- up_read(&memcg_cache_ids_sem);
-}
-
-/*
- * MIN_SIZE is different than 1, because we would like to avoid going through
- * the alloc/free process all the time. In a small machine, 4 kmem-limited
- * cgroups is a reasonable guess. In the future, it could be a parameter or
- * tunable, but that is strictly not necessary.
- *
- * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
- * this constant directly from cgroup, but it is understandable that this is
- * better kept as an internal representation in cgroup.c. In any case, the
- * cgrp_id space is not getting any smaller, and we don't have to necessarily
- * increase ours as well if it increases.
- */
-#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
-
-/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
-DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
-EXPORT_SYMBOL(memcg_kmem_enabled_key);
-#endif
-
-static int memcg_shrinker_map_size;
-static DEFINE_MUTEX(memcg_shrinker_map_mutex);
-
-static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
-{
- kvfree(container_of(head, struct memcg_shrinker_map, rcu));
-}
-
-static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
- int size, int old_size)
-{
- struct memcg_shrinker_map *new, *old;
- int nid;
-
- lockdep_assert_held(&memcg_shrinker_map_mutex);
-
- for_each_node(nid) {
- old = rcu_dereference_protected(
- mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
- /* Not yet online memcg */
- if (!old)
- return 0;
-
- new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
- if (!new)
- return -ENOMEM;
-
- /* Set all old bits, clear all new bits */
- memset(new->map, (int)0xff, old_size);
- memset((void *)new->map + old_size, 0, size - old_size);
-
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
- call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
- }
-
- return 0;
-}
-
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
-{
- struct mem_cgroup_per_node *pn;
- struct memcg_shrinker_map *map;
- int nid;
-
- if (mem_cgroup_is_root(memcg))
- return;
-
- for_each_node(nid) {
- pn = mem_cgroup_nodeinfo(memcg, nid);
- map = rcu_dereference_protected(pn->shrinker_map, true);
- if (map)
- kvfree(map);
- rcu_assign_pointer(pn->shrinker_map, NULL);
- }
-}
-
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- struct memcg_shrinker_map *map;
- int nid, size, ret = 0;
-
- if (mem_cgroup_is_root(memcg))
- return 0;
-
- mutex_lock(&memcg_shrinker_map_mutex);
- size = memcg_shrinker_map_size;
- for_each_node(nid) {
- map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
- if (!map) {
- memcg_free_shrinker_maps(memcg);
- ret = -ENOMEM;
- break;
- }
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
- }
- mutex_unlock(&memcg_shrinker_map_mutex);
-
- return ret;
-}
-
-int memcg_expand_shrinker_maps(int new_id)
-{
- int size, old_size, ret = 0;
- struct mem_cgroup *memcg;
+DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
+EXPORT_SYMBOL(memcg_kmem_online_key);
- size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
- old_size = memcg_shrinker_map_size;
- if (size <= old_size)
- return 0;
-
- mutex_lock(&memcg_shrinker_map_mutex);
- if (!root_mem_cgroup)
- goto unlock;
-
- for_each_mem_cgroup(memcg) {
- if (mem_cgroup_is_root(memcg))
- continue;
- ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
- if (ret) {
- mem_cgroup_iter_break(NULL, memcg);
- goto unlock;
- }
- }
-unlock:
- if (!ret)
- memcg_shrinker_map_size = size;
- mutex_unlock(&memcg_shrinker_map_mutex);
- return ret;
-}
-
-void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
-{
- if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
- struct memcg_shrinker_map *map;
-
- rcu_read_lock();
- map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
- /* Pairs with smp mb in shrink_slab() */
- smp_mb__before_atomic();
- set_bit(shrinker_id, map->map);
- rcu_read_unlock();
- }
-}
+DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
+EXPORT_SYMBOL(memcg_bpf_enabled_key);
+#endif
/**
- * mem_cgroup_css_from_page - css of the memcg associated with a page
- * @page: page of interest
+ * mem_cgroup_css_from_folio - css of the memcg associated with a folio
+ * @folio: folio of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
- * with @page is returned. The returned css remains associated with @page
+ * with @folio is returned. The returned css remains associated with @folio
* until it is released.
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
-struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
{
- struct mem_cgroup *memcg;
-
- memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = folio_memcg(folio);
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
@@ -557,16 +396,8 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- memcg = page->mem_cgroup;
-
- /*
- * The lowest bit set means that memcg isn't a valid
- * memcg pointer, but a obj_cgroups pointer.
- * In this case the page is shared and doesn't belong
- * to any specific memory cgroup.
- */
- if ((unsigned long) memcg & 0x1UL)
- memcg = NULL;
+ /* page_folio() is racy here, but the entire function is racy anyway */
+ memcg = folio_memcg_check(page_folio(page));
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
@@ -576,28 +407,6 @@ ino_t page_cgroup_ino(struct page *page)
return ino;
}
-static struct mem_cgroup_per_node *
-mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
-{
- int nid = page_to_nid(page);
-
- return memcg->nodeinfo[nid];
-}
-
-static struct mem_cgroup_tree_per_node *
-soft_limit_tree_node(int nid)
-{
- return soft_limit_tree.rb_tree_per_node[nid];
-}
-
-static struct mem_cgroup_tree_per_node *
-soft_limit_tree_from_page(struct page *page)
-{
- int nid = page_to_nid(page);
-
- return soft_limit_tree.rb_tree_per_node[nid];
-}
-
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess)
@@ -620,14 +429,9 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
if (mz->usage_in_excess < mz_node->usage_in_excess) {
p = &(*p)->rb_left;
rightmost = false;
- }
-
- /*
- * We can't avoid mem cgroups that are over their soft
- * limit by the same amount
- */
- else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+ } else {
p = &(*p)->rb_right;
+ }
}
if (rightmost)
@@ -673,13 +477,19 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
return excess;
}
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
{
unsigned long excess;
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
- mctz = soft_limit_tree_from_page(page);
+ if (lru_gen_enabled()) {
+ if (soft_limit_excess(memcg))
+ lru_gen_soft_reclaim(memcg, nid);
+ return;
+ }
+
+ mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz)
return;
/*
@@ -687,7 +497,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- mz = mem_cgroup_page_nodeinfo(memcg, page);
+ mz = memcg->nodeinfo[nid];
excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
@@ -717,8 +527,8 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
int nid;
for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
- mctz = soft_limit_tree_node(nid);
+ mz = memcg->nodeinfo[nid];
+ mctz = soft_limit_tree.rb_tree_per_node[nid];
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
}
@@ -760,6 +570,193 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
+/*
+ * memcg and lruvec stats flushing
+ *
+ * Many codepaths leading to stats update or read are performance sensitive and
+ * adding stats flushing in such codepaths is not desirable. So, to optimize the
+ * flushing the kernel does:
+ *
+ * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
+ * rstat update tree grow unbounded.
+ *
+ * 2) Flush the stats synchronously on reader side only when there are more than
+ * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
+ * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
+ * only for 2 seconds due to (1).
+ */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static DEFINE_PER_CPU(unsigned int, stats_updates);
+static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
+static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+static u64 flush_next_time;
+
+#define FLUSH_TIME (2UL*HZ)
+
+/*
+ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ * not rely on this as part of an acquired spinlock_t lock. These functions are
+ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ * is sufficient.
+ */
+static void memcg_stats_lock(void)
+{
+ preempt_disable_nested();
+ VM_WARN_ON_IRQS_ENABLED();
+}
+
+static void __memcg_stats_lock(void)
+{
+ preempt_disable_nested();
+}
+
+static void memcg_stats_unlock(void)
+{
+ preempt_enable_nested();
+}
+
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+{
+ unsigned int x;
+
+ if (!val)
+ return;
+
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+
+ x = __this_cpu_add_return(stats_updates, abs(val));
+ if (x > MEMCG_CHARGE_BATCH) {
+ /*
+ * If stats_flush_threshold exceeds the threshold
+ * (>num_online_cpus()), cgroup stats update will be triggered
+ * in __mem_cgroup_flush_stats(). Increasing this var further
+ * is redundant and simply adds overhead in atomic update.
+ */
+ if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
+ atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+ __this_cpu_write(stats_updates, 0);
+ }
+}
+
+static void do_flush_stats(void)
+{
+ /*
+ * We always flush the entire tree, so concurrent flushers can just
+ * skip. This avoids a thundering herd problem on the rstat global lock
+ * from memcg flushers (e.g. reclaim, refault, etc).
+ */
+ if (atomic_read(&stats_flush_ongoing) ||
+ atomic_xchg(&stats_flush_ongoing, 1))
+ return;
+
+ WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
+
+ cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+
+ atomic_set(&stats_flush_threshold, 0);
+ atomic_set(&stats_flush_ongoing, 0);
+}
+
+void mem_cgroup_flush_stats(void)
+{
+ if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+ do_flush_stats();
+}
+
+void mem_cgroup_flush_stats_ratelimited(void)
+{
+ if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
+ mem_cgroup_flush_stats();
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+ /*
+ * Always flush here so that flushing in latency-sensitive paths is
+ * as cheap as possible.
+ */
+ do_flush_stats();
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+}
+
+/* Subset of vm_event_item to report for memcg event stats */
+static const unsigned int memcg_vm_event_stat[] = {
+ PGPGIN,
+ PGPGOUT,
+ PGSCAN_KSWAPD,
+ PGSCAN_DIRECT,
+ PGSCAN_KHUGEPAGED,
+ PGSTEAL_KSWAPD,
+ PGSTEAL_DIRECT,
+ PGSTEAL_KHUGEPAGED,
+ PGFAULT,
+ PGMAJFAULT,
+ PGREFILL,
+ PGACTIVATE,
+ PGDEACTIVATE,
+ PGLAZYFREE,
+ PGLAZYFREED,
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ ZSWPIN,
+ ZSWPOUT,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ THP_FAULT_ALLOC,
+ THP_COLLAPSE_ALLOC,
+#endif
+};
+
+#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
+static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+
+static void init_memcg_events(void)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_EVENTS; ++i)
+ mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
+}
+
+static inline int memcg_events_index(enum vm_event_item idx)
+{
+ return mem_cgroup_events_index[idx] - 1;
+}
+
+struct memcg_vmstats_percpu {
+ /* Local (CPU and cgroup) page state & events */
+ long state[MEMCG_NR_STAT];
+ unsigned long events[NR_MEMCG_EVENTS];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[MEMCG_NR_STAT];
+ unsigned long events_prev[NR_MEMCG_EVENTS];
+
+ /* Cgroup1: threshold notifications & softlimit tree updates */
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct memcg_vmstats {
+ /* Aggregated (CPU and subtree) page state & events */
+ long state[MEMCG_NR_STAT];
+ unsigned long events[NR_MEMCG_EVENTS];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[MEMCG_NR_STAT];
+ unsigned long events_pending[NR_MEMCG_EVENTS];
+};
+
+unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = READ_ONCE(memcg->vmstats->state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
/**
* __mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
@@ -768,39 +765,26 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- long x, threshold = MEMCG_CHARGE_BATCH;
-
if (mem_cgroup_disabled())
return;
- if (memcg_stat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
-
- x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
- if (unlikely(abs(x) > threshold)) {
- struct mem_cgroup *mi;
-
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(memcg->vmstats_local->stat[idx], x);
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmstats[idx]);
- x = 0;
- }
- __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+ __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+ memcg_rstat_updated(memcg, val);
}
-static struct mem_cgroup_per_node *
-parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
+static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
- struct mem_cgroup *parent;
+ long x = 0;
+ int cpu;
- parent = parent_mem_cgroup(pn->memcg);
- if (!parent)
- return NULL;
- return mem_cgroup_nodeinfo(parent, nid);
+ for_each_possible_cpu(cpu)
+ x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
}
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@ -808,30 +792,39 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
- long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
+ /*
+ * The caller from rmap relay on disabled preemption becase they never
+ * update their counter from in-interrupt context. For these two
+ * counters we check that the update is never performed from an
+ * interrupt context while other caller need to have disabled interrupt.
+ */
+ __memcg_stats_lock();
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
+ switch (idx) {
+ case NR_ANON_MAPPED:
+ case NR_FILE_MAPPED:
+ case NR_ANON_THPS:
+ case NR_SHMEM_PMDMAPPED:
+ case NR_FILE_PMDMAPPED:
+ WARN_ON_ONCE(!in_task());
+ break;
+ default:
+ VM_WARN_ON_IRQS_ENABLED();
+ }
+ }
+
/* Update memcg */
- __mod_memcg_state(memcg, idx, val);
+ __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
- if (vmstat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
-
- x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
- if (unlikely(abs(x) > threshold)) {
- pg_data_t *pgdat = lruvec_pgdat(lruvec);
- struct mem_cgroup_per_node *pi;
+ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
- for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
- atomic_long_add(x, &pi->lruvec_stat[idx]);
- x = 0;
- }
- __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+ memcg_rstat_updated(memcg, val);
+ memcg_stats_unlock();
}
/**
@@ -855,33 +848,50 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__mod_memcg_lruvec_state(lruvec, idx, val);
}
-void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
+void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
+ int val)
{
- pg_data_t *pgdat = page_pgdat(virt_to_page(p));
+ struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
+ pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
rcu_read_lock();
- memcg = mem_cgroup_from_obj(p);
-
+ memcg = page_memcg(head);
/* Untracked pages have no memcg, no lruvec. Update only the node */
- if (!memcg || memcg == root_mem_cgroup) {
+ if (!memcg) {
+ rcu_read_unlock();
__mod_node_page_state(pgdat, idx, val);
- } else {
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_lruvec_state(lruvec, idx, val);
+ return;
}
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ __mod_lruvec_state(lruvec, idx, val);
rcu_read_unlock();
}
+EXPORT_SYMBOL(__mod_lruvec_page_state);
-void mod_memcg_obj_state(void *p, int idx, int val)
+void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
{
+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
rcu_read_lock();
- memcg = mem_cgroup_from_obj(p);
- if (memcg)
- mod_memcg_state(memcg, idx, val);
+ memcg = mem_cgroup_from_slab_obj(p);
+
+ /*
+ * Untracked pages have no memcg, no lruvec. Update only the
+ * node. If we reparent the slab objects to the root memcg,
+ * when we free the slab object, we need to update the per-memcg
+ * vmstats to keep it correct for the root memcg.
+ */
+ if (!memcg) {
+ __mod_node_page_state(pgdat, idx, val);
+ } else {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ __mod_lruvec_state(lruvec, idx, val);
+ }
rcu_read_unlock();
}
@@ -889,49 +899,46 @@ void mod_memcg_obj_state(void *p, int idx, int val)
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
- * @count: the number of events that occured
+ * @count: the number of events that occurred
*/
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
- unsigned long x;
+ int index = memcg_events_index(idx);
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled() || index < 0)
return;
- x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
- if (unlikely(x > MEMCG_CHARGE_BATCH)) {
- struct mem_cgroup *mi;
-
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(memcg->vmstats_local->events[idx], x);
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmevents[idx]);
- x = 0;
- }
- __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+ memcg_stats_lock();
+ __this_cpu_add(memcg->vmstats_percpu->events[index], count);
+ memcg_rstat_updated(memcg, count);
+ memcg_stats_unlock();
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
- return atomic_long_read(&memcg->vmevents[event]);
+ int index = memcg_events_index(event);
+
+ if (index < 0)
+ return 0;
+ return READ_ONCE(memcg->vmstats->events[index]);
}
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
long x = 0;
int cpu;
+ int index = memcg_events_index(event);
+
+ if (index < 0)
+ return 0;
for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
return x;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
- struct page *page,
int nr_pages)
{
/* pagein of a big page is an event. So, ignore page size */
@@ -974,8 +981,11 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
* Check events in order.
*
*/
-static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, int nid)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return;
+
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
@@ -985,7 +995,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
MEM_CGROUP_TARGET_SOFTLIMIT);
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
- mem_cgroup_update_tree(memcg, page);
+ mem_cgroup_update_tree(memcg, nid);
}
}
@@ -1003,13 +1013,24 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
}
EXPORT_SYMBOL(mem_cgroup_from_task);
+static __always_inline struct mem_cgroup *active_memcg(void)
+{
+ if (!in_task())
+ return this_cpu_read(int_active_memcg);
+ else
+ return current->active_memcg;
+}
+
/**
* get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
* @mm: mm from which memcg should be extracted. It can be NULL.
*
- * Obtain a reference on mm->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
- * returned.
+ * Obtain a reference on mm->memcg and returns it if successful. If mm
+ * is NULL, then the memcg is chosen as follows:
+ * 1) The active memcg, if set.
+ * 2) current->mm->memcg, if available
+ * 3) root memcg
+ * If mem_cgroup is disabled, NULL is returned.
*/
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
@@ -1018,67 +1039,49 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (mem_cgroup_disabled())
return NULL;
+ /*
+ * Page cache insertions can happen without an
+ * actual mm context, e.g. during disk probing
+ * on boot, loopback IO, acct() writes etc.
+ *
+ * No need to css_get on root memcg as the reference
+ * counting is disabled on the root level in the
+ * cgroup core. See CSS_NO_REF.
+ */
+ if (unlikely(!mm)) {
+ memcg = active_memcg();
+ if (unlikely(memcg)) {
+ /* remote memcg must hold a ref */
+ css_get(&memcg->css);
+ return memcg;
+ }
+ mm = current->mm;
+ if (unlikely(!mm))
+ return root_mem_cgroup;
+ }
+
rcu_read_lock();
do {
- /*
- * Page cache insertions can happen withou an
- * actual mm context, e.g. during disk probing
- * on boot, loopback IO, acct() writes etc.
- */
- if (unlikely(!mm))
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
memcg = root_mem_cgroup;
- else {
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- memcg = root_mem_cgroup;
- }
} while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
EXPORT_SYMBOL(get_mem_cgroup_from_mm);
-/**
- * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
- * @page: page from which memcg should be extracted.
- *
- * Obtain a reference on page->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned.
- */
-struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+static __always_inline bool memcg_kmem_bypass(void)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- rcu_read_lock();
- /* Page should not get uncharged and freed memcg under us. */
- if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- rcu_read_unlock();
- return memcg;
-}
-EXPORT_SYMBOL(get_mem_cgroup_from_page);
+ /* Allow remote memcg charging from any context. */
+ if (unlikely(active_memcg()))
+ return false;
-/**
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
-{
- if (unlikely(current->active_memcg)) {
- struct mem_cgroup *memcg;
+ /* Memcg to charge can't be determined. */
+ if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
- rcu_read_lock();
- /* current->active_memcg must hold a ref. */
- if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css)))
- memcg = root_mem_cgroup;
- else
- memcg = current->active_memcg;
- rcu_read_unlock();
- return memcg;
- }
- return get_mem_cgroup_from_mm(current->mm);
+ return false;
}
/**
@@ -1113,24 +1116,21 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (!root)
root = root_mem_cgroup;
- if (prev && !reclaim)
- pos = prev;
-
- if (!root->use_hierarchy && root != root_mem_cgroup) {
- if (prev)
- goto out;
- return root;
- }
-
rcu_read_lock();
if (reclaim) {
struct mem_cgroup_per_node *mz;
- mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
+ mz = root->nodeinfo[reclaim->pgdat->node_id];
iter = &mz->iter;
- if (prev && reclaim->generation != iter->generation)
+ /*
+ * On start, join the current reclaim iteration cycle.
+ * Exit when a concurrent walker completes it.
+ */
+ if (!prev)
+ reclaim->generation = iter->generation;
+ else if (reclaim->generation != iter->generation)
goto out_unlock;
while (1) {
@@ -1147,6 +1147,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
*/
(void)cmpxchg(&iter->position, pos, NULL);
}
+ } else if (prev) {
+ pos = prev;
}
if (pos)
@@ -1171,15 +1173,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
* is provided by the caller, so we know it's alive
* and kicking, and don't take an extra reference.
*/
- memcg = mem_cgroup_from_css(css);
-
- if (css == &root->css)
- break;
-
- if (css_tryget(css))
+ if (css == &root->css || css_tryget(css)) {
+ memcg = mem_cgroup_from_css(css);
break;
-
- memcg = NULL;
+ }
}
if (reclaim) {
@@ -1195,13 +1192,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (!memcg)
iter->generation++;
- else if (!prev)
- reclaim->generation = iter->generation;
}
out_unlock:
rcu_read_unlock();
-out:
if (prev && prev != root)
css_put(&prev->css);
@@ -1230,7 +1224,7 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
int nid;
for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(from, nid);
+ mz = from->nodeinfo[nid];
iter = &mz->iter;
cmpxchg(&iter->position, dead_memcg, NULL);
}
@@ -1247,12 +1241,12 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
} while ((memcg = parent_mem_cgroup(memcg)));
/*
- * When cgruop1 non-hierarchy mode is used,
+ * When cgroup1 non-hierarchy mode is used,
* parent_mem_cgroup() does not walk all the way up to the
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
*/
- if (last != root_mem_cgroup)
+ if (!mem_cgroup_is_root(last))
__invalidate_reclaim_iterators(root_mem_cgroup,
dead_memcg);
}
@@ -1265,18 +1259,18 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
*
* This function iterates over tasks attached to @memcg or to any of its
* descendants and calls @fn for each task. If @fn returns a non-zero
- * value, the function breaks the iteration loop and returns the value.
- * Otherwise, it will iterate over all tasks and return 0.
+ * value, the function breaks the iteration loop. Otherwise, it will iterate
+ * over all tasks and return 0.
*
* This function must not be called for the root memory cgroup.
*/
-int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
- int (*fn)(struct task_struct *, void *), void *arg)
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+ int (*fn)(struct task_struct *, void *), void *arg)
{
struct mem_cgroup *iter;
int ret = 0;
- BUG_ON(memcg == root_mem_cgroup);
+ BUG_ON(mem_cgroup_is_root(memcg));
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1291,46 +1285,92 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
break;
}
}
- return ret;
}
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
+{
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ memcg = folio_memcg(folio);
+
+ if (!memcg)
+ VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
+ else
+ VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
+}
+#endif
+
/**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
- * @pgdat: pgdat of the page
+ * folio_lruvec_lock - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
*
- * This function relies on page->mem_cgroup being stable - see the
- * access rules in commit_charge().
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held.
*/
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
+struct lruvec *folio_lruvec_lock(struct folio *folio)
{
- struct mem_cgroup_per_node *mz;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = folio_lruvec(folio);
- if (mem_cgroup_disabled()) {
- lruvec = &pgdat->__lruvec;
- goto out;
- }
+ spin_lock(&lruvec->lru_lock);
+ lruvec_memcg_debug(lruvec, folio);
- memcg = page->mem_cgroup;
- /*
- * Swapcache readahead pages are added to the LRU - and
- * possibly migrated - before they are charged.
- */
- if (!memcg)
- memcg = root_mem_cgroup;
+ return lruvec;
+}
+
+/**
+ * folio_lruvec_lock_irq - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
+ *
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held and interrupts
+ * disabled.
+ */
+struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
+{
+ struct lruvec *lruvec = folio_lruvec(folio);
+
+ spin_lock_irq(&lruvec->lru_lock);
+ lruvec_memcg_debug(lruvec, folio);
+
+ return lruvec;
+}
+
+/**
+ * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
+ * @flags: Pointer to irqsave flags.
+ *
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held and interrupts
+ * disabled.
+ */
+struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
+ unsigned long *flags)
+{
+ struct lruvec *lruvec = folio_lruvec(folio);
+
+ spin_lock_irqsave(&lruvec->lru_lock, *flags);
+ lruvec_memcg_debug(lruvec, folio);
- mz = mem_cgroup_page_nodeinfo(memcg, page);
- lruvec = &mz->lruvec;
-out:
- /*
- * Since a node can be onlined after the mem_cgroup was created,
- * we have to be prepared to initialize lruvec->zone here;
- * and if offlined then reonlined, we need to reinitialize it.
- */
- if (unlikely(lruvec->pgdat != pgdat))
- lruvec->pgdat = pgdat;
return lruvec;
}
@@ -1342,8 +1382,7 @@ out:
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
- * to or just after a page is removed from an lru list (that ordering being
- * so as to allow it to check that lru_size 0 is consistent with list_empty).
+ * to or just after a page is removed from an lru list.
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zid, int nr_pages)
@@ -1450,77 +1489,86 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
struct memory_stat {
const char *name;
- unsigned int ratio;
unsigned int idx;
};
-static struct memory_stat memory_stats[] = {
- { "anon", PAGE_SIZE, NR_ANON_MAPPED },
- { "file", PAGE_SIZE, NR_FILE_PAGES },
- { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
- { "percpu", 1, MEMCG_PERCPU_B },
- { "sock", PAGE_SIZE, MEMCG_SOCK },
- { "shmem", PAGE_SIZE, NR_SHMEM },
- { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
- { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
- { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+static const struct memory_stat memory_stats[] = {
+ { "anon", NR_ANON_MAPPED },
+ { "file", NR_FILE_PAGES },
+ { "kernel", MEMCG_KMEM },
+ { "kernel_stack", NR_KERNEL_STACK_KB },
+ { "pagetables", NR_PAGETABLE },
+ { "sec_pagetables", NR_SECONDARY_PAGETABLE },
+ { "percpu", MEMCG_PERCPU_B },
+ { "sock", MEMCG_SOCK },
+ { "vmalloc", MEMCG_VMALLOC },
+ { "shmem", NR_SHMEM },
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ { "zswap", MEMCG_ZSWAP_B },
+ { "zswapped", MEMCG_ZSWAPPED },
+#endif
+ { "file_mapped", NR_FILE_MAPPED },
+ { "file_dirty", NR_FILE_DIRTY },
+ { "file_writeback", NR_WRITEBACK },
+#ifdef CONFIG_SWAP
+ { "swapcached", NR_SWAPCACHE },
+#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /*
- * The ratio will be initialized in memory_stats_init(). Because
- * on some architectures, the macro of HPAGE_PMD_SIZE is not
- * constant(e.g. powerpc).
- */
- { "anon_thp", 0, NR_ANON_THPS },
+ { "anon_thp", NR_ANON_THPS },
+ { "file_thp", NR_FILE_THPS },
+ { "shmem_thp", NR_SHMEM_THPS },
#endif
- { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
- { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
- { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
- { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
- { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
-
- /*
- * Note: The slab_reclaimable and slab_unreclaimable must be
- * together and slab_reclaimable must be in front.
- */
- { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
- { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+ { "inactive_anon", NR_INACTIVE_ANON },
+ { "active_anon", NR_ACTIVE_ANON },
+ { "inactive_file", NR_INACTIVE_FILE },
+ { "active_file", NR_ACTIVE_FILE },
+ { "unevictable", NR_UNEVICTABLE },
+ { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
/* The memory events */
- { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
- { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
- { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
- { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
- { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
- { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
- { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+ { "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
};
-static int __init memory_stats_init(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memory_stats[i].idx == NR_ANON_THPS)
- memory_stats[i].ratio = HPAGE_PMD_SIZE;
-#endif
- VM_BUG_ON(!memory_stats[i].ratio);
- VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+/* Translate stat items to the correct unit for memory.stat output */
+static int memcg_page_state_unit(int item)
+{
+ switch (item) {
+ case MEMCG_PERCPU_B:
+ case MEMCG_ZSWAP_B:
+ case NR_SLAB_RECLAIMABLE_B:
+ case NR_SLAB_UNRECLAIMABLE_B:
+ case WORKINGSET_REFAULT_ANON:
+ case WORKINGSET_REFAULT_FILE:
+ case WORKINGSET_ACTIVATE_ANON:
+ case WORKINGSET_ACTIVATE_FILE:
+ case WORKINGSET_RESTORE_ANON:
+ case WORKINGSET_RESTORE_FILE:
+ case WORKINGSET_NODERECLAIM:
+ return 1;
+ case NR_KERNEL_STACK_KB:
+ return SZ_1K;
+ default:
+ return PAGE_SIZE;
}
+}
- return 0;
+static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
+ int item)
+{
+ return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-pure_initcall(memory_stats_init);
-static char *memory_stat_format(struct mem_cgroup *memcg)
+static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct seq_buf s;
int i;
- seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
- if (!s.buffer)
- return NULL;
-
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
@@ -1531,55 +1579,54 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
*
* Current memory state:
*/
+ mem_cgroup_flush_stats();
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
- size = memcg_page_state(memcg, memory_stats[i].idx);
- size *= memory_stats[i].ratio;
- seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
+ size = memcg_page_state_output(memcg, memory_stats[i].idx);
+ seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
- size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
- seq_buf_printf(&s, "slab %llu\n", size);
+ size += memcg_page_state_output(memcg,
+ NR_SLAB_RECLAIMABLE_B);
+ seq_buf_printf(s, "slab %llu\n", size);
}
}
/* Accumulated memory events */
-
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
- memcg_events(memcg, PGFAULT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
- memcg_events(memcg, PGMAJFAULT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
- memcg_events(memcg, PGREFILL));
- seq_buf_printf(&s, "pgscan %lu\n",
+ seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
- memcg_events(memcg, PGSCAN_DIRECT));
- seq_buf_printf(&s, "pgsteal %lu\n",
+ memcg_events(memcg, PGSCAN_DIRECT) +
+ memcg_events(memcg, PGSCAN_KHUGEPAGED));
+ seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
- memcg_events(memcg, PGSTEAL_DIRECT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
- memcg_events(memcg, PGACTIVATE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
- memcg_events(memcg, PGDEACTIVATE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
- memcg_events(memcg, PGLAZYFREE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
- memcg_events(memcg, PGLAZYFREED));
+ memcg_events(memcg, PGSTEAL_DIRECT) +
+ memcg_events(memcg, PGSTEAL_KHUGEPAGED));
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
- memcg_events(memcg, THP_FAULT_ALLOC));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
- memcg_events(memcg, THP_COLLAPSE_ALLOC));
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
+ if (memcg_vm_event_stat[i] == PGPGIN ||
+ memcg_vm_event_stat[i] == PGPGOUT)
+ continue;
+
+ seq_buf_printf(s, "%s %lu\n",
+ vm_event_name(memcg_vm_event_stat[i]),
+ memcg_events(memcg, memcg_vm_event_stat[i]));
+ }
/* The above should easily fit into one page */
- WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
+}
+
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
- return s.buffer;
+static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memcg_stat_format(memcg, s);
+ else
+ memcg1_stat_format(memcg, s);
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -1615,7 +1662,11 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
*/
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
- char *buf;
+ /* Use static buffer, for the caller is holding oom_lock. */
+ static char buf[PAGE_SIZE];
+ struct seq_buf s;
+
+ lockdep_assert_held(&oom_lock);
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
@@ -1636,11 +1687,9 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
- buf = memory_stat_format(memcg);
- if (!buf)
- return;
- pr_info("%s", buf);
- kfree(buf);
+ seq_buf_init(&s, buf, sizeof(buf));
+ memory_stat_format(memcg, &s);
+ seq_buf_do_printk(&s, KERN_INFO);
}
/*
@@ -1650,17 +1699,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
unsigned long max = READ_ONCE(memcg->memory.max);
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- if (mem_cgroup_swappiness(memcg))
- max += min(READ_ONCE(memcg->swap.max),
- (unsigned long)total_swap_pages);
- } else { /* v1 */
+ if (do_memsw_account()) {
if (mem_cgroup_swappiness(memcg)) {
/* Calculate swap excess capacity from memsw limit */
unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
max += min(swap, (unsigned long)total_swap_pages);
}
+ } else {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
}
return max;
}
@@ -1692,7 +1741,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = should_force_charge() || out_of_memory(&oc);
+ ret = task_is_dying() || out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -1826,7 +1875,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
/*
- * Be careful about under_oom underflows becase a child memcg
+ * Be careful about under_oom underflows because a child memcg
* could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
@@ -1873,20 +1922,16 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
-enum oom_status {
- OOM_SUCCESS,
- OOM_FAILED,
- OOM_ASYNC,
- OOM_SKIPPED
-};
-
-static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+/*
+ * Returns true if successfully killed one or more processes. Though in some
+ * corner cases it can return true even without killing any process.
+ */
+static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- enum oom_status ret;
- bool locked;
+ bool locked, ret;
if (order > PAGE_ALLOC_COSTLY_ORDER)
- return OOM_SKIPPED;
+ return false;
memcg_memory_event(memcg, MEMCG_OOM);
@@ -1908,15 +1953,14 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
* Please note that mem_cgroup_out_of_memory might fail to find a
* victim and then we have to bail out from the charge path.
*/
- if (memcg->oom_kill_disable) {
- if (!current->in_user_fault)
- return OOM_SKIPPED;
- css_get(&memcg->css);
- current->memcg_in_oom = memcg;
- current->memcg_oom_gfp_mask = mask;
- current->memcg_oom_order = order;
-
- return OOM_ASYNC;
+ if (READ_ONCE(memcg->oom_kill_disable)) {
+ if (current->in_user_fault) {
+ css_get(&memcg->css);
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
+ }
+ return false;
}
mem_cgroup_mark_under_oom(memcg);
@@ -1927,10 +1971,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
mem_cgroup_oom_notify(memcg);
mem_cgroup_unmark_under_oom(memcg);
- if (mem_cgroup_out_of_memory(memcg, mask, order))
- ret = OOM_SUCCESS;
- else
- ret = OOM_FAILED;
+ ret = mem_cgroup_out_of_memory(memcg, mask, order);
if (locked)
mem_cgroup_oom_unlock(memcg);
@@ -1982,26 +2023,12 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked)
mem_cgroup_oom_notify(memcg);
- if (locked && !memcg->oom_kill_disable) {
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
- current->memcg_oom_order);
- } else {
- schedule();
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- }
+ schedule();
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
- if (locked) {
+ if (locked)
mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitely.
- */
- memcg_oom_recover(memcg);
- }
cleanup:
current->memcg_in_oom = NULL;
css_put(&memcg->css);
@@ -2033,7 +2060,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
rcu_read_lock();
memcg = mem_cgroup_from_task(victim);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
/*
@@ -2050,7 +2077,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
* highest-level memory cgroup with oom.group set.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- if (memcg->oom_group)
+ if (READ_ONCE(memcg->oom_group))
oom_group = memcg;
if (memcg == oom_domain)
@@ -2073,19 +2100,17 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
/**
- * lock_page_memcg - lock a page->mem_cgroup binding
- * @page: the page
+ * folio_memcg_lock - Bind a folio to its memcg.
+ * @folio: The folio.
*
- * This function protects unlocked LRU pages from being moved to
+ * This function prevents unlocked LRU folios from being moved to
* another cgroup.
*
- * It ensures lifetime of the returned memcg. Caller is responsible
- * for the lifetime of the page; __unlock_page_memcg() is available
- * when @page might get freed inside the locked section.
+ * It ensures lifetime of the bound memcg. The caller is responsible
+ * for the lifetime of the folio.
*/
-struct mem_cgroup *lock_page_memcg(struct page *page)
+void folio_memcg_lock(struct folio *folio)
{
- struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
unsigned long flags;
@@ -2093,50 +2118,42 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page itself from being freed. E.g. writeback
- * doesn't hold a page reference and relies on PG_writeback to
- * keep off truncation, migration and so forth.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
- memcg = head->mem_cgroup;
+ memcg = folio_memcg(folio);
if (unlikely(!memcg))
- return NULL;
+ return;
+
+#ifdef CONFIG_PROVE_LOCKING
+ local_irq_save(flags);
+ might_lock(&memcg->move_lock);
+ local_irq_restore(flags);
+#endif
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != head->mem_cgroup) {
+ if (memcg != folio_memcg(folio)) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
/*
- * When charge migration first begins, we can have locked and
- * unlocked page stat updates happening concurrently. Track
- * the task who has the lock for unlock_page_memcg().
+ * When charge migration first begins, we can have multiple
+ * critical sections holding the fast-path RCU lock and one
+ * holding the slowpath move_lock. Track the task who has the
+ * move_lock for folio_memcg_unlock().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
-
- return memcg;
}
-EXPORT_SYMBOL(lock_page_memcg);
-/**
- * __unlock_page_memcg - unlock and unpin a memcg
- * @memcg: the memcg
- *
- * Unlock and unpin a memcg returned by lock_page_memcg().
- */
-void __unlock_page_memcg(struct mem_cgroup *memcg)
+static void __folio_memcg_unlock(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -2151,47 +2168,59 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
}
/**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
- * @page: the page
+ * folio_memcg_unlock - Release the binding between a folio and its memcg.
+ * @folio: The folio.
+ *
+ * This releases the binding created by folio_memcg_lock(). This does
+ * not change the accounting of this folio to its memcg, but it does
+ * permit others to change it.
*/
-void unlock_page_memcg(struct page *page)
+void folio_memcg_unlock(struct folio *folio)
{
- struct page *head = compound_head(page);
-
- __unlock_page_memcg(head->mem_cgroup);
+ __folio_memcg_unlock(folio_memcg(folio));
}
-EXPORT_SYMBOL(unlock_page_memcg);
struct memcg_stock_pcp {
+ local_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
+ struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
+ int nr_slab_reclaimable_b;
+ int nr_slab_unreclaimable_b;
#endif
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+ .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+};
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
#else
-static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
+ return NULL;
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
return false;
}
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
+{
+}
#endif
/**
@@ -2214,15 +2243,15 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
+ if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) {
stock->nr_pages -= nr_pages;
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -2232,7 +2261,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
- struct mem_cgroup *old = stock->cached;
+ struct mem_cgroup *old = READ_ONCE(stock->cached);
if (!old)
return;
@@ -2245,51 +2274,59 @@ static void drain_stock(struct memcg_stock_pcp *stock)
}
css_put(&old->css);
- stock->cached = NULL;
+ WRITE_ONCE(stock->cached, NULL);
}
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
unsigned long flags;
/*
- * The only protection from memory hotplug vs. drain_stock races is
- * that we always operate on local CPU stock here with IRQ disabled
+ * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
+ * drain_stock races is that we always operate on local CPU stock
+ * here with IRQ disabled
*/
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- drain_obj_stock(stock);
+ old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
}
/*
* Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned long flags;
-
- local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached != memcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
drain_stock(stock);
css_get(&memcg->css);
- stock->cached = memcg;
+ WRITE_ONCE(stock->cached, memcg);
}
stock->nr_pages += nr_pages;
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
+}
- local_irq_restore(flags);
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ unsigned long flags;
+
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ __refill_stock(memcg, nr_pages);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@@ -2309,18 +2346,19 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu();
+ migrate_disable();
+ curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
bool flush = false;
rcu_read_lock();
- memcg = stock->cached;
+ memcg = READ_ONCE(stock->cached);
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
- if (obj_stock_flush_required(stock, root_memcg))
+ else if (obj_stock_flush_required(stock, root_memcg))
flush = true;
rcu_read_unlock();
@@ -2328,59 +2366,21 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
- else
+ else if (!cpu_is_isolated(cpu))
schedule_work_on(cpu, &stock->work);
}
}
- put_cpu();
+ migrate_enable();
mutex_unlock(&percpu_charge_mutex);
}
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg, *mi;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
- for_each_mem_cgroup(memcg) {
- int i;
-
- for (i = 0; i < MEMCG_NR_STAT; i++) {
- int nid;
- long x;
-
- x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &memcg->vmstats[i]);
-
- if (i >= NR_VM_NODE_STAT_ITEMS)
- continue;
-
- for_each_node(nid) {
- struct mem_cgroup_per_node *pn;
-
- pn = mem_cgroup_nodeinfo(memcg, nid);
- x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
- if (x)
- do {
- atomic_long_add(x, &pn->lruvec_stat[i]);
- } while ((pn = parent_nodeinfo(pn, nid)));
- }
- }
-
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
- long x;
-
- x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &memcg->vmevents[i]);
- }
- }
-
return 0;
}
@@ -2401,7 +2401,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
- gfp_mask, true);
+ gfp_mask,
+ MEMCG_RECLAIM_MAY_SWAP);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2637,21 +2638,20 @@ out:
css_put(&memcg->css);
}
-static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
+static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
- enum oom_status oom_status;
unsigned long nr_reclaimed;
- bool may_swap = true;
+ bool passed_oom = false;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
bool drained = false;
+ bool raised_max_event = false;
unsigned long pflags;
- if (mem_cgroup_is_root(memcg))
- return 0;
retry:
if (consume_stock(memcg, nr_pages))
return 0;
@@ -2665,7 +2665,7 @@ retry:
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
- may_swap = false;
+ reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
}
if (batch > nr_pages) {
@@ -2674,24 +2674,6 @@ retry:
}
/*
- * Memcg doesn't have a dedicated reserve for atomic
- * allocations. But like the global atomic pool, we need to
- * put the burden of reclaim on regular allocation requests
- * and let these go through as privileged allocations.
- */
- if (gfp_mask & __GFP_ATOMIC)
- goto force;
-
- /*
- * Unlike in global OOM situations, memcg is not in a physical
- * memory shortage. Allow dying and OOM-killed tasks to
- * bypass the last charges so that they can exit quickly and
- * free their memory.
- */
- if (unlikely(should_force_charge()))
- goto force;
-
- /*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
@@ -2707,10 +2689,11 @@ retry:
goto nomem;
memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ raised_max_event = true;
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, may_swap);
+ gfp_mask, reclaim_options);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2748,33 +2731,39 @@ retry:
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
- if (gfp_mask & __GFP_NOFAIL)
- goto force;
-
- if (fatal_signal_pending(current))
- goto force;
+ /* Avoid endless loop for tasks bypassed by the oom killer */
+ if (passed_oom && task_is_dying())
+ goto nomem;
/*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
* couldn't make any progress.
*/
- oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
- get_order(nr_pages * PAGE_SIZE));
- switch (oom_status) {
- case OOM_SUCCESS:
+ if (mem_cgroup_oom(mem_over_limit, gfp_mask,
+ get_order(nr_pages * PAGE_SIZE))) {
+ passed_oom = true;
nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
- case OOM_FAILED:
- goto force;
- default:
- goto nomem;
}
nomem:
- if (!(gfp_mask & __GFP_NOFAIL))
+ /*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
return -ENOMEM;
force:
/*
+ * If the allocation has to be enforced, don't forget to raise
+ * a MEMCG_MAX event.
+ */
+ if (!raised_max_event)
+ memcg_memory_event(mem_over_limit, MEMCG_MAX);
+
+ /*
* The allocation either can't fail or will lead to more memory
* being freed very soon. Allow memory usage go over the limit
* temporarily by force charging it.
@@ -2807,7 +2796,7 @@ done_restock:
READ_ONCE(memcg->swap.high);
/* Don't bother a random interrupted task */
- if (in_interrupt()) {
+ if (!in_task()) {
if (mem_high) {
schedule_work(&memcg->high_work);
break;
@@ -2831,11 +2820,24 @@ done_restock:
}
} while ((memcg = parent_mem_cgroup(memcg)));
+ if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
+ !(current->flags & PF_MEMALLOC) &&
+ gfpflags_allow_blocking(gfp_mask)) {
+ mem_cgroup_handle_over_high();
+ }
return 0;
}
-#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
-static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_is_root(memcg))
+ return 0;
+
+ return try_charge_memcg(memcg, gfp_mask, nr_pages);
+}
+
+static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
return;
@@ -2844,88 +2846,176 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
}
-#endif
-static void commit_charge(struct page *page, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
- VM_BUG_ON_PAGE(page->mem_cgroup, page);
+ VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
/*
- * Any of the following ensures page->mem_cgroup stability:
+ * Any of the following ensures page's memcg stability:
*
* - the page lock
* - LRU isolation
- * - lock_page_memcg()
+ * - folio_memcg_lock()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*/
- page->mem_cgroup = memcg;
+ folio->memcg_data = (unsigned long)memcg;
}
#ifdef CONFIG_MEMCG_KMEM
-int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp)
+/*
+ * The allocated objcg pointers array is not accounted directly.
+ * Moreover, it should not come from DMA buffer and is not readily
+ * reclaimable. So those GFP bits should be masked off.
+ */
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
+
+/*
+ * mod_objcg_mlstate() may be called with irq enabled, so
+ * mod_memcg_lruvec_state() should be used.
+ */
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ rcu_read_unlock();
+}
+
+int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab)
{
- unsigned int objects = objs_per_slab_page(s, page);
+ unsigned int objects = objs_per_slab(s, slab);
+ unsigned long memcg_data;
void *vec;
+ gfp &= ~OBJCGS_CLEAR_MASK;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
- page_to_nid(page));
+ slab_nid(slab));
if (!vec)
return -ENOMEM;
- if (cmpxchg(&page->obj_cgroups, NULL,
- (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
+ if (new_slab) {
+ /*
+ * If the slab is brand new and nobody can yet access its
+ * memcg_data, no synchronization is required and memcg_data can
+ * be simply assigned.
+ */
+ slab->memcg_data = memcg_data;
+ } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
+ /*
+ * If the slab is already in use, somebody can allocate and
+ * assign obj_cgroups in parallel. In this case the existing
+ * objcg vector should be reused.
+ */
kfree(vec);
- else
- kmemleak_not_leak(vec);
+ return 0;
+ }
+ kmemleak_not_leak(vec);
return 0;
}
+static __always_inline
+struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
+{
+ /*
+ * Slab objects are accounted individually, not per-page.
+ * Memcg membership data for each individual object is saved in
+ * slab->memcg_data.
+ */
+ if (folio_test_slab(folio)) {
+ struct obj_cgroup **objcgs;
+ struct slab *slab;
+ unsigned int off;
+
+ slab = folio_slab(folio);
+ objcgs = slab_objcgs(slab);
+ if (!objcgs)
+ return NULL;
+
+ off = obj_to_index(slab->slab_cache, slab, p);
+ if (objcgs[off])
+ return obj_cgroup_memcg(objcgs[off]);
+
+ return NULL;
+ }
+
+ /*
+ * folio_memcg_check() is used here, because in theory we can encounter
+ * a folio where the slab flag has been cleared already, but
+ * slab->memcg_data has not been freed yet
+ * folio_memcg_check() will guarantee that a proper memory
+ * cgroup pointer or NULL will be returned.
+ */
+ return folio_memcg_check(folio);
+}
+
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
+ * A passed kernel object can be a slab object, vmalloc object or a generic
+ * kernel page, so different mechanisms for getting the memory cgroup pointer
+ * should be used.
+ *
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
* The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
* cgroup_mutex, etc.
*/
struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
- struct page *page;
+ struct folio *folio;
if (mem_cgroup_disabled())
return NULL;
- page = virt_to_head_page(p);
+ if (unlikely(is_vmalloc_addr(p)))
+ folio = page_folio(vmalloc_to_page(p));
+ else
+ folio = virt_to_folio(p);
- /*
- * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
- * or a pointer to obj_cgroup vector. In the latter case the lowest
- * bit of the pointer is set.
- * The page->mem_cgroup pointer can be asynchronously changed
- * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
- * from a valid memcg pointer to objcg vector or back.
- */
- if (!page->mem_cgroup)
+ return mem_cgroup_from_obj_folio(folio, p);
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
+ * allocated using vmalloc().
+ *
+ * A passed kernel object must be a slab object or a generic kernel page.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+ if (mem_cgroup_disabled())
return NULL;
- /*
- * Slab objects are accounted individually, not per-page.
- * Memcg membership data for each individual object is saved in
- * the page->obj_cgroups.
- */
- if (page_has_obj_cgroups(page)) {
- struct obj_cgroup *objcg;
- unsigned int off;
+ return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
+}
- off = obj_to_index(page->slab_cache, page, p);
- objcg = page_obj_cgroups(page)[off];
- if (objcg)
- return obj_cgroup_memcg(objcg);
+static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg = NULL;
- return NULL;
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ objcg = NULL;
}
-
- /* All other pages use page->mem_cgroup */
- return page->mem_cgroup;
+ return objcg;
}
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
@@ -2933,117 +3023,98 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg;
- if (unlikely(!current->mm && !current->active_memcg))
+ if (memcg_kmem_bypass())
return NULL;
rcu_read_lock();
- if (unlikely(current->active_memcg))
- memcg = rcu_dereference(current->active_memcg);
+ if (unlikely(active_memcg()))
+ memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
-
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
- objcg = rcu_dereference(memcg->objcg);
- if (objcg && obj_cgroup_tryget(objcg))
- break;
- }
+ objcg = __get_obj_cgroup_from_memcg(memcg);
rcu_read_unlock();
-
return objcg;
}
-static int memcg_alloc_cache_id(void)
+struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
{
- int id, size;
- int err;
-
- id = ida_simple_get(&memcg_cache_ida,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (id < 0)
- return id;
-
- if (id < memcg_nr_cache_ids)
- return id;
-
- /*
- * There's no space for the new id in memcg_caches arrays,
- * so we have to grow them.
- */
- down_write(&memcg_cache_ids_sem);
+ struct obj_cgroup *objcg;
- size = 2 * (id + 1);
- if (size < MEMCG_CACHES_MIN_SIZE)
- size = MEMCG_CACHES_MIN_SIZE;
- else if (size > MEMCG_CACHES_MAX_SIZE)
- size = MEMCG_CACHES_MAX_SIZE;
+ if (!memcg_kmem_online())
+ return NULL;
- err = memcg_update_all_list_lrus(size);
- if (!err)
- memcg_nr_cache_ids = size;
+ if (PageMemcgKmem(page)) {
+ objcg = __folio_objcg(page_folio(page));
+ obj_cgroup_get(objcg);
+ } else {
+ struct mem_cgroup *memcg;
- up_write(&memcg_cache_ids_sem);
+ rcu_read_lock();
+ memcg = __folio_memcg(page_folio(page));
+ if (memcg)
+ objcg = __get_obj_cgroup_from_memcg(memcg);
+ else
+ objcg = NULL;
+ rcu_read_unlock();
+ }
+ return objcg;
+}
- if (err) {
- ida_simple_remove(&memcg_cache_ida, id);
- return err;
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (nr_pages > 0)
+ page_counter_charge(&memcg->kmem, nr_pages);
+ else
+ page_counter_uncharge(&memcg->kmem, -nr_pages);
}
- return id;
}
-static void memcg_free_cache_id(int id)
+
+/*
+ * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
+ * @objcg: object cgroup to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ unsigned int nr_pages)
{
- ida_simple_remove(&memcg_cache_ida, id);
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+
+ memcg_account_kmem(memcg, -nr_pages);
+ refill_stock(memcg, nr_pages);
+
+ css_put(&memcg->css);
}
-/**
- * __memcg_kmem_charge: charge a number of kernel pages to a memcg
- * @memcg: memory cgroup to charge
+/*
+ * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
+ * @objcg: object cgroup to charge
* @gfp: reclaim mode
* @nr_pages: number of pages to charge
*
* Returns 0 on success, an error code on failure.
*/
-int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages)
+static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
+ unsigned int nr_pages)
{
- struct page_counter *counter;
+ struct mem_cgroup *memcg;
int ret;
- ret = try_charge(memcg, gfp, nr_pages);
- if (ret)
- return ret;
+ memcg = get_mem_cgroup_from_objcg(objcg);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
- !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
-
- /*
- * Enforce __GFP_NOFAIL allocation because callers are not
- * prepared to see failures and likely do not have any failure
- * handling code.
- */
- if (gfp & __GFP_NOFAIL) {
- page_counter_charge(&memcg->kmem, nr_pages);
- return 0;
- }
- cancel_charge(memcg, nr_pages);
- return -ENOMEM;
- }
- return 0;
-}
+ ret = try_charge_memcg(memcg, gfp, nr_pages);
+ if (ret)
+ goto out;
-/**
- * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
- * @memcg: memcg to uncharge
- * @nr_pages: number of pages to uncharge
- */
-void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ memcg_account_kmem(memcg, nr_pages);
+out:
+ css_put(&memcg->css);
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ return ret;
}
/**
@@ -3056,22 +3127,19 @@ void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
int ret = 0;
- if (memcg_kmem_bypass())
- return 0;
-
- memcg = get_mem_cgroup_from_current();
- if (!mem_cgroup_is_root(memcg)) {
- ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ objcg = get_obj_cgroup_from_current();
+ if (objcg) {
+ ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
- page->mem_cgroup = memcg;
- __SetPageKmemcg(page);
+ page->memcg_data = (unsigned long)objcg |
+ MEMCG_DATA_KMEM;
return 0;
}
+ obj_cgroup_put(objcg);
}
- css_put(&memcg->css);
return ret;
}
@@ -3082,20 +3150,83 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct folio *folio = page_folio(page);
+ struct obj_cgroup *objcg;
unsigned int nr_pages = 1 << order;
- if (!memcg)
+ if (!folio_memcg_kmem(folio))
return;
- VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- __memcg_kmem_uncharge(memcg, nr_pages);
- page->mem_cgroup = NULL;
- css_put(&memcg->css);
+ objcg = __folio_objcg(folio);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
+ folio->memcg_data = 0;
+ obj_cgroup_put(objcg);
+}
+
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
+ unsigned long flags;
+ int *bytes;
+
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ stock = this_cpu_ptr(&memcg_stock);
+
+ /*
+ * Save vmstat data in stock and skip vmstat array update unless
+ * accumulating over a page of vmstat data or when pgdat or idx
+ * changes.
+ */
+ if (READ_ONCE(stock->cached_objcg) != objcg) {
+ old = drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ WRITE_ONCE(stock->cached_objcg, objcg);
+ stock->cached_pgdat = pgdat;
+ } else if (stock->cached_pgdat != pgdat) {
+ /* Flush the existing cached vmstat data */
+ struct pglist_data *oldpg = stock->cached_pgdat;
+
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->cached_pgdat = pgdat;
+ }
+
+ bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
+ : &stock->nr_slab_unreclaimable_b;
+ /*
+ * Even for large object >= PAGE_SIZE, the vmstat data will still be
+ * cached locally at least once before pushing it out.
+ */
+ if (!*bytes) {
+ *bytes = nr;
+ nr = 0;
+ } else {
+ *bytes += nr;
+ if (abs(*bytes) > PAGE_SIZE) {
+ nr = *bytes;
+ *bytes = 0;
+ } else {
+ nr = 0;
+ }
+ }
+ if (nr)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
- /* slab pages do not have PageKmemcg flag set */
- if (PageKmemcg(page))
- __ClearPageKmemcg(page);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3104,34 +3235,39 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
-static void drain_obj_stock(struct memcg_stock_pcp *stock)
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
- struct obj_cgroup *old = stock->cached_objcg;
+ struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
- return;
+ return NULL;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
if (nr_pages) {
- rcu_read_lock();
- __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
- rcu_read_unlock();
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(old);
+
+ memcg_account_kmem(memcg, -nr_pages);
+ __refill_stock(memcg, nr_pages);
+
+ css_put(&memcg->css);
}
/*
@@ -3148,17 +3284,41 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
stock->nr_bytes = 0;
}
- obj_cgroup_put(old);
- stock->cached_objcg = NULL;
+ /*
+ * Flush the vmstat data in current stock
+ */
+ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(old, stock->cached_pgdat,
+ NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(old, stock->cached_pgdat,
+ NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->cached_pgdat = NULL;
+ }
+
+ WRITE_ONCE(stock->cached_objcg, NULL);
+ /*
+ * The `old' objects needs to be released by the caller via
+ * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
+ */
+ return old;
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
+ struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
- if (stock->cached_objcg) {
- memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (objcg) {
+ memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3166,31 +3326,42 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
return false;
}
-static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ bool allow_uncharge)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
unsigned long flags;
+ unsigned int nr_pages = 0;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached_objcg != objcg) { /* reset if necessary */
- drain_obj_stock(stock);
+ if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
+ old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
- stock->cached_objcg = objcg;
- stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ WRITE_ONCE(stock->cached_objcg, objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
- if (stock->nr_bytes > PAGE_SIZE)
- drain_obj_stock(stock);
+ if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
+ nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ stock->nr_bytes &= (PAGE_SIZE - 1);
+ }
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
+
+ if (nr_pages)
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
}
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
{
- struct mem_cgroup *memcg;
unsigned int nr_pages, nr_bytes;
int ret;
@@ -3198,63 +3369,70 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
return 0;
/*
- * In theory, memcg->nr_charged_bytes can have enough
+ * In theory, objcg->nr_charged_bytes can have enough
* pre-charged bytes to satisfy the allocation. However,
- * flushing memcg->nr_charged_bytes requires two atomic
- * operations, and memcg->nr_charged_bytes can't be big,
- * so it's better to ignore it and try grab some new pages.
- * memcg->nr_charged_bytes will be flushed in
- * refill_obj_stock(), called from this function or
- * independently later.
+ * flushing objcg->nr_charged_bytes requires two atomic
+ * operations, and objcg->nr_charged_bytes can't be big.
+ * The shared objcg->nr_charged_bytes can also become a
+ * performance bottleneck if all tasks of the same memcg are
+ * trying to update it. So it's better to ignore it and try
+ * grab some new pages. The stock's nr_bytes will be flushed to
+ * objcg->nr_charged_bytes later on when objcg changes.
+ *
+ * The stock's nr_bytes may contain enough pre-charged bytes
+ * to allow one less page from being charged, but we can't rely
+ * on the pre-charged bytes not being changed outside of
+ * consume_obj_stock() or refill_obj_stock(). So ignore those
+ * pre-charged bytes as well when charging pages. To avoid a
+ * page uncharge right after a page charge, we set the
+ * allow_uncharge flag to false when calling refill_obj_stock()
+ * to temporarily allow the pre-charged bytes to exceed the page
+ * size limit. The maximum reachable value of the pre-charged
+ * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
+ * race.
*/
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- css_get(&memcg->css);
- rcu_read_unlock();
-
nr_pages = size >> PAGE_SHIFT;
nr_bytes = size & (PAGE_SIZE - 1);
if (nr_bytes)
nr_pages += 1;
- ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
- css_put(&memcg->css);
return ret;
}
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size);
+ refill_obj_stock(objcg, size, true);
}
#endif /* CONFIG_MEMCG_KMEM */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
/*
- * Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * Because page_memcg(head) is not set on tails, set it now.
*/
-void mem_cgroup_split_huge_fixup(struct page *head)
+void split_page_memcg(struct page *head, unsigned int nr)
{
- struct mem_cgroup *memcg = head->mem_cgroup;
+ struct folio *folio = page_folio(head);
+ struct mem_cgroup *memcg = folio_memcg(folio);
int i;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled() || !memcg)
return;
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- css_get(&memcg->css);
- head[i].mem_cgroup = memcg;
- }
+ for (i = 1; i < nr; i++)
+ folio_page(folio, i)->memcg_data = folio->memcg_data;
+
+ if (folio_memcg_kmem(folio))
+ obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
+ else
+ css_get_many(&memcg->css, nr - 1);
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_SWAP
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
@@ -3335,8 +3513,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
continue;
}
- if (!try_to_free_mem_cgroup_pages(memcg, 1,
- GFP_KERNEL, !memsw)) {
+ if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
ret = -EBUSY;
break;
}
@@ -3358,12 +3536,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
int loop = 0;
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
- unsigned long nr_scanned;
+
+ if (lru_gen_enabled())
+ return 0;
if (order > 0)
return 0;
- mctz = soft_limit_tree_node(pgdat->node_id);
+ mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
/*
* Do not even bother to check the largest node if the root
@@ -3386,13 +3566,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
if (!mz)
break;
- nr_scanned = 0;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
- gfp_mask, &nr_scanned);
+ gfp_mask, total_scanned);
nr_reclaimed += reclaimed;
- *total_scanned += nr_scanned;
spin_lock_irq(&mctz->lock);
- __mem_cgroup_remove_exceeded(mz, mctz);
/*
* If we failed to reclaim anything from this memory cgroup
@@ -3432,22 +3609,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
}
/*
- * Test whether @memcg has children, dead or alive. Note that this
- * function doesn't care whether @memcg has use_hierarchy enabled and
- * returns %true if there are child csses according to the cgroup
- * hierarchy. Testing use_hierarchy is the caller's responsibility.
- */
-static inline bool memcg_has_children(struct mem_cgroup *memcg)
-{
- bool ret;
-
- rcu_read_lock();
- ret = css_next_child(NULL, &memcg->css);
- rcu_read_unlock();
- return ret;
-}
-
-/*
* Reclaims as many pages from the given memcg as possible.
*
* Caller is responsible for holding css reference for memcg.
@@ -3463,19 +3624,12 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
/* try to free all pages in this cgroup */
while (nr_retries && page_counter_read(&memcg->memory)) {
- int progress;
-
if (signal_pending(current))
return -EINTR;
- progress = try_to_free_mem_cgroup_pages(memcg, 1,
- GFP_KERNEL, true);
- if (!progress) {
+ if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+ MEMCG_RECLAIM_MAY_SWAP))
nr_retries--;
- /* maybe some writeback is necessary */
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
-
}
return 0;
@@ -3495,37 +3649,20 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return mem_cgroup_from_css(css)->use_hierarchy;
+ return 1;
}
static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- int retval = 0;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
-
- if (memcg->use_hierarchy == val)
+ if (val == 1)
return 0;
- /*
- * If parent's use_hierarchy is set, we can't make any modifications
- * in the child subtrees. If it is unset, then the change can
- * occur, provided the current cgroup has no children.
- *
- * For the root cgroup, parent_mem is NULL, we allow value to be
- * set if there are no children.
- */
- if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
- (val == 1 || val == 0)) {
- if (!memcg_has_children(memcg))
- memcg->use_hierarchy = val;
- else
- retval = -EBUSY;
- } else
- retval = -EINVAL;
+ pr_warn_once("Non-hierarchical mode is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
- return retval;
+ return -EINVAL;
}
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3533,10 +3670,14 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- val = memcg_page_state(memcg, NR_FILE_PAGES) +
- memcg_page_state(memcg, NR_ANON_MAPPED);
+ /*
+ * Approximate root's usage from global state. This isn't
+ * perfect, but the root usage was always an approximation.
+ */
+ val = global_node_page_state(NR_FILE_PAGES) +
+ global_node_page_state(NR_ANON_MAPPED);
if (swap)
- val += memcg_page_state(memcg, MEMCG_SWAP);
+ val += total_swap_pages - get_nr_swap_pages();
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -3591,111 +3732,56 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
case RES_FAILCNT:
return counter->failcnt;
case RES_SOFT_LIMIT:
- return (u64)memcg->soft_limit * PAGE_SIZE;
+ return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
default:
BUG();
}
}
-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
-{
- unsigned long stat[MEMCG_NR_STAT] = {0};
- struct mem_cgroup *mi;
- int node, cpu, i;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
-
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < MEMCG_NR_STAT; i++)
- atomic_long_add(stat[i], &mi->vmstats[i]);
-
- for_each_node(node) {
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
- struct mem_cgroup_per_node *pi;
-
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- stat[i] = 0;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- stat[i] += per_cpu(
- pn->lruvec_stat_cpu->count[i], cpu);
-
- for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- atomic_long_add(stat[i], &pi->lruvec_stat[i]);
- }
-}
-
-static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+/*
+ * This function doesn't do anything useful. Its only job is to provide a read
+ * handler for a file so that cgroup_file_mode() will add read permissions.
+ */
+static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
+ __always_unused void *v)
{
- unsigned long events[NR_VM_EVENT_ITEMS];
- struct mem_cgroup *mi;
- int cpu, i;
-
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] = 0;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] += per_cpu(memcg->vmstats_percpu->events[i],
- cpu);
-
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- atomic_long_add(events[i], &mi->vmevents[i]);
+ return -EINVAL;
}
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg;
- int memcg_id;
- if (cgroup_memory_nokmem)
+ if (mem_cgroup_kmem_disabled())
return 0;
- BUG_ON(memcg->kmemcg_id >= 0);
- BUG_ON(memcg->kmem_state);
-
- memcg_id = memcg_alloc_cache_id();
- if (memcg_id < 0)
- return memcg_id;
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ return 0;
objcg = obj_cgroup_alloc();
- if (!objcg) {
- memcg_free_cache_id(memcg_id);
+ if (!objcg)
return -ENOMEM;
- }
+
objcg->memcg = memcg;
rcu_assign_pointer(memcg->objcg, objcg);
- static_branch_enable(&memcg_kmem_enabled_key);
+ static_branch_enable(&memcg_kmem_online_key);
- /*
- * A memory cgroup is considered kmem-online as soon as it gets
- * kmemcg_id. Setting the id after enabling static branching will
- * guarantee no one starts accounting before all call sites are
- * patched.
- */
- memcg->kmemcg_id = memcg_id;
- memcg->kmem_state = KMEM_ONLINE;
+ memcg->kmemcg_id = memcg->id.id;
return 0;
}
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
- struct cgroup_subsys_state *css;
- struct mem_cgroup *parent, *child;
- int kmemcg_id;
+ struct mem_cgroup *parent;
- if (memcg->kmem_state != KMEM_ONLINE)
+ if (mem_cgroup_kmem_disabled())
return;
- memcg->kmem_state = KMEM_ALLOCATED;
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ return;
parent = parent_mem_cgroup(memcg);
if (!parent)
@@ -3703,37 +3789,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
memcg_reparent_objcgs(memcg, parent);
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
/*
- * Change kmemcg_id of this cgroup and all its descendants to the
- * parent's id, and then move all entries from this cgroup's list_lrus
- * to ones of the parent. After we have finished, all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty. The
- * ordering is imposed by list_lru_node->lock taken by
- * memcg_drain_all_list_lrus().
+ * After we have finished memcg_reparent_objcgs(), all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty.
+ * The ordering is imposed by list_lru_node->lock taken by
+ * memcg_reparent_list_lrus().
*/
- rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
- css_for_each_descendant_pre(css, &memcg->css) {
- child = mem_cgroup_from_css(css);
- BUG_ON(child->kmemcg_id != kmemcg_id);
- child->kmemcg_id = parent->kmemcg_id;
- if (!memcg->use_hierarchy)
- break;
- }
- rcu_read_unlock();
-
- memcg_drain_all_list_lrus(kmemcg_id, parent);
-
- memcg_free_cache_id(kmemcg_id);
-}
-
-static void memcg_free_kmem(struct mem_cgroup *memcg)
-{
- /* css_alloc() failed, offlining didn't happen */
- if (unlikely(memcg->kmem_state == KMEM_ONLINE))
- memcg_offline_kmem(memcg);
+ memcg_reparent_list_lrus(memcg, parent);
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -3743,22 +3805,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
}
-static void memcg_free_kmem(struct mem_cgroup *memcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
-static int memcg_update_kmem_max(struct mem_cgroup *memcg,
- unsigned long max)
-{
- int ret;
-
- mutex_lock(&memcg_max_mutex);
- ret = page_counter_set_max(&memcg->kmem, max);
- mutex_unlock(&memcg_max_mutex);
- return ret;
-}
-
static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
{
int ret;
@@ -3824,10 +3872,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
- pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
- "Please report your usecase to linux-mm@kvack.org if you "
- "depend on this functionality.\n");
- ret = memcg_update_kmem_max(memcg, nr_pages);
+ /* kmem.limit_in_bytes is deprecated. */
+ ret = -EOPNOTSUPP;
break;
case _TCP:
ret = memcg_update_tcp_max(memcg, nr_pages);
@@ -3835,8 +3881,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
break;
case RES_SOFT_LIMIT:
- memcg->soft_limit = nr_pages;
- ret = 0;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ ret = -EOPNOTSUPP;
+ } else {
+ WRITE_ONCE(memcg->soft_limit, nr_pages);
+ ret = 0;
+ }
break;
}
return ret ?: nbytes;
@@ -3891,6 +3941,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+
if (val & ~MOVE_MASK)
return -EINVAL;
@@ -3972,6 +4026,8 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
int nid;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ mem_cgroup_flush_stats();
+
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
seq_printf(m, "%s=%lu", stat->name,
mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
@@ -4009,6 +4065,8 @@ static const unsigned int memcg1_stats[] = {
NR_FILE_MAPPED,
NR_FILE_DIRTY,
NR_WRITEBACK,
+ WORKINGSET_REFAULT_ANON,
+ WORKINGSET_REFAULT_FILE,
MEMCG_SWAP,
};
@@ -4022,6 +4080,8 @@ static const char *const memcg1_stat_names[] = {
"mapped_file",
"dirty",
"writeback",
+ "workingset_refault_anon",
+ "workingset_refault_file",
"swap",
};
@@ -4033,36 +4093,34 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
-static int memcg_stat_show(struct seq_file *m, void *v)
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
+ mem_cgroup_flush_stats();
+
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memcg1_stats[i] == NR_ANON_THPS)
- nr *= HPAGE_PMD_NR;
-#endif
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
+ seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
+ nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
- memcg_events_local(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %lu\n", lru_list_name(i),
- memcg_page_state_local(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -4070,29 +4128,31 @@ static int memcg_stat_show(struct seq_file *m, void *v)
memory = min(memory, READ_ONCE(mi->memory.max));
memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
- seq_printf(m, "hierarchical_memory_limit %llu\n",
- (u64)memory * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
+ (u64)memory * PAGE_SIZE);
if (do_memsw_account())
- seq_printf(m, "hierarchical_memsw_limit %llu\n",
- (u64)memsw * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
+ (u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)memcg_page_state(memcg, memcg1_stats[i]) *
- PAGE_SIZE);
+ nr = memcg_page_state(memcg, memcg1_stats[i]);
+ seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
+ (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "total_%s %llu\n",
- vm_event_name(memcg1_events[i]),
- (u64)memcg_events(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "total_%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -4102,17 +4162,15 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
- mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ mz = memcg->nodeinfo[pgdat->node_id];
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "anon_cost %lu\n", anon_cost);
- seq_printf(m, "file_cost %lu\n", file_cost);
+ seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
+ seq_buf_printf(s, "file_cost %lu\n", file_cost);
}
#endif
-
- return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
@@ -4128,13 +4186,13 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- if (val > 100)
+ if (val > 200)
return -EINVAL;
- if (css->parent)
- memcg->swappiness = val;
+ if (!mem_cgroup_is_root(memcg))
+ WRITE_ONCE(memcg->swappiness, val);
else
- vm_swappiness = val;
+ WRITE_ONCE(vm_swappiness, val);
return 0;
}
@@ -4468,7 +4526,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
- seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
seq_printf(sf, "oom_kill %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
@@ -4481,10 +4539,10 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/* cannot set to root cgroup and only 0 and 1 are allowed */
- if (!css->parent || !((val == 0) || (val == 1)))
+ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
return -EINVAL;
- memcg->oom_kill_disable = val;
+ WRITE_ONCE(memcg->oom_kill_disable, val);
if (!val)
memcg_oom_recover(memcg);
@@ -4520,22 +4578,6 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
return &memcg->cgwb_domain;
}
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page().
- */
-static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = atomic_long_read(&memcg->vmstats[idx]);
- int cpu;
-
- for_each_online_cpu(cpu)
- x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
- if (x < 0)
- x = 0;
- return x;
-}
-
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
@@ -4561,13 +4603,14 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+ mem_cgroup_flush_stats();
- *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
- memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
- *pheadroom = PAGE_COUNTER_MAX;
+ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_page_state(memcg, NR_ACTIVE_FILE);
+ *pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
READ_ONCE(memcg->memory.high));
@@ -4582,7 +4625,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
* Foreign dirty flushing
*
* There's an inherent mismatch between memcg and writeback. The former
- * trackes ownership per-page while the latter per-inode. This was a
+ * tracks ownership per-page while the latter per-inode. This was a
* deliberate design decision because honoring per-page ownership in the
* writeback path is complicated, may lead to higher CPU and IO overheads
* and deemed unnecessary given that write-sharing an inode across
@@ -4597,9 +4640,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
* triggering background writeback. A will be slowed down without a way to
* make writeback of the dirty pages happen.
*
- * Conditions like the above can lead to a cgroup getting repatedly and
+ * Conditions like the above can lead to a cgroup getting repeatedly and
* severely throttled after making some progress after each
- * dirty_expire_interval while the underyling IO device is almost
+ * dirty_expire_interval while the underlying IO device is almost
* completely idle.
*
* Solving this problem completely requires matching the ownership tracking
@@ -4622,17 +4665,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
* As being wrong occasionally doesn't matter, updates and accesses to the
* records are lockless and racy.
*/
-void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
struct bdi_writeback *wb)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = folio_memcg(folio);
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
int oldest = -1;
int i;
- trace_track_foreign_dirty(page, wb);
+ trace_track_foreign_dirty(folio, wb);
/*
* Pick the slot to use. If there is already a slot for @wb, keep
@@ -4695,7 +4738,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
atomic_read(&frn->done.cnt) == 1) {
frn->at = 0;
trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
- cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
WB_REASON_FOREIGN_FLUSH,
&frn->done);
}
@@ -4821,10 +4864,14 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
unsigned int efd, cfd;
struct fd efile;
struct fd cfile;
+ struct dentry *cdentry;
const char *name;
char *endp;
int ret;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return -EOPNOTSUPP;
+
buf = strstrip(buf);
efd = simple_strtoul(buf, &endp, 10);
@@ -4867,11 +4914,21 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ ret = file_permission(cfile.file, MAY_READ);
if (ret < 0)
goto out_put_cfile;
/*
+ * The control file must be a regular cgroup1 file. As a regular cgroup
+ * file can't be renamed, it's safe to access its name afterwards.
+ */
+ cdentry = cfile.file->f_path.dentry;
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
* Determine the event callbacks and set them in @event. This used
* to be done via struct cftype but cgroup core no longer knows
* about these events. The following is crude but the whole thing
@@ -4879,7 +4936,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
*
* DO NOT ADD NEW FILES.
*/
- name = cfile.file->f_path.dentry->d_name.name;
+ name = cdentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
@@ -4903,7 +4960,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
&memory_cgrp_subsys);
ret = -EINVAL;
if (IS_ERR(cfile_css))
@@ -4919,9 +4976,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
vfs_poll(efile.file, &event->pt);
- spin_lock(&memcg->event_list_lock);
+ spin_lock_irq(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irq(&memcg->event_list_lock);
fdput(cfile);
fdput(efile);
@@ -4942,6 +4999,19 @@ out_kfree:
return ret;
}
+#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
+static int mem_cgroup_slab_show(struct seq_file *m, void *p)
+{
+ /*
+ * Deprecated.
+ * Please, take a look at tools/cgroup/memcg_slabinfo.py .
+ */
+ return 0;
+}
+#endif
+
+static int memory_stat_show(struct seq_file *m, void *v);
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
@@ -4974,7 +5044,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
},
{
.name = "stat",
- .seq_show = memcg_stat_show,
+ .seq_show = memory_stat_show,
},
{
.name = "force_empty",
@@ -5004,10 +5074,10 @@ static struct cftype mem_cgroup_legacy_files[] = {
.name = "oom_control",
.seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
- .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
+ .seq_show = mem_cgroup_dummy_seq_show,
},
#ifdef CONFIG_NUMA
{
@@ -5042,7 +5112,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
- .seq_show = memcg_slab_show,
+ .seq_show = mem_cgroup_slab_show,
},
#endif
{
@@ -5138,42 +5208,45 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return idr_find(&mem_cgroup_idr, id);
}
+#ifdef CONFIG_SHRINKER_DEBUG
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+ struct cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg;
+
+ cgrp = cgroup_get_from_id(ino);
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
+ if (css)
+ memcg = container_of(css, struct mem_cgroup, css);
+ else
+ memcg = ERR_PTR(-ENOENT);
+
+ cgroup_put(cgrp);
+
+ return memcg;
+}
+#endif
+
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- int tmp = node;
- /*
- * This routine is called against possible nodes.
- * But it's BUG to call kmalloc() against offline node.
- *
- * TODO: this routine can waste much memory for nodes which will
- * never be onlined. It's better to use memory hotplug callback
- * function.
- */
- if (!node_state(node, N_NORMAL_MEMORY))
- tmp = -1;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
- if (!pn)
- return 1;
- pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_local) {
- kfree(pn);
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
+ if (!pn)
return 1;
- }
- pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_cpu) {
- free_percpu(pn->lruvec_stat_local);
+ pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!pn->lruvec_stats_percpu) {
kfree(pn);
return 1;
}
lruvec_init(&pn->lruvec);
- pn->usage_in_excess = 0;
- pn->on_tree = false;
pn->memcg = memcg;
memcg->nodeinfo[node] = pn;
@@ -5187,8 +5260,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return;
- free_percpu(pn->lruvec_stat_cpu);
- free_percpu(pn->lruvec_stat_local);
+ free_percpu(pn->lruvec_stats_percpu);
kfree(pn);
}
@@ -5204,49 +5276,38 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
+ kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
- free_percpu(memcg->vmstats_local);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
- /*
- * Flush percpu vmstats and vmevents to guarantee the value correctness
- * on parent's and all ancestor levels.
- */
- memcg_flush_percpu_vmstats(memcg);
- memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- unsigned int size;
int node;
int __maybe_unused i;
long error = -ENOMEM;
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
- memcg = kzalloc(size, GFP_KERNEL);
+ memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
if (!memcg)
return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
- 1, MEM_CGROUP_ID_MAX,
- GFP_KERNEL);
+ 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
if (memcg->id.id < 0) {
error = memcg->id.id;
goto fail;
}
- memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
- GFP_KERNEL_ACCOUNT);
- if (!memcg->vmstats_local)
+ memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
+ if (!memcg->vmstats)
goto fail;
memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
@@ -5284,7 +5345,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
memcg->deferred_split_queue.split_queue_len = 0;
#endif
- idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ lru_gen_init_memcg(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
@@ -5296,80 +5357,92 @@ static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
- struct mem_cgroup *memcg;
- long error = -ENOMEM;
+ struct mem_cgroup *memcg, *old_memcg;
- memalloc_use_memcg(parent);
+ old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ memcg->zswap_max = PAGE_COUNTER_MAX;
+#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
- memcg->swappiness = mem_cgroup_swappiness(parent);
- memcg->oom_kill_disable = parent->oom_kill_disable;
- }
- if (parent && parent->use_hierarchy) {
- memcg->use_hierarchy = true;
+ WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
+ WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
+
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
+ init_memcg_events();
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
- /*
- * Deeper hierachy with use_hierarchy == false doesn't make
- * much sense so let cgroup subsystem know about this
- * unfortunate state in our controller.
- */
- if (parent != root_mem_cgroup)
- memory_cgrp_subsys.broken_hierarchy = true;
- }
- /* The following stuff does not apply to the root */
- if (!parent) {
root_mem_cgroup = memcg;
return &memcg->css;
}
- error = memcg_online_kmem(memcg);
- if (error)
- goto fail;
-
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
+#if defined(CONFIG_MEMCG_KMEM)
+ if (!cgroup_memory_nobpf)
+ static_branch_inc(&memcg_bpf_enabled_key);
+#endif
+
return &memcg->css;
-fail:
- mem_cgroup_id_remove(memcg);
- mem_cgroup_free(memcg);
- return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ if (memcg_online_kmem(memcg))
+ goto remove_id;
+
/*
- * A memcg must be visible for memcg_expand_shrinker_maps()
+ * A memcg must be visible for expand_shrinker_info()
* by the time the maps are allocated. So, we allocate maps
* here, when for_each_mem_cgroup() can't skip it.
*/
- if (memcg_alloc_shrinker_maps(memcg)) {
- mem_cgroup_id_remove(memcg);
- return -ENOMEM;
- }
+ if (alloc_shrinker_info(memcg))
+ goto offline_kmem;
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ FLUSH_TIME);
+ lru_gen_online_memcg(memcg);
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
css_get(css);
+
+ /*
+ * Ensure mem_cgroup_from_id() works once we're fully online.
+ *
+ * We could do this earlier and require callers to filter with
+ * css_tryget_online(). But right now there are no users that
+ * need earlier access, and the workingset code relies on the
+ * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
+ * publish it here at the end of onlining. This matches the
+ * regular ID destruction during offlining.
+ */
+ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+
return 0;
+offline_kmem:
+ memcg_offline_kmem(memcg);
+remove_id:
+ mem_cgroup_id_remove(memcg);
+ return -ENOMEM;
}
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
@@ -5382,18 +5455,20 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
- spin_lock(&memcg->event_list_lock);
+ spin_lock_irq(&memcg->event_list_lock);
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irq(&memcg->event_list_lock);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
memcg_offline_kmem(memcg);
+ reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
+ lru_gen_offline_memcg(memcg);
drain_all_stock(memcg);
@@ -5405,6 +5480,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
invalidate_reclaim_iterators(memcg);
+ lru_gen_release_memcg(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -5422,11 +5498,15 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);
+#if defined(CONFIG_MEMCG_KMEM)
+ if (!cgroup_memory_nobpf)
+ static_branch_dec(&memcg_bpf_enabled_key);
+#endif
+
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
- memcg_free_shrinker_maps(memcg);
- memcg_free_kmem(memcg);
+ free_shrinker_info(memcg);
mem_cgroup_free(memcg);
}
@@ -5454,11 +5534,97 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg);
}
+static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct memcg_vmstats_percpu *statc;
+ long delta, v;
+ int i, nid;
+
+ statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ /*
+ * Collect the aggregated propagation counts of groups
+ * below us. We're in a per-cpu loop here and this is
+ * a global counter, so the first cycle will get them.
+ */
+ delta = memcg->vmstats->state_pending[i];
+ if (delta)
+ memcg->vmstats->state_pending[i] = 0;
+
+ /* Add CPU changes on this level since the last flush */
+ v = READ_ONCE(statc->state[i]);
+ if (v != statc->state_prev[i]) {
+ delta += v - statc->state_prev[i];
+ statc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ /* Aggregate counts on this level and propagate upwards */
+ memcg->vmstats->state[i] += delta;
+ if (parent)
+ parent->vmstats->state_pending[i] += delta;
+ }
+
+ for (i = 0; i < NR_MEMCG_EVENTS; i++) {
+ delta = memcg->vmstats->events_pending[i];
+ if (delta)
+ memcg->vmstats->events_pending[i] = 0;
+
+ v = READ_ONCE(statc->events[i]);
+ if (v != statc->events_prev[i]) {
+ delta += v - statc->events_prev[i];
+ statc->events_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ memcg->vmstats->events[i] += delta;
+ if (parent)
+ parent->vmstats->events_pending[i] += delta;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node *ppn = NULL;
+ struct lruvec_stats_percpu *lstatc;
+
+ if (parent)
+ ppn = parent->nodeinfo[nid];
+
+ lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ delta = pn->lruvec_stats.state_pending[i];
+ if (delta)
+ pn->lruvec_stats.state_pending[i] = 0;
+
+ v = READ_ONCE(lstatc->state[i]);
+ if (v != lstatc->state_prev[i]) {
+ delta += v - lstatc->state_prev[i];
+ lstatc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ pn->lruvec_stats.state[i] += delta;
+ if (ppn)
+ ppn->lruvec_stats.state_pending[i] += delta;
+ }
+ }
+}
+
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
static int mem_cgroup_do_precharge(unsigned long count)
@@ -5526,17 +5692,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return NULL;
/*
- * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
- * a device and because they are not accessible by CPU they are store
- * as special swap entry in the CPU page table.
+ * Handle device private pages that are not accessible by the CPU, but
+ * stored as special swap entries in the page table.
*/
if (is_device_private_entry(ent)) {
- page = device_private_entry_to_page(ent);
- /*
- * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
- * a refcount of 1 when free (unlike normal page)
- */
- if (!page_ref_add_unless(page, 1, 1))
+ page = pfn_swap_entry_to_page(ent);
+ if (!get_page_unless_zero(page))
return NULL;
return page;
}
@@ -5545,7 +5706,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return NULL;
/*
- * Because lookup_swap_cache() updates some statistics counter,
+ * Because swap_cache_get_folio() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), swp_offset(ent));
@@ -5562,17 +5723,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
#endif
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, swp_entry_t *entry)
+ unsigned long addr, pte_t ptent)
{
+ unsigned long index;
+ struct folio *folio;
+
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- /* page is moved even if it's not RSS of this task(page-faulted). */
+ /* folio is moved even if it's not RSS of this task(page-faulted). */
/* shmem/tmpfs may report page out on swap: account for that too. */
- return find_get_incore_page(vma->vm_file->f_mapping,
- linear_page_index(vma, addr));
+ index = linear_page_index(vma, addr);
+ folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
+ if (IS_ERR(folio))
+ return NULL;
+ return folio_file_page(folio, index);
}
/**
@@ -5582,7 +5749,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must make sure the page is not on LRU (isolate_page() is useful.)
+ * The page must be locked and not on the LRU.
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
@@ -5592,61 +5759,54 @@ static int mem_cgroup_move_account(struct page *page,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
+ struct folio *folio = page_folio(page);
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
- unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
- int ret;
+ unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
+ int nid, ret;
VM_BUG_ON(from == to);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON(compound && !PageTransHuge(page));
-
- /*
- * Prevent mem_cgroup_migrate() from looking at
- * page->mem_cgroup of its source page while we change it.
- */
- ret = -EBUSY;
- if (!trylock_page(page))
- goto out;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON(compound && !folio_test_large(folio));
ret = -EINVAL;
- if (page->mem_cgroup != from)
- goto out_unlock;
+ if (folio_memcg(folio) != from)
+ goto out;
- pgdat = page_pgdat(page);
+ pgdat = folio_pgdat(folio);
from_vec = mem_cgroup_lruvec(from, pgdat);
to_vec = mem_cgroup_lruvec(to, pgdat);
- lock_page_memcg(page);
+ folio_memcg_lock(folio);
- if (PageAnon(page)) {
- if (page_mapped(page)) {
+ if (folio_test_anon(folio)) {
+ if (folio_mapped(folio)) {
__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
- if (PageTransHuge(page)) {
+ if (folio_test_transhuge(folio)) {
__mod_lruvec_state(from_vec, NR_ANON_THPS,
-nr_pages);
__mod_lruvec_state(to_vec, NR_ANON_THPS,
nr_pages);
}
-
}
} else {
__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
- if (PageSwapBacked(page)) {
+ if (folio_test_swapbacked(folio)) {
__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
}
- if (page_mapped(page)) {
+ if (folio_mapped(folio)) {
__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
- if (PageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ if (folio_test_dirty(folio)) {
+ struct address_space *mapping = folio_mapping(folio);
if (mapping_can_writeback(mapping)) {
__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
@@ -5657,7 +5817,13 @@ static int mem_cgroup_move_account(struct page *page,
}
}
- if (PageWriteback(page)) {
+#ifdef CONFIG_SWAP
+ if (folio_test_swapcache(folio)) {
+ __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
+ }
+#endif
+ if (folio_test_writeback(folio)) {
__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
@@ -5665,13 +5831,13 @@ static int mem_cgroup_move_account(struct page *page,
/*
* All state has been migrated, let's switch to the new memcg.
*
- * It is safe to change page->mem_cgroup here because the page
+ * It is safe to change page's memcg here because the page
* is referenced, charged, isolated, and locked: we can't race
* with (un)charging, migration, LRU putback, or anything else
- * that would rely on a stable page->mem_cgroup.
+ * that would rely on a stable page's memory cgroup.
*
- * Note that lock_page_memcg is a memcg lock, not a page lock,
- * to save space. As soon as we switch page->mem_cgroup to a
+ * Note that folio_memcg_lock is a memcg lock, not a page lock,
+ * to save space. As soon as we switch page's memory cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
*/
@@ -5680,20 +5846,19 @@ static int mem_cgroup_move_account(struct page *page,
css_get(&to->css);
css_put(&from->css);
- page->mem_cgroup = to;
+ folio->memcg_data = (unsigned long)to;
- __unlock_page_memcg(from);
+ __folio_memcg_unlock(from);
ret = 0;
+ nid = folio_nid(folio);
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
- memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
- memcg_check_events(from, page);
+ mem_cgroup_charge_statistics(to, nr_pages);
+ memcg_check_events(to, nid);
+ mem_cgroup_charge_statistics(from, -nr_pages);
+ memcg_check_events(from, nid);
local_irq_enable();
-out_unlock:
- unlock_page(page);
out:
return ret;
}
@@ -5713,8 +5878,8 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and
+ * thus not on the lru.
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -5733,10 +5898,37 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
+ else if (pte_none_mostly(ptent))
+ /*
+ * PTE markers should be treated as a none pte here, separated
+ * from other swap handling below.
+ */
+ page = mc_handle_file_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
- else if (pte_none(ptent))
- page = mc_handle_file_pte(vma, addr, ptent, &ent);
+
+ if (target && page) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ret;
+ }
+ /*
+ * page_mapped() must be stable during the move. This
+ * pte is locked, so if it's present, the page cannot
+ * become unmapped. If it isn't, we have only partial
+ * control over the mapped state: the page lock will
+ * prevent new faults against pagecache and swapcache,
+ * so an unmapped page cannot become mapped. However,
+ * if the page is already mapped elsewhere, it can
+ * unmap, and there is nothing we can do about it.
+ * Alas, skip moving the page in this case.
+ */
+ if (!pte_present(ptent) && page_mapped(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ret;
+ }
+ }
if (!page && !ent.val)
return ret;
@@ -5746,15 +5938,19 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (page->mem_cgroup == mc.from) {
+ if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page))
+ if (is_device_private_page(page) ||
+ is_device_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
}
- if (!ret || !target)
+ if (!ret || !target) {
+ if (target)
+ unlock_page(page);
put_page(page);
+ }
}
/*
* There is a swap entry and a page doesn't exist or isn't charged.
@@ -5790,10 +5986,14 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!(mc.flags & MOVE_ANON))
return ret;
- if (page->mem_cgroup == mc.from) {
+ if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ return MC_TARGET_NONE;
+ }
target->page = page;
}
}
@@ -5828,11 +6028,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
for (; addr != end; pte++, addr += PAGE_SIZE)
- if (get_mctgt_type(vma, addr, *pte, NULL))
+ if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -5842,6 +6042,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
static const struct mm_walk_ops precharge_walk_ops = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
@@ -5849,7 +6050,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
mmap_read_lock(mm);
- walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
+ walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
mmap_read_unlock(mm);
precharge = mc.precharge;
@@ -5957,7 +6158,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
return 0;
/*
- * We are now commited to this value whatever it is. Changes in this
+ * We are now committed to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
@@ -6024,7 +6225,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
page = target.page;
- if (!isolate_lru_page(page)) {
+ if (isolate_lru_page(page)) {
if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
@@ -6032,6 +6233,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
putback_lru_page(page);
}
+ unlock_page(page);
put_page(page);
} else if (target_type == MC_TARGET_DEVICE) {
page = target.page;
@@ -6040,18 +6242,19 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
+ unlock_page(page);
put_page(page);
}
spin_unlock(ptl);
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
for (; addr != end; addr += PAGE_SIZE) {
- pte_t ptent = *(pte++);
+ pte_t ptent = ptep_get(pte++);
bool device = false;
swp_entry_t ent;
@@ -6072,7 +6275,7 @@ retry:
*/
if (PageTransCompound(page))
goto put;
- if (!device && isolate_lru_page(page))
+ if (!device && !isolate_lru_page(page))
goto put;
if (!mem_cgroup_move_account(page, false,
mc.from, mc.to)) {
@@ -6082,7 +6285,8 @@ retry:
}
if (!device)
putback_lru_page(page);
-put: /* get_mctgt_type() gets the page */
+put: /* get_mctgt_type() gets & locks the page */
+ unlock_page(page);
put_page(page);
break;
case MC_TARGET_SWAP:
@@ -6118,13 +6322,14 @@ put: /* get_mctgt_type() gets the page */
static const struct mm_walk_ops charge_walk_ops = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static void mem_cgroup_move_charge(void)
{
lru_add_drain_all();
/*
- * Signal lock_page_memcg() to take the memcg's move_lock
+ * Signal folio_memcg_lock() to take the memcg's move_lock
* while we're moving its pages to another memcg. Then wait
* for already started RCU-only updates to finish.
*/
@@ -6147,9 +6352,7 @@ retry:
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
- NULL);
-
+ walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
mmap_read_unlock(mc.mm);
atomic_dec(&mc.from->moving_account);
}
@@ -6174,23 +6377,29 @@ static void mem_cgroup_move_task(void)
}
#endif
-/*
- * Cgroup retains root cgroups across [un]mount cycles making it necessary
- * to verify whether we're attached to the default hierarchy on each mount
- * attempt.
- */
-static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+ /* find the first leader if there is any */
+ cgroup_taskset_for_each_leader(task, css, tset)
+ break;
+
+ if (!task)
+ return;
+
+ task_lock(task);
+ if (task->mm && READ_ONCE(task->mm->owner) == task)
+ lru_gen_migrate_mm(task->mm);
+ task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
- /*
- * use_hierarchy is forced on the default hierarchy. cgroup core
- * guarantees that @root doesn't have any children, so turning it
- * on for the root memcg is enough.
- */
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- root_mem_cgroup->use_hierarchy = true;
- else
- root_mem_cgroup->use_hierarchy = false;
}
+#endif /* CONFIG_LRU_GEN */
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
@@ -6210,6 +6419,14 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
+static u64 memory_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)memcg->memory.watermark * PAGE_SIZE;
+}
+
static int memory_min_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -6276,6 +6493,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
+ page_counter_set_high(&memcg->memory, high);
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -6293,16 +6512,13 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, true);
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
if (!reclaimed && !nr_retries--)
break;
}
- page_counter_set_high(&memcg->memory, high);
-
memcg_wb_domain_size_changed(memcg);
-
return nbytes;
}
@@ -6345,7 +6561,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, true))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
nr_reclaims--;
continue;
}
@@ -6367,6 +6583,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
seq_printf(m, "oom_kill %lu\n",
atomic_long_read(&events[MEMCG_OOM_KILL]));
+ seq_printf(m, "oom_group_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
}
static int memory_events_show(struct seq_file *m, void *v)
@@ -6388,22 +6606,32 @@ static int memory_events_local_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- char *buf;
+ char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ struct seq_buf s;
- buf = memory_stat_format(memcg);
if (!buf)
return -ENOMEM;
+ seq_buf_init(&s, buf, PAGE_SIZE);
+ memory_stat_format(memcg, &s);
seq_puts(m, buf);
kfree(buf);
return 0;
}
#ifdef CONFIG_NUMA
+static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
+ int item)
+{
+ return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
+}
+
static int memory_numa_stat_show(struct seq_file *m, void *v)
{
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ mem_cgroup_flush_stats();
+
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;
@@ -6416,8 +6644,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- size = lruvec_page_state(lruvec, memory_stats[i].idx);
- size *= memory_stats[i].ratio;
+ size = lruvec_page_state_output(lruvec,
+ memory_stats[i].idx);
seq_printf(m, " N%d=%llu", nid, size);
}
seq_putc(m, '\n');
@@ -6431,7 +6659,7 @@ static int memory_oom_group_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- seq_printf(m, "%d\n", memcg->oom_group);
+ seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
return 0;
}
@@ -6453,7 +6681,49 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
if (oom_group != 0 && oom_group != 1)
return -EINVAL;
- memcg->oom_group = oom_group;
+ WRITE_ONCE(memcg->oom_group, oom_group);
+
+ return nbytes;
+}
+
+static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ unsigned int reclaim_options;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "", &nr_to_reclaim);
+ if (err)
+ return err;
+
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
+ while (nr_reclaimed < nr_to_reclaim) {
+ unsigned long reclaimed;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ /*
+ * This is the final attempt, drain percpu lru caches in the
+ * hope of introducing more evictable pages for
+ * try_to_free_mem_cgroup_pages().
+ */
+ if (!nr_retries)
+ lru_add_drain_all();
+
+ reclaimed = try_to_free_mem_cgroup_pages(memcg,
+ nr_to_reclaim - nr_reclaimed,
+ GFP_KERNEL, reclaim_options);
+
+ if (!reclaimed && !nr_retries--)
+ return -EAGAIN;
+
+ nr_reclaimed += reclaimed;
+ }
return nbytes;
}
@@ -6465,6 +6735,11 @@ static struct cftype memory_files[] = {
.read_u64 = memory_current_read,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = memory_peak_read,
+ },
+ {
.name = "min",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_min_show,
@@ -6516,6 +6791,11 @@ static struct cftype memory_files[] = {
.seq_show = memory_oom_group_show,
.write = memory_oom_group_write,
},
+ {
+ .name = "reclaim",
+ .flags = CFTYPE_NS_DELEGATABLE,
+ .write = memory_reclaim,
+ },
{ } /* terminate */
};
@@ -6526,10 +6806,11 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
+ .css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
+ .attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
- .bind = mem_cgroup_bind,
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
.early_init = 0,
@@ -6590,7 +6871,7 @@ static unsigned long effective_protection(unsigned long usage,
protected = min(usage, setting);
/*
* If all cgroups at this level combined claim and use more
- * protection then what the parent affords them, distribute
+ * protection than what the parent affords them, distribute
* shares in proportion to utilization.
*
* We are using actual utilization rather than the statically
@@ -6652,7 +6933,7 @@ static unsigned long effective_protection(unsigned long usage,
}
/**
- * mem_cgroup_protected - check if memory consumption is in the normal range
+ * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
@@ -6686,9 +6967,6 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
return;
parent = parent_mem_cgroup(memcg);
- /* No parent means a non-hierarchical mode on v1 memcg */
- if (!parent)
- return;
if (parent == root) {
memcg->memory.emin = READ_ONCE(memcg->memory.min);
@@ -6709,86 +6987,113 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_low_usage)));
}
+static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+ gfp_t gfp)
+{
+ long nr_pages = folio_nr_pages(folio);
+ int ret;
+
+ ret = try_charge(memcg, gfp, nr_pages);
+ if (ret)
+ goto out;
+
+ css_get(&memcg->css);
+ commit_charge(folio, memcg);
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, nr_pages);
+ memcg_check_events(memcg, folio_nid(folio));
+ local_irq_enable();
+out:
+ return ret;
+}
+
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = charge_memcg(folio, memcg, gfp);
+ css_put(&memcg->css);
+
+ return ret;
+}
+
/**
- * mem_cgroup_charge - charge a newly allocated page to a cgroup
- * @page: page to charge
+ * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
+ * @folio: folio to charge.
* @mm: mm context of the victim
- * @gfp_mask: reclaim mode
+ * @gfp: reclaim mode
+ * @entry: swap entry for which the folio is allocated
*
- * Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary.
+ * This function charges a folio allocated for swapin. Please call this before
+ * adding the folio to the swapcache.
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
+ gfp_t gfp, swp_entry_t entry)
{
- unsigned int nr_pages = thp_nr_pages(page);
- struct mem_cgroup *memcg = NULL;
- int ret = 0;
+ struct mem_cgroup *memcg;
+ unsigned short id;
+ int ret;
if (mem_cgroup_disabled())
- goto out;
-
- if (PageSwapCache(page)) {
- swp_entry_t ent = { .val = page_private(page), };
- unsigned short id;
-
- /*
- * Every swap fault against a single page tries to charge the
- * page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. page->mem_cgroup is protected
- * by the page lock, which serializes swap cache removal, which
- * in turn serializes uncharging.
- */
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (compound_head(page)->mem_cgroup)
- goto out;
-
- id = lookup_swap_cgroup_id(ent);
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
+ return 0;
- if (!memcg)
+ id = lookup_swap_cgroup_id(entry);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (!memcg || !css_tryget_online(&memcg->css))
memcg = get_mem_cgroup_from_mm(mm);
+ rcu_read_unlock();
- ret = try_charge(memcg, gfp_mask, nr_pages);
- if (ret)
- goto out_put;
-
- css_get(&memcg->css);
- commit_charge(page, memcg);
+ ret = charge_memcg(folio, memcg, gfp);
- local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
- memcg_check_events(memcg, page);
- local_irq_enable();
+ css_put(&memcg->css);
+ return ret;
+}
- if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page_private(page) };
+/*
+ * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
+ * @entry: swap entry for which the page is charged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
+ if (!mem_cgroup_disabled() && do_memsw_account()) {
/*
* The swap entry might not get freed for a long time,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
- mem_cgroup_uncharge_swap(entry, nr_pages);
+ mem_cgroup_uncharge_swap(entry, 1);
}
-
-out_put:
- css_put(&memcg->css);
-out:
- return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
- unsigned long nr_pages;
+ unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
- struct page *dummy_page;
+ int nid;
};
static inline void uncharge_gather_clear(struct uncharge_gather *ug)
@@ -6800,176 +7105,162 @@ static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;
- if (!mem_cgroup_is_root(ug->memcg)) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
+ if (ug->nr_memory) {
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
- page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ if (ug->nr_kmem)
+ memcg_account_kmem(ug->memcg, -ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
local_irq_save(flags);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
- memcg_check_events(ug->memcg, ug->dummy_page);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
+ memcg_check_events(ug->memcg, ug->nid);
local_irq_restore(flags);
- /* drop reference from uncharge_page */
+ /* drop reference from uncharge_folio */
css_put(&ug->memcg->css);
}
-static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
{
- unsigned long nr_pages;
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ long nr_pages;
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
- if (!page->mem_cgroup)
- return;
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point, we have fully
- * exclusive access to the page.
+ * folio memcg or objcg at this point, we have fully
+ * exclusive access to the folio.
*/
+ if (folio_memcg_kmem(folio)) {
+ objcg = __folio_objcg(folio);
+ /*
+ * This get matches the put at the end of the function and
+ * kmem pages do not hold memcg references anymore.
+ */
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ } else {
+ memcg = __folio_memcg(folio);
+ }
- if (ug->memcg != page->mem_cgroup) {
+ if (!memcg)
+ return;
+
+ if (ug->memcg != memcg) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = page->mem_cgroup;
+ ug->memcg = memcg;
+ ug->nid = folio_nid(folio);
/* pairs with css_put in uncharge_batch */
- css_get(&ug->memcg->css);
+ css_get(&memcg->css);
}
- nr_pages = compound_nr(page);
- ug->nr_pages += nr_pages;
+ nr_pages = folio_nr_pages(folio);
- if (!PageKmemcg(page)) {
- ug->pgpgout++;
- } else {
+ if (folio_memcg_kmem(folio)) {
+ ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
- __ClearPageKmemcg(page);
- }
- ug->dummy_page = page;
- page->mem_cgroup = NULL;
- css_put(&ug->memcg->css);
-}
-
-static void uncharge_list(struct list_head *page_list)
-{
- struct uncharge_gather ug;
- struct list_head *next;
-
- uncharge_gather_clear(&ug);
-
- /*
- * Note that the list can be a single page->lru; hence the
- * do-while loop instead of a simple list_for_each_entry().
- */
- next = page_list->next;
- do {
- struct page *page;
-
- page = list_entry(next, struct page, lru);
- next = page->lru.next;
+ folio->memcg_data = 0;
+ obj_cgroup_put(objcg);
+ } else {
+ /* LRU pages aren't accounted at the root level */
+ if (!mem_cgroup_is_root(memcg))
+ ug->nr_memory += nr_pages;
+ ug->pgpgout++;
- uncharge_page(page, &ug);
- } while (next != page_list);
+ folio->memcg_data = 0;
+ }
- if (ug.memcg)
- uncharge_batch(&ug);
+ css_put(&memcg->css);
}
-/**
- * mem_cgroup_uncharge - uncharge a page
- * @page: page to uncharge
- *
- * Uncharge a page previously charged with mem_cgroup_charge().
- */
-void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct folio *folio)
{
struct uncharge_gather ug;
- if (mem_cgroup_disabled())
- return;
-
- /* Don't touch page->lru of any random page, pre-check: */
- if (!page->mem_cgroup)
+ /* Don't touch folio->lru of any random page, pre-check: */
+ if (!folio_memcg(folio))
return;
uncharge_gather_clear(&ug);
- uncharge_page(page, &ug);
+ uncharge_folio(folio, &ug);
uncharge_batch(&ug);
}
/**
- * mem_cgroup_uncharge_list - uncharge a list of page
+ * __mem_cgroup_uncharge_list - uncharge a list of page
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
- * mem_cgroup_charge().
+ * __mem_cgroup_charge().
*/
-void mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
{
- if (mem_cgroup_disabled())
- return;
+ struct uncharge_gather ug;
+ struct folio *folio;
- if (!list_empty(page_list))
- uncharge_list(page_list);
+ uncharge_gather_clear(&ug);
+ list_for_each_entry(folio, page_list, lru)
+ uncharge_folio(folio, &ug);
+ if (ug.memcg)
+ uncharge_batch(&ug);
}
/**
- * mem_cgroup_migrate - charge a page's replacement
- * @oldpage: currently circulating page
- * @newpage: replacement page
+ * mem_cgroup_migrate - Charge a folio's replacement.
+ * @old: Currently circulating folio.
+ * @new: Replacement folio.
*
- * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * Charge @new as a replacement folio for @old. @old will
* be uncharged upon free.
*
- * Both pages must be locked, @newpage->mapping must be set up.
+ * Both folios must be locked, @new->mapping must be set up.
*/
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
- unsigned int nr_pages;
+ long nr_pages = folio_nr_pages(new);
unsigned long flags;
- VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
- VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
- newpage);
+ VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+ VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+ VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
+ VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
if (mem_cgroup_disabled())
return;
- /* Page cache replacement: new page already charged? */
- if (newpage->mem_cgroup)
+ /* Page cache replacement: new folio already charged? */
+ if (folio_memcg(new))
return;
- /* Swapcache readahead pages can get replaced before being charged */
- memcg = oldpage->mem_cgroup;
+ memcg = folio_memcg(old);
+ VM_WARN_ON_ONCE_FOLIO(!memcg, old);
if (!memcg)
return;
/* Force-charge the new page. The old one will be freed soon */
- nr_pages = thp_nr_pages(newpage);
-
- page_counter_charge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_charge(&memcg->memsw, nr_pages);
+ if (!mem_cgroup_is_root(memcg)) {
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, nr_pages);
+ }
css_get(&memcg->css);
- commit_charge(newpage, memcg);
+ commit_charge(new, memcg);
local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
- memcg_check_events(memcg, newpage);
+ mem_cgroup_charge_statistics(memcg, nr_pages);
+ memcg_check_events(memcg, folio_nid(new));
local_irq_restore(flags);
}
@@ -6984,12 +7275,12 @@ void mem_cgroup_sk_alloc(struct sock *sk)
return;
/* Do not associate the sock with unrelated interrupted task's memcg. */
- if (in_interrupt())
+ if (!in_task())
return;
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
@@ -7009,14 +7300,14 @@ void mem_cgroup_sk_free(struct sock *sk)
* mem_cgroup_charge_skmem - charge socket memory
* @memcg: memcg to charge
* @nr_pages: number of pages to charge
+ * @gfp_mask: reclaim mode
*
* Charges @nr_pages to @memcg. Returns %true if the charge fit within
- * @memcg's configured limit, %false if the charge had to be forced.
+ * @memcg's configured limit, %false if it doesn't.
*/
-bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
+ gfp_t gfp_mask)
{
- gfp_t gfp_mask = GFP_KERNEL;
-
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
struct page_counter *fail;
@@ -7024,21 +7315,19 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
memcg->tcpmem_pressure = 0;
return true;
}
- page_counter_charge(&memcg->tcpmem, nr_pages);
memcg->tcpmem_pressure = 1;
+ if (gfp_mask & __GFP_NOFAIL) {
+ page_counter_charge(&memcg->tcpmem, nr_pages);
+ return true;
+ }
return false;
}
- /* Don't block in the packet receive path */
- if (in_softirq())
- gfp_mask = GFP_NOWAIT;
-
- mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
-
- if (try_charge(memcg, gfp_mask, nr_pages) == 0)
+ if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
+ mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
return true;
+ }
- try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
return false;
}
@@ -7070,8 +7359,10 @@ static int __init cgroup_memory(char *s)
cgroup_memory_nosocket = true;
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
+ if (!strcmp(token, "nobpf"))
+ cgroup_memory_nobpf = true;
}
- return 0;
+ return 1;
}
__setup("cgroup.memory=", cgroup_memory);
@@ -7087,6 +7378,14 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
+ /*
+ * Currently s32 type (can refer to struct batched_lruvec_stat) is
+ * used for per-memcg-per-cpu caching of per-node statistics. In order
+ * to work fine, we should make sure that the overfill threshold can't
+ * exceed S32_MAX / PAGE_SIZE.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
+
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
@@ -7097,8 +7396,7 @@ static int __init mem_cgroup_init(void)
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE);
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
rtpn->rb_root = RB_ROOT;
rtpn->rb_rightmost = NULL;
@@ -7110,7 +7408,7 @@ static int __init mem_cgroup_init(void)
}
subsys_initcall(mem_cgroup_init);
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
while (!refcount_inc_not_zero(&memcg->id.ref)) {
@@ -7118,7 +7416,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
*/
- if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
VM_BUG_ON(1);
break;
}
@@ -7131,26 +7429,29 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
/**
* mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
+ * @folio: folio whose memsw charge to transfer
* @entry: swap entry to move the charge to
*
- * Transfer the memsw charge of @page to @entry.
+ * Transfer the memsw charge of @folio to @entry.
*/
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
unsigned short oldid;
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (mem_cgroup_disabled())
return;
- memcg = page->mem_cgroup;
+ if (!do_memsw_account())
+ return;
- /* Readahead page, never charged */
+ memcg = folio_memcg(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
if (!memcg)
return;
@@ -7160,21 +7461,21 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- nr_entries = thp_nr_pages(page);
+ nr_entries = folio_nr_pages(folio);
/* Get references for the tail pages, too */
if (nr_entries > 1)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
nr_entries);
- VM_BUG_ON_PAGE(oldid, page);
+ VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- page->mem_cgroup = NULL;
+ folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
- if (!cgroup_memory_noswap && memcg != swap_memcg) {
+ if (memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
@@ -7186,35 +7487,36 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
- VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, -nr_entries);
- memcg_check_events(memcg, page);
+ memcg_stats_lock();
+ mem_cgroup_charge_statistics(memcg, -nr_entries);
+ memcg_stats_unlock();
+ memcg_check_events(memcg, folio_nid(folio));
css_put(&memcg->css);
}
/**
- * mem_cgroup_try_charge_swap - try charging swap space for a page
- * @page: page being added to swap
+ * __mem_cgroup_try_charge_swap - try charging swap space for a folio
+ * @folio: folio being added to swap
* @entry: swap entry to charge
*
- * Try to charge @page's memcg for the swap space at @entry.
+ * Try to charge @folio's memcg for the swap space at @entry.
*
* Returns 0 on success, -ENOMEM on failure.
*/
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
{
- unsigned int nr_pages = thp_nr_pages(page);
+ unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (do_memsw_account())
return 0;
- memcg = page->mem_cgroup;
+ memcg = folio_memcg(folio);
- /* Readahead page, never charged */
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
if (!memcg)
return 0;
@@ -7225,7 +7527,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
- if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
+ if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
@@ -7237,31 +7539,34 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
- VM_BUG_ON_PAGE(oldid, page);
+ VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
return 0;
}
/**
- * mem_cgroup_uncharge_swap - uncharge swap space
+ * __mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
* @nr_pages: the amount of swap space to uncharge
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;
+ if (mem_cgroup_disabled())
+ return;
+
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->swap, nr_pages);
- else
+ if (!mem_cgroup_is_root(memcg)) {
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
+ else
+ page_counter_uncharge(&memcg->swap, nr_pages);
}
mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
mem_cgroup_id_put_many(memcg, nr_pages);
@@ -7273,31 +7578,31 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
long nr_swap_pages = get_nr_swap_pages();
- if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (mem_cgroup_disabled() || do_memsw_account())
return nr_swap_pages;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
return nr_swap_pages;
}
-bool mem_cgroup_swap_full(struct page *page)
+bool mem_cgroup_swap_full(struct folio *folio)
{
struct mem_cgroup *memcg;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (vm_swap_full())
return true;
- if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (do_memsw_account())
return false;
- memcg = page->mem_cgroup;
+ memcg = folio_memcg(folio);
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
@@ -7310,10 +7615,9 @@ bool mem_cgroup_swap_full(struct page *page)
static int __init setup_swap_account(char *s)
{
- if (!strcmp(s, "1"))
- cgroup_memory_noswap = 0;
- else if (!strcmp(s, "0"))
- cgroup_memory_noswap = 1;
+ pr_warn_once("The swapaccount= commandline option is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
return 1;
}
__setup("swapaccount=", setup_swap_account);
@@ -7326,6 +7630,14 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static u64 swap_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)memcg->swap.watermark * PAGE_SIZE;
+}
+
static int swap_high_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -7405,6 +7717,11 @@ static struct cftype swap_files[] = {
.write = swap_max_write,
},
{
+ .name = "swap.peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_peak_read,
+ },
+ {
.name = "swap.events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
@@ -7440,27 +7757,160 @@ static struct cftype memsw_files[] = {
{ }, /* terminate */
};
-/*
- * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
- * instead of a core_initcall(), this could mean cgroup_memory_noswap still
- * remains set to false even when memcg is disabled via "cgroup_disable=memory"
- * boot parameter. This may result in premature OOPS inside
- * mem_cgroup_get_nr_swap_pages() function in corner cases.
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+/**
+ * obj_cgroup_may_zswap - check if this cgroup can zswap
+ * @objcg: the object cgroup
+ *
+ * Check if the hierarchical zswap limit has been reached.
+ *
+ * This doesn't check for specific headroom, and it is not atomic
+ * either. But with zswap, the size of the allocation is only known
+ * once compression has occured, and this optimistic pre-check avoids
+ * spending cycles on compression when there is already no room left
+ * or zswap is disabled altogether somewhere in the hierarchy.
+ */
+bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
+{
+ struct mem_cgroup *memcg, *original_memcg;
+ bool ret = true;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return true;
+
+ original_memcg = get_mem_cgroup_from_objcg(objcg);
+ for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
+ memcg = parent_mem_cgroup(memcg)) {
+ unsigned long max = READ_ONCE(memcg->zswap_max);
+ unsigned long pages;
+
+ if (max == PAGE_COUNTER_MAX)
+ continue;
+ if (max == 0) {
+ ret = false;
+ break;
+ }
+
+ cgroup_rstat_flush(memcg->css.cgroup);
+ pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
+ if (pages < max)
+ continue;
+ ret = false;
+ break;
+ }
+ mem_cgroup_put(original_memcg);
+ return ret;
+}
+
+/**
+ * obj_cgroup_charge_zswap - charge compression backend memory
+ * @objcg: the object cgroup
+ * @size: size of compressed object
+ *
+ * This forces the charge after obj_cgroup_may_swap() allowed
+ * compression and storage in zwap for this cgroup to go ahead.
+ */
+void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
+{
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
+
+ /* PF_MEMALLOC context, charging must succeed */
+ if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
+ VM_WARN_ON_ONCE(1);
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
+ rcu_read_unlock();
+}
+
+/**
+ * obj_cgroup_uncharge_zswap - uncharge compression backend memory
+ * @objcg: the object cgroup
+ * @size: size of compressed object
+ *
+ * Uncharges zswap memory on page in.
*/
+void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
+{
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ obj_cgroup_uncharge(objcg, size);
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
+ rcu_read_unlock();
+}
+
+static u64 zswap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ cgroup_rstat_flush(css->cgroup);
+ return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
+}
+
+static int zswap_max_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
+}
+
+static ssize_t zswap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ xchg(&memcg->zswap_max, max);
+
+ return nbytes;
+}
+
+static struct cftype zswap_files[] = {
+ {
+ .name = "zswap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = zswap_current_read,
+ },
+ {
+ .name = "zswap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = zswap_max_show,
+ .write = zswap_max_write,
+ },
+ { } /* terminate */
+};
+#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
+
static int __init mem_cgroup_swap_init(void)
{
- /* No memory control -> no swap control */
if (mem_cgroup_disabled())
- cgroup_memory_noswap = true;
-
- if (cgroup_memory_noswap)
return 0;
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
-
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
+#endif
return 0;
}
-core_initcall(mem_cgroup_swap_init);
+subsys_initcall(mem_cgroup_swap_init);
-#endif /* CONFIG_MEMCG_SWAP */
+#endif /* CONFIG_SWAP */
diff --git a/mm/memfd.c b/mm/memfd.c
index 2647c898990c..2dba2cb6f0d0 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -18,6 +18,7 @@
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
+#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
/*
@@ -31,20 +32,28 @@
static void memfd_tag_pins(struct xa_state *xas)
{
struct page *page;
- unsigned int tagged = 0;
+ int latency = 0;
+ int cache_count;
lru_add_drain();
xas_lock_irq(xas);
xas_for_each(xas, page, ULONG_MAX) {
- if (xa_is_value(page))
- continue;
- page = find_subpage(page, xas->xa_index);
- if (page_count(page) - page_mapcount(page) > 1)
+ cache_count = 1;
+ if (!xa_is_value(page) &&
+ PageTransHuge(page) && !PageHuge(page))
+ cache_count = HPAGE_PMD_NR;
+
+ if (!xa_is_value(page) &&
+ page_count(page) - total_mapcount(page) != cache_count)
xas_set_mark(xas, MEMFD_TAG_PINNED);
+ if (cache_count != 1)
+ xas_set(xas, page->index + cache_count);
- if (++tagged % XA_CHECK_SCHED)
+ latency += cache_count;
+ if (latency < XA_CHECK_SCHED)
continue;
+ latency = 0;
xas_pause(xas);
xas_unlock_irq(xas);
@@ -73,7 +82,8 @@ static int memfd_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
- unsigned int tagged = 0;
+ int latency = 0;
+ int cache_count;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
@@ -87,10 +97,14 @@ static int memfd_wait_for_pins(struct address_space *mapping)
xas_lock_irq(&xas);
xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
- if (xa_is_value(page))
- continue;
- page = find_subpage(page, xas.xa_index);
- if (page_count(page) - page_mapcount(page) != 1) {
+
+ cache_count = 1;
+ if (!xa_is_value(page) &&
+ PageTransHuge(page) && !PageHuge(page))
+ cache_count = HPAGE_PMD_NR;
+
+ if (!xa_is_value(page) && cache_count !=
+ page_count(page) - total_mapcount(page)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
@@ -103,8 +117,11 @@ static int memfd_wait_for_pins(struct address_space *mapping)
}
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
- if (++tagged % XA_CHECK_SCHED)
+
+ latency += cache_count;
+ if (latency < XA_CHECK_SCHED)
continue;
+ latency = 0;
xas_pause(&xas);
xas_unlock_irq(&xas);
@@ -131,6 +148,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
}
#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_EXEC | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE | \
@@ -159,6 +177,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
* SEAL_SHRINK: Prevent the file from shrinking
* SEAL_GROW: Prevent the file from growing
* SEAL_WRITE: Prevent write access to the file
+ * SEAL_EXEC: Prevent modification of the exec bits in the file mode
*
* As we don't require any trust relationship between two parties, we
* must prevent seals from being removed. Therefore, sealing a file
@@ -203,6 +222,12 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
}
}
+ /*
+ * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
+ */
+ if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
+ seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
+
*file_seals |= seals;
error = 0;
@@ -218,16 +243,12 @@ static int memfd_get_seals(struct file *file)
return seals ? *seals : -EINVAL;
}
-long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
long error;
switch (cmd) {
case F_ADD_SEALS:
- /* disallow upper 32bit */
- if (arg > UINT_MAX)
- return -EINVAL;
-
error = memfd_add_seals(file, arg);
break;
case F_GET_SEALS:
@@ -245,7 +266,30 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
+
+static int check_sysctl_memfd_noexec(unsigned int *flags)
+{
+#ifdef CONFIG_SYSCTL
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ int sysctl = pidns_memfd_noexec_scope(ns);
+
+ if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+ if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
+ *flags |= MFD_NOEXEC_SEAL;
+ else
+ *flags |= MFD_EXEC;
+ }
+
+ if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
+ pr_err_ratelimited(
+ "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
+ current->comm, task_pid_nr(current), sysctl);
+ return -EACCES;
+ }
+#endif
+ return 0;
+}
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
@@ -267,6 +311,20 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
}
+ /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
+ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
+ return -EINVAL;
+
+ if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+ pr_warn_once(
+ "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n",
+ current->comm, task_pid_nr(current));
+ }
+
+ error = check_sysctl_memfd_noexec(&flags);
+ if (error < 0)
+ return error;
+
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
@@ -297,9 +355,7 @@ SYSCALL_DEFINE2(memfd_create,
}
if (flags & MFD_HUGETLB) {
- struct user_struct *user = NULL;
-
- file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
@@ -312,9 +368,20 @@ SYSCALL_DEFINE2(memfd_create,
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
- if (flags & MFD_ALLOW_SEALING) {
+ if (flags & MFD_NOEXEC_SEAL) {
+ struct inode *inode = file_inode(file);
+
+ inode->i_mode &= ~0111;
+ file_seals = memfd_file_seals_ptr(file);
+ if (file_seals) {
+ *file_seals &= ~F_SEAL_SEAL;
+ *file_seals |= F_SEAL_EXEC;
+ }
+ } else if (flags & MFD_ALLOW_SEALING) {
+ /* MFD_EXEC and MFD_ALLOW_SEALING are set */
file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
+ if (file_seals)
+ *file_seals &= ~F_SEAL_SEAL;
}
fd_install(fd, file);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 990e3b2e37d5..fe121fdb05f7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -6,16 +6,16 @@
* High level machine check handler. Handles pages reported by the
* hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
- *
+ *
* In addition there is a "soft offline" entry point that allows stop using
* not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronously in respect to
- * other VM users, because memory failures could happen anytime and
- * anywhere. This could violate some of their assumptions. This is why
- * this code has to be extremely careful. Generally it tries to use
- * normal locking rules, as in get the standard locks, even if that means
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
* the error handling takes potentially a long time.
*
* It can be very tempting to add handling for obscure cases here.
@@ -24,21 +24,25 @@
* - You have a test that can be added to mce-test
* https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
* - The case actually shows up as a frequent (top 10) page state in
- * tools/vm/page-types when running a real workload.
- *
+ * tools/mm/page-types when running a real workload.
+ *
* There are several operations here with exponential complexity because
- * of unsuitable VM data structures. For example the operation to map back
- * from RMAP chains to processes has to walk the complete process list and
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
* has non linear complexity with the number. But since memory corruptions
- * are rare we hope to get away with this. This avoids impacting the core
+ * are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
+
+#define pr_fmt(fmt) "Memory failure: " fmt
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
+#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -56,16 +60,138 @@
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
+#include <linux/sysctl.h>
+#include "swap.h"
#include "internal.h"
#include "ras/ras_event.h"
-int sysctl_memory_failure_early_kill __read_mostly = 0;
+static int sysctl_memory_failure_early_kill __read_mostly;
-int sysctl_memory_failure_recovery __read_mostly = 1;
+static int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
-#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
+static bool hw_memory_failure __read_mostly = false;
+
+inline void num_poisoned_pages_inc(unsigned long pfn)
+{
+ atomic_long_inc(&num_poisoned_pages);
+ memblk_nr_poison_inc(pfn);
+}
+
+inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+{
+ atomic_long_sub(i, &num_poisoned_pages);
+ if (pfn != -1UL)
+ memblk_nr_poison_sub(pfn, i);
+}
+
+/**
+ * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
+ * @_name: name of the file in the per NUMA sysfs directory.
+ */
+#define MF_ATTR_RO(_name) \
+static ssize_t _name##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ struct memory_failure_stats *mf_stats = \
+ &NODE_DATA(dev->id)->mf_stats; \
+ return sprintf(buf, "%lu\n", mf_stats->_name); \
+} \
+static DEVICE_ATTR_RO(_name)
+
+MF_ATTR_RO(total);
+MF_ATTR_RO(ignored);
+MF_ATTR_RO(failed);
+MF_ATTR_RO(delayed);
+MF_ATTR_RO(recovered);
+
+static struct attribute *memory_failure_attr[] = {
+ &dev_attr_total.attr,
+ &dev_attr_ignored.attr,
+ &dev_attr_failed.attr,
+ &dev_attr_delayed.attr,
+ &dev_attr_recovered.attr,
+ NULL,
+};
+
+const struct attribute_group memory_failure_attr_group = {
+ .name = "memory_failure",
+ .attrs = memory_failure_attr,
+};
+
+static struct ctl_table memory_failure_table[] = {
+ {
+ .procname = "memory_failure_early_kill",
+ .data = &sysctl_memory_failure_early_kill,
+ .maxlen = sizeof(sysctl_memory_failure_early_kill),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "memory_failure_recovery",
+ .data = &sysctl_memory_failure_recovery,
+ .maxlen = sizeof(sysctl_memory_failure_recovery),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
+/*
+ * Return values:
+ * 1: the page is dissolved (if needed) and taken off from buddy,
+ * 0: the page is dissolved (if needed) and not taken off from buddy,
+ * < 0: failed to dissolve.
+ */
+static int __page_handle_poison(struct page *page)
+{
+ int ret;
+
+ zone_pcp_disable(page_zone(page));
+ ret = dissolve_free_huge_page(page);
+ if (!ret)
+ ret = take_page_off_buddy(page);
+ zone_pcp_enable(page_zone(page));
+
+ return ret;
+}
+
+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
+{
+ if (hugepage_or_freepage) {
+ /*
+ * Doing this check for free pages is also fine since dissolve_free_huge_page
+ * returns 0 for non-hugetlb pages as well.
+ */
+ if (__page_handle_poison(page) <= 0)
+ /*
+ * We could fail to take off the target page from buddy
+ * for example due to racy page allocation, but that's
+ * acceptable because soft-offlined page is not broken
+ * and if someone really want to use it, they should
+ * take it.
+ */
+ return false;
+ }
+
+ SetPageHWPoison(page);
+ if (release)
+ put_page(page);
+ page_ref_inc(page);
+ num_poisoned_pages_inc(page_to_pfn(page));
+
+ return true;
+}
+
+#if IS_ENABLED(CONFIG_HWPOISON_INJECT)
u32 hwpoison_filter_enable = 0;
u32 hwpoison_filter_dev_major = ~0U;
@@ -87,12 +213,6 @@ static int hwpoison_filter_dev(struct page *p)
hwpoison_filter_dev_minor == ~0U)
return 0;
- /*
- * page_mapping() does not accept slab pages.
- */
- if (PageSlab(p))
- return -EINVAL;
-
mapping = page_mapping(p);
if (mapping == NULL || mapping->host == NULL)
return -EINVAL;
@@ -212,66 +332,63 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
short addr_lsb = tk->size_shift;
int ret = 0;
- pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+ pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
- if (flags & MF_ACTION_REQUIRED) {
- WARN_ON_ONCE(t != current);
+ if ((flags & MF_ACTION_REQUIRED) && (t == current))
ret = force_sig_mceerr(BUS_MCEERR_AR,
- (void __user *)tk->addr, addr_lsb);
- } else {
+ (void __user *)tk->addr, addr_lsb);
+ else
/*
+ * Signal other processes sharing the page if they have
+ * PF_MCE_EARLY set.
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
* This could cause a loop when the user sets SIGBUS
* to SIG_IGN, but hopefully no one will do that?
*/
ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
- addr_lsb, t); /* synchronous? */
- }
+ addr_lsb, t);
if (ret < 0)
- pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
+ pr_info("Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
return ret;
}
/*
- * When a unknown page type is encountered drain as many buffers as possible
- * in the hope to turn the page into a LRU or free page, which we can handle.
+ * Unknown page type encountered. Try to check whether it can turn PageLRU by
+ * lru_add_drain_all.
*/
-void shake_page(struct page *p, int access)
+void shake_page(struct page *p)
{
if (PageHuge(p))
return;
if (!PageSlab(p)) {
lru_add_drain_all();
- if (PageLRU(p))
- return;
- drain_all_pages(page_zone(p));
if (PageLRU(p) || is_free_buddy_page(p))
return;
}
/*
- * Only call shrink_node_slabs here (which would also shrink
- * other caches) if access is not potentially fatal.
+ * TODO: Could shrink slab caches here if a lightweight range-based
+ * shrinker will be available.
*/
- if (access)
- drop_slab_node(page_to_nid(p));
}
EXPORT_SYMBOL_GPL(shake_page);
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+ unsigned long address)
{
- unsigned long address = vma_address(page, vma);
+ unsigned long ret = 0;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pte_t ptent;
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return 0;
@@ -289,11 +406,13 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
if (pmd_devmap(*pmd))
return PMD_SHIFT;
pte = pte_offset_map(pmd, address);
- if (!pte_present(*pte))
+ if (!pte)
return 0;
- if (pte_devmap(*pte))
- return PAGE_SHIFT;
- return 0;
+ ptent = ptep_get(pte);
+ if (pte_present(ptent) && pte_devmap(ptent))
+ ret = PAGE_SHIFT;
+ pte_unmap(pte);
+ return ret;
}
/*
@@ -301,26 +420,36 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
* not much we can do. We just print a message and ignore otherwise.
*/
+#define FSDAX_INVALID_PGOFF ULONG_MAX
+
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
+ * filesystem with a memory failure handler has claimed the
+ * memory_failure event. In all other cases, page->index and
+ * page->mapping are sufficient for mapping the page back to its
+ * corresponding user virtual address.
*/
-static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+static void __add_to_kill(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma, struct list_head *to_kill,
+ unsigned long ksm_addr, pgoff_t fsdax_pgoff)
{
struct to_kill *tk;
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- pr_err("Memory failure: Out of memory while machine check handling\n");
+ pr_err("Out of memory while machine check handling\n");
return;
}
- tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
+ tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
+ if (is_zone_device_page(p)) {
+ if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
+ tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+ tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+ } else
tk->size_shift = page_shift(compound_head(p));
/*
@@ -334,7 +463,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* has a mapping for the page.
*/
if (tk->addr == -EFAULT) {
- pr_info("Memory failure: Unable to find user space address %lx in %s\n",
+ pr_info("Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
} else if (tk->size_shift == 0) {
kfree(tk);
@@ -346,11 +475,39 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
list_add_tail(&tk->nd, to_kill);
}
+static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill)
+{
+ __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
+}
+
+#ifdef CONFIG_KSM
+static bool task_in_to_kill_list(struct list_head *to_kill,
+ struct task_struct *tsk)
+{
+ struct to_kill *tk, *next;
+
+ list_for_each_entry_safe(tk, next, to_kill, nd) {
+ if (tk->tsk == tsk)
+ return true;
+ }
+
+ return false;
+}
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma, struct list_head *to_kill,
+ unsigned long ksm_addr)
+{
+ if (!task_in_to_kill_list(to_kill, tsk))
+ __add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
+}
+#endif
/*
* Kill the processes that have been collected earlier.
*
- * Only do anything when DOIT is set, otherwise just free the list
- * (this is used for clean pages which do not need killing)
+ * Only do anything when FORCEKILL is set, otherwise just free the
+ * list (this is used for clean pages which do not need killing)
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
@@ -359,7 +516,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
{
struct to_kill *tk, *next;
- list_for_each_entry_safe (tk, next, to_kill, nd) {
+ list_for_each_entry_safe(tk, next, to_kill, nd) {
if (forcekill) {
/*
* In case something went wrong with munmapping
@@ -367,7 +524,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr == -EFAULT) {
- pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
tk->tsk, PIDTYPE_PID);
@@ -380,9 +537,10 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* process anyways.
*/
else if (kill_proc(tk, pfn, flags) < 0)
- pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
+ list_del(&tk->nd);
put_task_struct(tk->tsk);
kfree(tk);
}
@@ -416,26 +574,25 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
* Determine whether a given process is "early kill" process which expects
* to be signaled when some page under the process is hwpoisoned.
* Return task_struct of the dedicated thread (main thread unless explicitly
- * specified) if the process is "early kill," and otherwise returns NULL.
+ * specified) if the process is "early kill" and otherwise returns NULL.
*
- * Note that the above is true for Action Optional case, but not for Action
- * Required case where SIGBUS should sent only to the current thread.
+ * Note that the above is true for Action Optional case. For Action Required
+ * case, it's only meaningful to the current thread which need to be signaled
+ * with SIGBUS, this error is Action Optional for other non current
+ * processes sharing the same error page,if the process is "early kill", the
+ * task_struct of the dedicated thread will also be returned.
*/
-static struct task_struct *task_early_kill(struct task_struct *tsk,
- int force_early)
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
{
if (!tsk->mm)
return NULL;
- if (force_early) {
- /*
- * Comparing ->mm here because current task might represent
- * a subthread, while tsk always points to the main thread.
- */
- if (tsk->mm == current->mm)
- return current;
- else
- return NULL;
- }
+ /*
+ * Comparing ->mm here because current task might represent
+ * a subthread, while tsk always points to the main thread.
+ */
+ if (force_early && tsk->mm == current->mm)
+ return current;
+
return find_early_kill_thread(tsk);
}
@@ -445,12 +602,13 @@ static struct task_struct *task_early_kill(struct task_struct *tsk,
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
int force_early)
{
+ struct folio *folio = page_folio(page);
struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
pgoff_t pgoff;
- av = page_lock_anon_vma_read(page);
+ av = folio_lock_anon_vma_read(folio, NULL);
if (av == NULL) /* Not actually mapped anymore */
return;
@@ -465,14 +623,15 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
vma = vmac->vma;
+ if (vma->vm_mm != t->mm)
+ continue;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill_anon_file(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
- page_unlock_anon_vma_read(av);
+ anon_vma_unlock_read(av);
}
/*
@@ -504,12 +663,47 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill_anon_file(t, page, vma, to_kill);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ i_mmap_unlock_read(mapping);
+}
+
+#ifdef CONFIG_FS_DAX
+static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill, pgoff_t pgoff)
+{
+ __add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
+}
+
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+ struct address_space *mapping, pgoff_t pgoff,
+ struct list_head *to_kill)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+
+ i_mmap_lock_read(mapping);
+ read_lock(&tasklist_lock);
+ for_each_process(tsk) {
+ struct task_struct *t = task_early_kill(tsk, true);
+
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (vma->vm_mm == t->mm)
+ add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
}
}
read_unlock(&tasklist_lock);
i_mmap_unlock_read(mapping);
}
+#endif /* CONFIG_FS_DAX */
/*
* Collect the processes who have the corrupted page mapped to kill.
@@ -519,13 +713,163 @@ static void collect_procs(struct page *page, struct list_head *tokill,
{
if (!page->mapping)
return;
-
- if (PageAnon(page))
+ if (unlikely(PageKsm(page)))
+ collect_procs_ksm(page, tokill, force_early);
+ else if (PageAnon(page))
collect_procs_anon(page, tokill, force_early);
else
collect_procs_file(page, tokill, force_early);
}
+struct hwp_walk {
+ struct to_kill tk;
+ unsigned long pfn;
+ int flags;
+};
+
+static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+ tk->addr = addr;
+ tk->size_shift = shift;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+ unsigned long poisoned_pfn, struct to_kill *tk)
+{
+ unsigned long pfn = 0;
+
+ if (pte_present(pte)) {
+ pfn = pte_pfn(pte);
+ } else {
+ swp_entry_t swp = pte_to_swp_entry(pte);
+
+ if (is_hwpoison_entry(swp))
+ pfn = swp_offset_pfn(swp);
+ }
+
+ if (!pfn || pfn != poisoned_pfn)
+ return 0;
+
+ set_to_kill(tk, addr, shift);
+ return 1;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+ struct hwp_walk *hwp)
+{
+ pmd_t pmd = *pmdp;
+ unsigned long pfn;
+ unsigned long hwpoison_vaddr;
+
+ if (!pmd_present(pmd))
+ return 0;
+ pfn = pmd_pfn(pmd);
+ if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
+ hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
+ set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
+ return 1;
+ }
+ return 0;
+}
+#else
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+ struct hwp_walk *hwp)
+{
+ return 0;
+}
+#endif
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct hwp_walk *hwp = walk->private;
+ int ret = 0;
+ pte_t *ptep, *mapped_pte;
+ spinlock_t *ptl;
+
+ ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+ if (ptl) {
+ ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
+ addr, &ptl);
+ if (!ptep)
+ goto out;
+
+ for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT,
+ hwp->pfn, &hwp->tk);
+ if (ret == 1)
+ break;
+ }
+ pte_unmap_unlock(mapped_pte, ptl);
+out:
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hwp_walk *hwp = walk->private;
+ pte_t pte = huge_ptep_get(ptep);
+ struct hstate *h = hstate_vma(walk->vma);
+
+ return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+ hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range NULL
+#endif
+
+static const struct mm_walk_ops hwp_walk_ops = {
+ .pmd_entry = hwpoison_pte_range,
+ .hugetlb_entry = hwpoison_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+/*
+ * Sends SIGBUS to the current process with error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple local machine checks happened on different CPUs.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. The returned virtual address
+ * is proper in most cases, but it could be wrong when the application
+ * process has multiple entries mapping the error page.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+ int flags)
+{
+ int ret;
+ struct hwp_walk priv = {
+ .pfn = pfn,
+ };
+ priv.tk.tsk = p;
+
+ if (!p->mm)
+ return -EFAULT;
+
+ mmap_read_lock(p->mm);
+ ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
+ (void *)&priv);
+ if (ret == 1 && priv.tk.addr)
+ kill_proc(&priv.tk, pfn, flags);
+ else
+ ret = 0;
+ mmap_read_unlock(p->mm);
+ return ret > 0 ? -EHWPOISON : -EFAULT;
+}
+
static const char *action_name[] = {
[MF_IGNORED] = "Ignored",
[MF_FAILED] = "Failed",
@@ -538,10 +882,8 @@ static const char * const action_page_types[] = {
[MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
[MF_MSG_SLAB] = "kernel slab page",
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
- [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
[MF_MSG_HUGE] = "huge page",
[MF_MSG_FREE_HUGE] = "free huge page",
- [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
[MF_MSG_UNMAP_FAILED] = "unmapping failed page",
[MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
[MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
@@ -553,8 +895,8 @@ static const char * const action_page_types[] = {
[MF_MSG_CLEAN_LRU] = "clean LRU page",
[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
[MF_MSG_BUDDY] = "free buddy page",
- [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -566,7 +908,7 @@ static const char * const action_page_types[] = {
*/
static int delete_from_lru_cache(struct page *p)
{
- if (!isolate_lru_page(p)) {
+ if (isolate_lru_page(p)) {
/*
* Clear sensible page flags, so that the buddy system won't
* complain when the page is unpoison-and-freed.
@@ -578,7 +920,7 @@ static int delete_from_lru_cache(struct page *p)
* Poisoned page might never drop its ref count to 0 so we have
* to uncharge it manually from its memcg.
*/
- mem_cgroup_uncharge(p);
+ mem_cgroup_uncharge(page_folio(p));
/*
* drop the page count elevated by isolate_lru_page()
@@ -595,15 +937,14 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
int ret = MF_FAILED;
if (mapping->a_ops->error_remove_page) {
+ struct folio *folio = page_folio(p);
int err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
- pfn, err);
- } else if (page_has_private(p) &&
- !try_to_release_page(p, GFP_NOIO)) {
- pr_info("Memory failure: %#lx: failed to release buffers\n",
- pfn);
+ pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
+ } else if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_NOIO)) {
+ pr_info("%#lx: failed to release buffers\n", pfn);
} else {
ret = MF_RECOVERED;
}
@@ -615,38 +956,73 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- pr_info("Memory failure: %#lx: Failed to invalidate\n",
- pfn);
+ pr_info("%#lx: Failed to invalidate\n", pfn);
}
return ret;
}
+struct page_state {
+ unsigned long mask;
+ unsigned long res;
+ enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
+ int (*action)(struct page_state *ps, struct page *p);
+};
+
+/*
+ * Return true if page is still referenced by others, otherwise return
+ * false.
+ *
+ * The extra_pins is true when one extra refcount is expected.
+ */
+static bool has_extra_refcount(struct page_state *ps, struct page *p,
+ bool extra_pins)
+{
+ int count = page_count(p) - 1;
+
+ if (extra_pins)
+ count -= 1;
+
+ if (count > 0) {
+ pr_err("%#lx: %s still referenced by %d users\n",
+ page_to_pfn(p), action_page_types[ps->type], count);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Error hit kernel page.
* Do nothing, try to be lucky and not touch this instead. For a few cases we
* could be more sophisticated.
*/
-static int me_kernel(struct page *p, unsigned long pfn)
+static int me_kernel(struct page_state *ps, struct page *p)
{
+ unlock_page(p);
return MF_IGNORED;
}
/*
* Page in unknown state. Do nothing.
*/
-static int me_unknown(struct page *p, unsigned long pfn)
+static int me_unknown(struct page_state *ps, struct page *p)
{
- pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
+ unlock_page(p);
return MF_FAILED;
}
/*
* Clean (or cleaned) page cache page.
*/
-static int me_pagecache_clean(struct page *p, unsigned long pfn)
+static int me_pagecache_clean(struct page_state *ps, struct page *p)
{
+ int ret;
struct address_space *mapping;
+ bool extra_pins;
delete_from_lru_cache(p);
@@ -654,8 +1030,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
- if (PageAnon(p))
- return MF_RECOVERED;
+ if (PageAnon(p)) {
+ ret = MF_RECOVERED;
+ goto out;
+ }
/*
* Now truncate the page in the page cache. This is really
@@ -669,15 +1047,29 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return MF_FAILED;
+ ret = MF_FAILED;
+ goto out;
}
/*
+ * The shmem page is kept in page cache instead of truncating
+ * so is expected to have an extra refcount after error-handling.
+ */
+ extra_pins = shmem_mapping(mapping);
+
+ /*
* Truncation is a bit tricky. Enable it per file system for now.
*
- * Open: to take i_mutex or not for this? Right now we don't.
+ * Open: to take i_rwsem or not for this? Right now we don't.
*/
- return truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, page_to_pfn(p), mapping);
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
+out:
+ unlock_page(p);
+
+ return ret;
}
/*
@@ -685,7 +1077,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
-static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+static int me_pagecache_dirty(struct page_state *ps, struct page *p)
{
struct address_space *mapping = page_mapping(p);
@@ -729,7 +1121,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
mapping_set_error(mapping, -EIO);
}
- return me_pagecache_clean(p, pfn);
+ return me_pagecache_clean(ps, p);
}
/*
@@ -739,7 +1131,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* cache and swap cache(ie. page is freshly swapped in). So it could be
* referenced concurrently by 2 types of PTEs:
* normal PTEs and swap PTEs. We try to handle them consistently by calling
- * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
* and then
* - clear dirty bit to prevent IO
* - remove from LRU
@@ -751,26 +1143,41 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* Clean swap cache pages can be directly isolated. A later page fault will
* bring in the known good data from disk.
*/
-static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+static int me_swapcache_dirty(struct page_state *ps, struct page *p)
{
+ int ret;
+ bool extra_pins = false;
+
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
- if (!delete_from_lru_cache(p))
- return MF_DELAYED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+ unlock_page(p);
+
+ if (ret == MF_DELAYED)
+ extra_pins = true;
+
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
+ return ret;
}
-static int me_swapcache_clean(struct page *p, unsigned long pfn)
+static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
- delete_from_swap_cache(p);
+ struct folio *folio = page_folio(p);
+ int ret;
- if (!delete_from_lru_cache(p))
- return MF_RECOVERED;
- else
- return MF_FAILED;
+ delete_from_swap_cache(folio);
+
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+ folio_unlock(folio);
+
+ if (has_extra_refcount(ps, p, false))
+ ret = MF_FAILED;
+
+ return ret;
}
/*
@@ -779,32 +1186,41 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
*/
-static int me_huge_page(struct page *p, unsigned long pfn)
+static int me_huge_page(struct page_state *ps, struct page *p)
{
- int res = 0;
+ int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
mapping = page_mapping(hpage);
if (mapping) {
- res = truncate_error_page(hpage, pfn, mapping);
+ res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
+ unlock_page(hpage);
} else {
unlock_page(hpage);
/*
- * migration entry prevents later access on error anonymous
- * hugepage, so we can free and dissolve it into buddy to
- * save healthy subpages.
+ * migration entry prevents later access on error hugepage,
+ * so we can free and dissolve it into buddy to save healthy
+ * subpages.
*/
- if (PageAnon(hpage))
- put_page(hpage);
- dissolve_free_huge_page(p);
- res = MF_RECOVERED;
- lock_page(hpage);
+ put_page(hpage);
+ if (__page_handle_poison(p) >= 0) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
+ } else {
+ res = MF_FAILED;
+ }
}
+ if (has_extra_refcount(ps, p, extra_pins))
+ res = MF_FAILED;
+
return res;
}
@@ -830,12 +1246,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
-static struct page_state {
- unsigned long mask;
- unsigned long res;
- enum mf_action_page_type type;
- int (*action)(struct page *p, unsigned long pfn);
-} error_states[] = {
+static struct page_state error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
@@ -878,103 +1289,277 @@ static struct page_state {
#undef slab
#undef reserved
+static void update_per_node_mf_stats(unsigned long pfn,
+ enum mf_result result)
+{
+ int nid = MAX_NUMNODES;
+ struct memory_failure_stats *mf_stats = NULL;
+
+ nid = pfn_to_nid(pfn);
+ if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) {
+ WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid);
+ return;
+ }
+
+ mf_stats = &NODE_DATA(nid)->mf_stats;
+ switch (result) {
+ case MF_IGNORED:
+ ++mf_stats->ignored;
+ break;
+ case MF_FAILED:
+ ++mf_stats->failed;
+ break;
+ case MF_DELAYED:
+ ++mf_stats->delayed;
+ break;
+ case MF_RECOVERED:
+ ++mf_stats->recovered;
+ break;
+ default:
+ WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result);
+ break;
+ }
+ ++mf_stats->total;
+}
+
/*
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, enum mf_action_page_type type,
- enum mf_result result)
+static int action_result(unsigned long pfn, enum mf_action_page_type type,
+ enum mf_result result)
{
trace_memory_failure_event(pfn, type, result);
- pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
+ num_poisoned_pages_inc(pfn);
+
+ update_per_node_mf_stats(pfn, result);
+
+ pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
+
+ return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
static int page_action(struct page_state *ps, struct page *p,
unsigned long pfn)
{
int result;
- int count;
-
- result = ps->action(p, pfn);
- count = page_count(p) - 1;
- if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
- count--;
- if (count > 0) {
- pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
- pfn, action_page_types[ps->type], count);
- result = MF_FAILED;
- }
- action_result(pfn, ps->type, result);
+ /* page p should be unlocked after returning from ps->action(). */
+ result = ps->action(ps, p);
/* Could do more checks here if page looks ok */
/*
* Could adjust zone counters here to correct for the missing page.
*/
- return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
+ return action_result(pfn, ps->type, result);
}
-/**
- * get_hwpoison_page() - Get refcount for memory error handling:
- * @page: raw error page (hit by memory error)
- *
- * Return: return 0 if failed to grab the refcount, otherwise true (some
- * non-zero value.)
+static inline bool PageHWPoisonTakenOff(struct page *page)
+{
+ return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
+}
+
+void SetPageHWPoisonTakenOff(struct page *page)
+{
+ set_page_private(page, MAGIC_HWPOISON);
+}
+
+void ClearPageHWPoisonTakenOff(struct page *page)
+{
+ if (PageHWPoison(page))
+ set_page_private(page, 0);
+}
+
+/*
+ * Return true if a page type of a given page is supported by hwpoison
+ * mechanism (while handling could fail), otherwise false. This function
+ * does not return true for hugetlb or device memory pages, so it's assumed
+ * to be called only in the context where we never have such pages.
*/
-int get_hwpoison_page(struct page *page)
+static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
+{
+ /* Soft offline could migrate non-LRU movable pages */
+ if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
+ return true;
+
+ return PageLRU(page) || is_free_buddy_page(page);
+}
+
+static int __get_hwpoison_page(struct page *page, unsigned long flags)
+{
+ struct folio *folio = page_folio(page);
+ int ret = 0;
+ bool hugetlb = false;
+
+ ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
+ if (hugetlb)
+ return ret;
+
+ /*
+ * This check prevents from calling folio_try_get() for any
+ * unsupported type of folio in order to reduce the risk of unexpected
+ * races caused by taking a folio refcount.
+ */
+ if (!HWPoisonHandlable(&folio->page, flags))
+ return -EBUSY;
+
+ if (folio_try_get(folio)) {
+ if (folio == page_folio(page))
+ return 1;
+
+ pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
+ folio_put(folio);
+ }
+
+ return 0;
+}
+
+static int get_any_page(struct page *p, unsigned long flags)
{
- struct page *head = compound_head(page);
+ int ret = 0, pass = 0;
+ bool count_increased = false;
+
+ if (flags & MF_COUNT_INCREASED)
+ count_increased = true;
+
+try_again:
+ if (!count_increased) {
+ ret = __get_hwpoison_page(p, flags);
+ if (!ret) {
+ if (page_count(p)) {
+ /* We raced with an allocation, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EBUSY;
+ } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
+ /* We raced with put_page, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EIO;
+ }
+ goto out;
+ } else if (ret == -EBUSY) {
+ /*
+ * We raced with (possibly temporary) unhandlable
+ * page, retry.
+ */
+ if (pass++ < 3) {
+ shake_page(p);
+ goto try_again;
+ }
+ ret = -EIO;
+ goto out;
+ }
+ }
- if (!PageHuge(head) && PageTransHuge(head)) {
+ if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
+ ret = 1;
+ } else {
/*
- * Non anonymous thp exists only in allocation/free time. We
- * can't handle such a case correctly, so let's give it up.
- * This should be better than triggering BUG_ON when kernel
- * tries to touch the "partially handled" page.
+ * A page we cannot handle. Check whether we can turn
+ * it into something we can handle.
*/
- if (!PageAnon(head)) {
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- page_to_pfn(page));
- return 0;
+ if (pass++ < 3) {
+ put_page(p);
+ shake_page(p);
+ count_increased = false;
+ goto try_again;
}
+ put_page(p);
+ ret = -EIO;
}
+out:
+ if (ret == -EIO)
+ pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
- if (get_page_unless_zero(head)) {
- if (head == compound_head(page))
- return 1;
+ return ret;
+}
- pr_info("Memory failure: %#lx cannot catch tail\n",
- page_to_pfn(page));
- put_page(head);
- }
+static int __get_unpoison_page(struct page *page)
+{
+ struct folio *folio = page_folio(page);
+ int ret = 0;
+ bool hugetlb = false;
- return 0;
+ ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
+ if (hugetlb)
+ return ret;
+
+ /*
+ * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
+ * but also isolated from buddy freelist, so need to identify the
+ * state and have to cancel both operations to unpoison.
+ */
+ if (PageHWPoisonTakenOff(page))
+ return -EHWPOISON;
+
+ return get_page_unless_zero(page) ? 1 : 0;
+}
+
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling
+ * @p: Raw error page (hit by memory error)
+ * @flags: Flags controlling behavior of error handling
+ *
+ * get_hwpoison_page() takes a page refcount of an error page to handle memory
+ * error on it, after checking that the error page is in a well-defined state
+ * (defined as a page-type we can successfully handle the memory error on it,
+ * such as LRU page and hugetlb page).
+ *
+ * Memory error handling could be triggered at any time on any type of page,
+ * so it's prone to race with typical memory management lifecycle (like
+ * allocation and free). So to avoid such races, get_hwpoison_page() takes
+ * extra care for the error page's state (as done in __get_hwpoison_page()),
+ * and has some retry logic in get_any_page().
+ *
+ * When called from unpoison_memory(), the caller should already ensure that
+ * the given page has PG_hwpoison. So it's never reused for other page
+ * allocations, and __get_unpoison_page() never races with them.
+ *
+ * Return: 0 on failure,
+ * 1 on success for in-use pages in a well-defined state,
+ * -EIO for pages on which we can not handle memory errors,
+ * -EBUSY when get_hwpoison_page() has raced with page lifecycle
+ * operations like allocation and free,
+ * -EHWPOISON when the page is hwpoisoned and taken off from buddy.
+ */
+static int get_hwpoison_page(struct page *p, unsigned long flags)
+{
+ int ret;
+
+ zone_pcp_disable(page_zone(p));
+ if (flags & MF_UNPOISON)
+ ret = __get_unpoison_page(p);
+ else
+ ret = get_any_page(p, flags);
+ zone_pcp_enable(page_zone(p));
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(get_hwpoison_page);
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int flags, struct page **hpagep)
+ int flags, struct page *hpage)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ struct folio *folio = page_folio(hpage);
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success = true;
- int kill = 1, forcekill;
- struct page *hpage = *hpagep;
+ bool unmap_success;
+ int forcekill;
bool mlocked = PageMlocked(hpage);
/*
* Here we are interested only in user-mapped pages, so skip any
* other types of pages.
*/
- if (PageReserved(p) || PageSlab(p))
+ if (PageReserved(p) || PageSlab(p) || PageTable(p))
return true;
if (!(PageLRU(hpage) || PageHuge(p)))
return true;
@@ -986,15 +1571,9 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (!page_mapped(hpage))
return true;
- if (PageKsm(p)) {
- pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
- return false;
- }
-
if (PageSwapCache(p)) {
- pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
- pfn);
- ttu |= TTU_IGNORE_HWPOISON;
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu &= ~TTU_HWPOISON;
}
/*
@@ -1009,9 +1588,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
- kill = 0;
- ttu |= TTU_IGNORE_HWPOISON;
- pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
+ ttu &= ~TTU_HWPOISON;
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -1020,41 +1598,30 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
- *
- * Error handling: We ignore errors here because
- * there's nothing that can be done.
*/
- if (kill)
- collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- if (!PageHuge(hpage)) {
- unmap_success = try_to_unmap(hpage, ttu);
- } else {
+ if (PageHuge(hpage) && !PageAnon(hpage)) {
/*
- * For hugetlb pages, try_to_unmap could potentially call
- * huge_pmd_unshare. Because of this, take semaphore in
- * write mode here and set TTU_RMAP_LOCKED to indicate we
- * have taken the lock at this higer level.
- *
- * Note that the call to hugetlb_page_mapping_lock_write
- * is necessary even if mapping is already set. It handles
- * ugliness of potentially having to drop page lock to obtain
- * i_mmap_rwsem.
+ * For hugetlb pages in shared mappings, try_to_unmap
+ * could potentially call huge_pmd_unshare. Because of
+ * this, take semaphore in write mode here and set
+ * TTU_RMAP_LOCKED to indicate we have taken the lock
+ * at this higher level.
*/
mapping = hugetlb_page_mapping_lock_write(hpage);
-
if (mapping) {
- unmap_success = try_to_unmap(hpage,
- ttu|TTU_RMAP_LOCKED);
+ try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
i_mmap_unlock_write(mapping);
- } else {
- pr_info("Memory failure: %#lx: could not find mapping for mapped huge page\n",
- pfn);
- unmap_success = false;
- }
+ } else
+ pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
+ } else {
+ try_to_unmap(folio, ttu);
}
+
+ unmap_success = !page_mapped(hpage);
if (!unmap_success)
- pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
+ pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
/*
@@ -1062,7 +1629,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* shake_page() again to ensure that it's flushed.
*/
if (mlocked)
- shake_page(hpage, 0);
+ shake_page(hpage);
/*
* Now that the dirty bit has been propagated to the
@@ -1074,7 +1641,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
+ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
+ !unmap_success;
kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
return unmap_success;
@@ -1103,89 +1671,58 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
-static int memory_failure_hugetlb(unsigned long pfn, int flags)
+static int try_to_split_thp_page(struct page *page)
{
- struct page *p = pfn_to_page(pfn);
- struct page *head = compound_head(p);
- int res;
- unsigned long page_flags;
+ int ret;
- if (TestSetPageHWPoison(head)) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n",
- pfn);
- return 0;
- }
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
- num_poisoned_pages_inc();
+ if (unlikely(ret))
+ put_page(page);
- if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
- /*
- * Check "filter hit" and "race with other subpage."
- */
- lock_page(head);
- if (PageHWPoison(head)) {
- if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != head && TestSetPageHWPoison(head))) {
- num_poisoned_pages_dec();
- unlock_page(head);
- return 0;
- }
- }
- unlock_page(head);
- dissolve_free_huge_page(p);
- action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
- return 0;
- }
+ return ret;
+}
- lock_page(head);
- page_flags = head->flags;
+static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
+ struct address_space *mapping, pgoff_t index, int flags)
+{
+ struct to_kill *tk;
+ unsigned long size = 0;
- if (!PageHWPoison(head)) {
- pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- num_poisoned_pages_dec();
- unlock_page(head);
- put_hwpoison_page(head);
- return 0;
- }
+ list_for_each_entry(tk, to_kill, nd)
+ if (tk->size_shift)
+ size = max(size, 1UL << tk->size_shift);
- /*
- * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
- * simply disable it. In order to make it work properly, we need
- * make sure that:
- * - conversion of a pud that maps an error hugetlb into hwpoison
- * entry properly works, and
- * - other mm code walking over page table is aware of pud-aligned
- * hwpoison entries.
- */
- if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
- action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
- res = -EBUSY;
- goto out;
- }
+ if (size) {
+ /*
+ * Unmap the largest mapping to avoid breaking up device-dax
+ * mappings which are constant size. The actual size of the
+ * mapping being torn down is communicated in siginfo, see
+ * kill_proc()
+ */
+ loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
- if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
- action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
- res = -EBUSY;
- goto out;
+ unmap_mapping_range(mapping, start, size, 0);
}
- res = identify_page_state(pfn, p, page_flags);
-out:
- unlock_page(head);
- return res;
+ kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
}
-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+static int mf_generic_kill_procs(unsigned long long pfn, int flags,
struct dev_pagemap *pgmap)
{
struct page *page = pfn_to_page(pfn);
- const bool unmap_success = true;
- unsigned long size = 0;
- struct to_kill *tk;
- LIST_HEAD(tokill);
- int rc = -EBUSY;
- loff_t start;
+ LIST_HEAD(to_kill);
dax_entry_t cookie;
+ int rc = 0;
+
+ /*
+ * Pages instantiated by device-dax (not filesystem-dax)
+ * may be compound pages.
+ */
+ page = compound_head(page);
/*
* Prevent the inode from being freed while we are interrogating
@@ -1196,19 +1733,24 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
*/
cookie = dax_lock_page(page);
if (!cookie)
- goto out;
+ return -EBUSY;
if (hwpoison_filter(page)) {
- rc = 0;
+ rc = -EOPNOTSUPP;
goto unlock;
}
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
/*
- * TODO: Handle HMM pages which may need coordination
+ * TODO: Handle device pages which may need coordination
* with device-side memory.
*/
+ rc = -ENXIO;
goto unlock;
+ default:
+ break;
}
/*
@@ -1224,25 +1766,342 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* SIGBUS (i.e. MF_MUST_KILL)
*/
flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(page, &to_kill, true);
- list_for_each_entry(tk, &tokill, nd)
- if (tk->size_shift)
- size = max(size, 1UL << tk->size_shift);
- if (size) {
+ unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
+unlock:
+ dax_unlock_page(page, cookie);
+ return rc;
+}
+
+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping: address_space of the file in use
+ * @index: start pgoff of the range within the file
+ * @count: length of the range, in unit of PAGE_SIZE
+ * @mf_flags: memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ unsigned long count, int mf_flags)
+{
+ LIST_HEAD(to_kill);
+ dax_entry_t cookie;
+ struct page *page;
+ size_t end = index + count;
+
+ mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+ for (; index < end; index++) {
+ page = NULL;
+ cookie = dax_lock_mapping_entry(mapping, index, &page);
+ if (!cookie)
+ return -EBUSY;
+ if (!page)
+ goto unlock;
+
+ SetPageHWPoison(page);
+
+ collect_procs_fsdax(page, mapping, index, &to_kill);
+ unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+ index, mf_flags);
+unlock:
+ dax_unlock_mapping_entry(mapping, index, cookie);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Struct raw_hwp_page represents information about "raw error page",
+ * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
+ */
+struct raw_hwp_page {
+ struct llist_node node;
+ struct page *page;
+};
+
+static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
+{
+ return (struct llist_head *)&folio->_hugetlb_hwpoison;
+}
+
+static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
+{
+ struct llist_head *head;
+ struct llist_node *t, *tnode;
+ unsigned long count = 0;
+
+ head = raw_hwp_list_head(folio);
+ llist_for_each_safe(tnode, t, head->first) {
+ struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+ if (move_flag)
+ SetPageHWPoison(p->page);
+ else
+ num_poisoned_pages_sub(page_to_pfn(p->page), 1);
+ kfree(p);
+ count++;
+ }
+ llist_del_all(head);
+ return count;
+}
+
+static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
+{
+ struct llist_head *head;
+ struct raw_hwp_page *raw_hwp;
+ struct llist_node *t, *tnode;
+ int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
+
+ /*
+ * Once the hwpoison hugepage has lost reliable raw error info,
+ * there is little meaning to keep additional error info precisely,
+ * so skip to add additional raw error info.
+ */
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
+ return -EHWPOISON;
+ head = raw_hwp_list_head(folio);
+ llist_for_each_safe(tnode, t, head->first) {
+ struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+ if (p->page == page)
+ return -EHWPOISON;
+ }
+
+ raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
+ if (raw_hwp) {
+ raw_hwp->page = page;
+ llist_add(&raw_hwp->node, head);
+ /* the first error event will be counted in action_result(). */
+ if (ret)
+ num_poisoned_pages_inc(page_to_pfn(page));
+ } else {
+ /*
+ * Failed to save raw error info. We no longer trace all
+ * hwpoisoned subpages, and we need refuse to free/dissolve
+ * this hwpoisoned hugepage.
+ */
+ folio_set_hugetlb_raw_hwp_unreliable(folio);
/*
- * Unmap the largest mapping to avoid breaking up
- * device-dax mappings which are constant size. The
- * actual size of the mapping being torn down is
- * communicated in siginfo, see kill_proc()
+ * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not
+ * used any more, so free it.
*/
- start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, start + size, 0);
+ __folio_free_raw_hwp(folio, false);
}
- kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
- rc = 0;
-unlock:
- dax_unlock_page(page, cookie);
+ return ret;
+}
+
+static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
+{
+ /*
+ * hugetlb_vmemmap_optimized hugepages can't be freed because struct
+ * pages for tail pages are required but they don't exist.
+ */
+ if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio))
+ return 0;
+
+ /*
+ * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by
+ * definition.
+ */
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
+ return 0;
+
+ return __folio_free_raw_hwp(folio, move_flag);
+}
+
+void folio_clear_hugetlb_hwpoison(struct folio *folio)
+{
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
+ return;
+ folio_clear_hwpoison(folio);
+ folio_free_raw_hwp(folio, true);
+}
+
+/*
+ * Called from hugetlb code with hugetlb_lock held.
+ *
+ * Return values:
+ * 0 - free hugepage
+ * 1 - in-use hugepage
+ * 2 - not a hugepage
+ * -EBUSY - the hugepage is busy (try to retry)
+ * -EHWPOISON - the hugepage is already hwpoisoned
+ */
+int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
+{
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
+ int ret = 2; /* fallback to normal page handling */
+ bool count_increased = false;
+
+ if (!folio_test_hugetlb(folio))
+ goto out;
+
+ if (flags & MF_COUNT_INCREASED) {
+ ret = 1;
+ count_increased = true;
+ } else if (folio_test_hugetlb_freed(folio)) {
+ ret = 0;
+ } else if (folio_test_hugetlb_migratable(folio)) {
+ ret = folio_try_get(folio);
+ if (ret)
+ count_increased = true;
+ } else {
+ ret = -EBUSY;
+ if (!(flags & MF_NO_RETRY))
+ goto out;
+ }
+
+ if (folio_set_hugetlb_hwpoison(folio, page)) {
+ ret = -EHWPOISON;
+ goto out;
+ }
+
+ /*
+ * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them
+ * from being migrated by memory hotremove.
+ */
+ if (count_increased && folio_test_hugetlb_migratable(folio)) {
+ folio_clear_hugetlb_migratable(folio);
+ *migratable_cleared = true;
+ }
+
+ return ret;
+out:
+ if (count_increased)
+ folio_put(folio);
+ return ret;
+}
+
+/*
+ * Taking refcount of hugetlb pages needs extra care about race conditions
+ * with basic operations like hugepage allocation/free/demotion.
+ * So some of prechecks for hwpoison (pinning, and testing/setting
+ * PageHWPoison) should be done in single hugetlb_lock range.
+ */
+static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
+{
+ int res;
+ struct page *p = pfn_to_page(pfn);
+ struct folio *folio;
+ unsigned long page_flags;
+ bool migratable_cleared = false;
+
+ *hugetlb = 1;
+retry:
+ res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
+ if (res == 2) { /* fallback to normal page handling */
+ *hugetlb = 0;
+ return 0;
+ } else if (res == -EHWPOISON) {
+ pr_err("%#lx: already hardware poisoned\n", pfn);
+ if (flags & MF_ACTION_REQUIRED) {
+ folio = page_folio(p);
+ res = kill_accessing_process(current, folio_pfn(folio), flags);
+ }
+ return res;
+ } else if (res == -EBUSY) {
+ if (!(flags & MF_NO_RETRY)) {
+ flags |= MF_NO_RETRY;
+ goto retry;
+ }
+ return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
+ }
+
+ folio = page_folio(p);
+ folio_lock(folio);
+
+ if (hwpoison_filter(p)) {
+ folio_clear_hugetlb_hwpoison(folio);
+ if (migratable_cleared)
+ folio_set_hugetlb_migratable(folio);
+ folio_unlock(folio);
+ if (res == 1)
+ folio_put(folio);
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Handling free hugepage. The possible race with hugepage allocation
+ * or demotion can be prevented by PageHWPoison flag.
+ */
+ if (res == 0) {
+ folio_unlock(folio);
+ if (__page_handle_poison(p) >= 0) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
+ } else {
+ res = MF_FAILED;
+ }
+ return action_result(pfn, MF_MSG_FREE_HUGE, res);
+ }
+
+ page_flags = folio->flags;
+
+ if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) {
+ folio_unlock(folio);
+ return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
+ }
+
+ return identify_page_state(pfn, p, page_flags);
+}
+
+#else
+static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
+{
+ return 0;
+}
+
+static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
+{
+ return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/* Drop the extra refcount in case we come from madvise() */
+static void put_ref_page(unsigned long pfn, int flags)
+{
+ struct page *page;
+
+ if (!(flags & MF_COUNT_INCREASED))
+ return;
+
+ page = pfn_to_page(pfn);
+ if (page)
+ put_page(page);
+}
+
+static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ int rc = -ENXIO;
+
+ put_ref_page(pfn, flags);
+
+ /* device metadata space is not recoverable */
+ if (!pgmap_pfn_valid(pgmap, pfn))
+ goto out;
+
+ /*
+ * Call driver's implementation to handle the memory failure, otherwise
+ * fall back to generic handler.
+ */
+ if (pgmap_has_memory_failure(pgmap)) {
+ rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
+ /*
+ * Fall back to generic handler too if operation is not
+ * supported inside the driver/device/filesystem.
+ */
+ if (rc != -EOPNOTSUPP)
+ goto out;
+ }
+
+ rc = mf_generic_kill_procs(pfn, flags, pgmap);
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
@@ -1250,6 +2109,8 @@ out:
return rc;
}
+static DEFINE_MUTEX(mf_mutex);
+
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
@@ -1266,47 +2127,69 @@ out:
*
* Must run in process context (e.g. a work queue) with interrupts
* enabled and no spinlocks hold.
+ *
+ * Return: 0 for successfully handled the memory error,
+ * -EOPNOTSUPP for hwpoison_filter() filtered the error event,
+ * < 0(except -EOPNOTSUPP) on failure.
*/
int memory_failure(unsigned long pfn, int flags)
{
struct page *p;
struct page *hpage;
- struct page *orig_head;
struct dev_pagemap *pgmap;
- int res;
+ int res = 0;
unsigned long page_flags;
+ bool retry = true;
+ int hugetlb = 0;
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
+ mutex_lock(&mf_mutex);
+
+ if (!(flags & MF_SW_SIMULATED))
+ hw_memory_failure = true;
+
p = pfn_to_online_page(pfn);
if (!p) {
+ res = arch_memory_failure(pfn, flags);
+ if (res == 0)
+ goto unlock_mutex;
+
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn, NULL);
- if (pgmap)
- return memory_failure_dev_pagemap(pfn, flags,
- pgmap);
+ if (pgmap) {
+ res = memory_failure_dev_pagemap(pfn, flags,
+ pgmap);
+ goto unlock_mutex;
+ }
}
- pr_err("Memory failure: %#lx: memory outside kernel control\n",
- pfn);
- return -ENXIO;
+ pr_err("%#lx: memory outside kernel control\n", pfn);
+ res = -ENXIO;
+ goto unlock_mutex;
}
- if (PageHuge(p))
- return memory_failure_hugetlb(pfn, flags);
+try_again:
+ res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
+ if (hugetlb)
+ goto unlock_mutex;
+
if (TestSetPageHWPoison(p)) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n",
- pfn);
- return 0;
+ pr_err("%#lx: already hardware poisoned\n", pfn);
+ res = -EHWPOISON;
+ if (flags & MF_ACTION_REQUIRED)
+ res = kill_accessing_process(current, pfn, flags);
+ if (flags & MF_COUNT_INCREASED)
+ put_page(p);
+ goto unlock_mutex;
}
- orig_head = hpage = compound_head(p);
- num_poisoned_pages_inc();
+ hpage = compound_head(p);
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
- * prep_new_page() will be the gate keeper.
+ * check_new_page() will be the gate keeper.
* 2) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
@@ -1314,34 +2197,53 @@ int memory_failure(unsigned long pfn, int flags)
* In fact it's dangerous to directly bump up page count from 0,
* that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
*/
- if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
- if (is_free_buddy_page(p)) {
- action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
- return 0;
- } else {
- action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
- return -EBUSY;
+ if (!(flags & MF_COUNT_INCREASED)) {
+ res = get_hwpoison_page(p, flags);
+ if (!res) {
+ if (is_free_buddy_page(p)) {
+ if (take_page_off_buddy(p)) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
+ } else {
+ /* We lost the race, try again */
+ if (retry) {
+ ClearPageHWPoison(p);
+ retry = false;
+ goto try_again;
+ }
+ res = MF_FAILED;
+ }
+ res = action_result(pfn, MF_MSG_BUDDY, res);
+ } else {
+ res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
+ }
+ goto unlock_mutex;
+ } else if (res < 0) {
+ res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
+ goto unlock_mutex;
}
}
if (PageTransHuge(hpage)) {
- lock_page(p);
- if (!PageAnon(p) || unlikely(split_huge_page(p))) {
- unlock_page(p);
- if (!PageAnon(p))
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- pfn);
- else
- pr_err("Memory failure: %#lx: thp split failed\n",
- pfn);
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- put_hwpoison_page(p);
- return -EBUSY;
+ /*
+ * The flag must be set after the refcount is bumped
+ * otherwise it may race with THP split.
+ * And the flag can't be set in get_hwpoison_page() since
+ * it is called by soft offline too and it is just called
+ * for !MF_COUNT_INCREASE. So here seems to be the best
+ * place.
+ *
+ * Don't need care about the above error handling paths for
+ * get_hwpoison_page() since they handle either free page
+ * or unhandlable page. The refcount is bumped iff the
+ * page is a valid handlable page.
+ */
+ SetPageHasHWPoisoned(hpage);
+ if (try_to_split_thp_page(p) < 0) {
+ res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
+ goto unlock_mutex;
}
- unlock_page(p);
VM_BUG_ON_PAGE(!page_count(p), p);
- hpage = compound_head(p);
}
/*
@@ -1352,26 +2254,27 @@ int memory_failure(unsigned long pfn, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- shake_page(p, 0);
- /* shake_page could have turned it free. */
- if (!PageLRU(p) && is_free_buddy_page(p)) {
- if (flags & MF_COUNT_INCREASED)
- action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
- else
- action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
- return 0;
- }
+ shake_page(p);
lock_page(p);
/*
- * The page could have changed compound pages during the locking.
- * If this happens just bail out.
+ * We're only intended to deal with the non-Compound page here.
+ * However, the page could have changed compound pages due to
+ * race window. If this happens, we could try again to hopefully
+ * handle the page next round.
*/
- if (PageCompound(p) && compound_head(p) != orig_head) {
- action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
- res = -EBUSY;
- goto out;
+ if (PageCompound(p)) {
+ if (retry) {
+ ClearPageHWPoison(p);
+ unlock_page(p);
+ put_page(p);
+ flags &= ~MF_COUNT_INCREASED;
+ retry = false;
+ goto try_again;
+ }
+ res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
+ goto unlock_page;
}
/*
@@ -1381,30 +2284,22 @@ int memory_failure(unsigned long pfn, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- if (PageHuge(p))
- page_flags = hpage->flags;
- else
- page_flags = p->flags;
+ page_flags = p->flags;
- /*
- * unpoison always clear PG_hwpoison inside page lock
- */
- if (!PageHWPoison(p)) {
- pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- num_poisoned_pages_dec();
- unlock_page(p);
- put_hwpoison_page(p);
- return 0;
- }
if (hwpoison_filter(p)) {
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
+ ClearPageHWPoison(p);
unlock_page(p);
- put_hwpoison_page(p);
- return 0;
+ put_page(p);
+ res = -EOPNOTSUPP;
+ goto unlock_mutex;
}
- if (!PageTransTail(p) && !PageLRU(p))
+ /*
+ * __munlock_folio() may clear a writeback page's LRU flag without
+ * page_lock. We need wait writeback completion for this page or it
+ * may trigger vfs BUG while evict inode.
+ */
+ if (!PageLRU(p) && !PageWriteback(p))
goto identify_page_state;
/*
@@ -1415,30 +2310,29 @@ int memory_failure(unsigned long pfn, int flags)
/*
* Now take care of user space mappings.
- * Abort on fail: __delete_from_page_cache() assumes unmapped page.
- *
- * When the raw error page is thp tail page, hpage points to the raw
- * page after thp split.
+ * Abort on fail: __filemap_remove_folio() assumes unmapped page.
*/
- if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
- action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
- res = -EBUSY;
- goto out;
+ if (!hwpoison_user_mappings(p, pfn, flags, p)) {
+ res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
+ goto unlock_page;
}
/*
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
- res = -EBUSY;
- goto out;
+ res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
+ goto unlock_page;
}
identify_page_state:
res = identify_page_state(pfn, p, page_flags);
-out:
+ mutex_unlock(&mf_mutex);
+ return res;
+unlock_page:
unlock_page(p);
+unlock_mutex:
+ mutex_unlock(&mf_mutex);
return res;
}
EXPORT_SYMBOL_GPL(memory_failure);
@@ -1490,7 +2384,7 @@ void memory_failure_queue(unsigned long pfn, int flags)
if (kfifo_put(&mf_cpu->fifo, entry))
schedule_work_on(smp_processor_id(), &mf_cpu->work);
else
- pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pr_err("buffer overflow when queuing memory failure at %#lx\n",
pfn);
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
put_cpu_var(memory_failure_cpu);
@@ -1543,10 +2437,14 @@ static int __init memory_failure_init(void)
INIT_WORK(&mf_cpu->work, memory_failure_work_func);
}
+ register_sysctl_init("vm", memory_failure_table);
+
return 0;
}
core_initcall(memory_failure_init);
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
#define unpoison_pr_info(fmt, pfn, rs) \
({ \
if (__ratelimit(rs)) \
@@ -1567,9 +2465,11 @@ core_initcall(memory_failure_init);
*/
int unpoison_memory(unsigned long pfn)
{
- struct page *page;
+ struct folio *folio;
struct page *p;
- int freeit = 0;
+ int ret = -EBUSY, ghp;
+ unsigned long count = 1;
+ bool huge = false;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -1577,342 +2477,211 @@ int unpoison_memory(unsigned long pfn)
return -ENXIO;
p = pfn_to_page(pfn);
- page = compound_head(p);
+ folio = page_folio(p);
- if (!PageHWPoison(p)) {
- unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
- pfn, &unpoison_rs);
- return 0;
- }
+ mutex_lock(&mf_mutex);
- if (page_count(page) > 1) {
- unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
+ if (hw_memory_failure) {
+ unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ ret = -EOPNOTSUPP;
+ goto unlock_mutex;
}
- if (page_mapped(page)) {
- unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
+ if (!PageHWPoison(p)) {
+ unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
- if (page_mapping(page)) {
- unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
+ if (folio_ref_count(folio) > 1) {
+ unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
+ if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
+ goto unlock_mutex;
+
/*
- * unpoison_memory() can encounter thp only when the thp is being
- * worked by memory_failure() and the page lock is not held yet.
- * In such case, we yield to memory_failure() and make unpoison fail.
+ * Note that folio->_mapcount is overloaded in SLAB, so the simple test
+ * in folio_mapped() has to be done after folio_test_slab() is checked.
*/
- if (!PageHuge(page) && PageTransHuge(page)) {
- unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
- pfn, &unpoison_rs);
- return 0;
- }
-
- if (!get_hwpoison_page(p)) {
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
+ if (folio_mapped(folio)) {
+ unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
- lock_page(page);
- /*
- * This test is racy because PG_hwpoison is set outside of page lock.
- * That's acceptable because that won't trigger kernel panic. Instead,
- * the PG_hwpoison page will be caught and isolated on the entrance to
- * the free buddy page pool.
- */
- if (TestClearPageHWPoison(page)) {
- unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ if (folio_mapping(folio)) {
+ unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
pfn, &unpoison_rs);
- num_poisoned_pages_dec();
- freeit = 1;
+ goto unlock_mutex;
}
- unlock_page(page);
-
- put_hwpoison_page(page);
- if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_hwpoison_page(page);
-
- return 0;
-}
-EXPORT_SYMBOL(unpoison_memory);
-
-static struct page *new_page(struct page *p, unsigned long private)
-{
- struct migration_target_control mtc = {
- .nid = page_to_nid(p),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- return alloc_migration_target(p, (unsigned long)&mtc);
-}
-
-/*
- * Safely get reference count of an arbitrary page.
- * Returns 0 for a free page, -EIO for a zero refcount page
- * that is not free, and 1 for any other page type.
- * For 1 the page is returned with increased page count, otherwise not.
- */
-static int __get_any_page(struct page *p, unsigned long pfn, int flags)
-{
- int ret;
- if (flags & MF_COUNT_INCREASED)
- return 1;
-
- /*
- * When the target page is a free hugepage, just remove it
- * from free hugepage list.
- */
- if (!get_hwpoison_page(p)) {
+ ghp = get_hwpoison_page(p, MF_UNPOISON);
+ if (!ghp) {
if (PageHuge(p)) {
- pr_info("%s: %#lx free huge page\n", __func__, pfn);
- ret = 0;
- } else if (is_free_buddy_page(p)) {
- pr_info("%s: %#lx free buddy page\n", __func__, pfn);
- ret = 0;
+ huge = true;
+ count = folio_free_raw_hwp(folio, false);
+ if (count == 0)
+ goto unlock_mutex;
+ }
+ ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
+ } else if (ghp < 0) {
+ if (ghp == -EHWPOISON) {
+ ret = put_page_back_buddy(p) ? 0 : -EBUSY;
} else {
- pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
- __func__, pfn, p->flags);
- ret = -EIO;
+ ret = ghp;
+ unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
+ pfn, &unpoison_rs);
}
} else {
- /* Not a free page */
- ret = 1;
+ if (PageHuge(p)) {
+ huge = true;
+ count = folio_free_raw_hwp(folio, false);
+ if (count == 0) {
+ folio_put(folio);
+ goto unlock_mutex;
+ }
+ }
+
+ folio_put(folio);
+ if (TestClearPageHWPoison(p)) {
+ folio_put(folio);
+ ret = 0;
+ }
+ }
+
+unlock_mutex:
+ mutex_unlock(&mf_mutex);
+ if (!ret) {
+ if (!huge)
+ num_poisoned_pages_sub(pfn, 1);
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ page_to_pfn(p), &unpoison_rs);
}
return ret;
}
+EXPORT_SYMBOL(unpoison_memory);
-static int get_any_page(struct page *page, unsigned long pfn, int flags)
+static bool isolate_page(struct page *page, struct list_head *pagelist)
{
- int ret = __get_any_page(page, pfn, flags);
+ bool isolated = false;
- if (ret == 1 && !PageHuge(page) &&
- !PageLRU(page) && !__PageMovable(page)) {
- /*
- * Try to free it.
- */
- put_hwpoison_page(page);
- shake_page(page, 1);
+ if (PageHuge(page)) {
+ isolated = isolate_hugetlb(page_folio(page), pagelist);
+ } else {
+ bool lru = !__PageMovable(page);
- /*
- * Did it turn free?
- */
- ret = __get_any_page(page, pfn, 0);
- if (ret == 1 && !PageLRU(page)) {
- /* Drop page reference which is from __get_any_page() */
- put_hwpoison_page(page);
- pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
- pfn, page->flags, &page->flags);
- return -EIO;
+ if (lru)
+ isolated = isolate_lru_page(page);
+ else
+ isolated = isolate_movable_page(page,
+ ISOLATE_UNEVICTABLE);
+
+ if (isolated) {
+ list_add(&page->lru, pagelist);
+ if (lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
}
}
- return ret;
+
+ /*
+ * If we succeed to isolate the page, we grabbed another refcount on
+ * the page, so we can safely drop the one we got from get_any_pages().
+ * If we failed to isolate the page, it means that we cannot go further
+ * and we will return an error, so drop the reference we got from
+ * get_any_pages() as well.
+ */
+ put_page(page);
+ return isolated;
}
-static int soft_offline_huge_page(struct page *page, int flags)
+/*
+ * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
+ * If the page is mapped, it migrates the contents over.
+ */
+static int soft_offline_in_use_page(struct page *page)
{
- int ret;
+ long ret = 0;
unsigned long pfn = page_to_pfn(page);
struct page *hpage = compound_head(page);
+ char const *msg_page[] = {"page", "hugepage"};
+ bool huge = PageHuge(page);
LIST_HEAD(pagelist);
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
- /*
- * This double-check of PageHWPoison is to avoid the race with
- * memory_failure(). See also comment in __soft_offline_page().
- */
- lock_page(hpage);
- if (PageHWPoison(hpage)) {
- unlock_page(hpage);
- put_hwpoison_page(hpage);
- pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
- return -EBUSY;
- }
- unlock_page(hpage);
-
- ret = isolate_huge_page(hpage, &pagelist);
- /*
- * get_any_page() and isolate_huge_page() takes a refcount each,
- * so need to drop one here.
- */
- put_hwpoison_page(hpage);
- if (!ret) {
- pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
- return -EBUSY;
- }
-
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
- pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
- if (!list_empty(&pagelist))
- putback_movable_pages(&pagelist);
- if (ret > 0)
- ret = -EIO;
- } else {
- /*
- * We set PG_hwpoison only when the migration source hugepage
- * was successfully dissolved, because otherwise hwpoisoned
- * hugepage remains on free hugepage list, then userspace will
- * find it as SIGBUS by allocation failure. That's not expected
- * in soft-offlining.
- */
- ret = dissolve_free_huge_page(page);
- if (!ret) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- ret = -EBUSY;
+ if (!huge && PageTransHuge(hpage)) {
+ if (try_to_split_thp_page(page)) {
+ pr_info("soft offline: %#lx: thp split failed\n", pfn);
+ return -EBUSY;
}
+ hpage = page;
}
- return ret;
-}
-
-static int __soft_offline_page(struct page *page, int flags)
-{
- int ret;
- unsigned long pfn = page_to_pfn(page);
- /*
- * Check PageHWPoison again inside page lock because PageHWPoison
- * is set by memory_failure() outside page lock. Note that
- * memory_failure() also double-checks PageHWPoison inside page lock,
- * so there's no race between soft_offline_page() and memory_failure().
- */
lock_page(page);
- wait_on_page_writeback(page);
+ if (!PageHuge(page))
+ wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
- return -EBUSY;
+ return 0;
}
- /*
- * Try to invalidate first. This should work for
- * non dirty unmapped page cache pages.
- */
- ret = invalidate_inode_page(page);
+
+ if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page))
+ /*
+ * Try to invalidate first. This should work for
+ * non dirty unmapped page cache pages.
+ */
+ ret = invalidate_inode_page(page);
unlock_page(page);
- /*
- * RED-PEN would be better to keep it isolated here, but we
- * would need to fix isolation locking first.
- */
- if (ret == 1) {
- put_hwpoison_page(page);
+
+ if (ret) {
pr_info("soft_offline: %#lx: invalidated\n", pfn);
- SetPageHWPoison(page);
- num_poisoned_pages_inc();
+ page_handle_poison(page, false, true);
return 0;
}
- /*
- * Simple invalidation didn't work.
- * Try to migrate to a new page instead. migrate.c
- * handles a large number of cases for us.
- */
- if (PageLRU(page))
- ret = isolate_lru_page(page);
- else
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- /*
- * Drop page reference which is came from get_any_page()
- * successful isolate_lru_page() already took another one.
- */
- put_hwpoison_page(page);
- if (!ret) {
- LIST_HEAD(pagelist);
- /*
- * After isolated lru page, the PageLRU will be cleared,
- * so use !__PageMovable instead for LRU page's mapping
- * cannot have PAGE_MAPPING_MOVABLE.
- */
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
- list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
+ if (isolate_page(hpage, &pagelist)) {
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
+ if (!ret) {
+ bool release = !huge;
+
+ if (!page_handle_poison(page, huge, release))
+ ret = -EBUSY;
+ } else {
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
+ pfn, msg_page[huge], ret, &page->flags);
if (ret > 0)
- ret = -EIO;
+ ret = -EBUSY;
}
} else {
- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
- pfn, ret, page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
+ pfn, msg_page[huge], page_count(page), &page->flags);
+ ret = -EBUSY;
}
return ret;
}
-static int soft_offline_in_use_page(struct page *page, int flags)
-{
- int ret;
- int mt;
- struct page *hpage = compound_head(page);
-
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(page);
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
- unlock_page(page);
- if (!PageAnon(page))
- pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
- else
- pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(page);
- return -EBUSY;
- }
- unlock_page(page);
- }
-
- /*
- * Setting MIGRATE_ISOLATE here ensures that the page will be linked
- * to free list immediately (not via pcplist) when released after
- * successful page migration. Otherwise we can't guarantee that the
- * page is really free after put_page() returns, so
- * set_hwpoison_free_buddy_page() highly likely fails.
- */
- mt = get_pageblock_migratetype(page);
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- set_pageblock_migratetype(page, mt);
- return ret;
-}
-
-static int soft_offline_free_page(struct page *page)
-{
- int rc = dissolve_free_huge_page(page);
-
- if (!rc) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- rc = -EBUSY;
- }
- return rc;
-}
-
/**
* soft_offline_page - Soft offline a page.
* @pfn: pfn to soft-offline
* @flags: flags. Same as memory_failure().
*
- * Returns 0 on success, otherwise negated errno.
+ * Returns 0 on success
+ * -EOPNOTSUPP for hwpoison_filter() filtered the error event
+ * < 0 otherwise negated errno.
*
* Soft offline a page, by migration or invalidation,
* without killing anything. This is for the case when
@@ -1932,30 +2701,57 @@ static int soft_offline_free_page(struct page *page)
int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
+ bool try_again = true;
struct page *page;
- if (!pfn_valid(pfn))
+ if (!pfn_valid(pfn)) {
+ WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
return -ENXIO;
+ }
+
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
page = pfn_to_online_page(pfn);
- if (!page)
+ if (!page) {
+ put_ref_page(pfn, flags);
return -EIO;
+ }
+
+ mutex_lock(&mf_mutex);
if (PageHWPoison(page)) {
- pr_info("soft offline: %#lx page already poisoned\n", pfn);
- if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
+ pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
+ put_ref_page(pfn, flags);
+ mutex_unlock(&mf_mutex);
+ return 0;
}
+retry:
get_online_mems();
- ret = get_any_page(page, pfn, flags);
+ ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
put_online_mems();
- if (ret > 0)
- ret = soft_offline_in_use_page(page, flags);
- else if (ret == 0)
- ret = soft_offline_free_page(page);
+ if (hwpoison_filter(page)) {
+ if (ret > 0)
+ put_page(page);
+
+ mutex_unlock(&mf_mutex);
+ return -EOPNOTSUPP;
+ }
+
+ if (ret > 0) {
+ ret = soft_offline_in_use_page(page);
+ } else if (ret == 0) {
+ if (!page_handle_poison(page, true, false)) {
+ if (try_again) {
+ try_again = false;
+ flags &= ~MF_COUNT_INCREASED;
+ goto retry;
+ }
+ ret = -EBUSY;
+ }
+ }
+
+ mutex_unlock(&mf_mutex);
return ret;
}
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
new file mode 100644
index 000000000000..a516e303e304
--- /dev/null
+++ b/mm/memory-tiers.c
@@ -0,0 +1,731 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/slab.h>
+#include <linux/lockdep.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/memory.h>
+#include <linux/memory-tiers.h>
+
+#include "internal.h"
+
+struct memory_tier {
+ /* hierarchy of memory tiers */
+ struct list_head list;
+ /* list of all memory types part of this tier */
+ struct list_head memory_types;
+ /*
+ * start value of abstract distance. memory tier maps
+ * an abstract distance range,
+ * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
+ */
+ int adistance_start;
+ struct device dev;
+ /* All the nodes that are part of all the lower memory tiers. */
+ nodemask_t lower_tier_mask;
+};
+
+struct demotion_nodes {
+ nodemask_t preferred;
+};
+
+struct node_memory_type_map {
+ struct memory_dev_type *memtype;
+ int map_count;
+};
+
+static DEFINE_MUTEX(memory_tier_lock);
+static LIST_HEAD(memory_tiers);
+static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
+static struct memory_dev_type *default_dram_type;
+
+static struct bus_type memory_tier_subsys = {
+ .name = "memory_tiering",
+ .dev_name = "memory_tier",
+};
+
+#ifdef CONFIG_MIGRATION
+static int top_tier_adistance;
+/*
+ * node_demotion[] examples:
+ *
+ * Example 1:
+ *
+ * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
+ *
+ * node distances:
+ * node 0 1 2 3
+ * 0 10 20 30 40
+ * 1 20 10 40 30
+ * 2 30 40 10 40
+ * 3 40 30 40 10
+ *
+ * memory_tiers0 = 0-1
+ * memory_tiers1 = 2-3
+ *
+ * node_demotion[0].preferred = 2
+ * node_demotion[1].preferred = 3
+ * node_demotion[2].preferred = <empty>
+ * node_demotion[3].preferred = <empty>
+ *
+ * Example 2:
+ *
+ * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
+ *
+ * node distances:
+ * node 0 1 2
+ * 0 10 20 30
+ * 1 20 10 30
+ * 2 30 30 10
+ *
+ * memory_tiers0 = 0-2
+ *
+ * node_demotion[0].preferred = <empty>
+ * node_demotion[1].preferred = <empty>
+ * node_demotion[2].preferred = <empty>
+ *
+ * Example 3:
+ *
+ * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
+ *
+ * node distances:
+ * node 0 1 2
+ * 0 10 20 30
+ * 1 20 10 40
+ * 2 30 40 10
+ *
+ * memory_tiers0 = 1
+ * memory_tiers1 = 0
+ * memory_tiers2 = 2
+ *
+ * node_demotion[0].preferred = 2
+ * node_demotion[1].preferred = 0
+ * node_demotion[2].preferred = <empty>
+ *
+ */
+static struct demotion_nodes *node_demotion __read_mostly;
+#endif /* CONFIG_MIGRATION */
+
+static inline struct memory_tier *to_memory_tier(struct device *device)
+{
+ return container_of(device, struct memory_tier, dev);
+}
+
+static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+{
+ nodemask_t nodes = NODE_MASK_NONE;
+ struct memory_dev_type *memtype;
+
+ list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+ nodes_or(nodes, nodes, memtype->nodes);
+
+ return nodes;
+}
+
+static void memory_tier_device_release(struct device *dev)
+{
+ struct memory_tier *tier = to_memory_tier(dev);
+ /*
+ * synchronize_rcu in clear_node_memory_tier makes sure
+ * we don't have rcu access to this memory tier.
+ */
+ kfree(tier);
+}
+
+static ssize_t nodelist_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int ret;
+ nodemask_t nmask;
+
+ mutex_lock(&memory_tier_lock);
+ nmask = get_memtier_nodemask(to_memory_tier(dev));
+ ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
+ mutex_unlock(&memory_tier_lock);
+ return ret;
+}
+static DEVICE_ATTR_RO(nodelist);
+
+static struct attribute *memtier_dev_attrs[] = {
+ &dev_attr_nodelist.attr,
+ NULL
+};
+
+static const struct attribute_group memtier_dev_group = {
+ .attrs = memtier_dev_attrs,
+};
+
+static const struct attribute_group *memtier_dev_groups[] = {
+ &memtier_dev_group,
+ NULL
+};
+
+static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
+{
+ int ret;
+ bool found_slot = false;
+ struct memory_tier *memtier, *new_memtier;
+ int adistance = memtype->adistance;
+ unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ adistance = round_down(adistance, memtier_adistance_chunk_size);
+ /*
+ * If the memtype is already part of a memory tier,
+ * just return that.
+ */
+ if (!list_empty(&memtype->tier_sibiling)) {
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ if (adistance == memtier->adistance_start)
+ return memtier;
+ }
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ if (adistance == memtier->adistance_start) {
+ goto link_memtype;
+ } else if (adistance < memtier->adistance_start) {
+ found_slot = true;
+ break;
+ }
+ }
+
+ new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
+ if (!new_memtier)
+ return ERR_PTR(-ENOMEM);
+
+ new_memtier->adistance_start = adistance;
+ INIT_LIST_HEAD(&new_memtier->list);
+ INIT_LIST_HEAD(&new_memtier->memory_types);
+ if (found_slot)
+ list_add_tail(&new_memtier->list, &memtier->list);
+ else
+ list_add_tail(&new_memtier->list, &memory_tiers);
+
+ new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
+ new_memtier->dev.bus = &memory_tier_subsys;
+ new_memtier->dev.release = memory_tier_device_release;
+ new_memtier->dev.groups = memtier_dev_groups;
+
+ ret = device_register(&new_memtier->dev);
+ if (ret) {
+ list_del(&new_memtier->list);
+ put_device(&new_memtier->dev);
+ return ERR_PTR(ret);
+ }
+ memtier = new_memtier;
+
+link_memtype:
+ list_add(&memtype->tier_sibiling, &memtier->memory_types);
+ return memtier;
+}
+
+static struct memory_tier *__node_get_memory_tier(int node)
+{
+ pg_data_t *pgdat;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return NULL;
+ /*
+ * Since we hold memory_tier_lock, we can avoid
+ * RCU read locks when accessing the details. No
+ * parallel updates are possible here.
+ */
+ return rcu_dereference_check(pgdat->memtier,
+ lockdep_is_held(&memory_tier_lock));
+}
+
+#ifdef CONFIG_MIGRATION
+bool node_is_toptier(int node)
+{
+ bool toptier;
+ pg_data_t *pgdat;
+ struct memory_tier *memtier;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return false;
+
+ rcu_read_lock();
+ memtier = rcu_dereference(pgdat->memtier);
+ if (!memtier) {
+ toptier = true;
+ goto out;
+ }
+ if (memtier->adistance_start <= top_tier_adistance)
+ toptier = true;
+ else
+ toptier = false;
+out:
+ rcu_read_unlock();
+ return toptier;
+}
+
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+ struct memory_tier *memtier;
+
+ /*
+ * pg_data_t.memtier updates includes a synchronize_rcu()
+ * which ensures that we either find NULL or a valid memtier
+ * in NODE_DATA. protect the access via rcu_read_lock();
+ */
+ rcu_read_lock();
+ memtier = rcu_dereference(pgdat->memtier);
+ if (memtier)
+ *targets = memtier->lower_tier_mask;
+ else
+ *targets = NODE_MASK_NONE;
+ rcu_read_unlock();
+}
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+ struct demotion_nodes *nd;
+ int target;
+
+ if (!node_demotion)
+ return NUMA_NO_NODE;
+
+ nd = &node_demotion[node];
+
+ /*
+ * node_demotion[] is updated without excluding this
+ * function from running.
+ *
+ * Make sure to use RCU over entire code blocks if
+ * node_demotion[] reads need to be consistent.
+ */
+ rcu_read_lock();
+ /*
+ * If there are multiple target nodes, just select one
+ * target node randomly.
+ *
+ * In addition, we can also use round-robin to select
+ * target node, but we should introduce another variable
+ * for node_demotion[] to record last selected target node,
+ * that may cause cache ping-pong due to the changing of
+ * last target node. Or introducing per-cpu data to avoid
+ * caching issue, which seems more complicated. So selecting
+ * target node randomly seems better until now.
+ */
+ target = node_random(&nd->preferred);
+ rcu_read_unlock();
+
+ return target;
+}
+
+static void disable_all_demotion_targets(void)
+{
+ struct memory_tier *memtier;
+ int node;
+
+ for_each_node_state(node, N_MEMORY) {
+ node_demotion[node].preferred = NODE_MASK_NONE;
+ /*
+ * We are holding memory_tier_lock, it is safe
+ * to access pgda->memtier.
+ */
+ memtier = __node_get_memory_tier(node);
+ if (memtier)
+ memtier->lower_tier_mask = NODE_MASK_NONE;
+ }
+ /*
+ * Ensure that the "disable" is visible across the system.
+ * Readers will see either a combination of before+disable
+ * state or disable+after. They will never see before and
+ * after state together.
+ */
+ synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for all memory
+ * nodes. Failing here is OK. It might just indicate
+ * being at the end of a chain.
+ */
+static void establish_demotion_targets(void)
+{
+ struct memory_tier *memtier;
+ struct demotion_nodes *nd;
+ int target = NUMA_NO_NODE, node;
+ int distance, best_distance;
+ nodemask_t tier_nodes, lower_tier;
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ if (!node_demotion)
+ return;
+
+ disable_all_demotion_targets();
+
+ for_each_node_state(node, N_MEMORY) {
+ best_distance = -1;
+ nd = &node_demotion[node];
+
+ memtier = __node_get_memory_tier(node);
+ if (!memtier || list_is_last(&memtier->list, &memory_tiers))
+ continue;
+ /*
+ * Get the lower memtier to find the demotion node list.
+ */
+ memtier = list_next_entry(memtier, list);
+ tier_nodes = get_memtier_nodemask(memtier);
+ /*
+ * find_next_best_node, use 'used' nodemask as a skip list.
+ * Add all memory nodes except the selected memory tier
+ * nodelist to skip list so that we find the best node from the
+ * memtier nodelist.
+ */
+ nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
+
+ /*
+ * Find all the nodes in the memory tier node list of same best distance.
+ * add them to the preferred mask. We randomly select between nodes
+ * in the preferred mask when allocating pages during demotion.
+ */
+ do {
+ target = find_next_best_node(node, &tier_nodes);
+ if (target == NUMA_NO_NODE)
+ break;
+
+ distance = node_distance(node, target);
+ if (distance == best_distance || best_distance == -1) {
+ best_distance = distance;
+ node_set(target, nd->preferred);
+ } else {
+ break;
+ }
+ } while (1);
+ }
+ /*
+ * Promotion is allowed from a memory tier to higher
+ * memory tier only if the memory tier doesn't include
+ * compute. We want to skip promotion from a memory tier,
+ * if any node that is part of the memory tier have CPUs.
+ * Once we detect such a memory tier, we consider that tier
+ * as top tiper from which promotion is not allowed.
+ */
+ list_for_each_entry_reverse(memtier, &memory_tiers, list) {
+ tier_nodes = get_memtier_nodemask(memtier);
+ nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
+ if (!nodes_empty(tier_nodes)) {
+ /*
+ * abstract distance below the max value of this memtier
+ * is considered toptier.
+ */
+ top_tier_adistance = memtier->adistance_start +
+ MEMTIER_CHUNK_SIZE - 1;
+ break;
+ }
+ }
+ /*
+ * Now build the lower_tier mask for each node collecting node mask from
+ * all memory tier below it. This allows us to fallback demotion page
+ * allocation to a set of nodes that is closer the above selected
+ * perferred node.
+ */
+ lower_tier = node_states[N_MEMORY];
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ /*
+ * Keep removing current tier from lower_tier nodes,
+ * This will remove all nodes in current and above
+ * memory tier from the lower_tier mask.
+ */
+ tier_nodes = get_memtier_nodemask(memtier);
+ nodes_andnot(lower_tier, lower_tier, tier_nodes);
+ memtier->lower_tier_mask = lower_tier;
+ }
+}
+
+#else
+static inline void establish_demotion_targets(void) {}
+#endif /* CONFIG_MIGRATION */
+
+static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+ if (!node_memory_types[node].memtype)
+ node_memory_types[node].memtype = memtype;
+ /*
+ * for each device getting added in the same NUMA node
+ * with this specific memtype, bump the map count. We
+ * Only take memtype device reference once, so that
+ * changing a node memtype can be done by droping the
+ * only reference count taken here.
+ */
+
+ if (node_memory_types[node].memtype == memtype) {
+ if (!node_memory_types[node].map_count++)
+ kref_get(&memtype->kref);
+ }
+}
+
+static struct memory_tier *set_node_memory_tier(int node)
+{
+ struct memory_tier *memtier;
+ struct memory_dev_type *memtype;
+ pg_data_t *pgdat = NODE_DATA(node);
+
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ if (!node_state(node, N_MEMORY))
+ return ERR_PTR(-EINVAL);
+
+ __init_node_memory_type(node, default_dram_type);
+
+ memtype = node_memory_types[node].memtype;
+ node_set(node, memtype->nodes);
+ memtier = find_create_memory_tier(memtype);
+ if (!IS_ERR(memtier))
+ rcu_assign_pointer(pgdat->memtier, memtier);
+ return memtier;
+}
+
+static void destroy_memory_tier(struct memory_tier *memtier)
+{
+ list_del(&memtier->list);
+ device_unregister(&memtier->dev);
+}
+
+static bool clear_node_memory_tier(int node)
+{
+ bool cleared = false;
+ pg_data_t *pgdat;
+ struct memory_tier *memtier;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return false;
+
+ /*
+ * Make sure that anybody looking at NODE_DATA who finds
+ * a valid memtier finds memory_dev_types with nodes still
+ * linked to the memtier. We achieve this by waiting for
+ * rcu read section to finish using synchronize_rcu.
+ * This also enables us to free the destroyed memory tier
+ * with kfree instead of kfree_rcu
+ */
+ memtier = __node_get_memory_tier(node);
+ if (memtier) {
+ struct memory_dev_type *memtype;
+
+ rcu_assign_pointer(pgdat->memtier, NULL);
+ synchronize_rcu();
+ memtype = node_memory_types[node].memtype;
+ node_clear(node, memtype->nodes);
+ if (nodes_empty(memtype->nodes)) {
+ list_del_init(&memtype->tier_sibiling);
+ if (list_empty(&memtier->memory_types))
+ destroy_memory_tier(memtier);
+ }
+ cleared = true;
+ }
+ return cleared;
+}
+
+static void release_memtype(struct kref *kref)
+{
+ struct memory_dev_type *memtype;
+
+ memtype = container_of(kref, struct memory_dev_type, kref);
+ kfree(memtype);
+}
+
+struct memory_dev_type *alloc_memory_type(int adistance)
+{
+ struct memory_dev_type *memtype;
+
+ memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
+ if (!memtype)
+ return ERR_PTR(-ENOMEM);
+
+ memtype->adistance = adistance;
+ INIT_LIST_HEAD(&memtype->tier_sibiling);
+ memtype->nodes = NODE_MASK_NONE;
+ kref_init(&memtype->kref);
+ return memtype;
+}
+EXPORT_SYMBOL_GPL(alloc_memory_type);
+
+void destroy_memory_type(struct memory_dev_type *memtype)
+{
+ kref_put(&memtype->kref, release_memtype);
+}
+EXPORT_SYMBOL_GPL(destroy_memory_type);
+
+void init_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+
+ mutex_lock(&memory_tier_lock);
+ __init_node_memory_type(node, memtype);
+ mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(init_node_memory_type);
+
+void clear_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+ mutex_lock(&memory_tier_lock);
+ if (node_memory_types[node].memtype == memtype)
+ node_memory_types[node].map_count--;
+ /*
+ * If we umapped all the attached devices to this node,
+ * clear the node memory type.
+ */
+ if (!node_memory_types[node].map_count) {
+ node_memory_types[node].memtype = NULL;
+ kref_put(&memtype->kref, release_memtype);
+ }
+ mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(clear_node_memory_type);
+
+static int __meminit memtier_hotplug_callback(struct notifier_block *self,
+ unsigned long action, void *_arg)
+{
+ struct memory_tier *memtier;
+ struct memory_notify *arg = _arg;
+
+ /*
+ * Only update the node migration order when a node is
+ * changing status, like online->offline.
+ */
+ if (arg->status_change_nid < 0)
+ return notifier_from_errno(0);
+
+ switch (action) {
+ case MEM_OFFLINE:
+ mutex_lock(&memory_tier_lock);
+ if (clear_node_memory_tier(arg->status_change_nid))
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+ break;
+ case MEM_ONLINE:
+ mutex_lock(&memory_tier_lock);
+ memtier = set_node_memory_tier(arg->status_change_nid);
+ if (!IS_ERR(memtier))
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+ break;
+ }
+
+ return notifier_from_errno(0);
+}
+
+static int __init memory_tier_init(void)
+{
+ int ret, node;
+ struct memory_tier *memtier;
+
+ ret = subsys_virtual_register(&memory_tier_subsys, NULL);
+ if (ret)
+ panic("%s() failed to register memory tier subsystem\n", __func__);
+
+#ifdef CONFIG_MIGRATION
+ node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
+ GFP_KERNEL);
+ WARN_ON(!node_demotion);
+#endif
+ mutex_lock(&memory_tier_lock);
+ /*
+ * For now we can have 4 faster memory tiers with smaller adistance
+ * than default DRAM tier.
+ */
+ default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
+ if (IS_ERR(default_dram_type))
+ panic("%s() failed to allocate default DRAM tier\n", __func__);
+
+ /*
+ * Look at all the existing N_MEMORY nodes and add them to
+ * default memory tier or to a tier if we already have memory
+ * types assigned.
+ */
+ for_each_node_state(node, N_MEMORY) {
+ memtier = set_node_memory_tier(node);
+ if (IS_ERR(memtier))
+ /*
+ * Continue with memtiers we are able to setup
+ */
+ break;
+ }
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+
+ hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
+ return 0;
+}
+subsys_initcall(memory_tier_init);
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_MIGRATION
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ numa_demotion_enabled ? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &numa_demotion_enabled);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+ __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+ numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+ &numa_demotion_enabled_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+ .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+ int err;
+ struct kobject *numa_kobj;
+
+ numa_kobj = kobject_create_and_add("numa", mm_kobj);
+ if (!numa_kobj) {
+ pr_err("failed to create numa kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(numa_kobj, &numa_attr_group);
+ if (err) {
+ pr_err("failed to register numa group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(numa_kobj);
+ return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif /* CONFIG_SYSFS */
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 2afb01ea1307..cdc4d4c1c858 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -41,6 +41,7 @@
#include <linux/kernel_stat.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
@@ -51,6 +52,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
+#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -65,6 +67,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
@@ -73,6 +76,8 @@
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
+#include <linux/net_mm.h>
#include <trace/events/kmem.h>
@@ -85,13 +90,13 @@
#include "pgalloc-track.h"
#include "internal.h"
+#include "swap.h"
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-/* use the per-pgdat data instead for discontigmem - mbligh */
+#ifndef CONFIG_NUMA
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
@@ -99,6 +104,22 @@ struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif
+static vm_fault_t do_fault(struct vm_fault *vmf);
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
+static bool vmf_pte_changed(struct vm_fault *vmf);
+
+/*
+ * Return true if the original pte was a uffd-wp pte marker (so the pte was
+ * wr-protected).
+ */
+static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+{
+ if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+ return false;
+
+ return pte_marker_uffd_wp(vmf->orig_pte);
+}
+
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
@@ -122,15 +143,15 @@ int randomize_va_space __read_mostly =
2;
#endif
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
+#ifndef arch_wants_old_prefaulted_pte
+static inline bool arch_wants_old_prefaulted_pte(void)
{
/*
- * Those arches which don't have hw access flag feature need to
- * implement their own helper. By default, "true" means pagefault
- * will be hit on old pte.
+ * Transitioning a PTE from 'old' to 'young' can be expensive on
+ * some architectures, even if it's performed in hardware. By
+ * default, "false" means prefaulted entries will be 'young'.
*/
- return true;
+ return false;
}
#endif
@@ -154,60 +175,13 @@ static int __init init_zero_pfn(void)
zero_pfn = page_to_pfn(ZERO_PAGE(0));
return 0;
}
-core_initcall(init_zero_pfn);
-
-void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
-{
- trace_rss_stat(mm, member, count);
-}
-
-#if defined(SPLIT_RSS_COUNTING)
-
-void sync_mm_rss(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- if (current->rss_stat.count[i]) {
- add_mm_counter(mm, i, current->rss_stat.count[i]);
- current->rss_stat.count[i] = 0;
- }
- }
- current->rss_stat.events = 0;
-}
-
-static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
-{
- struct task_struct *task = current;
-
- if (likely(task->mm == mm))
- task->rss_stat.count[member] += val;
- else
- add_mm_counter(mm, member, val);
-}
-#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
-#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
-
-/* sync counter once per 64 page faults */
-#define TASK_RSS_EVENTS_THRESH (64)
-static void check_sync_rss_stat(struct task_struct *task)
-{
- if (unlikely(task != current))
- return;
- if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
- sync_mm_rss(task->mm);
-}
-#else /* SPLIT_RSS_COUNTING */
+early_initcall(init_zero_pfn);
-#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
-#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
-
-static void check_sync_rss_stat(struct task_struct *task)
+void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
+ trace_rss_stat(mm, member);
}
-#endif /* SPLIT_RSS_COUNTING */
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -387,17 +361,28 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *vma, unsigned long floor,
+ unsigned long ceiling, bool mm_wr_locked)
{
- while (vma) {
- struct vm_area_struct *next = vma->vm_next;
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
+
+ do {
unsigned long addr = vma->vm_start;
+ struct vm_area_struct *next;
+
+ /*
+ * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
+ * be 0. This will underflow and is okay.
+ */
+ next = mas_find(&mas, ceiling - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
+ if (mm_wr_locked)
+ vma_start_write(vma);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
@@ -411,7 +396,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = vma->vm_next;
+ next = mas_find(&mas, ceiling - 1);
+ if (mm_wr_locked)
+ vma_start_write(vma);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
@@ -419,38 +406,42 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
floor, next ? next->vm_start : ceiling);
}
vma = next;
+ } while (vma);
+}
+
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+{
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ mm_inc_nr_ptes(mm);
+ /*
+ * Ensure all pte setup (eg. pte page lock and page clearing) are
+ * visible before the pte is made visible to other CPUs by being
+ * put into page tables.
+ *
+ * The other side of the story is the pointer chasing in the page
+ * table walking code (when walking the page table without locking;
+ * ie. most of the time). Fortunately, these data accesses consist
+ * of a chain of data-dependent loads, meaning most CPUs (alpha
+ * being the notable exception) will already guarantee loads are
+ * seen in-order. See the alpha page table accessors for the
+ * smp_rmb() barriers in page table walking code.
+ */
+ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+ pmd_populate(mm, pmd, *pte);
+ *pte = NULL;
}
+ spin_unlock(ptl);
}
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
- spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
- /*
- * Ensure all pte setup (eg. pte page lock and page clearing) are
- * visible before the pte is made visible to other CPUs by being
- * put into page tables.
- *
- * The other side of the story is the pointer chasing in the page
- * table walking code (when walking the page table without locking;
- * ie. most of the time). Fortunately, these data accesses consist
- * of a chain of data-dependent loads, meaning most CPUs (alpha
- * being the notable exception) will already guarantee loads are
- * seen in-order. See the alpha page table accessors for the
- * smp_rmb() barriers in page table walking code.
- */
- smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
-
- ptl = pmd_lock(mm, pmd);
- if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
- mm_inc_nr_ptes(mm);
- pmd_populate(mm, pmd, new);
- new = NULL;
- }
- spin_unlock(ptl);
+ pmd_install(mm, pmd, &new);
if (new)
pte_free(mm, new);
return 0;
@@ -462,10 +453,9 @@ int __pte_alloc_kernel(pmd_t *pmd)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ smp_wmb(); /* See comment in pmd_install() */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
}
@@ -540,11 +530,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
- mapping ? mapping->a_ops->readpage : NULL);
+ mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
@@ -606,6 +596,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (is_zero_pfn(pfn))
return NULL;
if (pte_devmap(pte))
+ /*
+ * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
+ * and will have refcounts incremented on their struct pages
+ * when they are inserted into PTEs, thus they are safe to
+ * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
+ * do not have refcounts. Example of legacy ZONE_DEVICE is
+ * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+ */
return NULL;
print_bad_pte(vma, addr, pte, NULL);
@@ -646,6 +644,16 @@ out:
return pfn_to_page(pfn);
}
+struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ struct page *page = vm_normal_page(vma, addr, pte);
+
+ if (page)
+ return page_folio(page);
+ return NULL;
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
@@ -688,6 +696,69 @@ out:
}
#endif
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+ struct page *page, unsigned long address,
+ pte_t *ptep)
+{
+ pte_t orig_pte;
+ pte_t pte;
+ swp_entry_t entry;
+
+ orig_pte = ptep_get(ptep);
+ pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+ if (pte_swp_soft_dirty(orig_pte))
+ pte = pte_mksoft_dirty(pte);
+
+ entry = pte_to_swp_entry(orig_pte);
+ if (pte_swp_uffd_wp(orig_pte))
+ pte = pte_mkuffd_wp(pte);
+ else if (is_writable_device_exclusive_entry(entry))
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+ VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page)));
+
+ /*
+ * No need to take a page reference as one was already
+ * created when the swap entry was made.
+ */
+ if (PageAnon(page))
+ page_add_anon_rmap(page, vma, address, RMAP_NONE);
+ else
+ /*
+ * Currently device exclusive access only supports anonymous
+ * memory so the entry shouldn't point to a filebacked page.
+ */
+ WARN_ON_ONCE(1);
+
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+
+ /*
+ * No need to invalidate - it was non-present before. However
+ * secondary CPUs may have mappings that need invalidating.
+ */
+ update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
+ struct page *page = pfn_swap_entry_to_page(entry);
+
+ if (trylock_page(page)) {
+ restore_exclusive_pte(vma, page, addr, src_pte);
+ unlock_page(page);
+ return 0;
+ }
+
+ return -EBUSY;
+}
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
@@ -696,17 +767,18 @@ out:
static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr, int *rss)
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
- unsigned long vm_flags = vma->vm_flags;
- pte_t pte = *src_pte;
+ unsigned long vm_flags = dst_vma->vm_flags;
+ pte_t orig_pte = ptep_get(src_pte);
+ pte_t pte = orig_pte;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(pte);
+ swp_entry_t entry = pte_to_swp_entry(orig_pte);
if (likely(!non_swap_entry(entry))) {
if (swap_duplicate(entry) < 0)
- return entry.val;
+ return -EIO;
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
@@ -716,28 +788,35 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
+ /* Mark the swap entry as shared. */
+ if (pte_swp_exclusive(orig_pte)) {
+ pte = pte_swp_clear_exclusive(orig_pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
rss[mm_counter(page)]++;
- if (is_write_migration_entry(entry) &&
+ if (!is_readable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
- * COW mappings require pages in both
- * parent and child to be set to read.
+ * COW mappings require pages in both parent and child
+ * to be set to read. A previously exclusive entry is
+ * now shared.
*/
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
+ if (pte_swp_soft_dirty(orig_pte))
pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*src_pte))
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_private_entry(entry)) {
- page = device_private_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
/*
* Update rss count even for unaddressable pages, as
@@ -750,7 +829,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
get_page(page);
rss[mm_counter(page)]++;
- page_dup_rmap(page, false);
+ /* Cannot fail as these pages cannot get pinned. */
+ BUG_ON(page_try_dup_anon_rmap(page, false, src_vma));
/*
* We do not preserve soft-dirty information, because so
@@ -759,33 +839,43 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* when a device driver is involved (you cannot easily
* save and restore device driver state).
*/
- if (is_write_device_private_entry(entry) &&
+ if (is_writable_device_private_entry(entry) &&
is_cow_mapping(vm_flags)) {
- make_device_private_entry_read(&entry);
+ entry = make_readable_device_private_entry(
+ swp_offset(entry));
pte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(*src_pte))
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
+ } else if (is_device_exclusive_entry(entry)) {
+ /*
+ * Make device exclusive entries present by restoring the
+ * original entry then copying as for a present pte. Device
+ * exclusive entries currently only support private writable
+ * (ie. COW) mappings.
+ */
+ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+ if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+ return -EBUSY;
+ return -ENOENT;
+ } else if (is_pte_marker_entry(entry)) {
+ if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
+ if (!userfaultfd_wp(dst_vma))
+ pte = pte_swp_clear_uffd_wp(pte);
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
/*
- * Copy a present and normal page if necessary.
+ * Copy a present and normal page.
*
- * NOTE! The usual case is that this doesn't need to do
- * anything, and can just return a positive value. That
- * will let the caller know that it can just increase
- * the page refcount and re-use the pte the traditional
- * way.
- *
- * But _if_ we need to copy it because it needs to be
- * pinned in the parent (and the child should get its own
- * copy rather than just a reference to the same page),
- * we'll do that here and return zero to let the caller
- * know we're done.
+ * NOTE! The usual case is that this isn't required;
+ * instead, the caller can just increase the page refcount
+ * and re-use the pte the traditional way.
*
* And if we need a pre-allocated page but don't yet have
* one, return a negative error to let the preallocation
@@ -795,34 +885,13 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct page **prealloc, pte_t pte, struct page *page)
+ struct folio **prealloc, struct page *page)
{
- struct mm_struct *src_mm = src_vma->vm_mm;
- struct page *new_page;
-
- if (!is_cow_mapping(src_vma->vm_flags))
- return 1;
-
- /*
- * What we want to do is to check whether this page may
- * have been pinned by the parent process. If so,
- * instead of wrprotect the pte on both sides, we copy
- * the page immediately so that we'll always guarantee
- * the pinned page won't be randomly replaced in the
- * future.
- *
- * The page pinning checks are just "has this mm ever
- * seen pinning", along with the (inexact) check of
- * the page count. That might give false positives for
- * for pinning, but it will work correctly.
- */
- if (likely(!atomic_read(&src_mm->has_pinned)))
- return 1;
- if (likely(!page_maybe_dma_pinned(page)))
- return 1;
+ struct folio *new_folio;
+ pte_t pte;
- new_page = *prealloc;
- if (!new_page)
+ new_folio = *prealloc;
+ if (!new_folio)
return -EAGAIN;
/*
@@ -830,15 +899,18 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* over and copy the page & arm it.
*/
*prealloc = NULL;
- copy_user_highpage(new_page, page, addr, src_vma);
- __SetPageUptodate(new_page);
- page_add_new_anon_rmap(new_page, dst_vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
- rss[mm_counter(new_page)]++;
+ copy_user_highpage(&new_folio->page, page, addr, src_vma);
+ __folio_mark_uptodate(new_folio);
+ folio_add_new_anon_rmap(new_folio, dst_vma, addr);
+ folio_add_lru_vma(new_folio, dst_vma);
+ rss[MM_ANONPAGES]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(new_page, dst_vma->vm_page_prot);
+ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+ if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
+ /* Uffd-wp needs to be delivered to dest pte as well */
+ pte = pte_mkuffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -850,25 +922,36 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct page **prealloc)
+ struct folio **prealloc)
{
struct mm_struct *src_mm = src_vma->vm_mm;
unsigned long vm_flags = src_vma->vm_flags;
- pte_t pte = *src_pte;
+ pte_t pte = ptep_get(src_pte);
struct page *page;
+ struct folio *folio;
page = vm_normal_page(src_vma, addr, pte);
- if (page) {
- int retval;
-
- retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, prealloc, pte, page);
- if (retval <= 0)
- return retval;
-
- get_page(page);
- page_dup_rmap(page, false);
- rss[mm_counter(page)]++;
+ if (page)
+ folio = page_folio(page);
+ if (page && folio_test_anon(folio)) {
+ /*
+ * If this page may have been pinned by the parent process,
+ * copy the page immediately for the child so that we'll always
+ * guarantee the pinned page won't be randomly replaced in the
+ * future.
+ */
+ folio_get(folio);
+ if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
+ /* Page may be pinned, we have to copy. */
+ folio_put(folio);
+ return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, page);
+ }
+ rss[MM_ANONPAGES]++;
+ } else if (page) {
+ folio_get(folio);
+ page_dup_file_rmap(page, false);
+ rss[mm_counter_file(page)]++;
}
/*
@@ -879,6 +962,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
+ VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
/*
* If it's a shared mapping, mark it clean in
@@ -888,35 +972,29 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
- /*
- * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
- * does not have the VM_UFFD_WP, which means that the uffd
- * fork event is not enabled.
- */
- if (!(vm_flags & VM_UFFD_WP))
+ if (!userfaultfd_wp(dst_vma))
pte = pte_clear_uffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
-static inline struct page *
-page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
- unsigned long addr)
+static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
+ struct vm_area_struct *vma, unsigned long addr)
{
- struct page *new_page;
+ struct folio *new_folio;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
- if (!new_page)
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
+ if (!new_folio)
return NULL;
- if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
- put_page(new_page);
+ if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
+ folio_put(new_folio);
return NULL;
}
- cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+ folio_throttle_swaprate(new_folio, GFP_KERNEL);
- return new_page;
+ return new_folio;
}
static int
@@ -928,23 +1006,36 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
+ pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
int progress, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
- struct page *prealloc = NULL;
+ struct folio *prealloc = NULL;
again:
progress = 0;
init_rss_vec(rss);
+ /*
+ * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
+ * error handling here, assume that exclusive mmap_lock on dst and src
+ * protects anon from unexpected THP transitions; with shmem and file
+ * protected by mmap_lock-less collapse skipping areas with anon_vma
+ * (whereas vma_needs_copy() skips areas without anon_vma). A rework
+ * can remove such assumptions later, but this is good enough for now.
+ */
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) {
ret = -ENOMEM;
goto out;
}
- src_pte = pte_offset_map(src_pmd, addr);
- src_ptl = pte_lockptr(src_mm, src_pmd);
+ src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+ if (!src_pte) {
+ pte_unmap_unlock(dst_pte, dst_ptl);
+ /* ret == 0 */
+ goto out;
+ }
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
@@ -961,18 +1052,31 @@ again:
spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
break;
}
- if (pte_none(*src_pte)) {
+ ptent = ptep_get(src_pte);
+ if (pte_none(ptent)) {
progress++;
continue;
}
- if (unlikely(!pte_present(*src_pte))) {
- entry.val = copy_nonpresent_pte(dst_mm, src_mm,
- dst_pte, src_pte,
- src_vma, addr, rss);
- if (entry.val)
+ if (unlikely(!pte_present(ptent))) {
+ ret = copy_nonpresent_pte(dst_mm, src_mm,
+ dst_pte, src_pte,
+ dst_vma, src_vma,
+ addr, rss);
+ if (ret == -EIO) {
+ entry = pte_to_swp_entry(ptep_get(src_pte));
break;
- progress += 8;
- continue;
+ } else if (ret == -EBUSY) {
+ break;
+ } else if (!ret) {
+ progress += 8;
+ continue;
+ }
+
+ /*
+ * Device exclusive entry restored, continue by copying
+ * the now present pte.
+ */
+ WARN_ON_ONCE(ret != -ENOENT);
}
/* copy_present_pte() will clear `*prealloc' if consumed */
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
@@ -990,38 +1094,43 @@ again:
* will allocate page according to address). This
* could only happen if one pinned pte changed.
*/
- put_page(prealloc);
+ folio_put(prealloc);
prealloc = NULL;
}
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- spin_unlock(src_ptl);
- pte_unmap(orig_src_pte);
+ pte_unmap_unlock(orig_src_pte, src_ptl);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
- if (entry.val) {
+ if (ret == -EIO) {
+ VM_WARN_ON_ONCE(!entry.val);
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
ret = -ENOMEM;
goto out;
}
entry.val = 0;
- } else if (ret) {
- WARN_ON_ONCE(ret != -EAGAIN);
+ } else if (ret == -EBUSY) {
+ goto out;
+ } else if (ret == -EAGAIN) {
prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
return -ENOMEM;
- /* We've captured and resolved the error. Reset, try again. */
- ret = 0;
+ } else if (ret) {
+ VM_WARN_ON_ONCE(1);
}
+
+ /* We've captured and resolved the error. Reset, try again. */
+ ret = 0;
+
if (addr != end)
goto again;
out:
if (unlikely(prealloc))
- put_page(prealloc);
+ folio_put(prealloc);
return ret;
}
@@ -1045,8 +1154,8 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
|| pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
- err = copy_huge_pmd(dst_mm, src_mm,
- dst_pmd, src_pmd, addr, src_vma);
+ err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+ addr, dst_vma, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1123,6 +1232,38 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
return 0;
}
+/*
+ * Return true if the vma needs to copy the pgtable during this fork(). Return
+ * false when we can speed up fork() by allowing lazy page faults later until
+ * when the child accesses the memory range.
+ */
+static bool
+vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
+{
+ /*
+ * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
+ * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
+ * contains uffd-wp protection information, that's something we can't
+ * retrieve from page cache, and skip copying will lose those info.
+ */
+ if (userfaultfd_wp(dst_vma))
+ return true;
+
+ if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+ return true;
+
+ if (src_vma->anon_vma)
+ return true;
+
+ /*
+ * Don't copy ptes where a page fault will fill them correctly. Fork
+ * becomes much lighter when there are big shared or private readonly
+ * mappings. The tradeoff is that copy_page_range is more efficient
+ * than faulting.
+ */
+ return false;
+}
+
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
@@ -1136,18 +1277,11 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
bool is_cow;
int ret;
- /*
- * Don't copy ptes where a page fault will fill them correctly.
- * Fork becomes much lighter when there are big shared or private
- * readonly mappings. The tradeoff is that copy_page_range is more
- * efficient than faulting.
- */
- if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
- !src_vma->anon_vma)
+ if (!vma_needs_copy(dst_vma, src_vma))
return 0;
if (is_vm_hugetlb_page(src_vma))
- return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
+ return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
@@ -1169,8 +1303,17 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (is_cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
- 0, src_vma, src_mm, addr, end);
+ 0, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
+ /*
+ * Disabling preemption is not needed for the write side, as
+ * the read side doesn't spin, but goes to the mmap_lock.
+ *
+ * Use the raw variant of the seqcount_t write API to avoid
+ * lockdep complaining about preemptibility.
+ */
+ mmap_assert_write_locked(src_mm);
+ raw_write_seqcount_begin(&src_mm->write_protect_seq);
}
ret = 0;
@@ -1182,16 +1325,72 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
+ untrack_pfn_clear(dst_vma);
ret = -ENOMEM;
break;
}
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
- if (is_cow)
+ if (is_cow) {
+ raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
+ }
return ret;
}
+/* Whether we should zap all COWed (private) pages too */
+static inline bool should_zap_cows(struct zap_details *details)
+{
+ /* By default, zap all pages */
+ if (!details)
+ return true;
+
+ /* Or, we zap COWed pages only if the caller wants to */
+ return details->even_cows;
+}
+
+/* Decides whether we should zap this page with the page pointer specified */
+static inline bool should_zap_page(struct zap_details *details, struct page *page)
+{
+ /* If we can make a decision without *page.. */
+ if (should_zap_cows(details))
+ return true;
+
+ /* E.g. the caller passes NULL for the case of a zero page */
+ if (!page)
+ return true;
+
+ /* Otherwise we should only zap non-anon pages */
+ return !PageAnon(page);
+}
+
+static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+{
+ if (!details)
+ return false;
+
+ return details->zap_flags & ZAP_FLAG_DROP_MARKER;
+}
+
+/*
+ * This function makes sure that we'll replace the none pte with an uffd-wp
+ * swap special pte marker when necessary. Must be with the pgtable lock held.
+ */
+static inline void
+zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte,
+ struct zap_details *details, pte_t pteval)
+{
+ /* Zap on anonymous always means dropping everything */
+ if (vma_is_anonymous(vma))
+ return;
+
+ if (zap_drop_file_uffd_wp(details))
+ return;
+
+ pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1206,14 +1405,17 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
swp_entry_t entry;
tlb_change_page_size(tlb, PAGE_SIZE);
-again:
init_rss_vec(rss);
- start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- pte = start_pte;
+ start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return addr;
+
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
+ struct page *page;
+
if (pte_none(ptent))
continue;
@@ -1221,39 +1423,38 @@ again:
break;
if (pte_present(ptent)) {
- struct page *page;
+ unsigned int delay_rmap;
page = vm_normal_page(vma, addr, ptent);
- if (unlikely(details) && page) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping &&
- details->check_mapping != page_rmapping(page))
- continue;
- }
+ if (unlikely(!should_zap_page(details, page)))
+ continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, details,
+ ptent);
if (unlikely(!page))
continue;
+ delay_rmap = 0;
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
- force_flush = 1;
set_page_dirty(page);
+ if (tlb_delay_rmap(tlb)) {
+ delay_rmap = 1;
+ force_flush = 1;
+ }
}
- if (pte_young(ptent) &&
- likely(!(vma->vm_flags & VM_SEQ_READ)))
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
- if (unlikely(__tlb_remove_page(tlb, page))) {
+ if (!delay_rmap) {
+ page_remove_rmap(page, vma, false);
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ }
+ if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
force_flush = 1;
addr += PAGE_SIZE;
break;
@@ -1262,67 +1463,72 @@ again:
}
entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry)) {
- struct page *page = device_private_entry_to_page(entry);
-
- if (unlikely(details && details->check_mapping)) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping !=
- page_rmapping(page))
- continue;
- }
-
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ page = pfn_swap_entry_to_page(entry);
+ if (unlikely(!should_zap_page(details, page)))
+ continue;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
+ if (is_device_private_entry(entry))
+ page_remove_rmap(page, vma, false);
put_page(page);
- continue;
- }
-
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
- continue;
-
- if (!non_swap_entry(entry))
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entry, hence a private anon page */
+ if (!should_zap_cows(details))
+ continue;
rss[MM_SWAPENTS]--;
- else if (is_migration_entry(entry)) {
- struct page *page;
-
- page = migration_entry_to_page(entry);
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ } else if (is_migration_entry(entry)) {
+ page = pfn_swap_entry_to_page(entry);
+ if (!should_zap_page(details, page))
+ continue;
rss[mm_counter(page)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /*
+ * For anon: always drop the marker; for file: only
+ * drop the marker if explicitly requested.
+ */
+ if (!vma_is_anonymous(vma) &&
+ !zap_drop_file_uffd_wp(details))
+ continue;
+ } else if (is_hwpoison_entry(entry) ||
+ is_swapin_error_entry(entry)) {
+ if (!should_zap_cows(details))
+ continue;
+ } else {
+ /* We should have covered all the swap entry types */
+ WARN_ON_ONCE(1);
}
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
} while (pte++, addr += PAGE_SIZE, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
/* Do the actual TLB flush before dropping ptl */
- if (force_flush)
+ if (force_flush) {
tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_rmaps(tlb, vma);
+ }
pte_unmap_unlock(start_pte, ptl);
/*
* If we forced a TLB flush (either due to running out of
* batch buffers or because we needed to flush dirty TLB
* entries before releasing the ptl), free the batched
- * memory too. Restart if we didn't do everything.
+ * memory too. Come back again if we didn't do everything.
*/
- if (force_flush) {
- force_flush = 0;
+ if (force_flush)
tlb_flush_mmu(tlb);
- }
-
- if (addr != end) {
- cond_resched();
- goto again;
- }
return addr;
}
@@ -1341,23 +1547,30 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
- else if (zap_huge_pmd(tlb, vma, pmd, addr))
- goto next;
+ else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
+ addr = next;
+ continue;
+ }
/* fall through */
+ } else if (details && details->single_folio &&
+ folio_test_pmd_mappable(details->single_folio) &&
+ next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+ spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+ /*
+ * Take and drop THP pmd lock so that we cannot return
+ * prematurely, while zap_huge_pmd() has cleared *pmd,
+ * but not yet decremented compound_mapcount().
+ */
+ spin_unlock(ptl);
}
- /*
- * Here there can be other concurrent MADV_DONTNEED or
- * trans huge page faults running, and if the pmd is
- * none or trans huge it can change under us. This is
- * because MADV_DONTNEED holds the mmap_lock in read
- * mode.
- */
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- goto next;
- next = zap_pte_range(tlb, vma, pmd, addr, next, details);
-next:
- cond_resched();
- } while (pmd++, addr = next, addr != end);
+ if (pmd_none(*pmd)) {
+ addr = next;
+ continue;
+ }
+ addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
+ if (addr != next)
+ pmd--;
+ } while (pmd++, cond_resched(), addr != end);
return addr;
}
@@ -1434,7 +1647,7 @@ void unmap_page_range(struct mmu_gather *tlb,
static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr,
- struct zap_details *details)
+ struct zap_details *details, bool mm_wr_locked)
{
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
@@ -1449,7 +1662,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
uprobe_munmap(vma, start, end);
if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn(vma, 0, 0);
+ untrack_pfn(vma, 0, 0, mm_wr_locked);
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1465,9 +1678,10 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* safe to do nothing in this case.
*/
if (vma->vm_file) {
- i_mmap_lock_write(vma->vm_file->f_mapping);
- __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ zap_flags_t zap_flags = details ?
+ details->zap_flags : 0;
+ __unmap_hugepage_range_final(tlb, vma, start, end,
+ NULL, zap_flags);
}
} else
unmap_page_range(tlb, vma, start, end, details);
@@ -1477,6 +1691,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
+ * @mt: the maple tree
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
@@ -1492,44 +1707,26 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-void unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr)
+ unsigned long end_addr, bool mm_wr_locked)
{
struct mmu_notifier_range range;
+ struct zap_details details = {
+ .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
+ /* Careful - we need to zap private pages too! */
+ .even_cows = true,
+ };
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
- unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
- mmu_notifier_invalidate_range_end(&range);
-}
-
-/**
- * zap_page_range - remove user pages in a given range
- * @vma: vm_area_struct holding the applicable pages
- * @start: starting address of pages to zap
- * @size: number of bytes to zap
- *
- * Caller must protect the VMA list
- */
-void zap_page_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long size)
-{
- struct mmu_notifier_range range;
- struct mmu_gather tlb;
-
- lru_add_drain();
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- start, start + size);
- tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
- update_hiwater_rss(vma->vm_mm);
- mmu_notifier_invalidate_range_start(&range);
- for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
- unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ do {
+ unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
+ mm_wr_locked);
+ } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, start, range.end);
}
/**
@@ -1541,21 +1738,29 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
*
* The range must fit into one VMA.
*/
-static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
+ const unsigned long end = address + size;
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address, address + size);
- tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
+ address, end);
+ if (is_vm_hugetlb_page(vma))
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
- unmap_single_vma(&tlb, vma, address, range.end, details);
+ /*
+ * unmap 'address-end' not 'range.start-range.end' as range
+ * could have been expanded for hugetlb pmd sharing.
+ */
+ unmap_single_vma(&tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, address, range.end);
+ tlb_finish_mmu(&tlb);
}
/**
@@ -1572,7 +1777,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size)
{
- if (address < vma->vm_start || address + size > vma->vm_end ||
+ if (!range_in_vma(vma, address, address + size) ||
!(vma->vm_flags & VM_PFNMAP))
return;
@@ -1620,16 +1825,16 @@ static int validate_page_before_insert(struct page *page)
return 0;
}
-static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
- if (!pte_none(*pte))
+ if (!pte_none(ptep_get(pte)))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(mm, addr, pte, mk_pte(page, prot));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(page));
+ page_add_file_rmap(page, vma, false);
+ set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
}
@@ -1643,7 +1848,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
- struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;
@@ -1652,17 +1856,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
if (retval)
goto out;
retval = -ENOMEM;
- pte = get_locked_pte(mm, addr, &ptl);
+ pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
#ifdef pte_index
-static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
+static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
@@ -1672,7 +1876,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
err = validate_page_before_insert(page);
if (err)
return err;
- return insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -1708,8 +1912,12 @@ more:
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+ if (!start_pte) {
+ ret = -EFAULT;
+ goto out;
+ }
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
- int err = insert_page_in_batch_locked(mm, pte,
+ int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
pte_unmap_unlock(start_pte, pte_lock);
@@ -1759,7 +1967,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
- vma->vm_flags |= VM_MIXEDMAP;
+ vm_flags_set(vma, VM_MIXEDMAP);
}
/* Defer page refcount checking till we're about to map that page. */
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
@@ -1817,7 +2025,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
- vma->vm_flags |= VM_MIXEDMAP;
+ vm_flags_set(vma, VM_MIXEDMAP);
}
return insert_page(vma, addr, page, vma->vm_page_prot);
}
@@ -1914,7 +2122,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return VM_FAULT_OOM;
- if (!pte_none(*pte)) {
+ entry = ptep_get(pte);
+ if (!pte_none(entry)) {
if (mkwrite) {
/*
* For read faults on private mappings the PFN passed
@@ -1926,11 +2135,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* allocation and mapping invalidation so just skip the
* update.
*/
- if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
- WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
+ if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
goto out_unlock;
}
- entry = pte_mkyoung(*pte);
+ entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, addr, pte, entry, 1))
update_mmu_cache(vma, addr, pte);
@@ -1972,8 +2181,20 @@ out_unlock:
* vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
*
- * See vmf_insert_mixed_prot() for a discussion of the implication of using
- * a value of @pgprot different from that of @vma->vm_page_prot.
+ * pgprot typically only differs from @vma->vm_page_prot when drivers set
+ * caching- and encryption bits different than those of @vma->vm_page_prot,
+ * because the caching- or encryption mode may not be known at mmap() time.
+ *
+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
+ * to set caching and encryption bits for those vmas (except for COW pages).
+ * This is ensured by core vm only modifying these page table entries using
+ * functions that don't touch caching- or encryption bits, using pte_modify()
+ * if needed. (See for example mprotect()).
+ *
+ * Also when new page-table entries are created, this is only done using the
+ * fault() callback, and never using the value of vma->vm_page_prot,
+ * except for page-table entries that point to anonymous pages as the result
+ * of COW.
*
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
@@ -2048,9 +2269,9 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, pgprot_t pgprot,
- bool mkwrite)
+ unsigned long addr, pfn_t pfn, bool mkwrite)
{
+ pgprot_t pgprot = vma->vm_page_prot;
int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
@@ -2093,43 +2314,10 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
-/**
- * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- * @pgprot: pgprot flags for the inserted page
- *
- * This is exactly like vmf_insert_mixed(), except that it allows drivers
- * to override pgprot on a per-page basis.
- *
- * Typically this function should be used by drivers to set caching- and
- * encryption bits different than those of @vma->vm_page_prot, because
- * the caching- or encryption mode may not be known at mmap() time.
- * This is ok as long as @vma->vm_page_prot is not used by the core vm
- * to set caching and encryption bits for those vmas (except for COW pages).
- * This is ensured by core vm only modifying these page table entries using
- * functions that don't touch caching- or encryption bits, using pte_modify()
- * if needed. (See for example mprotect()).
- * Also when new page-table entries are created, this is only done using the
- * fault() callback, and never using the value of vma->vm_page_prot,
- * except for page-table entries that point to anonymous pages as the result
- * of COW.
- *
- * Context: Process context. May allocate using %GFP_KERNEL.
- * Return: vm_fault_t value.
- */
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t pgprot)
-{
- return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
-}
-EXPORT_SYMBOL(vmf_insert_mixed_prot);
-
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
+ return __vm_insert_mixed(vma, addr, pfn, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);
@@ -2141,7 +2329,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
+ return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
@@ -2154,16 +2342,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
spinlock_t *ptl;
int err = 0;
- pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
arch_enter_lazy_mmu_mode();
do {
- BUG_ON(!pte_none(*pte));
+ BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
err = -EACCES;
break;
@@ -2172,7 +2360,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -2243,26 +2431,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
+/*
+ * Variant of remap_pfn_range that does not call track_pfn_remap. The caller
+ * must have pre-validated the caching bits of the pgprot_t.
*/
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
pgd_t *pgd;
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
- unsigned long remap_pfn = pfn;
int err;
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
@@ -2292,11 +2471,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
vma->vm_pgoff = pfn;
}
- err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
- if (err)
- return -EINVAL;
-
- vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -2307,12 +2482,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
err = remap_p4d_range(mm, pgd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
- break;
+ return err;
} while (pgd++, addr = next, addr != end);
+ return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ int err;
+
+ err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
if (err)
- untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
+ return -EINVAL;
+ err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+ if (err)
+ untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -2371,39 +2570,41 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
int err = 0;
spinlock_t *ptl;
if (create) {
- pte = (mm == &init_mm) ?
+ mapped_pte = pte = (mm == &init_mm) ?
pte_alloc_kernel_track(pmd, addr, mask) :
pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
} else {
- pte = (mm == &init_mm) ?
+ mapped_pte = pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -EINVAL;
}
- BUG_ON(pmd_huge(*pmd));
-
arch_enter_lazy_mmu_mode();
- do {
- if (create || !pte_none(*pte)) {
- err = fn(pte++, addr, data);
- if (err)
- break;
- }
- } while (addr += PAGE_SIZE, addr != end);
+ if (fn) {
+ do {
+ if (create || !pte_none(ptep_get(pte))) {
+ err = fn(pte++, addr, data);
+ if (err)
+ break;
+ }
+ } while (addr += PAGE_SIZE, addr != end);
+ }
*mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
if (mm != &init_mm)
- pte_unmap_unlock(pte-1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -2427,13 +2628,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
}
do {
next = pmd_addr_end(addr, end);
- if (create || !pmd_none_or_clear_bad(pmd)) {
- err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (pmd_none(*pmd) && !create)
+ continue;
+ if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+ return -EINVAL;
+ if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+ if (!create)
+ continue;
+ pmd_clear_bad(pmd);
}
+ err = apply_to_pte_range(mm, pmd, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (pmd++, addr = next, addr != end);
+
return err;
}
@@ -2455,13 +2664,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
}
do {
next = pud_addr_end(addr, end);
- if (create || !pud_none_or_clear_bad(pud)) {
- err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (pud_none(*pud) && !create)
+ continue;
+ if (WARN_ON_ONCE(pud_leaf(*pud)))
+ return -EINVAL;
+ if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+ if (!create)
+ continue;
+ pud_clear_bad(pud);
}
+ err = apply_to_pmd_range(mm, pud, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (pud++, addr = next, addr != end);
+
return err;
}
@@ -2483,13 +2700,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
}
do {
next = p4d_addr_end(addr, end);
- if (create || !p4d_none_or_clear_bad(p4d)) {
- err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (p4d_none(*p4d) && !create)
+ continue;
+ if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+ return -EINVAL;
+ if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+ if (!create)
+ continue;
+ p4d_clear_bad(p4d);
}
+ err = apply_to_pud_range(mm, p4d, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (p4d++, addr = next, addr != end);
+
return err;
}
@@ -2509,9 +2734,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (!create && pgd_none_or_clear_bad(pgd))
+ if (pgd_none(*pgd) && !create)
continue;
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return -EINVAL;
+ if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+ if (!create)
+ continue;
+ pgd_clear_bad(pgd);
+ }
+ err = apply_to_p4d_range(mm, pgd, addr, next,
+ fn, data, create, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
@@ -2555,36 +2788,43 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
* proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
*/
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
- pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct vm_fault *vmf)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
- same = pte_same(*page_table, orig_pte);
- spin_unlock(ptl);
+ spin_lock(vmf->ptl);
+ same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+ spin_unlock(vmf->ptl);
}
#endif
- pte_unmap(page_table);
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
return same;
}
-static inline bool cow_user_page(struct page *dst, struct page *src,
- struct vm_fault *vmf)
+/*
+ * Return:
+ * 0: copied succeeded
+ * -EHWPOISON: copy failed due to hwpoison in source page
+ * -EAGAIN: copied failed (some other reason)
+ */
+static inline int __wp_page_copy_user(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
- bool ret;
+ int ret;
void *kaddr;
void __user *uaddr;
- bool locked = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address;
if (likely(src)) {
- copy_user_highpage(dst, src, addr, vma);
- return true;
+ if (copy_mc_user_highpage(dst, src, addr, vma)) {
+ memory_failure_queue(page_to_pfn(src), 0);
+ return -EHWPOISON;
+ }
+ return 0;
}
/*
@@ -2600,18 +2840,19 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ vmf->pte = NULL;
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- locked = true;
- if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
/*
* Other thread has already handled the fault
* and update local tlb only
*/
- update_mmu_tlb(vma, addr, vmf->pte);
- ret = false;
+ if (vmf->pte)
+ update_mmu_tlb(vma, addr, vmf->pte);
+ ret = -EAGAIN;
goto pte_unlock;
}
@@ -2627,16 +2868,16 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
- if (locked)
+ if (vmf->pte)
goto warn;
/* Re-validate under PTL if the page is still mapped */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- locked = true;
- if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
- update_mmu_tlb(vma, addr, vmf->pte);
- ret = false;
+ if (vmf->pte)
+ update_mmu_tlb(vma, addr, vmf->pte);
+ ret = -EAGAIN;
goto pte_unlock;
}
@@ -2655,10 +2896,10 @@ warn:
}
}
- ret = true;
+ ret = 0;
pte_unlock:
- if (locked)
+ if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
@@ -2758,7 +2999,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
balance_dirty_pages_ratelimited(mapping);
if (fpin) {
fput(fpin);
- return VM_FAULT_RETRY;
+ return VM_FAULT_COMPLETED;
}
}
@@ -2779,6 +3020,10 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
pte_t entry;
+
+ VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
+ VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
+
/*
* Clear the pages cpupid information as the existing
* information potentially belongs to a now completely
@@ -2797,7 +3042,8 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
}
/*
- * Handle the case of a page which we actually need to copy to a new page.
+ * Handle the case of a page which we actually need to copy to a new page,
+ * either due to COW or unsharing.
*
* Called with mmap_lock locked and the old page referenced, but
* without the ptl held.
@@ -2814,49 +3060,59 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
*/
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
- struct page *old_page = vmf->page;
- struct page *new_page = NULL;
+ struct folio *old_folio = NULL;
+ struct folio *new_folio = NULL;
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
+ int ret;
+
+ delayacct_wpcopy_start();
+ if (vmf->page)
+ old_folio = page_folio(vmf->page);
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma,
- vmf->address);
- if (!new_page)
+ new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+ if (!new_folio)
goto oom;
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
- if (!new_page)
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+ vmf->address, false);
+ if (!new_folio)
goto oom;
- if (!cow_user_page(new_page, old_page, vmf)) {
+ ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
+ if (ret) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
* the same address and we will handle the fault
* from the second attempt.
+ * The -EHWPOISON case will not be retried.
*/
- put_page(new_page);
- if (old_page)
- put_page(old_page);
- return 0;
+ folio_put(new_folio);
+ if (old_folio)
+ folio_put(old_folio);
+
+ delayacct_wpcopy_end();
+ return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
}
+ kmsan_copy_page_meta(&new_folio->page, vmf->page);
}
- if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+ if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
goto oom_free_new;
- cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+ folio_throttle_swaprate(new_folio, GFP_KERNEL);
- __SetPageUptodate(new_page);
+ __folio_mark_uptodate(new_folio);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -2865,37 +3121,46 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* Re-check the pte - we dropped the lock
*/
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
- if (old_page) {
- if (!PageAnon(old_page)) {
- dec_mm_counter_fast(mm,
- mm_counter_file(old_page));
- inc_mm_counter_fast(mm, MM_ANONPAGES);
+ if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ if (old_folio) {
+ if (!folio_test_anon(old_folio)) {
+ dec_mm_counter(mm, mm_counter_file(&old_folio->page));
+ inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
- inc_mm_counter_fast(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = mk_pte(&new_folio->page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (unlikely(unshare)) {
+ if (pte_soft_dirty(vmf->orig_pte))
+ entry = pte_mksoft_dirty(entry);
+ if (pte_uffd_wp(vmf->orig_pte))
+ entry = pte_mkuffd_wp(entry);
+ } else {
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
+
/*
* Clear the pte entry and flush it first, before updating the
- * pte with the new entry. This will avoid a race condition
- * seen in the presence of one thread doing SMC and another
- * thread doing COW.
+ * pte with the new entry, to keep TLBs on different CPUs in
+ * sync. This code used to set the new PTE then flush TLBs, but
+ * that left a window where the new PTE could be loaded into
+ * some TLBs while the old PTE remains in others.
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
- page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ folio_add_new_anon_rmap(new_folio, vma, vmf->address);
+ folio_add_lru_vma(new_folio, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
+ BUG_ON(unshare && pte_write(entry));
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
update_mmu_cache(vma, vmf->address, vmf->pte);
- if (old_page) {
+ if (old_folio) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
@@ -2918,44 +3183,41 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page, false);
+ page_remove_rmap(vmf->page, vma, false);
}
/* Free the old page.. */
- new_page = old_page;
+ new_folio = old_folio;
page_copied = 1;
- } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ } else if (vmf->pte) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- if (new_page)
- put_page(new_page);
-
- pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
- if (old_page) {
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
- lock_page(old_page); /* LRU manipulation */
- if (PageMlocked(old_page))
- munlock_vma_page(old_page);
- unlock_page(old_page);
- }
- put_page(old_page);
+
+ if (new_folio)
+ folio_put(new_folio);
+ if (old_folio) {
+ if (page_copied)
+ free_swap_cache(&old_folio->page);
+ folio_put(old_folio);
}
- return page_copied ? VM_FAULT_WRITE : 0;
+
+ delayacct_wpcopy_end();
+ return 0;
oom_free_new:
- put_page(new_page);
+ folio_put(new_folio);
oom:
- if (old_page)
- put_page(old_page);
+ if (old_folio)
+ folio_put(old_folio);
+
+ delayacct_wpcopy_end();
return VM_FAULT_OOM;
}
@@ -2972,7 +3234,7 @@ oom:
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
*
- * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
+ * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
* we acquired PTE lock.
*/
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
@@ -2980,11 +3242,13 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
+ if (!vmf->pte)
+ return VM_FAULT_NOPAGE;
/*
* We might have raced with another page fault while we released the
* pte_offset_map_lock.
*/
- if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
@@ -3012,14 +3276,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
return finish_mkwrite_fault(vmf);
}
wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
+ return 0;
}
static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
- vm_fault_t ret = VM_FAULT_WRITE;
+ vm_fault_t ret = 0;
get_page(vmf->page);
@@ -3050,18 +3314,22 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
}
/*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
+ * This routine handles present pages, when
+ * * users try to write to a shared page (FAULT_FLAG_WRITE)
+ * * GUP wants to take a R/O pin on a possibly shared anonymous page
+ * (FAULT_FLAG_UNSHARE)
+ *
+ * It is done by copying the page to a new address and decrementing the
+ * shared-page counter for the old page.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
+ * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
+ * done any necessary COW.
*
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
+ * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
+ * though the page will change only once the write actually happens. This
+ * avoids a few races, and potentially makes it more efficient.
*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
@@ -3070,15 +3338,32 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio = NULL;
- if (userfaultfd_pte_wp(vma, *vmf->pte)) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return handle_userfault(vmf, VM_UFFD_WP);
+ if (likely(!unshare)) {
+ if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
+ /*
+ * Userfaultfd write-protect can defer flushes. Ensure the TLB
+ * is flushed in this case before copying.
+ */
+ if (unlikely(userfaultfd_wp(vmf->vma) &&
+ mm_tlb_flush_pending(vmf->vma->vm_mm)))
+ flush_tlb_page(vmf->vma, vmf->address);
}
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
- if (!vmf->page) {
+
+ /*
+ * Shared mapping: we are guaranteed to have VM_WRITE and
+ * FAULT_FLAG_WRITE set at this point.
+ */
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -3086,49 +3371,78 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
+ if (!vmf->page)
return wp_pfn_shared(vmf);
-
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return wp_page_copy(vmf);
+ return wp_page_shared(vmf);
}
+ if (vmf->page)
+ folio = page_folio(vmf->page);
+
/*
- * Take out anonymous pages first, anonymous shared vmas are
- * not dirty accountable.
+ * Private mapping: create an exclusive anonymous page copy if reuse
+ * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
*/
- if (PageAnon(vmf->page)) {
- struct page *page = vmf->page;
+ if (folio && folio_test_anon(folio)) {
+ /*
+ * If the page is exclusive to this process we must reuse the
+ * page without further checks.
+ */
+ if (PageAnonExclusive(vmf->page))
+ goto reuse;
- /* PageKsm() doesn't necessarily raise the page refcount */
- if (PageKsm(page) || page_count(page) != 1)
+ /*
+ * We have to verify under folio lock: these early checks are
+ * just an optimization to avoid locking the folio and freeing
+ * the swapcache if there is little hope that we can reuse.
+ *
+ * KSM doesn't necessarily raise the folio refcount.
+ */
+ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
+ goto copy;
+ if (!folio_test_lru(folio))
+ /*
+ * We cannot easily detect+handle references from
+ * remote LRU caches or references to LRU folios.
+ */
+ lru_add_drain();
+ if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
goto copy;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto copy;
- if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
- unlock_page(page);
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
+ folio_unlock(folio);
goto copy;
}
/*
- * Ok, we've got the only map reference, and the only
- * page count reference, and the page is locked,
- * it's dark out, and we're wearing sunglasses. Hit it.
+ * Ok, we've got the only folio reference from our mapping
+ * and the folio is locked, it's dark out, and we're wearing
+ * sunglasses. Hit it.
*/
- unlock_page(page);
+ page_move_anon_rmap(vmf->page, vma);
+ folio_unlock(folio);
+reuse:
+ if (unlikely(unshare)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))) {
- return wp_page_shared(vmf);
+ return 0;
}
copy:
/*
* Ok, we need to copy. Oh, well..
*/
- get_page(vmf->page);
+ if (folio)
+ folio_get(folio);
pte_unmap_unlock(vmf->pte, vmf->ptl);
+#ifdef CONFIG_KSM
+ if (folio && folio_test_ksm(folio))
+ count_vm_event(COW_KSM);
+#endif
return wp_page_copy(vmf);
}
@@ -3140,22 +3454,18 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
}
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
+ pgoff_t first_index,
+ pgoff_t last_index,
struct zap_details *details)
{
struct vm_area_struct *vma;
pgoff_t vba, vea, zba, zea;
- vma_interval_tree_foreach(vma, root,
- details->first_index, details->last_index) {
-
+ vma_interval_tree_foreach(vma, root, first_index, last_index) {
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
- zba = details->first_index;
- if (zba < vba)
- zba = vba;
- zea = details->last_index;
- if (zea > vea)
- zea = vea;
+ zba = max(first_index, vba);
+ zea = min(last_index, vea);
unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
@@ -3165,6 +3475,40 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
/**
+ * unmap_mapping_folio() - Unmap single folio from processes.
+ * @folio: The locked folio to be unmapped.
+ *
+ * Unmap this folio from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
+ * truncation or invalidation holds the lock on a folio, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_folio()
+ * to unmap it finally.
+ */
+void unmap_mapping_folio(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+ struct zap_details details = { };
+ pgoff_t first_index;
+ pgoff_t last_index;
+
+ VM_BUG_ON(!folio_test_locked(folio));
+
+ first_index = folio->index;
+ last_index = folio->index + folio_nr_pages(folio) - 1;
+
+ details.even_cows = false;
+ details.single_folio = folio;
+ details.zap_flags = ZAP_FLAG_DROP_MARKER;
+
+ i_mmap_lock_read(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+ last_index, &details);
+ i_mmap_unlock_read(mapping);
+}
+
+/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
* @start: Index of first page to be unmapped.
@@ -3180,18 +3524,20 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t nr, bool even_cows)
{
struct zap_details details = { };
+ pgoff_t first_index = start;
+ pgoff_t last_index = start + nr - 1;
- details.check_mapping = even_cows ? NULL : mapping;
- details.first_index = start;
- details.last_index = start + nr - 1;
- if (details.last_index < details.first_index)
- details.last_index = ULONG_MAX;
+ details.even_cows = even_cows;
+ if (last_index < first_index)
+ last_index = ULONG_MAX;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
- unmap_mapping_range_tree(&mapping->i_mmap, &details);
- i_mmap_unlock_write(mapping);
+ unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+ last_index, &details);
+ i_mmap_unlock_read(mapping);
}
+EXPORT_SYMBOL_GPL(unmap_mapping_pages);
/**
* unmap_mapping_range - unmap the portion of all mmaps in the specified
@@ -3229,6 +3575,135 @@ void unmap_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL(unmap_mapping_range);
/*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+ struct folio *folio = page_folio(vmf->page);
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+
+ /*
+ * We need a reference to lock the folio because we don't hold
+ * the PTL so a racing thread can remove the device-exclusive
+ * entry and unmap it. If the folio is free the entry must
+ * have been removed already. If it happens to have already
+ * been re-allocated after being freed all we do is lock and
+ * unlock it.
+ */
+ if (!folio_try_get(folio))
+ return 0;
+
+ if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+ folio_put(folio);
+ return VM_FAULT_RETRY;
+ }
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ vma->vm_mm, vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+ mmu_notifier_invalidate_range_start(&range);
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+ restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
+
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ mmu_notifier_invalidate_range_end(&range);
+ return 0;
+}
+
+static inline bool should_try_to_free_swap(struct folio *folio,
+ struct vm_area_struct *vma,
+ unsigned int fault_flags)
+{
+ if (!folio_test_swapcache(folio))
+ return false;
+ if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
+ folio_test_mlocked(folio))
+ return true;
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * user. Try freeing the swapcache to get rid of the swapcache
+ * reference only in case it's likely that we'll be the exlusive user.
+ */
+ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
+ folio_ref_count(folio) == 2;
+}
+
+static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
+{
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ return 0;
+ /*
+ * Be careful so that we will only recover a special uffd-wp pte into a
+ * none pte. Otherwise it means the pte could have changed, so retry.
+ *
+ * This should also cover the case where e.g. the pte changed
+ * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+ * So is_pte_marker() check is not enough to safely drop the pte.
+ */
+ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
+ pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+}
+
+static vm_fault_t do_pte_missing(struct vm_fault *vmf)
+{
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
+ else
+ return do_fault(vmf);
+}
+
+/*
+ * This is actually a page-missing access, but with uffd-wp special pte
+ * installed. It means this pte was wr-protected before being unmapped.
+ */
+static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
+{
+ /*
+ * Just in case there're leftover special ptes even after the region
+ * got unregistered - we can simply clear them.
+ */
+ if (unlikely(!userfaultfd_wp(vmf->vma)))
+ return pte_marker_clear(vmf);
+
+ return do_pte_missing(vmf);
+}
+
+static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
+{
+ swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
+ unsigned long marker = pte_marker_get(entry);
+
+ /*
+ * PTE markers should never be empty. If anything weird happened,
+ * the best thing to do is to kill the process along with its mm.
+ */
+ if (WARN_ON_ONCE(!marker))
+ return VM_FAULT_SIGBUS;
+
+ /* Higher priority than uffd-wp when data corrupted */
+ if (marker & PTE_MARKER_SWAPIN_ERROR)
+ return VM_FAULT_SIGBUS;
+
+ if (pte_marker_entry_uffd_wp(entry))
+ return pte_marker_handle_uffd_wp(vmf);
+
+ /* This is an unknown pte marker */
+ return VM_FAULT_SIGBUS;
+}
+
+/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
@@ -3239,27 +3714,54 @@ EXPORT_SYMBOL(unmap_mapping_range);
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *swapcache;
+ struct folio *swapcache, *folio = NULL;
+ struct page *page;
+ struct swap_info_struct *si = NULL;
+ rmap_t rmap_flags = RMAP_NONE;
+ bool exclusive = false;
swp_entry_t entry;
pte_t pte;
int locked;
- int exclusive = 0;
vm_fault_t ret = 0;
void *shadow = NULL;
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ if (!pte_unmap_same(vmf))
goto out;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
+ } else if (is_device_exclusive_entry(entry)) {
+ vmf->page = pfn_swap_entry_to_page(entry);
+ ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
- vmf->page = device_private_entry_to_page(entry);
+ vmf->page = pfn_swap_entry_to_page(entry);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte ||
+ !pte_same(ptep_get(vmf->pte),
+ vmf->orig_pte)))
+ goto unlock;
+
+ /*
+ * Get a page reference while we know the page can't be
+ * freed.
+ */
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
+ } else if (is_pte_marker_entry(entry)) {
+ ret = handle_pte_marker(vmf);
} else {
print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
@@ -3267,59 +3769,64 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out;
}
+ /* Prevent swapoff from happening to us. */
+ si = get_swap_device(entry);
+ if (unlikely(!si))
+ goto out;
- delayacct_set_flag(DELAYACCT_PF_SWAPIN);
- page = lookup_swap_cache(entry, vma, vmf->address);
- swapcache = page;
-
- if (!page) {
- struct swap_info_struct *si = swp_swap_info(entry);
+ folio = swap_cache_get_folio(entry, vma, vmf->address);
+ if (folio)
+ page = folio_file_page(folio, swp_offset(entry));
+ swapcache = folio;
+ if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
/* skip swapcache */
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
- if (page) {
- int err;
-
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
- set_page_private(page, entry.val);
-
- /* Tell memcg to use swap ownership records */
- SetPageSwapCache(page);
- err = mem_cgroup_charge(page, vma->vm_mm,
- GFP_KERNEL);
- ClearPageSwapCache(page);
- if (err) {
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
+ vma, vmf->address, false);
+ page = &folio->page;
+ if (folio) {
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+
+ if (mem_cgroup_swapin_charge_folio(folio,
+ vma->vm_mm, GFP_KERNEL,
+ entry)) {
ret = VM_FAULT_OOM;
goto out_page;
}
+ mem_cgroup_swapin_uncharge_swap(entry);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
- workingset_refault(page, shadow);
+ workingset_refault(folio, shadow);
- lru_cache_add(page);
- swap_readpage(page, true);
+ folio_add_lru(folio);
+
+ /* To provide entry to swap_readpage() */
+ folio_set_swap_entry(folio, entry);
+ swap_readpage(page, true, NULL);
+ folio->private = NULL;
}
} else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
vmf);
- swapcache = page;
+ if (page)
+ folio = page_folio(page);
+ swapcache = folio;
}
- if (!page) {
+ if (!folio) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ if (likely(vmf->pte &&
+ pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
ret = VM_FAULT_OOM;
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
}
@@ -3333,94 +3840,172 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* owner processes (which may be unknown at hwpoison time)
*/
ret = VM_FAULT_HWPOISON;
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto out_release;
}
- locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+ locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
}
- /*
- * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
- * release the swapcache from under us. The page pin, and pte_same
- * test below, are not enough to exclude that. Even if it is still
- * swapcache, we need to check that the page's swap has not changed.
- */
- if (unlikely((!PageSwapCache(page) ||
- page_private(page) != entry.val)) && swapcache)
- goto out_page;
-
- page = ksm_might_need_to_copy(page, vma, vmf->address);
- if (unlikely(!page)) {
- ret = VM_FAULT_OOM;
- page = swapcache;
- goto out_page;
+ if (swapcache) {
+ /*
+ * Make sure folio_free_swap() or swapoff did not release the
+ * swapcache from under us. The page pin, and pte_same test
+ * below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not
+ * changed.
+ */
+ if (unlikely(!folio_test_swapcache(folio) ||
+ page_private(page) != entry.val))
+ goto out_page;
+
+ /*
+ * KSM sometimes has to copy on read faults, for example, if
+ * page->index of !PageKSM() pages would be nonlinear inside the
+ * anon VMA -- PageKSM() is lost on actual swapout.
+ */
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ goto out_page;
+ } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
+ ret = VM_FAULT_HWPOISON;
+ goto out_page;
+ }
+ folio = page_folio(page);
+
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * owner. Try removing the extra reference from the local LRU
+ * caches if required.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
+ !folio_test_ksm(folio) && !folio_test_lru(folio))
+ lru_add_drain();
}
- cgroup_throttle_swaprate(page, GFP_KERNEL);
+ folio_throttle_swaprate(folio, GFP_KERNEL);
/*
* Back out if somebody else already faulted in this pte.
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
goto out_nomap;
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = VM_FAULT_SIGBUS;
goto out_nomap;
}
/*
- * The page isn't present yet, go ahead with the fault.
- *
- * Be careful about the sequence of operations here.
- * To get its accounting right, reuse_swap_page() must be called
- * while the page is counted on swap but not yet in mapcount i.e.
- * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
- * must be called after the swap_free(), or it will never succeed.
+ * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
+ * must never point at an anonymous page in the swapcache that is
+ * PG_anon_exclusive. Sanity check that this holds and especially, that
+ * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
+ * check after taking the PT lock and making sure that nobody
+ * concurrently faulted in this page and set PG_anon_exclusive.
+ */
+ BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
+ BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
+
+ /*
+ * Check under PT lock (to protect against concurrent fork() sharing
+ * the swap entry concurrently) for certainly exclusive pages.
+ */
+ if (!folio_test_ksm(folio)) {
+ exclusive = pte_swp_exclusive(vmf->orig_pte);
+ if (folio != swapcache) {
+ /*
+ * We have a fresh page that is not exposed to the
+ * swapcache -> certainly exclusive.
+ */
+ exclusive = true;
+ } else if (exclusive && folio_test_writeback(folio) &&
+ data_race(si->flags & SWP_STABLE_WRITES)) {
+ /*
+ * This is tricky: not all swap backends support
+ * concurrent page modifications while under writeback.
+ *
+ * So if we stumble over such a page in the swapcache
+ * we must not set the page exclusive, otherwise we can
+ * map it writable without further checks and modify it
+ * while still under writeback.
+ *
+ * For these problematic swap backends, simply drop the
+ * exclusive marker: this is perfectly fine as we start
+ * writeback only if we fully unmapped the page and
+ * there are no unexpected references on the page after
+ * unmapping succeeded. After fully unmapped, no
+ * further GUP references (FOLL_GET and FOLL_PIN) can
+ * appear, so dropping the exclusive marker and mapping
+ * it only R/O is fine.
+ */
+ exclusive = false;
+ }
+ }
+
+ /*
+ * Some architectures may have to restore extra metadata to the page
+ * when reading from swap. This metadata may be indexed by swap entry
+ * so this must be called before swap_free().
*/
+ arch_swap_restore(entry, folio);
+
+ /*
+ * Remove the swap entry and conditionally try to free up the swapcache.
+ * We're already holding a reference on the page but haven't mapped it
+ * yet.
+ */
+ swap_free(entry);
+ if (should_try_to_free_swap(folio, vma, vmf->flags))
+ folio_free_swap(folio);
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- vmf->flags &= ~FAULT_FLAG_WRITE;
- ret |= VM_FAULT_WRITE;
- exclusive = RMAP_EXCLUSIVE;
+
+ /*
+ * Same logic as in do_wp_page(); however, optimize for pages that are
+ * certainly not shared either because we just allocated them without
+ * exposing them to the swapcache or because the swap entry indicates
+ * exclusivity.
+ */
+ if (!folio_test_ksm(folio) &&
+ (exclusive || folio_ref_count(folio) == 1)) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ vmf->flags &= ~FAULT_FLAG_WRITE;
+ }
+ rmap_flags |= RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(vmf->orig_pte)) {
+ if (pte_swp_uffd_wp(vmf->orig_pte))
pte = pte_mkuffd_wp(pte);
- pte = pte_wrprotect(pte);
- }
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
- arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
/* ksm created a completely new copy */
- if (unlikely(page != swapcache && swapcache)) {
- page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ if (unlikely(folio != swapcache && swapcache)) {
+ page_add_new_anon_rmap(page, vma, vmf->address);
+ folio_add_lru_vma(folio, vma);
} else {
- do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
+ page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
}
- swap_free(entry);
- if (mem_cgroup_swap_full(page) ||
- (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
- try_to_free_swap(page);
- unlock_page(page);
- if (page != swapcache && swapcache) {
+ VM_BUG_ON(!folio_test_anon(folio) ||
+ (pte_write(pte) && !PageAnonExclusive(page)));
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
+
+ folio_unlock(folio);
+ if (folio != swapcache && swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
@@ -3429,8 +4014,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* so that the swap count won't change under a
* parallel locked swapcache.
*/
- unlock_page(swapcache);
- put_page(swapcache);
+ folio_unlock(swapcache);
+ folio_put(swapcache);
}
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -3443,19 +4028,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ if (si)
+ put_swap_device(si);
return ret;
out_nomap:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
- unlock_page(page);
+ folio_unlock(folio);
out_release:
- put_page(page);
- if (page != swapcache && swapcache) {
- unlock_page(swapcache);
- put_page(swapcache);
+ folio_put(folio);
+ if (folio != swapcache && swapcache) {
+ folio_unlock(swapcache);
+ folio_put(swapcache);
}
+ if (si)
+ put_swap_device(si);
return ret;
}
@@ -3466,8 +4057,9 @@ out_release:
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
+ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
struct vm_area_struct *vma = vmf->vma;
- struct page *page;
+ struct folio *folio;
vm_fault_t ret = 0;
pte_t entry;
@@ -3476,22 +4068,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/*
- * Use pte_alloc() instead of pte_alloc_map(). We can't run
- * pte_offset_map() on pmds where a huge pmd might be created
- * from a different thread.
- *
- * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
- * parallel threads are excluded by other means.
- *
- * Here we only have mmap_read_lock(mm).
+ * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
+ * be distinguished from a transient failure of pte_offset_map().
*/
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
- /* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(vmf->pmd)))
- return 0;
-
/* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
@@ -3499,7 +4081,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (!pte_none(*vmf->pte)) {
+ if (!vmf->pte)
+ goto unlock;
+ if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
}
@@ -3517,30 +4101,32 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
- if (!page)
+ folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+ if (!folio)
goto oom;
- if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
- cgroup_throttle_swaprate(page, GFP_KERNEL);
+ folio_throttle_swaprate(folio, GFP_KERNEL);
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
- entry = mk_pte(page, vma->vm_page_prot);
+ entry = mk_pte(&folio->page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (!pte_none(*vmf->pte)) {
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ if (!vmf->pte)
+ goto release;
+ if (vmf_pte_changed(vmf)) {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}
@@ -3551,26 +4137,29 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- put_page(page);
+ folio_put(folio);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ folio_add_new_anon_rmap(folio, vma, vmf->address);
+ folio_add_lru_vma(folio, vma);
setpte:
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
- put_page(page);
+ folio_put(folio);
goto unlock;
oom_free_page:
- put_page(page);
+ folio_put(folio);
oom:
return VM_FAULT_OOM;
}
@@ -3604,7 +4193,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
- smp_wmb(); /* See comment in __pte_alloc() */
}
ret = vma->vm_ops->fault(vmf);
@@ -3613,11 +4201,20 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
- if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf->page);
- put_page(vmf->page);
+ struct page *page = vmf->page;
+ vm_fault_t poisonret = VM_FAULT_HWPOISON;
+ if (ret & VM_FAULT_LOCKED) {
+ if (page_mapped(page))
+ unmap_mapping_pages(page_mapping(page),
+ page->index, 1, false);
+ /* Retry if a clean page was removed from the cache. */
+ if (invalidate_inode_page(page))
+ poisonret = VM_FAULT_NOPAGE;
+ unlock_page(page);
+ }
+ put_page(page);
vmf->page = NULL;
- return VM_FAULT_HWPOISON;
+ return poisonret;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
@@ -3628,66 +4225,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return ret;
}
-/*
- * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
- * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
- * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
- * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
- */
-static int pmd_devmap_trans_unstable(pmd_t *pmd)
-{
- return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
-}
-
-static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
-{
- struct vm_area_struct *vma = vmf->vma;
-
- if (!pmd_none(*vmf->pmd))
- goto map_pte;
- if (vmf->prealloc_pte) {
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_none(*vmf->pmd))) {
- spin_unlock(vmf->ptl);
- goto map_pte;
- }
-
- mm_inc_nr_ptes(vma->vm_mm);
- pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
- spin_unlock(vmf->ptl);
- vmf->prealloc_pte = NULL;
- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
- return VM_FAULT_OOM;
- }
-map_pte:
- /*
- * If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
- * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
- * under us and then back to pmd_none, as a result of MADV_DONTNEED
- * running immediately after a huge pmd fault in a different thread of
- * this mm, in turn leading to a misleading pmd_trans_huge() retval.
- * All we have to ensure is that it is a regular pmd that we can walk
- * with pte_offset_map() and we can do that through an atomic read in
- * C, which is what pmd_trans_unstable() provides.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return VM_FAULT_NOPAGE;
-
- /*
- * At this point we know that our vmf->pmd points to a page of ptes
- * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
- * for the duration of the fault. If a racing MADV_DONTNEED runs and
- * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
- * be valid and we will re-check to make sure the vmf->pte isn't
- * pte_none() under vmf->ptl protection when we return to
- * alloc_set_pte().
- */
- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
- &vmf->ptl);
- return 0;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
@@ -3702,30 +4239,39 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i;
- vm_fault_t ret;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
- return VM_FAULT_FALLBACK;
+ return ret;
- ret = VM_FAULT_FALLBACK;
page = compound_head(page);
+ if (compound_order(page) != HPAGE_PMD_ORDER)
+ return ret;
+
+ /*
+ * Just backoff if any subpage of a THP is corrupted otherwise
+ * the corrupted page may mapped by PMD silently to escape the
+ * check. This kind of THP just can be PTE mapped. Access to
+ * the corrupted subpage should trigger SIGBUS as expected.
+ */
+ if (unlikely(PageHasHWPoisoned(page)))
+ return ret;
/*
- * Archs like ppc64 need additonal space to store information
+ * Archs like ppc64 need additional space to store information
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
- smp_wmb(); /* See comment in __pte_alloc() */
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -3740,7 +4286,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
- page_add_file_rmap(page, true);
+ page_add_file_rmap(page, vma, true);
+
/*
* deposit and withdraw with pmd lock held
*/
@@ -3759,76 +4306,52 @@ out:
return ret;
}
#else
-static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
- BUILD_BUG();
- return 0;
+ return VM_FAULT_FALLBACK;
}
#endif
-/**
- * alloc_set_pte - setup new PTE entry for given page and add reverse page
- * mapping. If needed, the function allocates page table or use pre-allocated.
- *
- * @vmf: fault environment
- * @page: page to map
- *
- * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
- * return.
- *
- * Target users are page handler itself and implementations of
- * vm_ops->map_pages.
- *
- * Return: %0 on success, %VM_FAULT_ code in case of error.
- */
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
+void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
+ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool prefault = vmf->address != addr;
pte_t entry;
- vm_fault_t ret;
-
- if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
- if (ret != VM_FAULT_FALLBACK)
- return ret;
- }
-
- if (!vmf->pte) {
- ret = pte_alloc_one_map(vmf);
- if (ret)
- return ret;
- }
-
- /* Re-check under ptl */
- if (unlikely(!pte_none(*vmf->pte))) {
- update_mmu_tlb(vma, vmf->address, vmf->pte);
- return VM_FAULT_NOPAGE;
- }
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
- entry = pte_sw_mkyoung(entry);
+
+ if (prefault && arch_wants_old_prefaulted_pte())
+ entry = pte_mkold(entry);
+ else
+ entry = pte_sw_mkyoung(entry);
+
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (unlikely(uffd_wp))
+ entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, vmf->address, false);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
- inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
+ inc_mm_counter(vma->vm_mm, mm_counter_file(page));
+ page_add_file_rmap(page, vma, false);
}
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+ set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+}
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+static bool vmf_pte_changed(struct vm_fault *vmf)
+{
+ if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
+ return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
- return 0;
+ return !pte_none(ptep_get(vmf->pte));
}
-
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
@@ -3846,12 +4369,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct page *page;
- vm_fault_t ret = 0;
+ vm_fault_t ret;
/* Did we COW the page? */
- if ((vmf->flags & FAULT_FLAG_WRITE) &&
- !(vmf->vma->vm_flags & VM_SHARED))
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
page = vmf->cow_page;
else
page = vmf->page;
@@ -3860,22 +4383,54 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* check even for read faults because we might have lost our CoWed
* page
*/
- if (!(vmf->vma->vm_flags & VM_SHARED))
- ret = check_stable_address_space(vmf->vma->vm_mm);
- if (!ret)
- ret = alloc_set_pte(vmf, page);
- if (vmf->pte)
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (!(vma->vm_flags & VM_SHARED)) {
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ return ret;
+ }
+
+ if (pmd_none(*vmf->pmd)) {
+ if (PageTransCompound(page)) {
+ ret = do_set_pmd(vmf, page);
+ if (ret != VM_FAULT_FALLBACK)
+ return ret;
+ }
+
+ if (vmf->prealloc_pte)
+ pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
+ else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
+ return VM_FAULT_OOM;
+ }
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ return VM_FAULT_NOPAGE;
+
+ /* Re-check under ptl */
+ if (likely(!vmf_pte_changed(vmf))) {
+ do_set_pte(vmf, page, vmf->address);
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ ret = 0;
+ } else {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
+ ret = VM_FAULT_NOPAGE;
+ }
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
-static unsigned long fault_around_bytes __read_mostly =
- rounddown_pow_of_two(65536);
+static unsigned long fault_around_pages __read_mostly =
+ 65536 >> PAGE_SHIFT;
#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
- *val = fault_around_bytes;
+ *val = fault_around_pages << PAGE_SHIFT;
return 0;
}
@@ -3887,10 +4442,13 @@ static int fault_around_bytes_set(void *data, u64 val)
{
if (val / PAGE_SIZE > PTRS_PER_PTE)
return -EINVAL;
- if (val > PAGE_SIZE)
- fault_around_bytes = rounddown_pow_of_two(val);
- else
- fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
+
+ /*
+ * The minimum value is 1 page, however this results in no fault-around
+ * at all. See should_fault_around().
+ */
+ fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
+
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
@@ -3913,80 +4471,66 @@ late_initcall(fault_around_debugfs);
* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
* not ready to be mapped: not up-to-date, locked, etc.
*
- * This function is called with the page table lock taken. In the split ptlock
- * case the page table lock only protects only those entries which belong to
- * the page table corresponding to the fault address.
- *
- * This function doesn't cross the VMA boundaries, in order to call map_pages()
- * only once.
+ * This function doesn't cross VMA or page table boundaries, in order to call
+ * map_pages() and acquire a PTE lock only once.
*
- * fault_around_bytes defines how many bytes we'll try to map.
+ * fault_around_pages defines how many pages we'll try to map.
* do_fault_around() expects it to be set to a power of two less than or equal
* to PTRS_PER_PTE.
*
* The virtual address of the area that we map is naturally aligned to
- * fault_around_bytes rounded down to the machine page size
+ * fault_around_pages * PAGE_SIZE rounded down to the machine page size
* (and therefore to page order). This way it's easier to guarantee
* that we don't cross page table boundaries.
*/
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
- unsigned long address = vmf->address, nr_pages, mask;
- pgoff_t start_pgoff = vmf->pgoff;
- pgoff_t end_pgoff;
- int off;
- vm_fault_t ret = 0;
-
- nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
- mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
+ pgoff_t nr_pages = READ_ONCE(fault_around_pages);
+ pgoff_t pte_off = pte_index(vmf->address);
+ /* The page offset of vmf->address within the VMA. */
+ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
+ pgoff_t from_pte, to_pte;
+ vm_fault_t ret;
- vmf->address = max(address & mask, vmf->vma->vm_start);
- off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
- start_pgoff -= off;
+ /* The PTE offset of the start address, clamped to the VMA. */
+ from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
+ pte_off - min(pte_off, vma_off));
- /*
- * end_pgoff is either the end of the page table, the end of
- * the vma or nr_pages from start_pgoff, depending what is nearest.
- */
- end_pgoff = start_pgoff -
- ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
- PTRS_PER_PTE - 1;
- end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
- start_pgoff + nr_pages - 1);
+ /* The PTE offset of the end address, clamped to the VMA and PTE. */
+ to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
+ pte_off + vma_pages(vmf->vma) - vma_off) - 1;
if (pmd_none(*vmf->pmd)) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
- goto out;
- smp_wmb(); /* See comment in __pte_alloc() */
+ return VM_FAULT_OOM;
}
- vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
+ rcu_read_lock();
+ ret = vmf->vma->vm_ops->map_pages(vmf,
+ vmf->pgoff + from_pte - pte_off,
+ vmf->pgoff + to_pte - pte_off);
+ rcu_read_unlock();
- /* Huge page is mapped? Page fault is solved */
- if (pmd_trans_huge(*vmf->pmd)) {
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
+ return ret;
+}
- /* ->map_pages() haven't done anything useful. Cold page cache? */
- if (!vmf->pte)
- goto out;
+/* Return true if we should do read fault-around, false otherwise */
+static inline bool should_fault_around(struct vm_fault *vmf)
+{
+ /* No ->map_pages? No way to fault around... */
+ if (!vmf->vma->vm_ops->map_pages)
+ return false;
- /* check if the page fault is solved */
- vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
- if (!pte_none(*vmf->pte))
- ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(vmf->pte, vmf->ptl);
-out:
- vmf->address = address;
- vmf->pte = NULL;
- return ret;
+ if (uffd_disable_fault_around(vmf->vma))
+ return false;
+
+ /* A single page implies no faulting 'around' at all. */
+ return fault_around_pages > 1;
}
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
/*
@@ -3994,7 +4538,7 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
- if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
+ if (should_fault_around(vmf)) {
ret = do_fault_around(vmf);
if (ret)
return ret;
@@ -4023,11 +4567,12 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
+ if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
+ GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
- cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
+ folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL);
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4088,7 +4633,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults).
* The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
+ * return value. See filemap_fault() and __folio_lock_or_retry().
* If mmap_lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
@@ -4102,17 +4647,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/
if (!vma->vm_ops->fault) {
- /*
- * If we find a migration pmd entry or a none pmd entry, which
- * should never happen, return SIGBUS
- */
- if (unlikely(!pmd_present(*vmf->pmd)))
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
ret = VM_FAULT_SIGBUS;
else {
- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
- vmf->pmd,
- vmf->address,
- &vmf->ptl);
/*
* Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update
@@ -4120,7 +4659,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* we don't have concurrent modification by hardware
* followed by an update.
*/
- if (unlikely(pte_none(*vmf->pte)))
+ if (unlikely(pte_none(ptep_get(vmf->pte))))
ret = VM_FAULT_SIGBUS;
else
ret = VM_FAULT_NOPAGE;
@@ -4142,12 +4681,14 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int page_nid,
- int *flags)
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid, int *flags)
{
get_page(page);
+ /* Record the current PID acceesing VMA */
+ vma_set_access_pid_bit(vma);
+
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
@@ -4162,11 +4703,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = NUMA_NO_NODE;
+ bool writable = false;
int last_cpupid;
int target_nid;
- bool migrated = false;
pte_t pte, old_pte;
- bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
/*
@@ -4174,36 +4714,32 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*/
- vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
+ /* Get the normal PTE */
+ old_pte = ptep_get(vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
+
/*
- * Make it present again, Depending on how arch implementes non
- * accessible ptes, some can allow access by kernel mode.
+ * Detect now whether the PTE could be writable; this information
+ * is only valid while holding the PT lock.
*/
- old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
- pte = pte_modify(old_pte, vma->vm_page_prot);
- pte = pte_mkyoung(pte);
- if (was_writable)
- pte = pte_mkwrite(pte);
- ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ writable = pte_write(pte);
+ if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+ can_change_pte_writable(vma, vmf->address, pte))
+ writable = true;
page = vm_normal_page(vma, vmf->address, pte);
- if (!page) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ if (!page || is_zone_device_page(page))
+ goto out_map;
/* TODO: handle PTE-mapped THP */
- if (PageCompound(page)) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ if (PageCompound(page))
+ goto out_map;
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
@@ -4213,7 +4749,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
- if (!pte_write(pte))
+ if (!writable)
flags |= TNF_NO_GROUP;
/*
@@ -4223,28 +4759,60 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
- last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(page_nid))
+ last_cpupid = (-1 & LAST_CPUPID_MASK);
+ else
+ last_cpupid = page_cpupid_last(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == NUMA_NO_NODE) {
put_page(page);
- goto out;
+ goto out_map;
}
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ writable = false;
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, vma, target_nid);
- if (migrated) {
+ if (migrate_misplaced_page(page, vma, target_nid)) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
- } else
+ } else {
flags |= TNF_MIGRATE_FAIL;
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ goto out;
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
+ }
+ goto out_map;
+ }
out:
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
+out_map:
+ /*
+ * Make it present again, depending on how arch implements
+ * non-accessible ptes, some can allow access by kernel mode.
+ */
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
+ if (writable)
+ pte = pte_mkwrite(pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
@@ -4257,18 +4825,24 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
}
/* `inline' is required to avoid gcc 4.1.2 build error */
-static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+ vm_fault_t ret;
+
if (vma_is_anonymous(vmf->vma)) {
- if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+ if (likely(!unshare) &&
+ userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
- return do_huge_pmd_wp_page(vmf, orig_pmd);
+ return do_huge_pmd_wp_page(vmf);
}
- if (vmf->vma->vm_ops->huge_fault) {
- vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
+ if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vmf->vma->vm_ops->huge_fault) {
+ ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
}
/* COW or write-notify handled on pte level: split pmd. */
@@ -4283,29 +4857,33 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
- goto split;
- if (vmf->vma->vm_ops->huge_fault) {
- vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
-
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
- }
-split:
- /* COW or write-notify not handled on PUD level: split pud.*/
- __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
+ return VM_FAULT_FALLBACK;
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ vm_fault_t ret;
+
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
- return VM_FAULT_FALLBACK;
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ goto split;
+ if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vmf->vma->vm_ops->huge_fault) {
+ ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
+ }
+split:
+ /* COW or write-notify not handled on PUD level: split pud.*/
+ __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
return VM_FAULT_FALLBACK;
}
@@ -4322,7 +4900,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
* concurrent faults).
*
* The mmap_lock may have been released depending on flags and our return value.
- * See filemap_fault() and __lock_page_or_retry().
+ * See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
@@ -4336,40 +4914,29 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* concurrent faults and from rmap lookups.
*/
vmf->pte = NULL;
+ vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
- /* See comment in pte_alloc_one_map() */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return 0;
/*
* A regular pmd is established and it can't morph into a huge
- * pmd from under us anymore at this point because we hold the
- * mmap_lock read mode and khugepaged takes it in write mode.
- * So now it's safe to run pte_offset_map().
+ * pmd by anon khugepaged, since that takes mmap_lock in write
+ * mode; but shmem or file collapse to THP could still morph
+ * it into a huge pmd: just retry later if so.
*/
- vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
- vmf->orig_pte = *vmf->pte;
+ vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ return 0;
+ vmf->orig_pte = ptep_get_lockless(vmf->pte);
+ vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
- /*
- * some architectures can have larger ptes than wordsize,
- * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
- * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
- * accesses. The code below just needs a consistent view
- * for the ifs and we later double check anyway with the
- * ptl lock held. So here a barrier will do.
- */
- barrier();
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
}
}
- if (!vmf->pte) {
- if (vma_is_anonymous(vmf->vma))
- return do_anonymous_page(vmf);
- else
- return do_fault(vmf);
- }
+ if (!vmf->pte)
+ return do_pte_missing(vmf);
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
@@ -4377,17 +4944,17 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
- vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
- if (unlikely(!pte_same(*vmf->pte, entry))) {
+ if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
- if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!pte_write(entry))
return do_wp_page(vmf);
- entry = pte_mkdirty(entry);
+ else if (likely(vmf->flags & FAULT_FLAG_WRITE))
+ entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
@@ -4404,7 +4971,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* with threads.
*/
if (vmf->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+ vmf->pte);
}
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4415,7 +4983,7 @@ unlock:
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
+ * return value. See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
@@ -4423,12 +4991,13 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK,
+ .real_address = address,
.flags = flags,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
- unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
+ unsigned long vm_flags = vma->vm_flags;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
@@ -4442,7 +5011,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
if (!vmf.pud)
return VM_FAULT_OOM;
retry_pud:
- if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
+ if (pud_none(*vmf.pud) &&
+ hugepage_vma_check(vma, vm_flags, false, true, true)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4452,9 +5022,11 @@ retry_pud:
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
- /* NUMA case for anonymous PUDs would go here */
-
- if (dirty && !pud_write(orig_pud)) {
+ /*
+ * TODO once we support anonymous PUDs: NUMA case and
+ * FAULT_FLAG_UNSHARE handling.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
ret = wp_huge_pud(&vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4473,31 +5045,32 @@ retry_pud:
if (pud_trans_unstable(vmf.pud))
goto retry_pud;
- if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
+ if (pmd_none(*vmf.pmd) &&
+ hugepage_vma_check(vma, vm_flags, false, true, true)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *vmf.pmd;
+ vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
- barrier();
- if (unlikely(is_swap_pmd(orig_pmd))) {
+ if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
- if (is_pmd_migration_entry(orig_pmd))
+ !is_pmd_migration_entry(vmf.orig_pmd));
+ if (is_pmd_migration_entry(vmf.orig_pmd))
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
- if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
- if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&vmf, orig_pmd);
+ if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ return do_huge_pmd_numa_page(&vmf);
- if (dirty && !pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&vmf, orig_pmd);
+ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ !pmd_write(vmf.orig_pmd)) {
+ ret = wp_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&vmf, orig_pmd);
+ huge_pmd_set_accessed(&vmf);
return 0;
}
}
@@ -4507,7 +5080,7 @@ retry_pud:
}
/**
- * mm_account_fault - Do page fault accountings
+ * mm_account_fault - Do page fault accounting
*
* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
* of perf event counters, but we'll still do the per-task accounting to
@@ -4516,29 +5089,36 @@ retry_pud:
* @flags: the fault flags.
* @ret: the fault retcode.
*
- * This will take care of most of the page fault accountings. Meanwhile, it
+ * This will take care of most of the page fault accounting. Meanwhile, it
* will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
- * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
* still be in per-arch page fault handlers at the entry of page fault.
*/
-static inline void mm_account_fault(struct pt_regs *regs,
+static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
unsigned long address, unsigned int flags,
vm_fault_t ret)
{
bool major;
+ /* Incomplete faults will be accounted upon completion. */
+ if (ret & VM_FAULT_RETRY)
+ return;
+
/*
- * We don't do accounting for some specific faults:
- *
- * - Unsuccessful faults (e.g. when the address wasn't valid). That
- * includes arch_vma_access_permitted() failing before reaching here.
- * So this is not a "this many hardware page faults" counter. We
- * should use the hw profiling for that.
- *
- * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
- * once they're completed.
+ * To preserve the behavior of older kernels, PGFAULT counters record
+ * both successful and failed faults, as opposed to perf counters,
+ * which ignore failed cases.
+ */
+ count_vm_event(PGFAULT);
+ count_memcg_event_mm(mm, PGFAULT);
+
+ /*
+ * Do not account for unsuccessful faults (e.g. when the address wasn't
+ * valid). That includes arch_vma_access_permitted() failing before
+ * reaching here. So this is not a "this many hardware page faults"
+ * counter. We should use the hw profiling for that.
*/
- if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+ if (ret & VM_FAULT_ERROR)
return;
/*
@@ -4567,29 +5147,76 @@ static inline void mm_account_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
+#ifdef CONFIG_LRU_GEN
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+ /* the LRU algorithm only applies to accesses with recency */
+ current->in_lru_fault = vma_has_recency(vma);
+}
+
+static void lru_gen_exit_fault(void)
+{
+ current->in_lru_fault = false;
+}
+#else
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+}
+
+static void lru_gen_exit_fault(void)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
+static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
+ unsigned int *flags)
+{
+ if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
+ if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
+ return VM_FAULT_SIGSEGV;
+ /*
+ * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
+ * just treat it like an ordinary read-fault otherwise.
+ */
+ if (!is_cow_mapping(vma->vm_flags))
+ *flags &= ~FAULT_FLAG_UNSHARE;
+ } else if (*flags & FAULT_FLAG_WRITE) {
+ /* Write faults on read-only mappings are impossible ... */
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
+ return VM_FAULT_SIGSEGV;
+ /* ... and FOLL_FORCE only applies to COW mappings. */
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
+ !is_cow_mapping(vma->vm_flags)))
+ return VM_FAULT_SIGSEGV;
+ }
+ return 0;
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
+ * return value. See filemap_fault() and __folio_lock_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
{
+ /* If the fault handler drops the mmap_lock, vma may be freed */
+ struct mm_struct *mm = vma->vm_mm;
vm_fault_t ret;
__set_current_state(TASK_RUNNING);
- count_vm_event(PGFAULT);
- count_memcg_event_mm(vma->vm_mm, PGFAULT);
-
- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
+ ret = sanitize_fault_flags(vma, &flags);
+ if (ret)
+ goto out;
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
- flags & FAULT_FLAG_REMOTE))
- return VM_FAULT_SIGSEGV;
+ flags & FAULT_FLAG_REMOTE)) {
+ ret = VM_FAULT_SIGSEGV;
+ goto out;
+ }
/*
* Enable the memcg OOM handling for faults triggered in user
@@ -4598,11 +5225,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
+ lru_gen_enter_fault(vma);
+
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
+ lru_gen_exit_fault();
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
@@ -4614,13 +5245,194 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
}
-
- mm_account_fault(regs, address, flags, ret);
+out:
+ mm_account_fault(mm, regs, address, flags, ret);
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ if (likely(mmap_read_trylock(mm)))
+ return true;
+
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = instruction_pointer(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = instruction_pointer(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalend to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma;
+
+ rcu_read_lock();
+retry:
+ vma = mas_walk(&mas);
+ if (!vma)
+ goto inval;
+
+ /* Only anonymous and tcp vmas are supported for now */
+ if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
+ goto inval;
+
+ if (!vma_start_read(vma))
+ goto inval;
+
+ /*
+ * find_mergeable_anon_vma uses adjacent vmas which are not locked.
+ * This check must happen after vma_start_read(); otherwise, a
+ * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
+ * from its anon_vma.
+ */
+ if (unlikely(!vma->anon_vma && !vma_is_tcp(vma)))
+ goto inval_end_read;
+
+ /*
+ * Due to the possibility of userfault handler dropping mmap_lock, avoid
+ * it for now and fall back to page fault handling under mmap_lock.
+ */
+ if (userfaultfd_armed(vma))
+ goto inval_end_read;
+
+ /* Check since vm_start/vm_end might change before we lock the VMA */
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
+
+ /* Check if the VMA got isolated after we found it */
+ if (vma->detached) {
+ vma_end_read(vma);
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
+
+ rcu_read_unlock();
+ return vma;
+
+inval_end_read:
+ vma_end_read(vma);
+inval:
+ rcu_read_unlock();
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
@@ -4632,13 +5444,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&mm->page_table_lock);
- if (pgd_present(*pgd)) /* Another has populated it */
+ if (pgd_present(*pgd)) { /* Another has populated it */
p4d_free(mm, new);
- else
+ } else {
+ smp_wmb(); /* See comment in pmd_install() */
pgd_populate(mm, pgd, new);
+ }
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -4655,11 +5467,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&mm->page_table_lock);
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);
+ smp_wmb(); /* See comment in pmd_install() */
p4d_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
@@ -4680,22 +5491,42 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
ptl = pud_lock(mm, pud);
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
+ smp_wmb(); /* See comment in pmd_install() */
pud_populate(mm, pud, new);
- } else /* Another has populated it */
+ } else { /* Another has populated it */
pmd_free(mm, new);
+ }
spin_unlock(ptl);
return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */
-static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4718,73 +5549,19 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_huge(*pmd)) {
- if (!pmdpp)
- goto out;
-
- if (range) {
- mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
- NULL, mm, address & PMD_MASK,
- (address & PMD_MASK) + PMD_SIZE);
- mmu_notifier_invalidate_range_start(range);
- }
- *ptlp = pmd_lock(mm, pmd);
- if (pmd_huge(*pmd)) {
- *pmdpp = pmd;
- return 0;
- }
- spin_unlock(*ptlp);
- if (range)
- mmu_notifier_invalidate_range_end(range);
- }
-
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto out;
-
- if (range) {
- mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
- address & PAGE_MASK,
- (address & PAGE_MASK) + PAGE_SIZE);
- mmu_notifier_invalidate_range_start(range);
- }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
- if (!pte_present(*ptep))
+ if (!ptep)
+ goto out;
+ if (!pte_present(ptep_get(ptep)))
goto unlock;
*ptepp = ptep;
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
- if (range)
- mmu_notifier_invalidate_range_end(range);
out:
return -EINVAL;
}
-
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
-{
- int res;
-
- /* (void) is needed to make gcc happy */
- (void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, NULL,
- ptepp, NULL, ptlp)));
- return res;
-}
-
-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
-{
- int res;
-
- /* (void) is needed to make gcc happy */
- (void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, range,
- ptepp, pmdpp, ptlp)));
- return res;
-}
-EXPORT_SYMBOL(follow_pte_pmd);
+EXPORT_SYMBOL_GPL(follow_pte);
/**
* follow_pfn - look up PFN at a user virtual address
@@ -4794,6 +5571,9 @@ EXPORT_SYMBOL(follow_pte_pmd);
*
* Only IO mappings and raw PFN mappings are allowed.
*
+ * This function does not allow the caller to read the permissions
+ * of the PTE. Do not use it.
+ *
* Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
@@ -4809,7 +5589,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
- *pfn = pte_pfn(*ptep);
+ *pfn = pte_pfn(ptep_get(ptep));
pte_unmap_unlock(ptep, ptl);
return 0;
}
@@ -4829,7 +5609,7 @@ int follow_phys(struct vm_area_struct *vma,
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
- pte = *ptep;
+ pte = ptep_get(ptep);
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -4844,72 +5624,126 @@ out:
return ret;
}
+/**
+ * generic_access_phys - generic implementation for iomem mmap access
+ * @vma: the vma to access
+ * @addr: userspace address, not relative offset within @vma
+ * @buf: buffer to read/write
+ * @len: length of transfer
+ * @write: set to FOLL_WRITE when writing, otherwise reading
+ *
+ * This is a generic implementation for &vm_operations_struct.access for an
+ * iomem mapping. This callback is used by access_process_vm() when the @vma is
+ * not page based.
+ */
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
- int offset = addr & (PAGE_SIZE-1);
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ int offset = offset_in_page(addr);
+ int ret = -EINVAL;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+retry:
+ if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+ return -EINVAL;
+ pte = ptep_get(ptep);
+ pte_unmap_unlock(ptep, ptl);
- if (follow_phys(vma, addr, write, &prot, &phys_addr))
+ prot = pgprot_val(pte_pgprot(pte));
+ phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+ if ((write & FOLL_WRITE) && !pte_write(pte))
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (!maddr)
return -ENOMEM;
+ if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+ goto out_unmap;
+
+ if (!pte_same(pte, ptep_get(ptep))) {
+ pte_unmap_unlock(ptep, ptl);
+ iounmap(maddr);
+
+ goto retry;
+ }
+
if (write)
memcpy_toio(maddr + offset, buf, len);
else
memcpy_fromio(buf, maddr + offset, len);
+ ret = len;
+ pte_unmap_unlock(ptep, ptl);
+out_unmap:
iounmap(maddr);
- return len;
+ return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
/*
- * Access another process' address space as given in mm. If non-NULL, use the
- * given task for page fault accounting.
+ * Access another process' address space as given in mm.
*/
-int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long addr, void *buf, int len, unsigned int gup_flags)
+int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
+ int len, unsigned int gup_flags)
{
- struct vm_area_struct *vma;
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
if (mmap_read_lock_killable(mm))
return 0;
+ /* Untag the address before looking up the VMA */
+ addr = untagged_addr_remote(mm, addr);
+
+ /* Avoid triggering the temporary warning in __get_user_pages */
+ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
+ return 0;
+
/* ignore errors, just check how much was successfully transferred */
while (len) {
- int bytes, ret, offset;
+ int bytes, offset;
void *maddr;
- struct page *page = NULL;
+ struct vm_area_struct *vma = NULL;
+ struct page *page = get_user_page_vma_remote(mm, addr,
+ gup_flags, &vma);
+
+ if (IS_ERR_OR_NULL(page)) {
+ /* We might need to expand the stack to access it */
+ vma = vma_lookup(mm, addr);
+ if (!vma) {
+ vma = expand_stack(mm, addr);
+
+ /* mmap_lock was dropped on failure */
+ if (!vma)
+ return buf - old_buf;
+
+ /* Try again if stack expansion worked */
+ continue;
+ }
+
- ret = get_user_pages_remote(mm, addr, 1,
- gup_flags, &page, &vma, NULL);
- if (ret <= 0) {
-#ifndef CONFIG_HAVE_IOREMAP_PROT
- break;
-#else
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- break;
+ bytes = 0;
+#ifdef CONFIG_HAVE_IOREMAP_PROT
if (vma->vm_ops && vma->vm_ops->access)
- ret = vma->vm_ops->access(vma, addr, buf,
- len, write);
- if (ret <= 0)
- break;
- bytes = ret;
+ bytes = vma->vm_ops->access(vma, addr, buf,
+ len, write);
#endif
+ if (bytes <= 0)
+ break;
} else {
bytes = len;
offset = addr & (PAGE_SIZE-1);
@@ -4952,7 +5786,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
- return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
+ return __access_remote_vm(mm, addr, buf, len, gup_flags);
}
/*
@@ -4970,7 +5804,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
if (!mm)
return 0;
- ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+ ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
@@ -5014,17 +5848,9 @@ void print_vma_addr(char *prefix, unsigned long ip)
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
- /*
- * Some code (nfs/sunrpc) uses socket ops on kernel memory while
- * holding the mmap_lock, this is safe because kernel memory doesn't
- * get paged out, therefore we'll never actually fault, and the
- * below annotations will generate false positives.
- */
- if (uaccess_kernel())
- return;
if (pagefault_disabled())
return;
- __might_sleep(file, line, 0);
+ __might_sleep(file, line);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
@@ -5039,12 +5865,12 @@ EXPORT_SYMBOL(__might_fault);
* operation. The target subpage will be processed last to keep its
* cache lines hot.
*/
-static inline void process_huge_page(
+static inline int process_huge_page(
unsigned long addr_hint, unsigned int pages_per_huge_page,
- void (*process_subpage)(unsigned long addr, int idx, void *arg),
+ int (*process_subpage)(unsigned long addr, int idx, void *arg),
void *arg)
{
- int i, n, base, l;
+ int i, n, base, l, ret;
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
@@ -5058,7 +5884,9 @@ static inline void process_huge_page(
/* Process subpages at the end of huge page */
for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
cond_resched();
- process_subpage(addr + i * PAGE_SIZE, i, arg);
+ ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+ if (ret)
+ return ret;
}
} else {
/* If target subpage in second half of huge page */
@@ -5067,7 +5895,9 @@ static inline void process_huge_page(
/* Process subpages at the begin of huge page */
for (i = 0; i < base; i++) {
cond_resched();
- process_subpage(addr + i * PAGE_SIZE, i, arg);
+ ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+ if (ret)
+ return ret;
}
}
/*
@@ -5079,10 +5909,15 @@ static inline void process_huge_page(
int right_idx = base + 2 * l - 1 - i;
cond_resched();
- process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+ ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+ if (ret)
+ return ret;
cond_resched();
- process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+ ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+ if (ret)
+ return ret;
}
+ return 0;
}
static void clear_gigantic_page(struct page *page,
@@ -5090,21 +5925,22 @@ static void clear_gigantic_page(struct page *page,
unsigned int pages_per_huge_page)
{
int i;
- struct page *p = page;
+ struct page *p;
might_sleep();
- for (i = 0; i < pages_per_huge_page;
- i++, p = mem_map_next(p, page, i)) {
+ for (i = 0; i < pages_per_huge_page; i++) {
+ p = nth_page(page, i);
cond_resched();
clear_user_highpage(p, addr + i * PAGE_SIZE);
}
}
-static void clear_subpage(unsigned long addr, int idx, void *arg)
+static int clear_subpage(unsigned long addr, int idx, void *arg)
{
struct page *page = arg;
clear_user_highpage(page + idx, addr);
+ return 0;
}
void clear_huge_page(struct page *page,
@@ -5121,23 +5957,27 @@ void clear_huge_page(struct page *page,
process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
- unsigned long addr,
- struct vm_area_struct *vma,
- unsigned int pages_per_huge_page)
+static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
+ unsigned long addr,
+ struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
{
int i;
- struct page *dst_base = dst;
- struct page *src_base = src;
+ struct page *dst_page;
+ struct page *src_page;
- for (i = 0; i < pages_per_huge_page; ) {
- cond_resched();
- copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+ for (i = 0; i < pages_per_huge_page; i++) {
+ dst_page = folio_page(dst, i);
+ src_page = folio_page(src, i);
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
+ cond_resched();
+ if (copy_mc_user_highpage(dst_page, src_page,
+ addr + i*PAGE_SIZE, vma)) {
+ memory_failure_queue(page_to_pfn(src_page), 0);
+ return -EHWPOISON;
+ }
}
+ return 0;
}
struct copy_subpage_arg {
@@ -5146,62 +5986,63 @@ struct copy_subpage_arg {
struct vm_area_struct *vma;
};
-static void copy_subpage(unsigned long addr, int idx, void *arg)
+static int copy_subpage(unsigned long addr, int idx, void *arg)
{
struct copy_subpage_arg *copy_arg = arg;
- copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
- addr, copy_arg->vma);
+ if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+ addr, copy_arg->vma)) {
+ memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+ return -EHWPOISON;
+ }
+ return 0;
}
-void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr_hint, struct vm_area_struct *vma,
- unsigned int pages_per_huge_page)
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+ unsigned long addr_hint, struct vm_area_struct *vma)
{
+ unsigned int pages_per_huge_page = folio_nr_pages(dst);
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
struct copy_subpage_arg arg = {
- .dst = dst,
- .src = src,
+ .dst = &dst->page,
+ .src = &src->page,
.vma = vma,
};
- if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
- copy_user_gigantic_page(dst, src, addr, vma,
- pages_per_huge_page);
- return;
- }
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
+ return copy_user_gigantic_page(dst, src, addr, vma,
+ pages_per_huge_page);
- process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
+ return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}
-long copy_huge_page_from_user(struct page *dst_page,
- const void __user *usr_src,
- unsigned int pages_per_huge_page,
- bool allow_pagefault)
+long copy_folio_from_user(struct folio *dst_folio,
+ const void __user *usr_src,
+ bool allow_pagefault)
{
- void *src = (void *)usr_src;
- void *page_kaddr;
+ void *kaddr;
unsigned long i, rc = 0;
- unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
-
- for (i = 0; i < pages_per_huge_page; i++) {
- if (allow_pagefault)
- page_kaddr = kmap(dst_page + i);
- else
- page_kaddr = kmap_atomic(dst_page + i);
- rc = copy_from_user(page_kaddr,
- (const void __user *)(src + i * PAGE_SIZE),
- PAGE_SIZE);
- if (allow_pagefault)
- kunmap(dst_page + i);
- else
- kunmap_atomic(page_kaddr);
+ unsigned int nr_pages = folio_nr_pages(dst_folio);
+ unsigned long ret_val = nr_pages * PAGE_SIZE;
+ struct page *subpage;
+
+ for (i = 0; i < nr_pages; i++) {
+ subpage = folio_page(dst_folio, i);
+ kaddr = kmap_local_page(subpage);
+ if (!allow_pagefault)
+ pagefault_disable();
+ rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
+ if (!allow_pagefault)
+ pagefault_enable();
+ kunmap_local(kaddr);
ret_val -= (PAGE_SIZE - rc);
if (rc)
break;
+ flush_dcache_page(subpage);
+
cond_resched();
}
return ret_val;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e9e2d44cdad..3f231cf1b410 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -13,7 +13,6 @@
#include <linux/pagemap.h>
#include <linux/compiler.h>
#include <linux/export.h>
-#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
@@ -21,7 +20,6 @@
#include <linux/memory.h>
#include <linux/memremap.h>
#include <linux/memory_hotplug.h>
-#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
#include <linux/delay.h>
@@ -36,12 +34,99 @@
#include <linux/memblock.h>
#include <linux/compaction.h>
#include <linux/rmap.h>
+#include <linux/module.h>
#include <asm/tlbflush.h>
#include "internal.h"
#include "shuffle.h"
+#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
+/*
+ * memory_hotplug.memmap_on_memory parameter
+ */
+static bool memmap_on_memory __ro_after_init;
+module_param(memmap_on_memory, bool, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+
+static inline bool mhp_memmap_on_memory(void)
+{
+ return memmap_on_memory;
+}
+#else
+static inline bool mhp_memmap_on_memory(void)
+{
+ return false;
+}
+#endif
+
+enum {
+ ONLINE_POLICY_CONTIG_ZONES = 0,
+ ONLINE_POLICY_AUTO_MOVABLE,
+};
+
+static const char * const online_policy_to_str[] = {
+ [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
+ [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
+};
+
+static int set_online_policy(const char *val, const struct kernel_param *kp)
+{
+ int ret = sysfs_match_string(online_policy_to_str, val);
+
+ if (ret < 0)
+ return ret;
+ *((int *)kp->arg) = ret;
+ return 0;
+}
+
+static int get_online_policy(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]);
+}
+
+/*
+ * memory_hotplug.online_policy: configure online behavior when onlining without
+ * specifying a zone (MMOP_ONLINE)
+ *
+ * "contig-zones": keep zone contiguous
+ * "auto-movable": online memory to ZONE_MOVABLE if the configuration
+ * (auto_movable_ratio, auto_movable_numa_aware) allows for it
+ */
+static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
+static const struct kernel_param_ops online_policy_ops = {
+ .set = set_online_policy,
+ .get = get_online_policy,
+};
+module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
+MODULE_PARM_DESC(online_policy,
+ "Set the online policy (\"contig-zones\", \"auto-movable\") "
+ "Default: \"contig-zones\"");
+
+/*
+ * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio
+ *
+ * The ratio represent an upper limit and the kernel might decide to not
+ * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory
+ * doesn't allow for more MOVABLE memory.
+ */
+static unsigned int auto_movable_ratio __read_mostly = 301;
+module_param(auto_movable_ratio, uint, 0644);
+MODULE_PARM_DESC(auto_movable_ratio,
+ "Set the maximum ratio of MOVABLE:KERNEL memory in the system "
+ "in percent for \"auto-movable\" online policy. Default: 301");
+
+/*
+ * memory_hotplug.auto_movable_numa_aware: consider numa node stats
+ */
+#ifdef CONFIG_NUMA
+static bool auto_movable_numa_aware __read_mostly = true;
+module_param(auto_movable_numa_aware, bool, 0644);
+MODULE_PARM_DESC(auto_movable_numa_aware,
+ "Consider numa node stats in addition to global stats in "
+ "\"auto-movable\" online policy. Default: true");
+#endif /* CONFIG_NUMA */
+
/*
* online_page_callback contains pointer to current page onlining function.
* Initially it is generic_online_page(). If it is required it could be
@@ -67,17 +152,17 @@ void put_online_mems(void)
bool movable_node_enabled = false;
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int memhp_default_online_type = MMOP_OFFLINE;
+int mhp_default_online_type = MMOP_OFFLINE;
#else
-int memhp_default_online_type = MMOP_ONLINE;
+int mhp_default_online_type = MMOP_ONLINE;
#endif
static int __init setup_memhp_default_state(char *str)
{
- const int online_type = memhp_online_type_from_str(str);
+ const int online_type = mhp_online_type_from_str(str);
if (online_type >= 0)
- memhp_default_online_type = online_type;
+ mhp_default_online_type = online_type;
return 1;
}
@@ -105,7 +190,10 @@ static struct resource *register_memory_resource(u64 start, u64 size,
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
if (strcmp(resource_name, "System RAM"))
- flags |= IORESOURCE_MEM_DRIVER_MANAGED;
+ flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
+
+ if (!mhp_range_allowed(start, size, true))
+ return ERR_PTR(-E2BIG);
/*
* Make sure value parsed from 'mem=' only restricts memory adding
@@ -140,125 +228,7 @@ static void release_memory_resource(struct resource *res)
kfree(res);
}
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-void get_page_bootmem(unsigned long info, struct page *page,
- unsigned long type)
-{
- page->freelist = (void *)type;
- SetPagePrivate(page);
- set_page_private(page, info);
- page_ref_inc(page);
-}
-
-void put_page_bootmem(struct page *page)
-{
- unsigned long type;
-
- type = (unsigned long) page->freelist;
- BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
- type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
-
- if (page_ref_dec_return(page) == 1) {
- page->freelist = NULL;
- ClearPagePrivate(page);
- set_page_private(page, 0);
- INIT_LIST_HEAD(&page->lru);
- free_reserved_page(page);
- }
-}
-
-#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
- unsigned long mapsize, section_nr, i;
- struct mem_section *ms;
- struct page *page, *memmap;
- struct mem_section_usage *usage;
-
- section_nr = pfn_to_section_nr(start_pfn);
- ms = __nr_to_section(section_nr);
-
- /* Get section's memmap address */
- memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
- /*
- * Get page for the memmap's phys address
- * XXX: need more consideration for sparse_vmemmap...
- */
- page = virt_to_page(memmap);
- mapsize = sizeof(struct page) * PAGES_PER_SECTION;
- mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
-
- /* remember memmap's page */
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, SECTION_INFO);
-
- usage = ms->usage;
- page = virt_to_page(usage);
-
- mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-
-}
-#else /* CONFIG_SPARSEMEM_VMEMMAP */
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
- unsigned long mapsize, section_nr, i;
- struct mem_section *ms;
- struct page *page, *memmap;
- struct mem_section_usage *usage;
-
- section_nr = pfn_to_section_nr(start_pfn);
- ms = __nr_to_section(section_nr);
-
- memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
- register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
-
- usage = ms->usage;
- page = virt_to_page(usage);
-
- mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-
-void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
- unsigned long i, pfn, end_pfn, nr_pages;
- int node = pgdat->node_id;
- struct page *page;
-
- nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
- page = virt_to_page(pgdat);
-
- for (i = 0; i < nr_pages; i++, page++)
- get_page_bootmem(node, page, NODE_INFO);
-
- pfn = pgdat->node_start_pfn;
- end_pfn = pgdat_end_pfn(pgdat);
-
- /* register section info */
- for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- /*
- * Some platforms can assign the same pfn to multiple nodes - on
- * node0 as well as nodeN. To avoid registering a pfn against
- * multiple nodes we check that this pfn does not already
- * reside in some other nodes.
- */
- if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
- register_page_bootmem_info_section(pfn);
- }
-}
-#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-
-static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
- const char *reason)
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages)
{
/*
* Disallow all operations smaller than a sub-section and only
@@ -275,37 +245,59 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
min_align = PAGES_PER_SUBSECTION;
else
min_align = PAGES_PER_SECTION;
- if (!IS_ALIGNED(pfn, min_align)
- || !IS_ALIGNED(nr_pages, min_align)) {
- WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
- reason, pfn, pfn + nr_pages - 1);
+ if (!IS_ALIGNED(pfn | nr_pages, min_align))
return -EINVAL;
- }
return 0;
}
-static int check_hotplug_memory_addressable(unsigned long pfn,
- unsigned long nr_pages)
+/*
+ * Return page for the valid pfn only if the page is online. All pfn
+ * walkers which rely on the fully initialized page->flags and others
+ * should use this rather than pfn_valid && pfn_to_page
+ */
+struct page *pfn_to_online_page(unsigned long pfn)
{
- const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
+ unsigned long nr = pfn_to_section_nr(pfn);
+ struct dev_pagemap *pgmap;
+ struct mem_section *ms;
- if (max_addr >> MAX_PHYSMEM_BITS) {
- const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
- WARN(1,
- "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
- (u64)PFN_PHYS(pfn), max_addr, max_allowed);
- return -E2BIG;
- }
+ if (nr >= NR_MEM_SECTIONS)
+ return NULL;
- return 0;
+ ms = __nr_to_section(nr);
+ if (!online_section(ms))
+ return NULL;
+
+ /*
+ * Save some code text when online_section() +
+ * pfn_section_valid() are sufficient.
+ */
+ if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
+ return NULL;
+
+ if (!pfn_section_valid(ms, pfn))
+ return NULL;
+
+ if (!online_device_section(ms))
+ return pfn_to_page(pfn);
+
+ /*
+ * Slowpath: when ZONE_DEVICE collides with
+ * ZONE_{NORMAL,MOVABLE} within the same section some pfns in
+ * the section may be 'offline' but 'valid'. Only
+ * get_dev_pagemap() can determine sub-section online status.
+ */
+ pgmap = get_dev_pagemap(pfn, NULL);
+ put_dev_pagemap(pgmap);
+
+ /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
+ if (pgmap)
+ return NULL;
+
+ return pfn_to_page(pfn);
}
+EXPORT_SYMBOL_GPL(pfn_to_online_page);
-/*
- * Reasonably generic function for adding memory. It is
- * expected that archs that support memory hotplug will
- * call this function after deciding the zone to which to
- * add the new pages.
- */
int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
struct mhp_params *params)
{
@@ -314,12 +306,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
int err;
struct vmem_altmap *altmap = params->altmap;
- if (WARN_ON_ONCE(!params->pgprot.pgprot))
+ if (WARN_ON_ONCE(!pgprot_val(params->pgprot)))
return -EINVAL;
- err = check_hotplug_memory_addressable(pfn, nr_pages);
- if (err)
- return err;
+ VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
if (altmap) {
/*
@@ -333,15 +323,17 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
altmap->alloc = 0;
}
- err = check_pfn_span(pfn, nr_pages, "add");
- if (err)
- return err;
+ if (check_pfn_span(pfn, nr_pages)) {
+ WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1);
+ return -EINVAL;
+ }
for (; pfn < end_pfn; pfn += cur_nr_pages) {
/* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
- err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
+ err = sparse_add_section(nid, pfn, cur_nr_pages, altmap,
+ params->pgmap);
if (err)
break;
cond_resched();
@@ -350,24 +342,6 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
return err;
}
-#ifdef CONFIG_NUMA
-int __weak memory_add_physaddr_to_nid(u64 start)
-{
- pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
- start);
- return 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-
-int __weak phys_to_target_node(u64 start)
-{
- pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
- start);
- return 0;
-}
-EXPORT_SYMBOL_GPL(phys_to_target_node);
-#endif
-
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
@@ -420,7 +394,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long pfn;
int nid = zone_to_nid(zone);
- zone_span_writelock(zone);
if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
@@ -453,7 +426,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
zone->spanned_pages = 0;
}
}
- zone_span_writeunlock(zone);
}
static void update_pgdat_span(struct pglist_data *pgdat)
@@ -463,20 +435,19 @@ static void update_pgdat_span(struct pglist_data *pgdat)
for (zone = pgdat->node_zones;
zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
- unsigned long zone_end_pfn = zone->zone_start_pfn +
- zone->spanned_pages;
+ unsigned long end_pfn = zone_end_pfn(zone);
/* No need to lock the zones, they can't change. */
if (!zone->spanned_pages)
continue;
if (!node_end_pfn) {
node_start_pfn = zone->zone_start_pfn;
- node_end_pfn = zone_end_pfn;
+ node_end_pfn = end_pfn;
continue;
}
- if (zone_end_pfn > node_end_pfn)
- node_end_pfn = zone_end_pfn;
+ if (end_pfn > node_end_pfn)
+ node_end_pfn = end_pfn;
if (zone->zone_start_pfn < node_start_pfn)
node_start_pfn = zone->zone_start_pfn;
}
@@ -491,7 +462,7 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
{
const unsigned long end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long pfn, cur_nr_pages, flags;
+ unsigned long pfn, cur_nr_pages;
/* Poison struct pages because they are now uninitialized again. */
for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
@@ -504,38 +475,22 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
sizeof(struct page) * cur_nr_pages);
}
-#ifdef CONFIG_ZONE_DEVICE
/*
* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
* we will not try to shrink the zones - which is okay as
* set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
*/
- if (zone_idx(zone) == ZONE_DEVICE)
+ if (zone_is_zone_device(zone))
return;
-#endif
clear_zone_contiguous(zone);
- pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
update_pgdat_span(pgdat);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
set_zone_contiguous(zone);
}
-static void __remove_section(unsigned long pfn, unsigned long nr_pages,
- unsigned long map_offset,
- struct vmem_altmap *altmap)
-{
- struct mem_section *ms = __pfn_to_section(pfn);
-
- if (WARN_ON_ONCE(!valid_section(ms)))
- return;
-
- sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
-}
-
/**
* __remove_pages() - remove sections of pages
* @pfn: starting pageframe (must be aligned to start of a section)
@@ -552,20 +507,18 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
{
const unsigned long end_pfn = pfn + nr_pages;
unsigned long cur_nr_pages;
- unsigned long map_offset = 0;
-
- map_offset = vmem_altmap_offset(altmap);
- if (check_pfn_span(pfn, nr_pages, "remove"))
+ if (check_pfn_span(pfn, nr_pages)) {
+ WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1);
return;
+ }
for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
/* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
- __remove_section(pfn, cur_nr_pages, map_offset, altmap);
- map_offset = 0;
+ sparse_remove_section(pfn, cur_nr_pages, altmap);
}
}
@@ -614,42 +567,46 @@ void generic_online_page(struct page *page, unsigned int order)
* so we should map it first. This is better than introducing a special
* case in page freeing fast path.
*/
- if (debug_pagealloc_enabled_static())
- kernel_map_pages(page, 1 << order, 1);
+ debug_pagealloc_map_pages(page, 1 << order);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
-#ifdef CONFIG_HIGHMEM
- if (PageHighMem(page))
- totalhigh_pages_add(1UL << order);
-#endif
}
EXPORT_SYMBOL_GPL(generic_online_page);
-static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
{
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn;
- int order;
/*
- * Online the pages. The callback might decide to keep some pages
- * PG_reserved (to add them to the buddy later), but we still account
- * them as being online/belonging to this zone ("present").
+ * Online the pages in MAX_ORDER aligned chunks. The callback might
+ * decide to not expose all pages to the buddy (e.g., expose them
+ * later). We account all pages as being online and belonging to this
+ * zone ("present").
+ * When using memmap_on_memory, the range might not be aligned to
+ * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
+ * this and the first chunk to online will be pageblock_nr_pages.
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
- order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
- /* __free_pages_core() wants pfns to be aligned to the order */
- if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
- order = 0;
+ for (pfn = start_pfn; pfn < end_pfn;) {
+ int order;
+
+ /*
+ * Free to online pages in the largest chunks alignment allows.
+ *
+ * __ffs() behaviour is undefined for 0. start == 0 is
+ * MAX_ORDER-aligned, Set order to MAX_ORDER for the case.
+ */
+ if (pfn)
+ order = min_t(int, MAX_ORDER, __ffs(pfn));
+ else
+ order = MAX_ORDER;
+
(*online_page_callback)(pfn_to_page(pfn), order);
+ pfn += (1UL << order);
}
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
-
- *(unsigned long *)arg += nr_pages;
- return 0;
}
/* check which state of node_states will be changed when online memory */
@@ -660,16 +617,11 @@ static void node_states_check_changes_online(unsigned long nr_pages,
arg->status_change_nid = NUMA_NO_NODE;
arg->status_change_nid_normal = NUMA_NO_NODE;
- arg->status_change_nid_high = NUMA_NO_NODE;
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
-#ifdef CONFIG_HIGHMEM
- if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
- arg->status_change_nid_high = nid;
-#endif
}
static void node_states_set_node(int node, struct memory_notify *arg)
@@ -677,9 +629,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_normal >= 0)
node_set_state(node, N_NORMAL_MEMORY);
- if (arg->status_change_nid_high >= 0)
- node_set_state(node, N_HIGH_MEMORY);
-
if (arg->status_change_nid >= 0)
node_set_state(node, N_MEMORY);
}
@@ -706,29 +655,55 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
}
+
+#ifdef CONFIG_ZONE_DEVICE
+static void section_taint_zone_device(unsigned long pfn)
+{
+ struct mem_section *ms = __pfn_to_section(pfn);
+
+ ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
+}
+#else
+static inline void section_taint_zone_device(unsigned long pfn)
+{
+}
+#endif
+
/*
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
* call, all affected pages are PG_reserved.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages,
+ struct vmem_altmap *altmap, int migratetype)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
- unsigned long flags;
clear_zone_contiguous(zone);
- /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
- pgdat_resize_lock(pgdat, &flags);
- zone_span_writelock(zone);
if (zone_is_empty(zone))
init_currently_empty_zone(zone, start_pfn, nr_pages);
resize_zone_range(zone, start_pfn, nr_pages);
- zone_span_writeunlock(zone);
resize_pgdat_range(pgdat, start_pfn, nr_pages);
- pgdat_resize_unlock(pgdat, &flags);
+
+ /*
+ * Subsection population requires care in pfn_to_online_page().
+ * Set the taint to enable the slow path detection of
+ * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE}
+ * section.
+ */
+ if (zone_is_zone_device(zone)) {
+ if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
+ section_taint_zone_device(start_pfn);
+ if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
+ section_taint_zone_device(start_pfn + nr_pages);
+ }
/*
* TODO now we have a visible range of pages which are not associated
@@ -736,12 +711,115 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
- memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMINIT_HOTPLUG, altmap);
+ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+ MEMINIT_HOTPLUG, altmap, migratetype);
set_zone_contiguous(zone);
}
+struct auto_movable_stats {
+ unsigned long kernel_early_pages;
+ unsigned long movable_pages;
+};
+
+static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
+ struct zone *zone)
+{
+ if (zone_idx(zone) == ZONE_MOVABLE) {
+ stats->movable_pages += zone->present_pages;
+ } else {
+ stats->kernel_early_pages += zone->present_early_pages;
+#ifdef CONFIG_CMA
+ /*
+ * CMA pages (never on hotplugged memory) behave like
+ * ZONE_MOVABLE.
+ */
+ stats->movable_pages += zone->cma_pages;
+ stats->kernel_early_pages -= zone->cma_pages;
+#endif /* CONFIG_CMA */
+ }
+}
+struct auto_movable_group_stats {
+ unsigned long movable_pages;
+ unsigned long req_kernel_early_pages;
+};
+
+static int auto_movable_stats_account_group(struct memory_group *group,
+ void *arg)
+{
+ const int ratio = READ_ONCE(auto_movable_ratio);
+ struct auto_movable_group_stats *stats = arg;
+ long pages;
+
+ /*
+ * We don't support modifying the config while the auto-movable online
+ * policy is already enabled. Just avoid the division by zero below.
+ */
+ if (!ratio)
+ return 0;
+
+ /*
+ * Calculate how many early kernel pages this group requires to
+ * satisfy the configured zone ratio.
+ */
+ pages = group->present_movable_pages * 100 / ratio;
+ pages -= group->present_kernel_pages;
+
+ if (pages > 0)
+ stats->req_kernel_early_pages += pages;
+ stats->movable_pages += group->present_movable_pages;
+ return 0;
+}
+
+static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
+ unsigned long nr_pages)
+{
+ unsigned long kernel_early_pages, movable_pages;
+ struct auto_movable_group_stats group_stats = {};
+ struct auto_movable_stats stats = {};
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct zone *zone;
+ int i;
+
+ /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */
+ if (nid == NUMA_NO_NODE) {
+ /* TODO: cache values */
+ for_each_populated_zone(zone)
+ auto_movable_stats_account_zone(&stats, zone);
+ } else {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zone = pgdat->node_zones + i;
+ if (populated_zone(zone))
+ auto_movable_stats_account_zone(&stats, zone);
+ }
+ }
+
+ kernel_early_pages = stats.kernel_early_pages;
+ movable_pages = stats.movable_pages;
+
+ /*
+ * Kernel memory inside dynamic memory group allows for more MOVABLE
+ * memory within the same group. Remove the effect of all but the
+ * current group from the stats.
+ */
+ walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
+ group, &group_stats);
+ if (kernel_early_pages <= group_stats.req_kernel_early_pages)
+ return false;
+ kernel_early_pages -= group_stats.req_kernel_early_pages;
+ movable_pages -= group_stats.movable_pages;
+
+ if (group && group->is_dynamic)
+ kernel_early_pages += group->present_kernel_pages;
+
+ /*
+ * Test if we could online the given number of pages to ZONE_MOVABLE
+ * and still stay in the configured ratio.
+ */
+ movable_pages += nr_pages;
+ return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
+}
+
/*
* Returns a default kernel memory zone for the given pfn range.
* If no kernel zone covers this pfn range it will automatically go
@@ -753,7 +831,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
struct pglist_data *pgdat = NODE_DATA(nid);
int zid;
- for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+ for (zid = 0; zid < ZONE_NORMAL; zid++) {
struct zone *zone = &pgdat->node_zones[zid];
if (zone_intersects(zone, start_pfn, nr_pages))
@@ -763,6 +841,117 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
return &pgdat->node_zones[ZONE_NORMAL];
}
+/*
+ * Determine to which zone to online memory dynamically based on user
+ * configuration and system stats. We care about the following ratio:
+ *
+ * MOVABLE : KERNEL
+ *
+ * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in
+ * one of the kernel zones. CMA pages inside one of the kernel zones really
+ * behaves like ZONE_MOVABLE, so we treat them accordingly.
+ *
+ * We don't allow for hotplugged memory in a KERNEL zone to increase the
+ * amount of MOVABLE memory we can have, so we end up with:
+ *
+ * MOVABLE : KERNEL_EARLY
+ *
+ * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze
+ * boot. We base our calculation on KERNEL_EARLY internally, because:
+ *
+ * a) Hotplugged memory in one of the kernel zones can sometimes still get
+ * hotunplugged, especially when hot(un)plugging individual memory blocks.
+ * There is no coordination across memory devices, therefore "automatic"
+ * hotunplugging, as implemented in hypervisors, could result in zone
+ * imbalances.
+ * b) Early/boot memory in one of the kernel zones can usually not get
+ * hotunplugged again (e.g., no firmware interface to unplug, fragmented
+ * with unmovable allocations). While there are corner cases where it might
+ * still work, it is barely relevant in practice.
+ *
+ * Exceptions are dynamic memory groups, which allow for more MOVABLE
+ * memory within the same memory group -- because in that case, there is
+ * coordination within the single memory device managed by a single driver.
+ *
+ * We rely on "present pages" instead of "managed pages", as the latter is
+ * highly unreliable and dynamic in virtualized environments, and does not
+ * consider boot time allocations. For example, memory ballooning adjusts the
+ * managed pages when inflating/deflating the balloon, and balloon compaction
+ * can even migrate inflated pages between zones.
+ *
+ * Using "present pages" is better but some things to keep in mind are:
+ *
+ * a) Some memblock allocations, such as for the crashkernel area, are
+ * effectively unused by the kernel, yet they account to "present pages".
+ * Fortunately, these allocations are comparatively small in relevant setups
+ * (e.g., fraction of system memory).
+ * b) Some hotplugged memory blocks in virtualized environments, esecially
+ * hotplugged by virtio-mem, look like they are completely present, however,
+ * only parts of the memory block are actually currently usable.
+ * "present pages" is an upper limit that can get reached at runtime. As
+ * we base our calculations on KERNEL_EARLY, this is not an issue.
+ */
+static struct zone *auto_movable_zone_for_pfn(int nid,
+ struct memory_group *group,
+ unsigned long pfn,
+ unsigned long nr_pages)
+{
+ unsigned long online_pages = 0, max_pages, end_pfn;
+ struct page *page;
+
+ if (!auto_movable_ratio)
+ goto kernel_zone;
+
+ if (group && !group->is_dynamic) {
+ max_pages = group->s.max_pages;
+ online_pages = group->present_movable_pages;
+
+ /* If anything is !MOVABLE online the rest !MOVABLE. */
+ if (group->present_kernel_pages)
+ goto kernel_zone;
+ } else if (!group || group->d.unit_pages == nr_pages) {
+ max_pages = nr_pages;
+ } else {
+ max_pages = group->d.unit_pages;
+ /*
+ * Take a look at all online sections in the current unit.
+ * We can safely assume that all pages within a section belong
+ * to the same zone, because dynamic memory groups only deal
+ * with hotplugged memory.
+ */
+ pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
+ end_pfn = pfn + group->d.unit_pages;
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ page = pfn_to_online_page(pfn);
+ if (!page)
+ continue;
+ /* If anything is !MOVABLE online the rest !MOVABLE. */
+ if (!is_zone_movable_page(page))
+ goto kernel_zone;
+ online_pages += PAGES_PER_SECTION;
+ }
+ }
+
+ /*
+ * Online MOVABLE if we could *currently* online all remaining parts
+ * MOVABLE. We expect to (add+) online them immediately next, so if
+ * nobody interferes, all will be MOVABLE if possible.
+ */
+ nr_pages = max_pages - online_pages;
+ if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
+ goto kernel_zone;
+
+#ifdef CONFIG_NUMA
+ if (auto_movable_numa_aware &&
+ !auto_movable_can_online_movable(nid, group, nr_pages))
+ goto kernel_zone;
+#endif /* CONFIG_NUMA */
+
+ return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+kernel_zone:
+ return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
+}
+
static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -787,7 +976,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
return movable_node_enabled ? movable_zone : kernel_zone;
}
-struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+struct zone *zone_for_pfn_range(int online_type, int nid,
+ struct memory_group *group, unsigned long start_pfn,
unsigned long nr_pages)
{
if (online_type == MMOP_ONLINE_KERNEL)
@@ -796,24 +986,107 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
if (online_type == MMOP_ONLINE_MOVABLE)
return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+ if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
+ return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
+
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct page *page, struct memory_group *group,
+ long nr_pages)
+{
+ struct zone *zone = page_zone(page);
+ const bool movable = zone_idx(zone) == ZONE_MOVABLE;
+
+ /*
+ * We only support onlining/offlining/adding/removing of complete
+ * memory blocks; therefore, either all is either early or hotplugged.
+ */
+ if (early_section(__pfn_to_section(page_to_pfn(page))))
+ zone->present_early_pages += nr_pages;
+ zone->present_pages += nr_pages;
+ zone->zone_pgdat->node_present_pages += nr_pages;
+
+ if (group && movable)
+ group->present_movable_pages += nr_pages;
+ else if (group && !movable)
+ group->present_kernel_pages += nr_pages;
+}
+
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+ struct zone *zone)
+{
+ unsigned long end_pfn = pfn + nr_pages;
+ int ret, i;
+
+ ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+ if (ret)
+ return ret;
+
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+ for (i = 0; i < nr_pages; i++)
+ SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
+
+ /*
+ * It might be that the vmemmap_pages fully span sections. If that is
+ * the case, mark those sections online here as otherwise they will be
+ * left offline.
+ */
+ if (nr_pages >= PAGES_PER_SECTION)
+ online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+ return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long end_pfn = pfn + nr_pages;
+
+ /*
+ * It might be that the vmemmap_pages fully span sections. If that is
+ * the case, mark those sections offline here as otherwise they will be
+ * left online.
+ */
+ if (nr_pages >= PAGES_PER_SECTION)
+ offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+ /*
+ * The pages associated with this vmemmap have been offlined, so
+ * we can reset its state here.
+ */
+ remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+ kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+}
+
int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
- int online_type, int nid)
+ struct zone *zone, struct memory_group *group)
{
unsigned long flags;
- unsigned long onlined_pages = 0;
- struct zone *zone;
int need_zonelists_rebuild = 0;
+ const int nid = zone_to_nid(zone);
int ret;
struct memory_notify arg;
+ /*
+ * {on,off}lining is constrained to full memory sections (or more
+ * precisely to memory blocks from the user space POV).
+ * memmap_on_memory is an exception because it reserves initial part
+ * of the physical memory space for vmemmaps. That space is pageblock
+ * aligned.
+ */
+ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) ||
+ !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/* associate pfn range with the zone */
- zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -825,6 +1098,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
goto failed_addition;
/*
+ * Fixup the number of isolated pageblocks before marking the sections
+ * onlining, such that undo_isolate_page_range() works correctly.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
@@ -834,36 +1115,25 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
setup_zone_pageset(zone);
}
- ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
- online_pages_range);
- if (ret) {
- /* not a single memory resource was applicable */
- if (need_zonelists_rebuild)
- zone_pcp_reset(zone);
- goto failed_addition;
- }
+ online_pages_range(pfn, nr_pages);
+ adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- zone->present_pages += onlined_pages;
+ node_states_set_node(nid, &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL);
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages += onlined_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ /* Basic onlining is complete, allow allocation of onlined pages. */
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
/*
- * When exposing larger, physically contiguous memory areas to the
- * buddy, shuffling in the buddy (when freeing onlined pages, putting
- * them either to the head or the tail of the freelist) is only helpful
- * for maintaining the shuffle, but not for creating the initial
- * shuffle. Shuffle the whole zone to make sure the just onlined pages
- * are properly distributed across the whole freelist.
+ * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
+ * the tail of the freelist when undoing isolation). Shuffle the whole
+ * zone to make sure the just onlined pages are properly distributed
+ * across the whole freelist - to create an initial shuffle.
*/
shuffle_zone(zone);
- node_states_set_node(nid, &arg);
- if (need_zonelists_rebuild)
- build_all_zonelists(NULL);
- zone_pcp_update(zone);
-
+ /* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
kswapd_run(nid);
@@ -884,56 +1154,22 @@ failed_addition:
mem_hotplug_done();
return ret;
}
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-
-static void reset_node_present_pages(pg_data_t *pgdat)
-{
- struct zone *z;
-
- for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->present_pages = 0;
-
- pgdat->node_present_pages = 0;
-}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_new_pgdat(int nid)
+static pg_data_t __ref *hotadd_init_pgdat(int nid)
{
struct pglist_data *pgdat;
+ /*
+ * NODE_DATA is preallocated (free_area_init) but its internal
+ * state is not allocated completely. Add missing pieces.
+ * Completely offline nodes stay around and they just need
+ * reintialization.
+ */
pgdat = NODE_DATA(nid);
- if (!pgdat) {
- pgdat = arch_alloc_nodedata(nid);
- if (!pgdat)
- return NULL;
-
- pgdat->per_cpu_nodestats =
- alloc_percpu(struct per_cpu_nodestat);
- arch_refresh_nodedata(nid, pgdat);
- } else {
- int cpu;
- /*
- * Reset the nr_zones, order and highest_zoneidx before reuse.
- * Note that kswapd will init kswapd_highest_zoneidx properly
- * when it starts in the near future.
- */
- pgdat->nr_zones = 0;
- pgdat->kswapd_order = 0;
- pgdat->kswapd_highest_zoneidx = 0;
- for_each_online_cpu(cpu) {
- struct per_cpu_nodestat *p;
-
- p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
- memset(p, 0, sizeof(*p));
- }
- }
-
- /* we can use NODE_DATA(nid) from here */
- pgdat->node_id = nid;
- pgdat->node_start_pfn = 0;
/* init node's zones as empty zones, we don't have any present pages.*/
- free_area_init_core_hotplug(nid);
+ free_area_init_core_hotplug(pgdat);
/*
* The node we allocated has no zone fallback lists. For avoiding
@@ -941,29 +1177,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid)
*/
build_all_zonelists(pgdat);
- /*
- * When memory is hot-added, all the memory is in offline state. So
- * clear all zones' present_pages because they will be updated in
- * online_pages() and offline_pages().
- */
- reset_node_managed_pages(pgdat);
- reset_node_present_pages(pgdat);
-
return pgdat;
}
-static void rollback_node_hotadd(int nid)
-{
- pg_data_t *pgdat = NODE_DATA(nid);
-
- arch_refresh_nodedata(nid, NULL);
- free_percpu(pgdat->per_cpu_nodestats);
- arch_free_nodedata(pgdat);
-}
-
-
-/**
- * try_online_node - online a node if offlined
+/*
+ * __try_online_node - online a node if offlined
* @nid: the node ID
* @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
@@ -981,7 +1199,7 @@ static int __try_online_node(int nid, bool set_node_online)
if (node_online(nid))
return 0;
- pgdat = hotadd_new_pgdat(nid);
+ pgdat = hotadd_init_pgdat(nid);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
@@ -1025,19 +1243,60 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
- mem->online_type = memhp_default_online_type;
+ mem->online_type = mhp_default_online_type;
return device_online(&mem->dev);
}
+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+ unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+ unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+ unsigned long remaining_size = size - vmemmap_size;
+
+ /*
+ * Besides having arch support and the feature enabled at runtime, we
+ * need a few more assumptions to hold true:
+ *
+ * a) We span a single memory block: memory onlining/offlinin;g happens
+ * in memory block granularity. We don't want the vmemmap of online
+ * memory blocks to reside on offline memory blocks. In the future,
+ * we might want to support variable-sized memory blocks to make the
+ * feature more versatile.
+ *
+ * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+ * to populate memory from the altmap for unrelated parts (i.e.,
+ * other memory blocks)
+ *
+ * c) The vmemmap pages (and thereby the pages that will be exposed to
+ * the buddy) have to cover full pageblocks: memory onlining/offlining
+ * code requires applicable ranges to be page-aligned, for example, to
+ * set the migratetypes properly.
+ *
+ * TODO: Although we have a check here to make sure that vmemmap pages
+ * fully populate a PMD, it is not the right place to check for
+ * this. A much better solution involves improving vmemmap code
+ * to fallback to base pages when trying to populate vmemmap using
+ * altmap as an alternative source of memory, and we do not exactly
+ * populate a single PMD.
+ */
+ return mhp_memmap_on_memory() &&
+ size == memory_block_size_bytes() &&
+ IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+ IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
/*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations (triggered e.g. by sysfs).
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
- struct mhp_params params = { .pgprot = PAGE_KERNEL };
+ struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+ enum memblock_flags memblock_flags = MEMBLOCK_NONE;
+ struct vmem_altmap mhp_altmap = {};
+ struct memory_group *group = NULL;
u64 start, size;
bool new_node = false;
int ret;
@@ -1049,6 +1308,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
if (ret)
return ret;
+ if (mhp_flags & MHP_NID_IS_MGID) {
+ group = memory_group_find_by_id(nid);
+ if (!group)
+ return -EINVAL;
+ nid = group->nid;
+ }
+
if (!node_possible(nid)) {
WARN(1, "node %d was absent from the node_possible_map\n", nid);
return -EINVAL;
@@ -1056,23 +1322,42 @@ int __ref add_memory_resource(int nid, struct resource *res)
mem_hotplug_begin();
- if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
- memblock_add_node(start, size, nid);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+ if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
+ memblock_flags = MEMBLOCK_DRIVER_MANAGED;
+ ret = memblock_add_node(start, size, nid, memblock_flags);
+ if (ret)
+ goto error_mem_hotplug_end;
+ }
ret = __try_online_node(nid, false);
if (ret < 0)
goto error;
new_node = ret;
+ /*
+ * Self hosted memmap array
+ */
+ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+ if (!mhp_supports_memmap_on_memory(size)) {
+ ret = -EINVAL;
+ goto error;
+ }
+ mhp_altmap.free = PHYS_PFN(size);
+ mhp_altmap.base_pfn = PHYS_PFN(start);
+ params.altmap = &mhp_altmap;
+ }
+
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, &params);
if (ret < 0)
goto error;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size);
+ ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
+ group);
if (ret) {
- arch_remove_memory(nid, start, size, NULL);
+ arch_remove_memory(start, size, NULL);
goto error;
}
@@ -1087,10 +1372,9 @@ int __ref add_memory_resource(int nid, struct resource *res)
BUG_ON(ret);
}
- /* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
- BUG_ON(ret);
+ register_memory_blocks_under_node(nid, PFN_DOWN(start),
+ PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1099,23 +1383,28 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
+ /*
+ * In case we're allowed to merge the resource, flag it and trigger
+ * merging now that adding succeeded.
+ */
+ if (mhp_flags & MHP_MERGE_RESOURCE)
+ merge_system_ram_resource(res);
+
/* online pages if requested */
- if (memhp_default_online_type != MMOP_OFFLINE)
+ if (mhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
error:
- /* rollback pgdat allocation and others */
- if (new_node)
- rollback_node_hotadd(nid);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
+error_mem_hotplug_end:
mem_hotplug_done();
return ret;
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size)
+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
@@ -1124,18 +1413,18 @@ int __ref __add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, mhp_flags);
if (ret < 0)
release_memory_resource(res);
return ret;
}
-int add_memory(int nid, u64 start, u64 size)
+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
int rc;
lock_device_hotplug();
- rc = __add_memory(nid, start, size);
+ rc = __add_memory(nid, start, size, mhp_flags);
unlock_device_hotplug();
return rc;
@@ -1157,14 +1446,14 @@ EXPORT_SYMBOL_GPL(add_memory);
*
* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
* memory map") are created. Also, the created memory resource is flagged
- * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
* this memory as well (esp., not place kexec images onto it).
*
* The resource_name (visible via /proc/iomem) has to have the format
* "System RAM ($DRIVER)".
*/
int add_memory_driver_managed(int nid, u64 start, u64 size,
- const char *resource_name)
+ const char *resource_name, mhp_t mhp_flags)
{
struct resource *res;
int rc;
@@ -1182,7 +1471,7 @@ int add_memory_driver_managed(int nid, u64 start, u64 size,
goto out_unlock;
}
- rc = add_memory_resource(nid, res);
+ rc = add_memory_resource(nid, res, mhp_flags);
if (rc < 0)
release_memory_resource(res);
@@ -1192,46 +1481,62 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(add_memory_driver_managed);
-#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * Confirm all pages in a range [start, end) belong to the same zone (skipping
- * memory holes). When true, return the zone.
+ * Platforms should define arch_get_mappable_range() that provides
+ * maximum possible addressable physical memory range for which the
+ * linear mapping could be created. The platform returned address
+ * range must adhere to these following semantics.
+ *
+ * - range.start <= range.end
+ * - Range includes both end points [range.start..range.end]
+ *
+ * There is also a fallback definition provided here, allowing the
+ * entire possible physical address range in case any platform does
+ * not define arch_get_mappable_range().
*/
-struct zone *test_pages_in_a_zone(unsigned long start_pfn,
- unsigned long end_pfn)
+struct range __weak arch_get_mappable_range(void)
{
- unsigned long pfn, sec_end_pfn;
- struct zone *zone = NULL;
- struct page *page;
- int i;
- for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
- pfn < end_pfn;
- pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
- /* Make sure the memory section is present first */
- if (!present_section_nr(pfn_to_section_nr(pfn)))
- continue;
- for (; pfn < sec_end_pfn && pfn < end_pfn;
- pfn += MAX_ORDER_NR_PAGES) {
- i = 0;
- /* This is just a CONFIG_HOLES_IN_ZONE check.*/
- while ((i < MAX_ORDER_NR_PAGES) &&
- !pfn_valid_within(pfn + i))
- i++;
- if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
- continue;
- /* Check if we got outside of the zone */
- if (zone && !zone_spans_pfn(zone, pfn + i))
- return NULL;
- page = pfn_to_page(pfn + i);
- if (zone && page_zone(page) != zone)
- return NULL;
- zone = page_zone(page);
+ struct range mhp_range = {
+ .start = 0UL,
+ .end = -1ULL,
+ };
+ return mhp_range;
+}
+
+struct range mhp_get_pluggable_range(bool need_mapping)
+{
+ const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ struct range mhp_range;
+
+ if (need_mapping) {
+ mhp_range = arch_get_mappable_range();
+ if (mhp_range.start > max_phys) {
+ mhp_range.start = 0;
+ mhp_range.end = 0;
}
+ mhp_range.end = min_t(u64, mhp_range.end, max_phys);
+ } else {
+ mhp_range.start = 0;
+ mhp_range.end = max_phys;
}
+ return mhp_range;
+}
+EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
+
+bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
+{
+ struct range mhp_range = mhp_get_pluggable_range(need_mapping);
+ u64 end = start + size;
- return zone;
+ if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
+ return true;
+
+ pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
+ start, end, mhp_range.start, mhp_range.end);
+ return false;
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
* non-lru movable pages and hugepages). Will skip over most unmovable
@@ -1272,7 +1577,14 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
if (!PageHuge(page))
continue;
head = compound_head(page);
- if (page_huge_active(head))
+ /*
+ * This test is racy as we hold no reference or lock. The
+ * hugetlb page could have been free'ed and head is no longer
+ * a hugetlb page before the following check. In such unlikely
+ * cases false positives and negatives are possible. Calling
+ * code must deal with these scenarios.
+ */
+ if (HPageMigratable(head))
goto found;
skip = compound_nr(head) - (page - head);
pfn += skip - 1;
@@ -1283,44 +1595,27 @@ found:
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private)
-{
- nodemask_t nmask = node_states[N_MEMORY];
- struct migration_target_control mtc = {
- .nid = page_to_nid(page),
- .nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- /*
- * try to allocate from a different node but reuse this node if there
- * are no other online nodes to be used (e.g. we are offlining a part
- * of the only existing node)
- */
- node_clear(mtc.nid, nmask);
- if (nodes_empty(nmask))
- node_set(mtc.nid, nmask);
-
- return alloc_migration_target(page, (unsigned long)&mtc);
-}
-
-static int
-do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page, *head;
- int ret = 0;
LIST_HEAD(source);
+ static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct folio *folio;
+ bool isolated;
+
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
- head = compound_head(page);
+ folio = page_folio(page);
+ head = &folio->page;
if (PageHuge(page)) {
pfn = page_to_pfn(head) + compound_nr(head) - 1;
- isolate_huge_page(head, &source);
+ isolate_hugetlb(folio, &source);
continue;
} else if (PageTransHuge(page))
pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
@@ -1333,10 +1628,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* the unmap as the catch all safety net).
*/
if (PageHWPoison(page)) {
- if (WARN_ON(PageLRU(page)))
- isolate_lru_page(page);
- if (page_mapped(page))
- try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ if (WARN_ON(folio_test_lru(folio)))
+ folio_isolate_lru(folio);
+ if (folio_mapped(folio))
+ try_to_unmap(folio, TTU_IGNORE_MLOCK);
continue;
}
@@ -1347,58 +1642,58 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* LRU and non-lru movable pages.
*/
if (PageLRU(page))
- ret = isolate_lru_page(page);
+ isolated = isolate_lru_page(page);
else
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- if (!ret) { /* Success */
+ isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+ if (isolated) {
list_add_tail(&page->lru, &source);
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_lru(page));
} else {
- pr_warn("failed to isolate pfn %lx\n", pfn);
- dump_page(page, "isolation failed");
+ if (__ratelimit(&migrate_rs)) {
+ pr_warn("failed to isolate pfn %lx\n", pfn);
+ dump_page(page, "isolation failed");
+ }
}
put_page(page);
}
if (!list_empty(&source)) {
- /* Allocate a new page from the nearest neighbor node */
- ret = migrate_pages(&source, new_node_page, NULL, 0,
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+ int ret;
+
+ /*
+ * We have checked that migration range is on a single zone so
+ * we can use the nid of the first page to all the others.
+ */
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+
+ /*
+ * try to allocate from a different node but reuse this node
+ * if there are no other online nodes to be used (e.g. we are
+ * offlining a part of the only existing node)
+ */
+ node_clear(mtc.nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(mtc.nid, nmask);
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
if (ret) {
list_for_each_entry(page, &source, lru) {
- pr_warn("migrating pfn %lx failed ret:%d ",
- page_to_pfn(page), ret);
- dump_page(page, "migration failure");
+ if (__ratelimit(&migrate_rs)) {
+ pr_warn("migrating pfn %lx failed ret:%d\n",
+ page_to_pfn(page), ret);
+ dump_page(page, "migration failure");
+ }
}
putback_movable_pages(&source);
}
}
-
- return ret;
-}
-
-/* Mark all sections offline and remove all free pages from the buddy. */
-static int
-offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
- void *data)
-{
- unsigned long *offlined_pages = (unsigned long *)data;
-
- *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
-
-/*
- * Check all pages in range, recorded as memory resource, are isolated.
- */
-static int
-check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
- void *data)
-{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages,
- MEMORY_OFFLINE);
}
static int __init cmdline_parse_movable_node(char *p)
@@ -1418,7 +1713,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
arg->status_change_nid = NUMA_NO_NODE;
arg->status_change_nid_normal = NUMA_NO_NODE;
- arg->status_change_nid_high = NUMA_NO_NODE;
/*
* Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1433,24 +1727,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
arg->status_change_nid_normal = zone_to_nid(zone);
-#ifdef CONFIG_HIGHMEM
- /*
- * node_states[N_HIGH_MEMORY] contains nodes which
- * have normal memory or high memory.
- * Here we add the present_pages belonging to ZONE_HIGHMEM.
- * If the zone is within the range of [0..ZONE_HIGHMEM), and
- * we determine that the zones in that range become empty,
- * we need to clear the node for N_HIGH_MEMORY.
- */
- present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
- if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
- arg->status_change_nid_high = zone_to_nid(zone);
-#endif
-
/*
- * We have accounted the pages from [0..ZONE_NORMAL), and
- * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
- * as well.
+ * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
+ * does not apply as we don't support 32bit.
* Here we count the possible pages from ZONE_MOVABLE.
* If after having accounted all the pages, we see that the nr_pages
* to be offlined is over or equal to the accounted pages,
@@ -1468,9 +1747,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_normal >= 0)
node_clear_state(node, N_NORMAL_MEMORY);
- if (arg->status_change_nid_high >= 0)
- node_clear_state(node, N_HIGH_MEMORY);
-
if (arg->status_change_nid >= 0)
node_clear_state(node, N_MEMORY);
}
@@ -1484,16 +1760,27 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
return 0;
}
-static int __ref __offline_pages(unsigned long start_pfn,
- unsigned long end_pfn)
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+ struct zone *zone, struct memory_group *group)
{
- unsigned long pfn, nr_pages = 0;
- unsigned long offlined_pages = 0;
- int ret, node, nr_isolate_pageblock;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn, system_ram_pages = 0;
+ const int node = zone_to_nid(zone);
unsigned long flags;
- struct zone *zone;
struct memory_notify arg;
char *reason;
+ int ret;
+
+ /*
+ * {on,off}lining is constrained to full memory sections (or more
+ * precisely to memory blocks from the user space POV).
+ * memmap_on_memory is an exception because it reserves initial part
+ * of the physical memory space for vmemmaps. That space is pageblock
+ * aligned.
+ */
+ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) ||
+ !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
mem_hotplug_begin();
@@ -1505,33 +1792,42 @@ static int __ref __offline_pages(unsigned long start_pfn,
* memory holes PG_reserved, don't need pfn_valid() checks, and can
* avoid using walk_system_ram_range() later.
*/
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
count_system_ram_pages_cb);
- if (nr_pages != end_pfn - start_pfn) {
+ if (system_ram_pages != nr_pages) {
ret = -EINVAL;
reason = "memory holes";
goto failed_removal;
}
- /* This makes hotplug much easier...and readable.
- we assume this for now. .*/
- zone = test_pages_in_a_zone(start_pfn, end_pfn);
- if (!zone) {
+ /*
+ * We only support offlining of memory blocks managed by a single zone,
+ * checked by calling code. This is just a sanity check that we might
+ * want to remove in the future.
+ */
+ if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone ||
+ page_zone(pfn_to_page(end_pfn - 1)) != zone)) {
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
}
- node = zone_to_nid(zone);
+
+ /*
+ * Disable pcplists so that page isolation cannot race with freeing
+ * in a way that pages from isolated pageblock are left on pcplists.
+ */
+ zone_pcp_disable(zone);
+ lru_cache_disable();
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE);
- if (ret < 0) {
+ MEMORY_OFFLINE | REPORT_FAILURE,
+ GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
+ if (ret) {
reason = "failure to isolate range";
- goto failed_removal;
+ goto failed_removal_pcplists_disabled;
}
- nr_isolate_pageblock = ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1554,7 +1850,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
}
cond_resched();
- lru_add_drain_all();
ret = scan_movable_pages(pfn, end_pfn, &pfn);
if (!ret) {
@@ -1581,59 +1876,43 @@ static int __ref __offline_pages(unsigned long start_pfn,
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
}
- /* check again */
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- NULL, check_pages_isolated_cb);
- /*
- * per-cpu pages are drained in start_isolate_page_range, but if
- * there are still pages that are not free, make sure that we
- * drain again, because when we isolated range we might
- * have raced with another thread that was adding pages to pcp
- * list.
- *
- * Forward progress should be still guaranteed because
- * pages on the pcp list can only belong to MOVABLE_ZONE
- * because has_unmovable_pages explicitly checks for
- * PageBuddy on freed pages on other zones.
- */
- if (ret)
- drain_all_pages(zone);
+
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
+
} while (ret);
- /* Ok, all of our target is isolated.
- We cannot do rollback at this point. */
- walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- &offlined_pages, offline_isolated_pages_cb);
- pr_info("Offlined Pages %ld\n", offlined_pages);
+ /* Mark all sections offline and remove free pages from the buddy. */
+ __offline_isolated_pages(start_pfn, end_pfn);
+ pr_debug("Offlined Pages %ld\n", nr_pages);
+
/*
- * Onlining will reset pagetype flags and makes migrate type
- * MOVABLE, so just need to decrease the number of isolated
- * pageblocks zone counter here.
+ * The memory sections are marked offline, and the pageblock flags
+ * effectively stale; nobody should be touching them. Fixup the number
+ * of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock -= nr_isolate_pageblock;
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
- /* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
- zone->present_pages -= offlined_pages;
+ lru_cache_enable();
+ zone_pcp_enable(zone);
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= offlined_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ /* removal success */
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
+ /* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
- } else
- zone_pcp_update(zone);
+ }
node_states_clear_node(node, &arg);
if (arg.status_change_nid >= 0) {
- kswapd_stop(node);
kcompactd_stop(node);
+ kswapd_stop(node);
}
writeback_set_ratelimit();
@@ -1644,28 +1923,27 @@ static int __ref __offline_pages(unsigned long start_pfn,
return 0;
failed_removal_isolated:
+ /* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
+failed_removal_pcplists_disabled:
+ lru_cache_enable();
+ zone_pcp_enable(zone);
failed_removal:
pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
reason);
- /* pushback to free area */
mem_hotplug_done();
return ret;
}
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
- return __offline_pages(start_pfn, start_pfn + nr_pages);
-}
-
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
- int ret = !is_memblock_offlined(mem);
+ int *nid = arg;
- if (unlikely(ret)) {
+ *nid = mem->nid;
+ if (unlikely(mem->state != MEM_OFFLINE)) {
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
@@ -1678,12 +1956,20 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
-static int check_cpu_on_node(pg_data_t *pgdat)
+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+ /*
+ * If not set, continue with the next block.
+ */
+ return mem->nr_vmemmap_pages;
+}
+
+static int check_cpu_on_node(int nid)
{
int cpu;
for_each_present_cpu(cpu) {
- if (cpu_to_node(cpu) == pgdat->node_id)
+ if (cpu_to_node(cpu) == nid)
/*
* the cpu on this node isn't removed, and we can't
* offline this node.
@@ -1717,7 +2003,6 @@ static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
*/
void try_offline_node(int nid)
{
- pg_data_t *pgdat = NODE_DATA(nid);
int rc;
/*
@@ -1725,7 +2010,7 @@ void try_offline_node(int nid)
* offline it. A node spans memory after move_pfn_range_to_zone(),
* e.g., after the memory block was onlined.
*/
- if (pgdat->node_spanned_pages)
+ if (node_spanned_pages(nid))
return;
/*
@@ -1737,7 +2022,7 @@ void try_offline_node(int nid)
if (rc)
return;
- if (check_cpu_on_node(pgdat))
+ if (check_cpu_on_node(nid))
return;
/*
@@ -1749,29 +2034,12 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
-static void __release_memory_resource(resource_size_t start,
- resource_size_t size)
+static int __ref try_remove_memory(u64 start, u64 size)
{
- int ret;
-
- /*
- * When removing memory in the same granularity as it was added,
- * this function never fails. It might only fail if resources
- * have to be adjusted or split. We'll ignore the error, as
- * removing of memory cannot fail.
- */
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
-}
-
-static int __ref try_remove_memory(int nid, u64 start, u64 size)
-{
- int rc = 0;
+ struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap *altmap = NULL;
+ unsigned long nr_vmemmap_pages;
+ int rc = 0, nid = NUMA_NO_NODE;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1779,11 +2047,40 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
* All memory blocks must be offlined before removing memory. Check
* whether all memory blocks in question are offline and return error
* if this is not the case.
+ *
+ * While at it, determine the nid. Note that if we'd have mixed nodes,
+ * we'd only try to offline the last determined one -- which is good
+ * enough for the cases we care about.
*/
- rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
+ rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
if (rc)
return rc;
+ /*
+ * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+ * the same granularity it was added - a single memory block.
+ */
+ if (mhp_memmap_on_memory()) {
+ nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+ get_nr_vmemmap_pages_cb);
+ if (nr_vmemmap_pages) {
+ if (size != memory_block_size_bytes()) {
+ pr_warn("Refuse to remove %#llx - %#llx,"
+ "wrong granularity\n",
+ start, start + size);
+ return -EINVAL;
+ }
+
+ /*
+ * Let remove_pmd_table->free_hugepage_table do the
+ * right thing if we used vmem_altmap when hot-adding
+ * the range.
+ */
+ mhp_altmap.alloc = nr_vmemmap_pages;
+ altmap = &mhp_altmap;
+ }
+ }
+
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1795,24 +2092,24 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
- arch_remove_memory(nid, start, size, NULL);
+ arch_remove_memory(start, size, altmap);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
- memblock_free(start, size);
+ memblock_phys_free(start, size);
memblock_remove(start, size);
}
- __release_memory_resource(start, size);
+ release_mem_region_adjustable(start, size);
- try_offline_node(nid);
+ if (nid != NUMA_NO_NODE)
+ try_offline_node(nid);
mem_hotplug_done();
return 0;
}
/**
- * remove_memory
- * @nid: the node ID
+ * __remove_memory - Remove memory if every memory block is offline
* @start: physical address of the region to remove
* @size: size of the region to remove
*
@@ -1820,14 +2117,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
* and online/offline operations before this call, as required by
* try_offline_node().
*/
-void __remove_memory(int nid, u64 start, u64 size)
+void __remove_memory(u64 start, u64 size)
{
/*
* trigger BUG() if some memory is not offlined prior to calling this
* function
*/
- if (try_remove_memory(nid, start, size))
+ if (try_remove_memory(start, size))
BUG();
}
@@ -1835,51 +2132,124 @@ void __remove_memory(int nid, u64 start, u64 size)
* Remove memory if every memory block is offline, otherwise return -EBUSY is
* some memory is not offline
*/
-int remove_memory(int nid, u64 start, u64 size)
+int remove_memory(u64 start, u64 size)
{
int rc;
lock_device_hotplug();
- rc = try_remove_memory(nid, start, size);
+ rc = try_remove_memory(start, size);
unlock_device_hotplug();
return rc;
}
EXPORT_SYMBOL_GPL(remove_memory);
+static int try_offline_memory_block(struct memory_block *mem, void *arg)
+{
+ uint8_t online_type = MMOP_ONLINE_KERNEL;
+ uint8_t **online_types = arg;
+ struct page *page;
+ int rc;
+
+ /*
+ * Sense the online_type via the zone of the memory block. Offlining
+ * with multiple zones within one memory block will be rejected
+ * by offlining code ... so we don't care about that.
+ */
+ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
+ if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
+ online_type = MMOP_ONLINE_MOVABLE;
+
+ rc = device_offline(&mem->dev);
+ /*
+ * Default is MMOP_OFFLINE - change it only if offlining succeeded,
+ * so try_reonline_memory_block() can do the right thing.
+ */
+ if (!rc)
+ **online_types = online_type;
+
+ (*online_types)++;
+ /* Ignore if already offline. */
+ return rc < 0 ? rc : 0;
+}
+
+static int try_reonline_memory_block(struct memory_block *mem, void *arg)
+{
+ uint8_t **online_types = arg;
+ int rc;
+
+ if (**online_types != MMOP_OFFLINE) {
+ mem->online_type = **online_types;
+ rc = device_online(&mem->dev);
+ if (rc < 0)
+ pr_warn("%s: Failed to re-online memory: %d",
+ __func__, rc);
+ }
+
+ /* Continue processing all remaining memory blocks. */
+ (*online_types)++;
+ return 0;
+}
+
/*
- * Try to offline and remove a memory block. Might take a long time to
- * finish in case memory is still in use. Primarily useful for memory devices
- * that logically unplugged all memory (so it's no longer in use) and want to
- * offline + remove the memory block.
+ * Try to offline and remove memory. Might take a long time to finish in case
+ * memory is still in use. Primarily useful for memory devices that logically
+ * unplugged all memory (so it's no longer in use) and want to offline + remove
+ * that memory.
*/
-int offline_and_remove_memory(int nid, u64 start, u64 size)
+int offline_and_remove_memory(u64 start, u64 size)
{
- struct memory_block *mem;
- int rc = -EINVAL;
+ const unsigned long mb_count = size / memory_block_size_bytes();
+ uint8_t *online_types, *tmp;
+ int rc;
if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
- size != memory_block_size_bytes())
- return rc;
+ !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+ return -EINVAL;
+
+ /*
+ * We'll remember the old online type of each memory block, so we can
+ * try to revert whatever we did when offlining one memory block fails
+ * after offlining some others succeeded.
+ */
+ online_types = kmalloc_array(mb_count, sizeof(*online_types),
+ GFP_KERNEL);
+ if (!online_types)
+ return -ENOMEM;
+ /*
+ * Initialize all states to MMOP_OFFLINE, so when we abort processing in
+ * try_offline_memory_block(), we'll skip all unprocessed blocks in
+ * try_reonline_memory_block().
+ */
+ memset(online_types, MMOP_OFFLINE, mb_count);
lock_device_hotplug();
- mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
- if (mem)
- rc = device_offline(&mem->dev);
- /* Ignore if the device is already offline. */
- if (rc > 0)
- rc = 0;
+
+ tmp = online_types;
+ rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
/*
- * In case we succeeded to offline the memory block, remove it.
+ * In case we succeeded to offline all memory, remove it.
* This cannot fail as it cannot get onlined in the meantime.
*/
if (!rc) {
- rc = try_remove_memory(nid, start, size);
- WARN_ON_ONCE(rc);
+ rc = try_remove_memory(start, size);
+ if (rc)
+ pr_err("%s: Failed to remove memory: %d", __func__, rc);
+ }
+
+ /*
+ * Rollback what we did. While memory onlining might theoretically fail
+ * (nacked by a notifier), it barely ever happens.
+ */
+ if (rc) {
+ tmp = online_types;
+ walk_memory_blocks(start, size, &tmp,
+ try_reonline_memory_block);
}
unlock_device_hotplug();
+ kfree(online_types);
return rc;
}
EXPORT_SYMBOL_GPL(offline_and_remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3fde772ef5ef..ec2eaceffd74 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -31,6 +31,9 @@
* but useful to set in a VMA when you have a non default
* process policy.
*
+ * preferred many Try a set of nodes first before normal fallback. This is
+ * similar to preferred without the special case.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -101,6 +104,7 @@
#include <linux/swapops.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#include <linux/uaccess.h>
#include "internal.h"
@@ -121,8 +125,7 @@ enum zone_type policy_zone = 0;
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
- .mode = MPOL_PREFERRED,
- .flags = MPOL_F_LOCAL,
+ .mode = MPOL_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
@@ -132,6 +135,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
* @node: Node id to start the search
*
* Lookup the next closest node by distance if @nid is not online.
+ *
+ * Return: this @node if it is online, otherwise the closest node by distance
*/
int numa_map_to_online_node(int node)
{
@@ -190,38 +195,28 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
nodes_onto(*ret, tmp, *rel);
}
-static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
- pol->v.nodes = *nodes;
+ pol->nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
- if (!nodes)
- pol->flags |= MPOL_F_LOCAL; /* local allocation */
- else if (nodes_empty(*nodes))
- return -EINVAL; /* no allowed nodes */
- else
- pol->v.preferred_node = first_node(*nodes);
- return 0;
-}
-
-static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
-{
if (nodes_empty(*nodes))
return -EINVAL;
- pol->v.nodes = *nodes;
+
+ nodes_clear(pol->nodes);
+ node_set(first_node(*nodes), pol->nodes);
return 0;
}
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
- * parameter with respect to the policy mode and flags. But, we need to
- * handle an empty nodemask with MPOL_PREFERRED here.
+ * parameter with respect to the policy mode and flags.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_lock for write.
@@ -231,33 +226,31 @@ static int mpol_set_nodemask(struct mempolicy *pol,
{
int ret;
- /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
- if (pol == NULL)
+ /*
+ * Default (pol==NULL) resp. local memory policies are not a
+ * subject of any remapping. They also do not need any special
+ * constructor.
+ */
+ if (!pol || pol->mode == MPOL_LOCAL)
return 0;
+
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
- if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
- nodes = NULL; /* explicit local allocation */
- else {
- if (pol->flags & MPOL_F_RELATIVE_NODES)
- mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
- else
- nodes_and(nsc->mask2, *nodes, nsc->mask1);
- if (mpol_store_user_nodemask(pol))
- pol->w.user_nodemask = *nodes;
- else
- pol->w.cpuset_mems_allowed =
- cpuset_current_mems_allowed;
- }
+ if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
+ else
+ nodes_and(nsc->mask2, *nodes, nsc->mask1);
- if (nodes)
- ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+ if (mpol_store_user_nodemask(pol))
+ pol->w.user_nodemask = *nodes;
else
- ret = mpol_ops[pol->mode].create(pol, NULL);
+ pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
+
+ ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
return ret;
}
@@ -290,13 +283,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
+
+ mode = MPOL_LOCAL;
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
- mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -305,6 +299,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
+ policy->home_node = NUMA_NO_NODE;
return policy;
}
@@ -330,7 +325,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+ nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
@@ -338,31 +333,13 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
if (nodes_empty(tmp))
tmp = *nodes;
- pol->v.nodes = tmp;
+ pol->nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
const nodemask_t *nodes)
{
- nodemask_t tmp;
-
- if (pol->flags & MPOL_F_STATIC_NODES) {
- int node = first_node(pol->w.user_nodemask);
-
- if (node_isset(node, *nodes)) {
- pol->v.preferred_node = node;
- pol->flags &= ~MPOL_F_LOCAL;
- } else
- pol->flags |= MPOL_F_LOCAL;
- } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
- mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
- pol->v.preferred_node = first_node(tmp);
- } else if (!(pol->flags & MPOL_F_LOCAL)) {
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- pol->w.cpuset_mems_allowed,
- *nodes);
- pol->w.cpuset_mems_allowed = *nodes;
- }
+ pol->w.cpuset_mems_allowed = *nodes;
}
/*
@@ -374,9 +351,9 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
- if (!pol)
+ if (!pol || pol->mode == MPOL_LOCAL)
return;
- if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
+ if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -404,10 +381,13 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for_each_vma(vmi, vma) {
+ vma_start_write(vma);
mpol_rebind_policy(vma->vm_policy, new);
+ }
mmap_write_unlock(mm);
}
@@ -416,7 +396,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
- .create = mpol_new_interleave,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
@@ -424,12 +404,19 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
- .create = mpol_new_bind,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
+ [MPOL_LOCAL] = {
+ .rebind = mpol_rebind_default,
+ },
+ [MPOL_PREFERRED_MANY] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_preferred,
+ },
};
-static int migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags);
struct queue_pages {
@@ -442,36 +429,36 @@ struct queue_pages {
};
/*
- * Check if the page's nid is in qp->nmask.
+ * Check if the folio's nid is in qp->nmask.
*
* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
* in the invert of qp->nmask.
*/
-static inline bool queue_pages_required(struct page *page,
+static inline bool queue_folio_required(struct folio *folio,
struct queue_pages *qp)
{
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
unsigned long flags = qp->flags;
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
/*
- * queue_pages_pmd() has four possible return values:
- * 0 - pages are placed on the right node or queued successfully.
- * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * queue_folios_pmd() has three possible return values:
+ * 0 - folios are placed on the right node or queued successfully, or
+ * special page is met, i.e. huge zero page.
+ * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
- * 2 - THP was split.
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
- * existing page was already on a node that does not follow the
+ * existing folio was already on a node that does not follow the
* policy.
*/
-static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
+static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
__releases(ptl)
{
int ret = 0;
- struct page *page;
+ struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags;
@@ -479,21 +466,19 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
ret = -EIO;
goto unlock;
}
- page = pmd_page(*pmd);
- if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
- __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
- ret = 2;
- goto out;
+ folio = pfn_folio(pmd_pfn(*pmd));
+ if (is_huge_zero_page(&folio->page)) {
+ walk->action = ACTION_CONTINUE;
+ goto unlock;
}
- if (!queue_pages_required(page, qp))
+ if (!queue_folio_required(folio, qp))
goto unlock;
flags = qp->flags;
- /* go to thp migration */
+ /* go to folio migration */
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
- migrate_page_add(page, qp->pagelist, flags)) {
+ migrate_folio_add(folio, qp->pagelist, flags)) {
ret = 1;
goto unlock;
}
@@ -501,7 +486,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
ret = -EIO;
unlock:
spin_unlock(ptl);
-out:
return ret;
}
@@ -509,50 +493,49 @@ out:
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*
- * queue_pages_pte_range() has three possible return values:
- * 0 - pages are placed on the right node or queued successfully.
- * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * queue_folios_pte_range() has three possible return values:
+ * 0 - folios are placed on the right node or queued successfully, or
+ * special page is met, i.e. zero page.
+ * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
- * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
+ * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
* on a node that does not follow the policy.
*/
-static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
+static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
- struct page *page;
+ struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int ret;
bool has_unmovable = false;
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
+ pte_t ptent;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
- if (ret != 2)
- return ret;
- }
- /* THP was split, fall through to pte walk */
+ if (ptl)
+ return queue_folios_pmd(pmd, ptl, addr, end, walk);
- if (pmd_trans_unstable(pmd))
+ mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
return 0;
-
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ }
for (; addr != end; pte++, addr += PAGE_SIZE) {
- if (!pte_present(*pte))
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, *pte);
- if (!page)
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
continue;
/*
- * vm_normal_page() filters out zero pages, but there might
- * still be PageReserved pages to skip, perhaps in a VDSO.
+ * vm_normal_folio() filters out zero pages, but there might
+ * still be reserved folios to skip, perhaps in a VDSO.
*/
- if (PageReserved(page))
+ if (folio_test_reserved(folio))
continue;
- if (!queue_pages_required(page, qp))
+ if (!queue_folio_required(folio, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
/* MPOL_MF_STRICT must be specified if we get here */
@@ -566,12 +549,12 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
* temporary off LRU pages in the range. Still
* need migrate other LRU pages.
*/
- if (migrate_page_add(page, qp->pagelist, flags))
+ if (migrate_folio_add(folio, qp->pagelist, flags))
has_unmovable = true;
} else
break;
}
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
if (has_unmovable)
@@ -580,7 +563,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
return addr != end ? -EIO : 0;
}
-static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
+static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -588,7 +571,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
unsigned long flags = (qp->flags & MPOL_MF_VALID);
- struct page *page;
+ struct folio *folio;
spinlock_t *ptl;
pte_t entry;
@@ -596,13 +579,13 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
- page = pte_page(entry);
- if (!queue_pages_required(page, qp))
+ folio = pfn_folio(pte_pfn(entry));
+ if (!queue_folio_required(folio, qp))
goto unlock;
if (flags == MPOL_MF_STRICT) {
/*
- * STRICT alone means only detecting misplaced page and no
+ * STRICT alone means only detecting misplaced folio and no
* need to further check other vma.
*/
ret = -EIO;
@@ -613,20 +596,28 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
/*
* Must be STRICT with MOVE*, otherwise .test_walk() have
* stopped walking current vma.
- * Detecting misplaced page but allow migrating pages which
+ * Detecting misplaced folio but allow migrating folios which
* have been queued.
*/
ret = 1;
goto unlock;
}
- /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
+ /*
+ * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
+ * is shared it is likely not worth migrating.
+ *
+ * To check if the folio is shared, ideally we want to make sure
+ * every page is mapped to the same process. Doing that is very
+ * expensive, so check the estimated mapcount of the folio instead.
+ */
if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
- if (!isolate_huge_page(page, qp->pagelist) &&
+ (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
+ !hugetlb_pmd_shared(pte))) {
+ if (!isolate_hugetlb(folio, qp->pagelist) &&
(flags & MPOL_MF_STRICT))
/*
- * Failed to isolate page but allow migrating pages
+ * Failed to isolate folio but allow migrating pages
* which have been queued.
*/
ret = 1;
@@ -652,12 +643,17 @@ unlock:
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
- int nr_updated;
+ struct mmu_gather tlb;
+ long nr_updated;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
- nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
- if (nr_updated)
+ nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
+ if (nr_updated > 0)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+ tlb_finish_mmu(&tlb);
+
return nr_updated;
}
#else
@@ -671,13 +667,13 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->vma;
+ struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
/* range check first */
- VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
+ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
if (!qp->first) {
qp->first = vma;
@@ -686,9 +682,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
/* hole at head side of range */
return -EFAULT;
}
+ next = find_vma(vma->vm_mm, vma->vm_end);
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
((vma->vm_end < qp->end) &&
- (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ (!next || vma->vm_end < next->vm_start)))
/* hole at middle or tail of range */
return -EFAULT;
@@ -718,9 +715,17 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
}
static const struct mm_walk_ops queue_pages_walk_ops = {
- .hugetlb_entry = queue_pages_hugetlb,
- .pmd_entry = queue_pages_pte_range,
+ .hugetlb_entry = queue_folios_hugetlb,
+ .pmd_entry = queue_folios_pte_range,
.test_walk = queue_pages_test_walk,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
+ .hugetlb_entry = queue_folios_hugetlb,
+ .pmd_entry = queue_folios_pte_range,
+ .test_walk = queue_pages_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
};
/*
@@ -741,7 +746,7 @@ static const struct mm_walk_ops queue_pages_walk_ops = {
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
- struct list_head *pagelist)
+ struct list_head *pagelist, bool lock_vma)
{
int err;
struct queue_pages qp = {
@@ -752,8 +757,10 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
.end = end,
.first = NULL,
};
+ const struct mm_walk_ops *ops = lock_vma ?
+ &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
- err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+ err = walk_page_range(mm, start, end, ops, &qp);
if (!qp.first)
/* whole range in hole */
@@ -773,6 +780,8 @@ static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *old;
struct mempolicy *new;
+ vma_assert_write_locked(vma);
+
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
@@ -798,64 +807,52 @@ static int vma_replace_policy(struct vm_area_struct *vma,
return err;
}
-/* Step 2: apply policy to a range and do splits. */
-static int mbind_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct mempolicy *new_pol)
+/* Split or merge the VMA (if required) and apply the new policy */
+static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
{
- struct vm_area_struct *next;
- struct vm_area_struct *prev;
- struct vm_area_struct *vma;
- int err = 0;
+ struct vm_area_struct *merged;
+ unsigned long vmstart, vmend;
pgoff_t pgoff;
- unsigned long vmstart;
- unsigned long vmend;
+ int err;
- vma = find_vma(mm, start);
- VM_BUG_ON(!vma);
+ vmend = min(end, vma->vm_end);
+ if (start > vma->vm_start) {
+ *prev = vma;
+ vmstart = start;
+ } else {
+ vmstart = vma->vm_start;
+ }
- prev = vma->vm_prev;
- if (start > vma->vm_start)
- prev = vma;
+ if (mpol_equal(vma_policy(vma), new_pol)) {
+ *prev = vma;
+ return 0;
+ }
- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
- next = vma->vm_next;
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
+ pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
+ merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, new_pol,
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ if (merged) {
+ *prev = merged;
+ return vma_replace_policy(merged, new_pol);
+ }
- if (mpol_equal(vma_policy(vma), new_pol))
- continue;
+ if (vma->vm_start != vmstart) {
+ err = split_vma(vmi, vma, vmstart, 1);
+ if (err)
+ return err;
+ }
- pgoff = vma->vm_pgoff +
- ((vmstart - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- new_pol, vma->vm_userfaultfd_ctx);
- if (prev) {
- vma = prev;
- next = vma->vm_next;
- if (mpol_equal(vma_policy(vma), new_pol))
- continue;
- /* vma_merge() joined vma && vma->next, case 8 */
- goto replace;
- }
- if (vma->vm_start != vmstart) {
- err = split_vma(vma->vm_mm, vma, vmstart, 1);
- if (err)
- goto out;
- }
- if (vma->vm_end != vmend) {
- err = split_vma(vma->vm_mm, vma, vmend, 0);
- if (err)
- goto out;
- }
- replace:
- err = vma_replace_policy(vma, new_pol);
+ if (vma->vm_end != vmend) {
+ err = split_vma(vmi, vma, vmend, 0);
if (err)
- goto out;
+ return err;
}
- out:
- return err;
+ *prev = vma;
+ return vma_replace_policy(vma, new_pol);
}
/* Set the process memory policy */
@@ -875,12 +872,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
+ task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
+ task_unlock(current);
mpol_put(new);
goto out;
}
- task_lock(current);
+
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -907,12 +906,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
switch (p->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
- *nodes = p->v.nodes;
- break;
case MPOL_PREFERRED:
- if (!(p->flags & MPOL_F_LOCAL))
- node_set(p->v.preferred_node, *nodes);
- /* else return empty node mask for local allocation */
+ case MPOL_PREFERRED_MANY:
+ *nodes = p->nodes;
+ break;
+ case MPOL_LOCAL:
+ /* return empty node mask for local allocation */
break;
default:
BUG();
@@ -922,17 +921,14 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p = NULL;
- int err;
+ int ret;
- int locked = 1;
- err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
- if (err > 0) {
- err = page_to_nid(p);
+ ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
+ if (ret > 0) {
+ ret = page_to_nid(p);
put_page(p);
}
- if (locked)
- mmap_read_unlock(mm);
- return err;
+ return ret;
}
/* Retrieve NUMA policy */
@@ -965,7 +961,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
* want to return MPOL_DEFAULT in this case.
*/
mmap_read_lock(mm);
- vma = find_vma_intersection(mm, addr, addr+1);
+ vma = vma_lookup(mm, addr);
if (!vma) {
mmap_read_unlock(mm);
return -EFAULT;
@@ -983,21 +979,21 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
/*
- * Take a refcount on the mpol, lookup_node()
- * wil drop the mmap_lock, so after calling
- * lookup_node() only "pol" remains valid, "vma"
- * is stale.
+ * Take a refcount on the mpol, because we are about to
+ * drop the mmap_lock, after which only "pol" remains
+ * valid, "vma" is stale.
*/
pol_refcount = pol;
vma = NULL;
mpol_get(pol);
+ mmap_read_unlock(mm);
err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
- *policy = next_node_in(current->il_prev, pol->v.nodes);
+ *policy = next_node_in(current->il_prev, pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -1033,27 +1029,28 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
#ifdef CONFIG_MIGRATION
-/*
- * page migration, thp tail pages can be passed.
- */
-static int migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
- struct page *head = compound_head(page);
/*
- * Avoid migrating a page that is shared with others.
+ * We try to migrate only unshared folios. If it is shared it
+ * is likely not worth migrating.
+ *
+ * To check if the folio is shared, ideally we want to make sure
+ * every page is mapped to the same process. Doing that is very
+ * expensive, so check the estimated mapcount of the folio instead.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
- if (!isolate_lru_page(head)) {
- list_add_tail(&head->lru, pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_lru(head),
- thp_nr_pages(head));
+ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
+ if (folio_isolate_lru(folio)) {
+ list_add_tail(&folio->lru, foliolist);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
} else if (flags & MPOL_MF_STRICT) {
/*
- * Non-movable page may reach here. And, there may be
- * temporary off LRU pages or non-LRU movable pages.
- * Treat them as unmovable pages since they can't be
+ * Non-movable folio may reach here. And, there may be
+ * temporary off LRU folios or non-LRU movable folios.
+ * Treat them as unmovable folios since they can't be
* isolated, so they can't be moved at the moment. It
* should return -EIO for this case too.
*/
@@ -1072,6 +1069,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
+ struct vm_area_struct *vma;
LIST_HEAD(pagelist);
int err = 0;
struct migration_target_control mtc = {
@@ -1087,13 +1085,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
+ vma = find_vma(mm, 0);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
- flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1111,12 +1110,10 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
int busy = 0;
- int err;
+ int err = 0;
nodemask_t tmp;
- err = migrate_prep();
- if (err)
- return err;
+ lru_cache_disable();
mmap_read_lock(mm);
@@ -1153,7 +1150,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
- int s,d;
+ int s, d;
int source = NUMA_NO_NODE;
int dest = 0;
@@ -1200,6 +1197,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
break;
}
mmap_read_unlock(mm);
+
+ lru_cache_enable();
if (err < 0)
return err;
return busy;
@@ -1213,41 +1212,36 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_page(struct page *page, unsigned long start)
+static struct folio *new_folio(struct folio *src, unsigned long start)
{
struct vm_area_struct *vma;
unsigned long address;
+ VMA_ITERATOR(vmi, current->mm, start);
+ gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
- vma = find_vma(current->mm, start);
- while (vma) {
- address = page_address_in_vma(page, vma);
+ for_each_vma(vmi, vma) {
+ address = page_address_in_vma(&src->page, vma);
if (address != -EFAULT)
break;
- vma = vma->vm_next;
}
- if (PageHuge(page)) {
- return alloc_huge_page_vma(page_hstate(compound_head(page)),
+ if (folio_test_hugetlb(src)) {
+ return alloc_hugetlb_folio_vma(folio_hstate(src),
vma, address);
- } else if (PageTransHuge(page)) {
- struct page *thp;
-
- thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
}
+
+ if (folio_test_large(src))
+ gfp = GFP_TRANSHUGE;
+
/*
- * if !vma, alloc_page_vma() will use task or system default policy
+ * if !vma, vma_alloc_folio() will use task or system default policy
*/
- return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
- vma, address);
+ return vma_alloc_folio(gfp, folio_order(src), vma, address,
+ folio_test_large(src));
}
#else
-static int migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
return -EIO;
@@ -1259,7 +1253,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_page(struct page *page, unsigned long start)
+static struct folio *new_folio(struct folio *src, unsigned long start)
{
return NULL;
}
@@ -1270,6 +1264,8 @@ static long do_mbind(unsigned long start, unsigned long len,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ struct vma_iterator vmi;
struct mempolicy *new;
unsigned long end;
int err;
@@ -1287,7 +1283,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
@@ -1315,9 +1311,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- err = migrate_prep();
- if (err)
- goto mpol_out;
+ lru_cache_disable();
}
{
NODEMASK_SCRATCH(scratch);
@@ -1333,23 +1327,33 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
+ /*
+ * Lock the VMAs before scanning for pages to migrate, to ensure we don't
+ * miss a concurrently inserted page.
+ */
ret = queue_pages_range(mm, start, end, nmask,
- flags | MPOL_MF_INVERT, &pagelist);
+ flags | MPOL_MF_INVERT, &pagelist, true);
if (ret < 0) {
err = ret;
goto up_out;
}
- err = mbind_range(mm, start, end, new);
+ vma_iter_init(&vmi, mm, start);
+ prev = vma_prev(&vmi);
+ for_each_vma_range(vmi, vma, end) {
+ err = mbind_range(&vmi, vma, &prev, start, end, new);
+ if (err)
+ break;
+ }
if (!err) {
int nr_failed = 0;
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_page, NULL,
- start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ nr_failed = migrate_pages(&pagelist, new_folio, NULL,
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
if (nr_failed)
putback_movable_pages(&pagelist);
}
@@ -1365,22 +1369,41 @@ up_out:
mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_cache_enable();
return err;
}
/*
* User space interface with variable sized bitmaps for nodelists.
*/
+static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned long nlongs = BITS_TO_LONGS(maxnode);
+ int ret;
+
+ if (in_compat_syscall())
+ ret = compat_get_bitmap(mask,
+ (const compat_ulong_t __user *)nmask,
+ maxnode);
+ else
+ ret = copy_from_user(mask, nmask,
+ nlongs * sizeof(unsigned long));
+
+ if (ret)
+ return -EFAULT;
+
+ if (maxnode % BITS_PER_LONG)
+ mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+ return 0;
+}
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
- unsigned long k;
- unsigned long t;
- unsigned long nlongs;
- unsigned long endmask;
-
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
@@ -1388,49 +1411,29 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
return -EINVAL;
- nlongs = BITS_TO_LONGS(maxnode);
- if ((maxnode % BITS_PER_LONG) == 0)
- endmask = ~0UL;
- else
- endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
-
/*
* When the user specified more nodes than supported just check
- * if the non supported part is all zero.
- *
- * If maxnode have more longs than MAX_NUMNODES, check
- * the bits in that area first. And then go through to
- * check the rest bits which equal or bigger than MAX_NUMNODES.
- * Otherwise, just check bits [MAX_NUMNODES, maxnode).
+ * if the non supported part is all zero, one word at a time,
+ * starting at the end.
*/
- if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
- for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
- if (get_user(t, nmask + k))
- return -EFAULT;
- if (k == nlongs - 1) {
- if (t & endmask)
- return -EINVAL;
- } else if (t)
- return -EINVAL;
- }
- nlongs = BITS_TO_LONGS(MAX_NUMNODES);
- endmask = ~0UL;
- }
+ while (maxnode > MAX_NUMNODES) {
+ unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
+ unsigned long t;
- if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
- unsigned long valid_mask = endmask;
-
- valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
- if (get_user(t, nmask + nlongs - 1))
+ if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
return -EFAULT;
- if (t & valid_mask)
+
+ if (maxnode - bits >= MAX_NUMNODES) {
+ maxnode -= bits;
+ } else {
+ maxnode = MAX_NUMNODES;
+ t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
+ }
+ if (t)
return -EINVAL;
}
- if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
- return -EFAULT;
- nodes_addr(*nodes)[nlongs-1] &= endmask;
- return 0;
+ return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}
/* Copy a kernel node mask to user space */
@@ -1439,6 +1442,10 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
+ bool compat = in_compat_syscall();
+
+ if (compat)
+ nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
@@ -1446,30 +1453,118 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
+ maxnode = nr_node_ids;
}
+
+ if (compat)
+ return compat_put_bitmap((compat_ulong_t __user *)mask,
+ nodes_addr(*nodes), maxnode);
+
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
+/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
+static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
+{
+ *flags = *mode & MPOL_MODE_FLAGS;
+ *mode &= ~MPOL_MODE_FLAGS;
+
+ if ((unsigned int)(*mode) >= MPOL_MAX)
+ return -EINVAL;
+ if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
+ if (*flags & MPOL_F_NUMA_BALANCING) {
+ if (*mode != MPOL_BIND)
+ return -EINVAL;
+ *flags |= (MPOL_F_MOF | MPOL_F_MORON);
+ }
+ return 0;
+}
+
static long kernel_mbind(unsigned long start, unsigned long len,
unsigned long mode, const unsigned long __user *nmask,
unsigned long maxnode, unsigned int flags)
{
+ unsigned short mode_flags;
nodemask_t nodes;
+ int lmode = mode;
int err;
- unsigned short mode_flags;
start = untagged_addr(start);
- mode_flags = mode & MPOL_MODE_FLAGS;
- mode &= ~MPOL_MODE_FLAGS;
- if (mode >= MPOL_MAX)
- return -EINVAL;
- if ((mode_flags & MPOL_F_STATIC_NODES) &&
- (mode_flags & MPOL_F_RELATIVE_NODES))
- return -EINVAL;
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return err;
+
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_mbind(start, len, mode, mode_flags, &nodes, flags);
+
+ return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
+}
+
+SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
+ unsigned long, home_node, unsigned long, flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ struct mempolicy *new, *old;
+ unsigned long end;
+ int err = -ENOENT;
+ VMA_ITERATOR(vmi, mm, start);
+
+ start = untagged_addr(start);
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ /*
+ * flags is used for future extension if any.
+ */
+ if (flags != 0)
+ return -EINVAL;
+
+ /*
+ * Check home_node is online to avoid accessing uninitialized
+ * NODE_DATA.
+ */
+ if (home_node >= MAX_NUMNODES || !node_online(home_node))
+ return -EINVAL;
+
+ len = PAGE_ALIGN(len);
+ end = start + len;
+
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+ mmap_write_lock(mm);
+ prev = vma_prev(&vmi);
+ for_each_vma_range(vmi, vma, end) {
+ /*
+ * If any vma in the range got policy other than MPOL_BIND
+ * or MPOL_PREFERRED_MANY we return error. We don't reset
+ * the home node for vmas we already updated before.
+ */
+ old = vma_policy(vma);
+ if (!old)
+ continue;
+ if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+ new = mpol_dup(old);
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ break;
+ }
+
+ vma_start_write(vma);
+ new->home_node = home_node;
+ err = mbind_range(&vmi, vma, &prev, start, end, new);
+ mpol_put(new);
+ if (err)
+ break;
+ }
+ mmap_write_unlock(mm);
+ return err;
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
@@ -1483,20 +1578,20 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
- int err;
+ unsigned short mode_flags;
nodemask_t nodes;
- unsigned short flags;
+ int lmode = mode;
+ int err;
+
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return err;
- flags = mode & MPOL_MODE_FLAGS;
- mode &= ~MPOL_MODE_FLAGS;
- if ((unsigned int)mode >= MPOL_MAX)
- return -EINVAL;
- if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
- return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_set_mempolicy(mode, flags, &nodes);
+
+ return do_set_mempolicy(lmode, mode_flags, &nodes);
}
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
@@ -1638,116 +1733,6 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}
-#ifdef CONFIG_COMPAT
-
-COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
- compat_ulong_t __user *, nmask,
- compat_ulong_t, maxnode,
- compat_ulong_t, addr, compat_ulong_t, flags)
-{
- long err;
- unsigned long __user *nm = NULL;
- unsigned long nr_bits, alloc_size;
- DECLARE_BITMAP(bm, MAX_NUMNODES);
-
- nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
- alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
-
- if (nmask)
- nm = compat_alloc_user_space(alloc_size);
-
- err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
-
- if (!err && nmask) {
- unsigned long copy_size;
- copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
- err = copy_from_user(bm, nm, copy_size);
- /* ensure entire bitmap is zeroed */
- err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
- err |= compat_put_bitmap(nmask, bm, nr_bits);
- }
-
- return err;
-}
-
-COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
- compat_ulong_t, maxnode)
-{
- unsigned long __user *nm = NULL;
- unsigned long nr_bits, alloc_size;
- DECLARE_BITMAP(bm, MAX_NUMNODES);
-
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
- alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
-
- if (nmask) {
- if (compat_get_bitmap(bm, nmask, nr_bits))
- return -EFAULT;
- nm = compat_alloc_user_space(alloc_size);
- if (copy_to_user(nm, bm, alloc_size))
- return -EFAULT;
- }
-
- return kernel_set_mempolicy(mode, nm, nr_bits+1);
-}
-
-COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
- compat_ulong_t, mode, compat_ulong_t __user *, nmask,
- compat_ulong_t, maxnode, compat_ulong_t, flags)
-{
- unsigned long __user *nm = NULL;
- unsigned long nr_bits, alloc_size;
- nodemask_t bm;
-
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
- alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
-
- if (nmask) {
- if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
- return -EFAULT;
- nm = compat_alloc_user_space(alloc_size);
- if (copy_to_user(nm, nodes_addr(bm), alloc_size))
- return -EFAULT;
- }
-
- return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
-}
-
-COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
- compat_ulong_t, maxnode,
- const compat_ulong_t __user *, old_nodes,
- const compat_ulong_t __user *, new_nodes)
-{
- unsigned long __user *old = NULL;
- unsigned long __user *new = NULL;
- nodemask_t tmp_mask;
- unsigned long nr_bits;
- unsigned long size;
-
- nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
- size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
- if (old_nodes) {
- if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
- return -EFAULT;
- old = compat_alloc_user_space(new_nodes ? size * 2 : size);
- if (new_nodes)
- new = old + size / sizeof(unsigned long);
- if (copy_to_user(old, nodes_addr(tmp_mask), size))
- return -EFAULT;
- }
- if (new_nodes) {
- if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
- return -EFAULT;
- if (new == NULL)
- new = compat_alloc_user_space(size);
- if (copy_to_user(new, nodes_addr(tmp_mask), size))
- return -EFAULT;
- }
- return kernel_migrate_pages(pid, nr_bits + 1, old, new);
-}
-
-#endif /* CONFIG_COMPAT */
-
bool vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
@@ -1846,21 +1831,21 @@ bool vma_policy_mof(struct vm_area_struct *vma)
return pol->flags & MPOL_F_MOF;
}
-static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
- * if policy->v.nodes has movable memory only,
+ * if policy->nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
- * policy->v.nodes is intersect with node_states[N_MEMORY].
- * so if the following test faile, it implies
- * policy->v.nodes has movable memory only.
+ * policy->nodes is intersect with node_states[N_MEMORY].
+ * so if the following test fails, it implies
+ * policy->nodes has movable memory only.
*/
- if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
@@ -1872,21 +1857,32 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
*/
nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
+ int mode = policy->mode;
+
/* Lower zones don't get a nodemask applied for MPOL_BIND */
- if (unlikely(policy->mode == MPOL_BIND) &&
- apply_policy_zone(policy, gfp_zone(gfp)) &&
- cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
- return &policy->v.nodes;
+ if (unlikely(mode == MPOL_BIND) &&
+ apply_policy_zone(policy, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ return &policy->nodes;
+
+ if (mode == MPOL_PREFERRED_MANY)
+ return &policy->nodes;
return NULL;
}
-/* Return the node id preferred by the given mempolicy, or the given id */
+/*
+ * Return the preferred node id for 'prefer' mempolicy, and return
+ * the given id for all other policies.
+ *
+ * policy_node() is always coupled with policy_nodemask(), which
+ * secures the nodemask limit for 'bind' and 'prefer-many' policy.
+ */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
- if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
- nd = policy->v.preferred_node;
- else {
+ if (policy->mode == MPOL_PREFERRED) {
+ nd = first_node(policy->nodes);
+ } else {
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
@@ -1895,6 +1891,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
+ if ((policy->mode == MPOL_BIND ||
+ policy->mode == MPOL_PREFERRED_MANY) &&
+ policy->home_node != NUMA_NO_NODE)
+ return policy->home_node;
+
return nd;
}
@@ -1904,7 +1905,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
unsigned next;
struct task_struct *me = current;
- next = next_node_in(me->il_prev, policy->v.nodes);
+ next = next_node_in(me->il_prev, policy->nodes);
if (next < MAX_NUMNODES)
me->il_prev = next;
return next;
@@ -1919,24 +1920,23 @@ unsigned int mempolicy_slab_node(void)
struct mempolicy *policy;
int node = numa_mem_id();
- if (in_interrupt())
+ if (!in_task())
return node;
policy = current->mempolicy;
- if (!policy || policy->flags & MPOL_F_LOCAL)
+ if (!policy)
return node;
switch (policy->mode) {
case MPOL_PREFERRED:
- /*
- * handled MPOL_F_LOCAL above
- */
- return policy->v.preferred_node;
+ return first_node(policy->nodes);
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
- case MPOL_BIND: {
+ case MPOL_BIND:
+ case MPOL_PREFERRED_MANY:
+ {
struct zoneref *z;
/*
@@ -1947,9 +1947,11 @@ unsigned int mempolicy_slab_node(void)
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
- &policy->v.nodes);
+ &policy->nodes);
return z->zone ? zone_to_nid(z->zone) : node;
}
+ case MPOL_LOCAL:
+ return node;
default:
BUG();
@@ -1958,22 +1960,31 @@ unsigned int mempolicy_slab_node(void)
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
- * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
- unsigned nnodes = nodes_weight(pol->v.nodes);
- unsigned target;
+ nodemask_t nodemask = pol->nodes;
+ unsigned int target, nnodes;
int i;
int nid;
+ /*
+ * The barrier will stabilize the nodemask in a register or on
+ * the stack so that it will stop changing under the code.
+ *
+ * Between first_node() and next_node(), pol->nodes could be changed
+ * by other threads. So we put pol->nodes in a local stack.
+ */
+ barrier();
+ nnodes = nodes_weight(nodemask);
if (!nnodes)
return numa_node_id();
target = (unsigned int)n % nnodes;
- nid = first_node(pol->v.nodes);
+ nid = first_node(nodemask);
for (i = 0; i < target; i++)
- nid = next_node(nid, pol->v.nodes);
+ nid = next_node(nid, nodemask);
return nid;
}
@@ -2006,12 +2017,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
*
* Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
- * If the effective policy is 'BIND, returns a pointer to the mempolicy's
- * @nodemask for filtering the zonelist.
+ * If the effective policy is 'bind' or 'prefer-many', returns a pointer
+ * to the mempolicy's @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
@@ -2019,17 +2030,19 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
int nid;
+ int mode;
*mpol = get_vma_policy(vma, addr);
- *nodemask = NULL; /* assume !MPOL_BIND */
+ *nodemask = NULL;
+ mode = (*mpol)->mode;
- if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ if (unlikely(mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
- if ((*mpol)->mode == MPOL_BIND)
- *nodemask = &(*mpol)->v.nodes;
+ if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
+ *nodemask = &(*mpol)->nodes;
}
return nid;
}
@@ -2053,7 +2066,6 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
- int nid;
if (!(mask && current->mempolicy))
return false;
@@ -2062,16 +2074,14 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
- if (mempolicy->flags & MPOL_F_LOCAL)
- nid = numa_node_id();
- else
- nid = mempolicy->v.preferred_node;
- init_nodemask_of_node(mask, nid);
- break;
-
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
- *mask = mempolicy->v.nodes;
+ *mask = mempolicy->nodes;
+ break;
+
+ case MPOL_LOCAL:
+ init_nodemask_of_node(mask, numa_node_id());
break;
default:
@@ -2084,16 +2094,16 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
#endif
/*
- * mempolicy_nodemask_intersects
+ * mempolicy_in_oom_domain
*
- * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
- * policy. Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
- * policy, always return true since it may allocate elsewhere on fallback.
+ * If tsk's mempolicy is "bind", check for intersection between mask and
+ * the policy nodemask. Otherwise, return true for all other policies
+ * including "interleave", as a tsk with "interleave" policy may have
+ * memory allocated from all nodes in system.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
-bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
@@ -2101,29 +2111,13 @@ bool mempolicy_nodemask_intersects(struct task_struct *tsk,
if (!mask)
return ret;
+
task_lock(tsk);
mempolicy = tsk->mempolicy;
- if (!mempolicy)
- goto out;
-
- switch (mempolicy->mode) {
- case MPOL_PREFERRED:
- /*
- * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
- * allocate from, they may fallback to other nodes when oom.
- * Thus, it's possible for tsk to have allocated memory from
- * nodes in mask.
- */
- break;
- case MPOL_BIND:
- case MPOL_INTERLEAVE:
- ret = nodes_intersects(mempolicy->v.nodes, *mask);
- break;
- default:
- BUG();
- }
-out:
+ if (mempolicy && mempolicy->mode == MPOL_BIND)
+ ret = nodes_intersects(mempolicy->nodes, *mask);
task_unlock(tsk);
+
return ret;
}
@@ -2134,58 +2128,89 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
{
struct page *page;
- page = __alloc_pages(gfp, order, nid);
+ page = __alloc_pages(gfp, order, nid, NULL);
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return page;
if (page && page_to_nid(page) == nid) {
preempt_disable();
- __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
+ __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
preempt_enable();
}
return page;
}
+static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
+ int nid, struct mempolicy *pol)
+{
+ struct page *page;
+ gfp_t preferred_gfp;
+
+ /*
+ * This is a two pass approach. The first pass will only try the
+ * preferred nodes but skip the direct reclaim and allow the
+ * allocation to fail, while the second pass will try all the
+ * nodes in system.
+ */
+ preferred_gfp = gfp | __GFP_NOWARN;
+ preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+ page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
+ if (!page)
+ page = __alloc_pages(gfp, order, nid, NULL);
+
+ return page;
+}
+
/**
- * alloc_pages_vma - Allocate a page for a VMA.
+ * vma_alloc_folio - Allocate a folio for a VMA.
+ * @gfp: GFP flags.
+ * @order: Order of the folio.
+ * @vma: Pointer to VMA or NULL if not available.
+ * @addr: Virtual address of the allocation. Must be inside @vma.
+ * @hugepage: For hugepages try only the preferred node if possible.
*
- * @gfp:
- * %GFP_USER user allocation.
- * %GFP_KERNEL kernel allocations,
- * %GFP_HIGHMEM highmem/user allocations,
- * %GFP_FS allocation should not call back into a file system.
- * %GFP_ATOMIC don't sleep.
+ * Allocate a folio for a specific address in @vma, using the appropriate
+ * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
+ * of the mm_struct of the VMA to prevent it from going away. Should be
+ * used for all allocations for folios that will be mapped into user space.
*
- * @order:Order of the GFP allocation.
- * @vma: Pointer to VMA or NULL if not available.
- * @addr: Virtual Address of the allocation. Must be inside the VMA.
- * @node: Which node to prefer for allocation (modulo policy).
- * @hugepage: for hugepages try only the preferred node if possible
- *
- * This function allocates a page from the kernel page pool and applies
- * a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must read-lock the mmap_lock of the
- * mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into user space. Returns
- * NULL when no page can be allocated.
+ * Return: The folio on success or NULL if allocation fails.
*/
-struct page *
-alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, int node, bool hugepage)
+struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr, bool hugepage)
{
struct mempolicy *pol;
- struct page *page;
+ int node = numa_node_id();
+ struct folio *folio;
int preferred_nid;
nodemask_t *nmask;
pol = get_vma_policy(vma, addr);
if (pol->mode == MPOL_INTERLEAVE) {
+ struct page *page;
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
+ gfp |= __GFP_COMP;
page = alloc_page_interleave(gfp, order, nid);
+ if (page && order > 1)
+ prep_transhuge_page(page);
+ folio = (struct folio *)page;
+ goto out;
+ }
+
+ if (pol->mode == MPOL_PREFERRED_MANY) {
+ struct page *page;
+
+ node = policy_node(gfp, pol, node);
+ gfp |= __GFP_COMP;
+ page = alloc_pages_preferred_many(gfp, order, node, pol);
+ mpol_cond_put(pol);
+ if (page && order > 1)
+ prep_transhuge_page(page);
+ folio = (struct folio *)page;
goto out;
}
@@ -2199,11 +2224,11 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
- * If the policy is interleave, or does not allow the current
+ * If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
- if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
- hpage_node = pol->v.preferred_node;
+ if (pol->mode == MPOL_PREFERRED)
+ hpage_node = first_node(pol->nodes);
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
@@ -2212,8 +2237,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
- page = __alloc_pages_node(hpage_node,
- gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+ folio = __folio_alloc_node(gfp | __GFP_THISNODE |
+ __GFP_NORETRY, order, hpage_node);
/*
* If hugepage allocations are configured to always
@@ -2221,9 +2246,9 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* to prefer hugepage backing, retry allowing remote
* memory with both reclaim and compact as well.
*/
- if (!page && (gfp & __GFP_DIRECT_RECLAIM))
- page = __alloc_pages_node(hpage_node,
- gfp, order);
+ if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
+ folio = __folio_alloc(gfp, order, hpage_node,
+ nmask);
goto out;
}
@@ -2231,29 +2256,28 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
- page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
+ folio = __folio_alloc(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
- return page;
+ return folio;
}
-EXPORT_SYMBOL(alloc_pages_vma);
+EXPORT_SYMBOL(vma_alloc_folio);
/**
- * alloc_pages_current - Allocate pages.
+ * alloc_pages - Allocate pages.
+ * @gfp: GFP flags.
+ * @order: Power of two of number of pages to allocate.
*
- * @gfp:
- * %GFP_USER user allocation,
- * %GFP_KERNEL kernel allocation,
- * %GFP_HIGHMEM highmem allocation,
- * %GFP_FS don't call back into a file system.
- * %GFP_ATOMIC don't sleep.
- * @order: Power of two of allocation size in pages. 0 is a single page.
+ * Allocate 1 << @order contiguous pages. The physical address of the
+ * first page is naturally aligned (eg an order-3 allocation will be aligned
+ * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
+ * process is honoured when in process context.
*
- * Allocate a page from the kernel page pool. When not in
- * interrupt context and apply the current process NUMA policy.
- * Returns NULL when no page can be allocated.
+ * Context: Can be called from any context, providing the appropriate GFP
+ * flags are used.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+struct page *alloc_pages(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
@@ -2267,14 +2291,109 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else if (pol->mode == MPOL_PREFERRED_MANY)
+ page = alloc_pages_preferred_many(gfp, order,
+ policy_node(gfp, pol, numa_node_id()), pol);
else
- page = __alloc_pages_nodemask(gfp, order,
+ page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
-EXPORT_SYMBOL(alloc_pages_current);
+EXPORT_SYMBOL(alloc_pages);
+
+struct folio *folio_alloc(gfp_t gfp, unsigned order)
+{
+ struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+ if (page && order > 1)
+ prep_transhuge_page(page);
+ return (struct folio *)page;
+}
+EXPORT_SYMBOL(folio_alloc);
+
+static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ int nodes;
+ unsigned long nr_pages_per_node;
+ int delta;
+ int i;
+ unsigned long nr_allocated;
+ unsigned long total_allocated = 0;
+
+ nodes = nodes_weight(pol->nodes);
+ nr_pages_per_node = nr_pages / nodes;
+ delta = nr_pages - nodes * nr_pages_per_node;
+
+ for (i = 0; i < nodes; i++) {
+ if (delta) {
+ nr_allocated = __alloc_pages_bulk(gfp,
+ interleave_nodes(pol), NULL,
+ nr_pages_per_node + 1, NULL,
+ page_array);
+ delta--;
+ } else {
+ nr_allocated = __alloc_pages_bulk(gfp,
+ interleave_nodes(pol), NULL,
+ nr_pages_per_node, NULL, page_array);
+ }
+
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ }
+
+ return total_allocated;
+}
+
+static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ gfp_t preferred_gfp;
+ unsigned long nr_allocated = 0;
+
+ preferred_gfp = gfp | __GFP_NOWARN;
+ preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+
+ nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
+ nr_pages, NULL, page_array);
+
+ if (nr_allocated < nr_pages)
+ nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
+ nr_pages - nr_allocated, NULL,
+ page_array + nr_allocated);
+ return nr_allocated;
+}
+
+/* alloc pages bulk and mempolicy should be considered at the
+ * same time in some situation such as vmalloc.
+ *
+ * It can accelerate memory allocation especially interleaving
+ * allocate memory.
+ */
+unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+ unsigned long nr_pages, struct page **page_array)
+{
+ struct mempolicy *pol = &default_policy;
+
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
+
+ if (pol->mode == MPOL_INTERLEAVE)
+ return alloc_pages_bulk_array_interleave(gfp, pol,
+ nr_pages, page_array);
+
+ if (pol->mode == MPOL_PREFERRED_MANY)
+ return alloc_pages_bulk_array_preferred_many(gfp,
+ numa_node_id(), pol, nr_pages, page_array);
+
+ return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
+ policy_nodemask(gfp, pol), nr_pages, NULL,
+ page_array);
+}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
@@ -2330,6 +2449,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
return false;
if (a->flags != b->flags)
return false;
+ if (a->home_node != b->home_node)
+ return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
@@ -2337,12 +2458,11 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
switch (a->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
- return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- /* a's ->flags is the same as b's */
- if (a->flags & MPOL_F_LOCAL)
- return true;
- return a->v.preferred_node == b->v.preferred_node;
+ case MPOL_PREFERRED_MANY:
+ return !!nodes_equal(a->nodes, b->nodes);
+ case MPOL_LOCAL:
+ return true;
default:
BUG();
return false;
@@ -2451,14 +2571,11 @@ static void sp_free(struct sp_node *n)
* @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
- * node id.
- *
- * Returns:
- * -1 - not misplaced, page is in the right node
- * node - node id where the page should be
- *
- * Policy determination "mimics" alloc_page_vma().
+ * node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
+ *
+ * Return: NUMA_NO_NODE if the page is in a node that is valid for this
+ * policy, or a suitable node ID to allocate a replacement page from.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
@@ -2469,7 +2586,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = NUMA_NO_NODE;
- int ret = -1;
+ int ret = NUMA_NO_NODE;
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
@@ -2483,26 +2600,36 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_PREFERRED:
- if (pol->flags & MPOL_F_LOCAL)
- polnid = numa_node_id();
- else
- polnid = pol->v.preferred_node;
+ if (node_isset(curnid, pol->nodes))
+ goto out;
+ polnid = first_node(pol->nodes);
+ break;
+
+ case MPOL_LOCAL:
+ polnid = numa_node_id();
break;
case MPOL_BIND:
+ /* Optimize placement among multiple nodes via NUMA balancing */
+ if (pol->flags & MPOL_F_MORON) {
+ if (node_isset(thisnid, pol->nodes))
+ break;
+ goto out;
+ }
+ fallthrough;
+ case MPOL_PREFERRED_MANY:
/*
- * allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
- if (node_isset(curnid, pol->v.nodes))
+ if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
- &pol->v.nodes);
+ &pol->nodes);
polnid = zone_to_nid(z->zone);
break;
@@ -2642,6 +2769,7 @@ alloc_new:
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
+ atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
@@ -2705,7 +2833,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
+ npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -2803,7 +2931,7 @@ void __init numa_policy_init(void)
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
- .v = { .preferred_node = nid, },
+ .nodes = nodemask_of_node(nid),
};
}
@@ -2847,9 +2975,6 @@ void numa_default_policy(void)
* Parse and format mempolicy from/to strings
*/
-/*
- * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
- */
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
@@ -2857,6 +2982,7 @@ static const char * const policy_modes[] =
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
+ [MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2869,7 +2995,7 @@ static const char * const policy_modes[] =
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
- * On success, returns 0, else 1
+ * Return: %0 on success, else %1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
@@ -2927,7 +3053,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
*/
if (nodelist)
goto out;
- mode = MPOL_PREFERRED;
break;
case MPOL_DEFAULT:
/*
@@ -2936,6 +3061,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
if (!nodelist)
err = 0;
goto out;
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
@@ -2966,12 +3092,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
- if (mode != MPOL_PREFERRED)
- new->v.nodes = nodes;
- else if (nodelist)
- new->v.preferred_node = first_node(nodes);
- else
- new->flags |= MPOL_F_LOCAL;
+ if (mode != MPOL_PREFERRED) {
+ new->nodes = nodes;
+ } else if (nodelist) {
+ nodes_clear(new->nodes);
+ node_set(first_node(nodes), new->nodes);
+ } else {
+ new->mode = MPOL_LOCAL;
+ }
/*
* Save nodes for contextualization: this will be used to "clone"
@@ -3017,16 +3145,13 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
switch (mode) {
case MPOL_DEFAULT:
+ case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
- if (flags & MPOL_F_LOCAL)
- mode = MPOL_LOCAL;
- else
- node_set(pol->v.preferred_node, nodes);
- break;
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
- nodes = pol->v.nodes;
+ nodes = pol->nodes;
break;
default:
WARN_ON_ONCE(1);
diff --git a/mm/mempool.c b/mm/mempool.c
index f473cdddaff0..734bcf5afbb7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -17,7 +17,6 @@
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
-#include <linux/blkdev.h>
#include <linux/writeback.h>
#include "slab.h"
@@ -58,8 +57,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size)
static void check_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
- __check_element(pool, element, ksize(element));
+ if (pool->free == mempool_kfree) {
+ __check_element(pool, element, (size_t)pool->pool_data);
+ } else if (pool->free == mempool_free_slab) {
+ __check_element(pool, element, kmem_cache_size(pool->pool_data));
} else if (pool->free == mempool_free_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
@@ -81,8 +82,10 @@ static void __poison_element(void *element, size_t size)
static void poison_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
- __poison_element(element, ksize(element));
+ if (pool->alloc == mempool_kmalloc) {
+ __poison_element(element, (size_t)pool->pool_data);
+ } else if (pool->alloc == mempool_alloc_slab) {
+ __poison_element(element, kmem_cache_size(pool->pool_data));
} else if (pool->alloc == mempool_alloc_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
@@ -104,17 +107,21 @@ static inline void poison_element(mempool_t *pool, void *element)
static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_poison_kfree(element, _RET_IP_);
+ kasan_slab_free_mempool(element);
else if (pool->alloc == mempool_alloc_pages)
- kasan_free_pages(element, (unsigned long)pool->pool_data);
+ kasan_poison_pages(element, (unsigned long)pool->pool_data,
+ false);
}
static void kasan_unpoison_element(mempool_t *pool, void *element)
{
- if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_unpoison_slab(element);
+ if (pool->alloc == mempool_kmalloc)
+ kasan_unpoison_range(element, (size_t)pool->pool_data);
+ else if (pool->alloc == mempool_alloc_slab)
+ kasan_unpoison_range(element, kmem_cache_size(pool->pool_data));
else if (pool->alloc == mempool_alloc_pages)
- kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+ kasan_unpoison_pages(element, (unsigned long)pool->pool_data,
+ false);
}
static __always_inline void add_element(mempool_t *pool, void *element)
@@ -251,7 +258,7 @@ EXPORT_SYMBOL(mempool_init);
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
- return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+ return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
GFP_KERNEL, NUMA_NO_NODE);
}
EXPORT_SYMBOL(mempool_create);
@@ -378,7 +385,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
- might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+ might_alloc(gfp_mask);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
diff --git a/mm/memremap.c b/mm/memremap.c
index 198083453182..bee85560a243 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -1,10 +1,10 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
#include <linux/device.h>
#include <linux/io.h>
#include <linux/kasan.h>
#include <linux/memory_hotplug.h>
-#include <linux/mm.h>
+#include <linux/memremap.h>
#include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/mmzone.h>
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/xarray.h>
+#include "internal.h"
static DEFINE_XARRAY(pgmap_array);
@@ -37,35 +38,29 @@ unsigned long memremap_compat_align(void)
EXPORT_SYMBOL_GPL(memremap_compat_align);
#endif
-#ifdef CONFIG_DEV_PAGEMAP_OPS
+#ifdef CONFIG_FS_DAX
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
-static void devmap_managed_enable_put(void)
+static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
- static_branch_dec(&devmap_managed_key);
+ if (pgmap->type == MEMORY_DEVICE_FS_DAX)
+ static_branch_dec(&devmap_managed_key);
}
-static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
+static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
- if (pgmap->type == MEMORY_DEVICE_PRIVATE &&
- (!pgmap->ops || !pgmap->ops->page_free)) {
- WARN(1, "Missing page_free method\n");
- return -EINVAL;
- }
-
- static_branch_inc(&devmap_managed_key);
- return 0;
+ if (pgmap->type == MEMORY_DEVICE_FS_DAX)
+ static_branch_inc(&devmap_managed_key);
}
#else
-static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
+static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
- return -EINVAL;
}
-static void devmap_managed_enable_put(void)
+static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
+#endif /* CONFIG_FS_DAX */
static void pgmap_array_delete(struct range *range)
{
@@ -84,59 +79,43 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
}
-static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
+bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
{
- const struct range *range = &pgmap->ranges[range_id];
+ int i;
- return (range->start + range_len(range)) >> PAGE_SHIFT;
-}
+ for (i = 0; i < pgmap->nr_range; i++) {
+ struct range *range = &pgmap->ranges[i];
-static unsigned long pfn_next(unsigned long pfn)
-{
- if (pfn % 1024 == 0)
- cond_resched();
- return pfn + 1;
-}
+ if (pfn >= PHYS_PFN(range->start) &&
+ pfn <= PHYS_PFN(range->end))
+ return pfn >= pfn_first(pgmap, i);
+ }
-#define for_each_device_pfn(pfn, map, i) \
- for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
+ return false;
+}
-static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
{
- if (pgmap->ops && pgmap->ops->kill)
- pgmap->ops->kill(pgmap);
- else
- percpu_ref_kill(pgmap->ref);
+ const struct range *range = &pgmap->ranges[range_id];
+
+ return (range->start + range_len(range)) >> PAGE_SHIFT;
}
-static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
+static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
{
- if (pgmap->ops && pgmap->ops->cleanup) {
- pgmap->ops->cleanup(pgmap);
- } else {
- wait_for_completion(&pgmap->done);
- percpu_ref_exit(pgmap->ref);
- }
- /*
- * Undo the pgmap ref assignment for the internal case as the
- * caller may re-enable the same pgmap.
- */
- if (pgmap->ref == &pgmap->internal_ref)
- pgmap->ref = NULL;
+ return (pfn_end(pgmap, range_id) -
+ pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
}
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
struct range *range = &pgmap->ranges[range_id];
struct page *first_page;
- int nid;
/* make sure to access a memmap that was actually initialized */
first_page = pfn_to_page(pfn_first(pgmap, range_id));
/* pages are dead and unused, undo the arch mapping */
- nid = page_to_nid(first_page);
-
mem_hotplug_begin();
remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start),
PHYS_PFN(range_len(range)));
@@ -144,32 +123,34 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
__remove_pages(PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), NULL);
} else {
- arch_remove_memory(nid, range->start, range_len(range),
+ arch_remove_memory(range->start, range_len(range),
pgmap_altmap(pgmap));
kasan_remove_zero_shadow(__va(range->start), range_len(range));
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
pgmap_array_delete(range);
}
void memunmap_pages(struct dev_pagemap *pgmap)
{
- unsigned long pfn;
int i;
- dev_pagemap_kill(pgmap);
- for (i = 0; i < pgmap->nr_range; i++)
- for_each_device_pfn(pfn, pgmap, i)
- put_page(pfn_to_page(pfn));
- dev_pagemap_cleanup(pgmap);
+ percpu_ref_kill(&pgmap->ref);
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ for (i = 0; i < pgmap->nr_range; i++)
+ percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+
+ wait_for_completion(&pgmap->done);
for (i = 0; i < pgmap->nr_range; i++)
pageunmap_range(pgmap, i);
+ percpu_ref_exit(&pgmap->ref);
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
- devmap_managed_enable_put();
+ devmap_managed_enable_put(pgmap);
}
EXPORT_SYMBOL_GPL(memunmap_pages);
@@ -180,8 +161,7 @@ static void devm_memremap_pages_release(void *data)
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
{
- struct dev_pagemap *pgmap =
- container_of(ref, struct dev_pagemap, internal_ref);
+ struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
complete(&pgmap->done);
}
@@ -189,6 +169,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
int range_id, int nid)
{
+ const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE;
struct range *range = &pgmap->ranges[range_id];
struct dev_pagemap *conflict_pgmap;
int error, is_ram;
@@ -234,6 +215,11 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
if (error)
goto err_pfn_remap;
+ if (!mhp_range_allowed(range->start, range_len(range), !is_private)) {
+ error = -EINVAL;
+ goto err_kasan;
+ }
+
mem_hotplug_begin();
/*
@@ -247,7 +233,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
* the CPU, we do want the linear mapping and thus use
* arch_add_memory().
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ if (is_private) {
error = add_pages(nid, PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), params);
} else {
@@ -266,7 +252,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
- PHYS_PFN(range_len(range)), params->altmap);
+ PHYS_PFN(range_len(range)), params->altmap,
+ MIGRATE_MOVABLE);
}
mem_hotplug_done();
@@ -280,14 +267,16 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
- percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
- - pfn_first(pgmap, range_id));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
- kasan_remove_zero_shadow(__va(range->start), range_len(range));
+ if (!is_private)
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
err_pfn_remap:
pgmap_array_delete(range);
return error;
@@ -295,18 +284,18 @@ err_pfn_remap:
/*
- * Not device managed version of dev_memremap_pages, undone by
- * memunmap_pages(). Please use dev_memremap_pages if you have a struct
+ * Not device managed version of devm_memremap_pages, undone by
+ * memunmap_pages(). Please use devm_memremap_pages if you have a struct
* device available.
*/
void *memremap_pages(struct dev_pagemap *pgmap, int nid)
{
struct mhp_params params = {
.altmap = pgmap_altmap(pgmap),
+ .pgmap = pgmap,
.pgprot = PAGE_KERNEL,
};
const int nr_range = pgmap->nr_range;
- bool need_devmap_managed = true;
int error, i;
if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
@@ -322,52 +311,49 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "Missing migrate_to_ram method\n");
return ERR_PTR(-EINVAL);
}
+ if (!pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->owner) {
+ WARN(1, "Missing owner\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case MEMORY_DEVICE_COHERENT:
+ if (!pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return ERR_PTR(-EINVAL);
+ }
if (!pgmap->owner) {
WARN(1, "Missing owner\n");
return ERR_PTR(-EINVAL);
}
break;
case MEMORY_DEVICE_FS_DAX:
- if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
- IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
+ params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
- need_devmap_managed = false;
break;
case MEMORY_DEVICE_PCI_P2PDMA:
params.pgprot = pgprot_noncached(params.pgprot);
- need_devmap_managed = false;
break;
default:
WARN(1, "Invalid pgmap type %d\n", pgmap->type);
break;
}
- if (!pgmap->ref) {
- if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
- return ERR_PTR(-EINVAL);
-
- init_completion(&pgmap->done);
- error = percpu_ref_init(&pgmap->internal_ref,
- dev_pagemap_percpu_release, 0, GFP_KERNEL);
- if (error)
- return ERR_PTR(error);
- pgmap->ref = &pgmap->internal_ref;
- } else {
- if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
- WARN(1, "Missing reference count teardown definition\n");
- return ERR_PTR(-EINVAL);
- }
- }
+ init_completion(&pgmap->done);
+ error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
+ GFP_KERNEL);
+ if (error)
+ return ERR_PTR(error);
- if (need_devmap_managed) {
- error = devmap_managed_enable_get(pgmap);
- if (error)
- return ERR_PTR(error);
- }
+ devmap_managed_enable_get(pgmap);
/*
* Clear the pgmap nr_range as it will be incremented for each
@@ -399,7 +385,7 @@ EXPORT_SYMBOL_GPL(memremap_pages);
* @pgmap: pointer to a struct dev_pagemap
*
* Notes:
- * 1/ At a minimum the res and type members of @pgmap must be initialized
+ * 1/ At a minimum the range and type members of @pgmap must be initialized
* by the caller before passing it to this function
*
* 2/ The altmap field may optionally be initialized, in which case
@@ -474,7 +460,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
/* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
- if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+ if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
@@ -482,21 +468,24 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
}
EXPORT_SYMBOL_GPL(get_dev_pagemap);
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-void free_devmap_managed_page(struct page *page)
+void free_zone_device_page(struct page *page)
{
- /* notify page idle for dax */
- if (!is_device_private_page(page)) {
- wake_up_var(&page->_refcount);
+ if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
return;
- }
- __ClearPageWaiters(page);
+ mem_cgroup_uncharge(page_folio(page));
- mem_cgroup_uncharge(page);
+ /*
+ * Note: we don't expect anonymous compound pages yet. Once supported
+ * and we could PTE-map them similar to THP, we'd have to clear
+ * PG_anon_exclusive on all tail pages.
+ */
+ VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page);
+ if (PageAnon(page))
+ __ClearPageAnonExclusive(page);
/*
- * When a device_private page is freed, the page->mapping field
+ * When a device managed page is freed, the page->mapping field
* may still contain a (stale) mapping value. For example, the
* lower bits of page->mapping may still identify the page as an
* anonymous page. Ultimately, this entire field is just stale
@@ -518,5 +507,44 @@ void free_devmap_managed_page(struct page *page)
*/
page->mapping = NULL;
page->pgmap->ops->page_free(page);
+
+ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ page->pgmap->type != MEMORY_DEVICE_COHERENT)
+ /*
+ * Reset the page count to 1 to prepare for handing out the page
+ * again.
+ */
+ set_page_count(page, 1);
+ else
+ put_dev_pagemap(page->pgmap);
+}
+
+void zone_device_page_init(struct page *page)
+{
+ /*
+ * Drivers shouldn't be allocating pages after calling
+ * memunmap_pages().
+ */
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
+ set_page_count(page, 1);
+ lock_page(page);
+}
+EXPORT_SYMBOL_GPL(zone_device_page_init);
+
+#ifdef CONFIG_FS_DAX
+bool __put_devmap_managed_page_refs(struct page *page, int refs)
+{
+ if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+ return false;
+
+ /*
+ * fsdax page refcounts are 1-based, rather than 0-based: if
+ * refcount is 1, then the page is free and the refcount is
+ * stable because nobody holds a reference on the page.
+ */
+ if (page_ref_sub_return(page, refs) == 1)
+ wake_up_var(&page->_refcount);
+ return true;
}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
+EXPORT_SYMBOL(__put_devmap_managed_page_refs);
+#endif /* CONFIG_FS_DAX */
diff --git a/mm/memtest.c b/mm/memtest.c
index f53ace709ccd..57149dfee438 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -4,6 +4,9 @@
#include <linux/init.h>
#include <linux/memblock.h>
+bool early_memtest_done;
+phys_addr_t early_memtest_bad_size;
+
static u64 patterns[] __initdata = {
/* The first entry has to be 0 to leave memtest with zeroed memory */
0,
@@ -30,6 +33,7 @@ static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr
pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
cpu_to_be64(pattern), &start_bad, &end_bad);
memblock_reserve(start_bad, end_bad - start_bad);
+ early_memtest_bad_size += (end_bad - start_bad);
}
static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
@@ -61,6 +65,8 @@ static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size
}
if (start_bad)
reserve_bad_mem(pattern, start_bad, last_bad + incr);
+
+ early_memtest_done = true;
}
static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
diff --git a/mm/migrate.c b/mm/migrate.c
index f94d7c7eeddf..24baad2571e3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,7 +21,6 @@
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
-#include <linux/pagevec.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/topology.h>
@@ -38,54 +37,30 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
-#include <linux/pagewalk.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/balloon_compaction.h>
-#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
#include <linux/oom.h>
+#include <linux/memory.h>
+#include <linux/random.h>
+#include <linux/sched/sysctl.h>
+#include <linux/memory-tiers.h>
#include <asm/tlbflush.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
#include "internal.h"
-/*
- * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
- * undesirable, use migrate_prep_local()
- */
-int migrate_prep(void)
-{
- /*
- * Clear the LRU lists so pages can be isolated.
- * Note that pages may be moved off the LRU after we have
- * drained them. Those pages will fail to migrate like other
- * pages that may be busy.
- */
- lru_add_drain_all();
-
- return 0;
-}
-
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
-int migrate_prep_local(void)
-{
- lru_add_drain();
-
- return 0;
-}
-
-int isolate_movable_page(struct page *page, isolate_mode_t mode)
+bool isolate_movable_page(struct page *page, isolate_mode_t mode)
{
- struct address_space *mapping;
+ struct folio *folio = folio_get_nontail_page(page);
+ const struct movable_operations *mops;
/*
* Avoid burning cycles with pages that are yet under __free_pages(),
@@ -96,16 +71,25 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
* the put_page() at the end of this block will take care of
* release this page, thus avoiding a nasty leakage.
*/
- if (unlikely(!get_page_unless_zero(page)))
+ if (!folio)
goto out;
+ if (unlikely(folio_test_slab(folio)))
+ goto out_putfolio;
+ /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
+ smp_rmb();
/*
- * Check PageMovable before holding a PG_lock because page's owner
- * assumes anybody doesn't touch PG_lock of newly allocated page
- * so unconditionally grabbing the lock ruins page's owner side.
+ * Check movable flag before taking the page lock because
+ * we use non-atomic bitops on newly allocated page flags so
+ * unconditionally grabbing the lock ruins page's owner side.
*/
- if (unlikely(!__PageMovable(page)))
- goto out_putpage;
+ if (unlikely(!__folio_test_movable(folio)))
+ goto out_putfolio;
+ /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
+ smp_rmb();
+ if (unlikely(folio_test_slab(folio)))
+ goto out_putfolio;
+
/*
* As movable pages are not isolated from LRU lists, concurrent
* compaction threads can race against page migration functions
@@ -117,45 +101,39 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
* lets be sure we have the page lock
* before proceeding with the movable page isolation steps.
*/
- if (unlikely(!trylock_page(page)))
- goto out_putpage;
+ if (unlikely(!folio_trylock(folio)))
+ goto out_putfolio;
- if (!PageMovable(page) || PageIsolated(page))
+ if (!folio_test_movable(folio) || folio_test_isolated(folio))
goto out_no_isolated;
- mapping = page_mapping(page);
- VM_BUG_ON_PAGE(!mapping, page);
+ mops = folio_movable_ops(folio);
+ VM_BUG_ON_FOLIO(!mops, folio);
- if (!mapping->a_ops->isolate_page(page, mode))
+ if (!mops->isolate_page(&folio->page, mode))
goto out_no_isolated;
/* Driver shouldn't use PG_isolated bit of page->flags */
- WARN_ON_ONCE(PageIsolated(page));
- __SetPageIsolated(page);
- unlock_page(page);
+ WARN_ON_ONCE(folio_test_isolated(folio));
+ folio_set_isolated(folio);
+ folio_unlock(folio);
- return 0;
+ return true;
out_no_isolated:
- unlock_page(page);
-out_putpage:
- put_page(page);
+ folio_unlock(folio);
+out_putfolio:
+ folio_put(folio);
out:
- return -EBUSY;
+ return false;
}
-/* It should be called on page which is PG_movable */
-void putback_movable_page(struct page *page)
+static void putback_movable_folio(struct folio *folio)
{
- struct address_space *mapping;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageMovable(page), page);
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ const struct movable_operations *mops = folio_movable_ops(folio);
- mapping = page_mapping(page);
- mapping->a_ops->putback_page(page);
- __ClearPageIsolated(page);
+ mops->putback_page(&folio->page);
+ folio_clear_isolated(folio);
}
/*
@@ -164,37 +142,37 @@ void putback_movable_page(struct page *page)
*
* This function shall be used whenever the isolated pageset has been
* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_huge_page().
+ * and isolate_hugetlb().
*/
void putback_movable_pages(struct list_head *l)
{
- struct page *page;
- struct page *page2;
+ struct folio *folio;
+ struct folio *folio2;
- list_for_each_entry_safe(page, page2, l, lru) {
- if (unlikely(PageHuge(page))) {
- putback_active_hugepage(page);
+ list_for_each_entry_safe(folio, folio2, l, lru) {
+ if (unlikely(folio_test_hugetlb(folio))) {
+ folio_putback_active_hugetlb(folio);
continue;
}
- list_del(&page->lru);
+ list_del(&folio->lru);
/*
- * We isolated non-lru movable page so here we can use
- * __PageMovable because LRU page's mapping cannot have
+ * We isolated non-lru movable folio so here we can use
+ * __PageMovable because LRU folio's mapping cannot have
* PAGE_MAPPING_MOVABLE.
*/
- if (unlikely(__PageMovable(page))) {
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
- lock_page(page);
- if (PageMovable(page))
- putback_movable_page(page);
+ if (unlikely(__folio_test_movable(folio))) {
+ VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio);
+ folio_lock(folio);
+ if (folio_test_movable(folio))
+ putback_movable_folio(folio);
else
- __ClearPageIsolated(page);
- unlock_page(page);
- put_page(page);
+ folio_clear_isolated(folio);
+ folio_unlock(folio);
+ folio_put(folio);
} else {
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_lru(page), -thp_nr_pages(page));
- putback_lru_page(page);
+ node_stat_mod_folio(folio, NR_ISOLATED_ANON +
+ folio_is_file_lru(folio), -folio_nr_pages(folio));
+ folio_putback_lru(folio);
}
}
}
@@ -202,83 +180,93 @@ void putback_movable_pages(struct list_head *l)
/*
* Restore a potential migration pte to a working pte entry
*/
-static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, void *old)
+static bool remove_migration_pte(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *old)
{
- struct page_vma_mapped_walk pvmw = {
- .page = old,
- .vma = vma,
- .address = addr,
- .flags = PVMW_SYNC | PVMW_MIGRATION,
- };
- struct page *new;
- pte_t pte;
- swp_entry_t entry;
+ DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
- VM_BUG_ON_PAGE(PageTail(page), page);
while (page_vma_mapped_walk(&pvmw)) {
- if (PageKsm(page))
- new = page;
- else
- new = page - pvmw.page->index +
- linear_page_index(vma, pvmw.address);
+ rmap_t rmap_flags = RMAP_NONE;
+ pte_t old_pte;
+ pte_t pte;
+ swp_entry_t entry;
+ struct page *new;
+ unsigned long idx = 0;
+
+ /* pgoff is invalid for ksm pages, but they are never large */
+ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
+ new = folio_page(folio, idx);
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
- VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+ VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
+ !folio_test_pmd_mappable(folio), folio);
remove_migration_pmd(&pvmw, new);
continue;
}
#endif
- get_page(new);
- pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
- if (pte_swp_soft_dirty(*pvmw.pte))
+ folio_get(folio);
+ pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
+ old_pte = ptep_get(pvmw.pte);
+ if (pte_swp_soft_dirty(old_pte))
pte = pte_mksoft_dirty(pte);
- /*
- * Recheck VMA as permissions can change since migration started
- */
- entry = pte_to_swp_entry(*pvmw.pte);
- if (is_write_migration_entry(entry))
- pte = maybe_mkwrite(pte, vma);
- else if (pte_swp_uffd_wp(*pvmw.pte))
+ entry = pte_to_swp_entry(old_pte);
+ if (!is_migration_entry_young(entry))
+ pte = pte_mkold(pte);
+ if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+ pte = pte_mkdirty(pte);
+ if (is_writable_migration_entry(entry))
+ pte = pte_mkwrite(pte);
+ else if (pte_swp_uffd_wp(old_pte))
pte = pte_mkuffd_wp(pte);
+ if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
if (unlikely(is_device_private_page(new))) {
- entry = make_device_private_entry(new, pte_write(pte));
+ if (pte_write(pte))
+ entry = make_writable_device_private_entry(
+ page_to_pfn(new));
+ else
+ entry = make_readable_device_private_entry(
+ page_to_pfn(new));
pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*pvmw.pte))
+ if (pte_swp_soft_dirty(old_pte))
pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*pvmw.pte))
+ if (pte_swp_uffd_wp(old_pte))
pte = pte_swp_mkuffd_wp(pte);
}
#ifdef CONFIG_HUGETLB_PAGE
- if (PageHuge(new)) {
- pte = pte_mkhuge(pte);
- pte = arch_make_huge_pte(pte, vma, new, 0);
- set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
- if (PageAnon(new))
- hugepage_add_anon_rmap(new, vma, pvmw.address);
+ if (folio_test_hugetlb(folio)) {
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
+
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+ if (folio_test_anon(folio))
+ hugepage_add_anon_rmap(new, vma, pvmw.address,
+ rmap_flags);
else
- page_dup_rmap(new, true);
+ page_dup_file_rmap(new, true);
+ set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
} else
#endif
{
- set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
-
- if (PageAnon(new))
- page_add_anon_rmap(new, vma, pvmw.address, false);
+ if (folio_test_anon(folio))
+ page_add_anon_rmap(new, vma, pvmw.address,
+ rmap_flags);
else
- page_add_file_rmap(new, false);
+ page_add_file_rmap(new, vma, false);
+ set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
- if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
- mlock_vma_page(new);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_drain_local();
- if (PageTransHuge(page) && PageMlocked(page))
- clear_page_mlock(page);
+ trace_remove_migration_pte(pvmw.address, pte_val(pte),
+ compound_order(new));
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
@@ -291,17 +279,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-void remove_migration_ptes(struct page *old, struct page *new, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
{
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
- .arg = old,
+ .arg = src,
};
if (locked)
- rmap_walk_locked(new, &rwc);
+ rmap_walk_locked(dst, &rwc);
else
- rmap_walk(new, &rwc);
+ rmap_walk(dst, &rwc);
}
/*
@@ -309,15 +297,21 @@ void remove_migration_ptes(struct page *old, struct page *new, bool locked)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
- spinlock_t *ptl)
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
{
+ spinlock_t *ptl;
+ pte_t *ptep;
pte_t pte;
swp_entry_t entry;
- struct page *page;
- spin_lock(ptl);
- pte = *ptep;
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!ptep)
+ return;
+
+ pte = ptep_get(ptep);
+ pte_unmap(ptep);
+
if (!is_swap_pte(pte))
goto out;
@@ -325,70 +319,71 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!is_migration_entry(entry))
goto out;
- page = migration_entry_to_page(entry);
-
- /*
- * Once page cache replacement of page migration started, page_count
- * is zero; but we must not call put_and_wait_on_page_locked() without
- * a ref. Use get_page_unless_zero(), and just fault again if it fails.
- */
- if (!get_page_unless_zero(page))
- goto out;
- pte_unmap_unlock(ptep, ptl);
- put_and_wait_on_page_locked(page);
+ migration_entry_wait_on_locked(entry, ptl);
return;
out:
- pte_unmap_unlock(ptep, ptl);
+ spin_unlock(ptl);
}
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The vma read lock must be held upon entry. Holding that lock prevents either
+ * the pte or the ptl from being freed.
+ *
+ * This function will release the vma lock before returning.
+ */
+void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep)
{
- spinlock_t *ptl = pte_lockptr(mm, pmd);
- pte_t *ptep = pte_offset_map(pmd, address);
- __migration_entry_wait(mm, ptep, ptl);
-}
+ spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
+ pte_t pte;
-void migration_entry_wait_huge(struct vm_area_struct *vma,
- struct mm_struct *mm, pte_t *pte)
-{
- spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
- __migration_entry_wait(mm, pte, ptl);
+ hugetlb_vma_assert_locked(vma);
+ spin_lock(ptl);
+ pte = huge_ptep_get(ptep);
+
+ if (unlikely(!is_hugetlb_entry_migration(pte))) {
+ spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+ } else {
+ /*
+ * If migration entry existed, safe to release vma lock
+ * here because the pgtable page won't be freed without the
+ * pgtable lock released. See comment right above pgtable
+ * lock release in migration_entry_wait_on_locked().
+ */
+ hugetlb_vma_unlock_read(vma);
+ migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
+ }
}
+#endif
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
- struct page *page;
ptl = pmd_lock(mm, pmd);
if (!is_pmd_migration_entry(*pmd))
goto unlock;
- page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
- if (!get_page_unless_zero(page))
- goto unlock;
- spin_unlock(ptl);
- put_and_wait_on_page_locked(page);
+ migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl);
return;
unlock:
spin_unlock(ptl);
}
#endif
-static int expected_page_refs(struct address_space *mapping, struct page *page)
+static int folio_expected_refs(struct address_space *mapping,
+ struct folio *folio)
{
- int expected_count = 1;
+ int refs = 1;
+ if (!mapping)
+ return refs;
- /*
- * Device private pages have an extra refcount as they are
- * ZONE_DEVICE pages.
- */
- expected_count += is_device_private_page(page);
- if (mapping)
- expected_count += thp_nr_pages(page) + page_has_private(page);
+ refs += folio_nr_pages(folio);
+ if (folio_test_private(folio))
+ refs++;
- return expected_count;
+ return refs;
}
/*
@@ -399,82 +394,70 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* 2 for pages with a mapping
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
-int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page, int extra_count)
+int folio_migrate_mapping(struct address_space *mapping,
+ struct folio *newfolio, struct folio *folio, int extra_count)
{
- XA_STATE(xas, &mapping->i_pages, page_index(page));
+ XA_STATE(xas, &mapping->i_pages, folio_index(folio));
struct zone *oldzone, *newzone;
int dirty;
- int expected_count = expected_page_refs(mapping, page) + extra_count;
+ int expected_count = folio_expected_refs(mapping, folio) + extra_count;
+ long nr = folio_nr_pages(folio);
if (!mapping) {
/* Anonymous page without mapping */
- if (page_count(page) != expected_count)
+ if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
/* No turning back from here */
- newpage->index = page->index;
- newpage->mapping = page->mapping;
- if (PageSwapBacked(page))
- __SetPageSwapBacked(newpage);
+ newfolio->index = folio->index;
+ newfolio->mapping = folio->mapping;
+ if (folio_test_swapbacked(folio))
+ __folio_set_swapbacked(newfolio);
return MIGRATEPAGE_SUCCESS;
}
- oldzone = page_zone(page);
- newzone = page_zone(newpage);
+ oldzone = folio_zone(folio);
+ newzone = folio_zone(newfolio);
xas_lock_irq(&xas);
- if (page_count(page) != expected_count || xas_load(&xas) != page) {
- xas_unlock_irq(&xas);
- return -EAGAIN;
- }
-
- if (!page_ref_freeze(page, expected_count)) {
+ if (!folio_ref_freeze(folio, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
}
/*
- * Now we know that no one else is looking at the page:
+ * Now we know that no one else is looking at the folio:
* no turning back from here.
*/
- newpage->index = page->index;
- newpage->mapping = page->mapping;
- page_ref_add(newpage, thp_nr_pages(page)); /* add cache reference */
- if (PageSwapBacked(page)) {
- __SetPageSwapBacked(newpage);
- if (PageSwapCache(page)) {
- SetPageSwapCache(newpage);
- set_page_private(newpage, page_private(page));
+ newfolio->index = folio->index;
+ newfolio->mapping = folio->mapping;
+ folio_ref_add(newfolio, nr); /* add cache reference */
+ if (folio_test_swapbacked(folio)) {
+ __folio_set_swapbacked(newfolio);
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
}
} else {
- VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
}
/* Move dirty while page refs frozen and newpage not yet exposed */
- dirty = PageDirty(page);
+ dirty = folio_test_dirty(folio);
if (dirty) {
- ClearPageDirty(page);
- SetPageDirty(newpage);
+ folio_clear_dirty(folio);
+ folio_set_dirty(newfolio);
}
- xas_store(&xas, newpage);
- if (PageTransHuge(page)) {
- int i;
-
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- xas_next(&xas);
- xas_store(&xas, newpage);
- }
- }
+ xas_store(&xas, newfolio);
/*
* Drop cache reference from old page by unfreezing
* to one less reference.
* We know this isn't the last reference.
*/
- page_ref_unfreeze(page, expected_count - thp_nr_pages(page));
+ folio_ref_unfreeze(folio, expected_count - nr);
xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -493,59 +476,65 @@ int migrate_page_move_mapping(struct address_space *mapping,
struct lruvec *old_lruvec, *new_lruvec;
struct mem_cgroup *memcg;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
- __dec_lruvec_state(old_lruvec, NR_FILE_PAGES);
- __inc_lruvec_state(new_lruvec, NR_FILE_PAGES);
- if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_lruvec_state(old_lruvec, NR_SHMEM);
- __inc_lruvec_state(new_lruvec, NR_SHMEM);
+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
+ if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
+
+ if (folio_test_pmd_mappable(folio)) {
+ __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
+ }
+ }
+#ifdef CONFIG_SWAP
+ if (folio_test_swapcache(folio)) {
+ __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
}
+#endif
if (dirty && mapping_can_writeback(mapping)) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
}
local_irq_enable();
return MIGRATEPAGE_SUCCESS;
}
-EXPORT_SYMBOL(migrate_page_move_mapping);
+EXPORT_SYMBOL(folio_migrate_mapping);
/*
* The expected number of remaining references is the same as that
- * of migrate_page_move_mapping().
+ * of folio_migrate_mapping().
*/
int migrate_huge_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct folio *dst, struct folio *src)
{
- XA_STATE(xas, &mapping->i_pages, page_index(page));
+ XA_STATE(xas, &mapping->i_pages, folio_index(src));
int expected_count;
xas_lock_irq(&xas);
- expected_count = 2 + page_has_private(page);
- if (page_count(page) != expected_count || xas_load(&xas) != page) {
- xas_unlock_irq(&xas);
- return -EAGAIN;
- }
-
- if (!page_ref_freeze(page, expected_count)) {
+ expected_count = 2 + folio_has_private(src);
+ if (!folio_ref_freeze(src, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
}
- newpage->index = page->index;
- newpage->mapping = page->mapping;
+ dst->index = src->index;
+ dst->mapping = src->mapping;
- get_page(newpage);
+ folio_get(dst);
- xas_store(&xas, newpage);
+ xas_store(&xas, dst);
- page_ref_unfreeze(page, expected_count - 1);
+ folio_ref_unfreeze(src, expected_count - 1);
xas_unlock_irq(&xas);
@@ -553,167 +542,147 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
}
/*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page. We need something more
- * specialized.
+ * Copy the flags and some other ancillary information
*/
-static void __copy_gigantic_page(struct page *dst, struct page *src,
- int nr_pages)
-{
- int i;
- struct page *dst_base = dst;
- struct page *src_base = src;
-
- for (i = 0; i < nr_pages; ) {
- cond_resched();
- copy_highpage(dst, src);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
- }
-}
-
-static void copy_huge_page(struct page *dst, struct page *src)
+void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
{
- int i;
- int nr_pages;
-
- if (PageHuge(src)) {
- /* hugetlbfs page */
- struct hstate *h = page_hstate(src);
- nr_pages = pages_per_huge_page(h);
-
- if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
- __copy_gigantic_page(dst, src, nr_pages);
- return;
- }
- } else {
- /* thp page */
- BUG_ON(!PageTransHuge(src));
- nr_pages = thp_nr_pages(src);
- }
+ int cpupid;
- for (i = 0; i < nr_pages; i++) {
- cond_resched();
- copy_highpage(dst + i, src + i);
- }
-}
+ if (folio_test_error(folio))
+ folio_set_error(newfolio);
+ if (folio_test_referenced(folio))
+ folio_set_referenced(newfolio);
+ if (folio_test_uptodate(folio))
+ folio_mark_uptodate(newfolio);
+ if (folio_test_clear_active(folio)) {
+ VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
+ folio_set_active(newfolio);
+ } else if (folio_test_clear_unevictable(folio))
+ folio_set_unevictable(newfolio);
+ if (folio_test_workingset(folio))
+ folio_set_workingset(newfolio);
+ if (folio_test_checked(folio))
+ folio_set_checked(newfolio);
+ /*
+ * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
+ * migration entries. We can still have PG_anon_exclusive set on an
+ * effectively unmapped and unreferenced first sub-pages of an
+ * anonymous THP: we can simply copy it here via PG_mappedtodisk.
+ */
+ if (folio_test_mappedtodisk(folio))
+ folio_set_mappedtodisk(newfolio);
-/*
- * Copy the page to its new location
- */
-void migrate_page_states(struct page *newpage, struct page *page)
-{
- int cpupid;
+ /* Move dirty on pages not done by folio_migrate_mapping() */
+ if (folio_test_dirty(folio))
+ folio_set_dirty(newfolio);
- if (PageError(page))
- SetPageError(newpage);
- if (PageReferenced(page))
- SetPageReferenced(newpage);
- if (PageUptodate(page))
- SetPageUptodate(newpage);
- if (TestClearPageActive(page)) {
- VM_BUG_ON_PAGE(PageUnevictable(page), page);
- SetPageActive(newpage);
- } else if (TestClearPageUnevictable(page))
- SetPageUnevictable(newpage);
- if (PageWorkingset(page))
- SetPageWorkingset(newpage);
- if (PageChecked(page))
- SetPageChecked(newpage);
- if (PageMappedToDisk(page))
- SetPageMappedToDisk(newpage);
-
- /* Move dirty on pages not done by migrate_page_move_mapping() */
- if (PageDirty(page))
- SetPageDirty(newpage);
-
- if (page_is_young(page))
- set_page_young(newpage);
- if (page_is_idle(page))
- set_page_idle(newpage);
+ if (folio_test_young(folio))
+ folio_set_young(newfolio);
+ if (folio_test_idle(folio))
+ folio_set_idle(newfolio);
/*
* Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page.
*/
- cpupid = page_cpupid_xchg_last(page, -1);
- page_cpupid_xchg_last(newpage, cpupid);
+ cpupid = page_cpupid_xchg_last(&folio->page, -1);
+ /*
+ * For memory tiering mode, when migrate between slow and fast
+ * memory node, reset cpupid, because that is used to record
+ * page access time in slow memory node.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
+ bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
+ bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
- ksm_migrate_page(newpage, page);
+ if (f_toptier != t_toptier)
+ cpupid = -1;
+ }
+ page_cpupid_xchg_last(&newfolio->page, cpupid);
+
+ folio_migrate_ksm(newfolio, folio);
/*
* Please do not reorder this without considering how mm/ksm.c's
* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
*/
- if (PageSwapCache(page))
- ClearPageSwapCache(page);
- ClearPagePrivate(page);
- set_page_private(page, 0);
+ if (folio_test_swapcache(folio))
+ folio_clear_swapcache(folio);
+ folio_clear_private(folio);
+
+ /* page->private contains hugetlb specific flags */
+ if (!folio_test_hugetlb(folio))
+ folio->private = NULL;
/*
* If any waiters have accumulated on the new page then
* wake them up.
*/
- if (PageWriteback(newpage))
- end_page_writeback(newpage);
+ if (folio_test_writeback(newfolio))
+ folio_end_writeback(newfolio);
/*
* PG_readahead shares the same bit with PG_reclaim. The above
* end_page_writeback() may clear PG_readahead mistakenly, so set the
* bit after that.
*/
- if (PageReadahead(page))
- SetPageReadahead(newpage);
+ if (folio_test_readahead(folio))
+ folio_set_readahead(newfolio);
- copy_page_owner(page, newpage);
+ folio_copy_owner(newfolio, folio);
- if (!PageHuge(page))
- mem_cgroup_migrate(page, newpage);
+ if (!folio_test_hugetlb(folio))
+ mem_cgroup_migrate(folio, newfolio);
}
-EXPORT_SYMBOL(migrate_page_states);
+EXPORT_SYMBOL(folio_migrate_flags);
-void migrate_page_copy(struct page *newpage, struct page *page)
+void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
{
- if (PageHuge(page) || PageTransHuge(page))
- copy_huge_page(newpage, page);
- else
- copy_highpage(newpage, page);
-
- migrate_page_states(newpage, page);
+ folio_copy(newfolio, folio);
+ folio_migrate_flags(newfolio, folio);
}
-EXPORT_SYMBOL(migrate_page_copy);
+EXPORT_SYMBOL(folio_migrate_copy);
/************************************************************
* Migration functions
***********************************************************/
-/*
- * Common logic to directly migrate a single LRU page suitable for
- * pages that do not use PagePrivate/PagePrivate2.
- *
- * Pages are locked upon entry and exit.
- */
-int migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page,
- enum migrate_mode mode)
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode, int extra_count)
{
int rc;
- BUG_ON(PageWriteback(page)); /* Writeback must be complete */
+ BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+ rc = folio_migrate_mapping(mapping, dst, src, extra_count);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
+ folio_migrate_copy(dst, src);
else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(dst, src);
return MIGRATEPAGE_SUCCESS;
}
-EXPORT_SYMBOL(migrate_page);
+
+/**
+ * migrate_folio() - Simple folio migration.
+ * @mapping: The address_space containing the folio.
+ * @dst: The folio to migrate the data to.
+ * @src: The folio containing the current data.
+ * @mode: How to migrate the page.
+ *
+ * Common logic to directly migrate a single LRU folio suitable for
+ * folios that do not use PagePrivate/PagePrivate2.
+ *
+ * Folios are locked upon entry and exit.
+ */
+int migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
+{
+ return migrate_folio_extra(mapping, dst, src, mode, 0);
+}
+EXPORT_SYMBOL(migrate_folio);
#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
@@ -721,56 +690,51 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
{
struct buffer_head *bh = head;
+ struct buffer_head *failed_bh;
- /* Simple case, sync compaction */
- if (mode != MIGRATE_ASYNC) {
- do {
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
-
- return true;
- }
-
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
do {
if (!trylock_buffer(bh)) {
- /*
- * We failed to lock the buffer and cannot stall in
- * async migration. Release the taken locks
- */
- struct buffer_head *failed_bh = bh;
- bh = head;
- while (bh != failed_bh) {
- unlock_buffer(bh);
- bh = bh->b_this_page;
- }
- return false;
+ if (mode == MIGRATE_ASYNC)
+ goto unlock;
+ if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh))
+ goto unlock;
+ lock_buffer(bh);
}
bh = bh->b_this_page;
} while (bh != head);
+
return true;
+
+unlock:
+ /* We failed to lock the buffer and cannot stall. */
+ failed_bh = bh;
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ bh = bh->b_this_page;
+ }
+
+ return false;
}
-static int __buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode,
+static int __buffer_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode,
bool check_refs)
{
struct buffer_head *bh, *head;
int rc;
int expected_count;
- if (!page_has_buffers(page))
- return migrate_page(mapping, newpage, page, mode);
+ head = folio_buffers(src);
+ if (!head)
+ return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = expected_page_refs(mapping, page);
- if (page_count(page) != expected_count)
+ expected_count = folio_expected_refs(mapping, src);
+ if (folio_ref_count(src) != expected_count)
return -EAGAIN;
- head = page_buffers(page);
if (!buffer_migrate_lock_buffers(head, mode))
return -EAGAIN;
@@ -801,23 +765,22 @@ recheck_buffers:
}
}
- rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+ rc = folio_migrate_mapping(mapping, dst, src, 0);
if (rc != MIGRATEPAGE_SUCCESS)
goto unlock_buffers;
- attach_page_private(newpage, detach_page_private(page));
+ folio_attach_private(dst, folio_detach_private(src));
bh = head;
do {
- set_bh_page(bh, newpage, bh_offset(bh));
+ set_bh_page(bh, &dst->page, bh_offset(bh));
bh = bh->b_this_page;
-
} while (bh != head);
if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
+ folio_migrate_copy(dst, src);
else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(dst, src);
rc = MIGRATEPAGE_SUCCESS;
unlock_buffers:
@@ -827,41 +790,79 @@ unlock_buffers:
do {
unlock_buffer(bh);
bh = bh->b_this_page;
-
} while (bh != head);
return rc;
}
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist. For example attached buffer heads are accessed only under page lock.
+/**
+ * buffer_migrate_folio() - Migration function for folios with buffers.
+ * @mapping: The address space containing @src.
+ * @dst: The folio to migrate to.
+ * @src: The folio to migrate from.
+ * @mode: How to migrate the folio.
+ *
+ * This function can only be used if the underlying filesystem guarantees
+ * that no other references to @src exist. For example attached buffer
+ * heads are accessed only under the folio lock. If your filesystem cannot
+ * provide this guarantee, buffer_migrate_folio_norefs() may be more
+ * appropriate.
+ *
+ * Return: 0 on success or a negative errno on failure.
*/
-int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+int buffer_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- return __buffer_migrate_page(mapping, newpage, page, mode, false);
+ return __buffer_migrate_folio(mapping, dst, src, mode, false);
}
-EXPORT_SYMBOL(buffer_migrate_page);
+EXPORT_SYMBOL(buffer_migrate_folio);
-/*
- * Same as above except that this variant is more careful and checks that there
- * are also no buffer head references. This function is the right one for
- * mappings where buffer heads are directly looked up and referenced (such as
- * block device mappings).
+/**
+ * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
+ * @mapping: The address space containing @src.
+ * @dst: The folio to migrate to.
+ * @src: The folio to migrate from.
+ * @mode: How to migrate the folio.
+ *
+ * Like buffer_migrate_folio() except that this variant is more careful
+ * and checks that there are also no buffer head references. This function
+ * is the right one for mappings where buffer heads are directly looked
+ * up and referenced (such as block device mappings).
+ *
+ * Return: 0 on success or a negative errno on failure.
*/
-int buffer_migrate_page_norefs(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+int buffer_migrate_folio_norefs(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- return __buffer_migrate_page(mapping, newpage, page, mode, true);
+ return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
+EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
#endif
+int filemap_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
+{
+ int ret;
+
+ ret = folio_migrate_mapping(mapping, dst, src, 0);
+ if (ret != MIGRATEPAGE_SUCCESS)
+ return ret;
+
+ if (folio_get_private(src))
+ folio_attach_private(dst, folio_detach_private(src));
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ folio_migrate_copy(dst, src);
+ else
+ folio_migrate_flags(dst, src);
+ return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(filemap_migrate_folio);
+
/*
- * Writeback a page to clean the dirty state
+ * Writeback a folio to clean the dirty state
*/
-static int writeout(struct address_space *mapping, struct page *page)
+static int writeout(struct address_space *mapping, struct folio *folio)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
@@ -876,25 +877,25 @@ static int writeout(struct address_space *mapping, struct page *page)
/* No write method for the address space */
return -EINVAL;
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
/* Someone else already triggered a write */
return -EAGAIN;
/*
- * A dirty page may imply that the underlying filesystem has
- * the page on some queue. So the page must be clean for
- * migration. Writeout may mean we loose the lock and the
- * page state is no longer what we checked for earlier.
+ * A dirty folio may imply that the underlying filesystem has
+ * the folio on some queue. So the folio must be clean for
+ * migration. Writeout may mean we lose the lock and the
+ * folio state is no longer what we checked for earlier.
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(page, page, false);
+ remove_migration_ptes(folio, folio, false);
- rc = mapping->a_ops->writepage(page, &wbc);
+ rc = mapping->a_ops->writepage(&folio->page, &wbc);
if (rc != AOP_WRITEPAGE_ACTIVATE)
/* unlocked. Relock */
- lock_page(page);
+ folio_lock(folio);
return (rc < 0) ? -EIO : -EAGAIN;
}
@@ -902,11 +903,11 @@ static int writeout(struct address_space *mapping, struct page *page)
/*
* Default handling if a filesystem does not provide a migration function.
*/
-static int fallback_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+static int fallback_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (PageDirty(page)) {
- /* Only writeback pages in full synchronous migration */
+ if (folio_test_dirty(src)) {
+ /* Only writeback folios in full synchronous migration */
switch (mode) {
case MIGRATE_SYNC:
case MIGRATE_SYNC_NO_COPY:
@@ -914,18 +915,18 @@ static int fallback_migrate_page(struct address_space *mapping,
default:
return -EBUSY;
}
- return writeout(mapping, page);
+ return writeout(mapping, src);
}
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_test_private(src) &&
+ !filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
- return migrate_page(mapping, newpage, page, mode);
+ return migrate_folio(mapping, dst, src, mode);
}
/*
@@ -939,93 +940,193 @@ static int fallback_migrate_page(struct address_space *mapping,
* < 0 - error code
* MIGRATEPAGE_SUCCESS - success
*/
-static int move_to_new_page(struct page *newpage, struct page *page,
+static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
- struct address_space *mapping;
int rc = -EAGAIN;
- bool is_lru = !__PageMovable(page);
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ bool is_lru = !__PageMovable(&src->page);
- mapping = page_mapping(page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
+ VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
if (likely(is_lru)) {
+ struct address_space *mapping = folio_mapping(src);
+
if (!mapping)
- rc = migrate_page(mapping, newpage, page, mode);
- else if (mapping->a_ops->migratepage)
+ rc = migrate_folio(mapping, dst, src, mode);
+ else if (mapping->a_ops->migrate_folio)
/*
- * Most pages have a mapping and most filesystems
- * provide a migratepage callback. Anonymous pages
+ * Most folios have a mapping and most filesystems
+ * provide a migrate_folio callback. Anonymous folios
* are part of swap space which also has its own
- * migratepage callback. This is the most common path
+ * migrate_folio callback. This is the most common path
* for page migration.
*/
- rc = mapping->a_ops->migratepage(mapping, newpage,
- page, mode);
+ rc = mapping->a_ops->migrate_folio(mapping, dst, src,
+ mode);
else
- rc = fallback_migrate_page(mapping, newpage,
- page, mode);
+ rc = fallback_migrate_folio(mapping, dst, src, mode);
} else {
+ const struct movable_operations *mops;
+
/*
* In case of non-lru page, it could be released after
* isolation step. In that case, we shouldn't try migration.
*/
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
- if (!PageMovable(page)) {
+ VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
+ if (!folio_test_movable(src)) {
rc = MIGRATEPAGE_SUCCESS;
- __ClearPageIsolated(page);
+ folio_clear_isolated(src);
goto out;
}
- rc = mapping->a_ops->migratepage(mapping, newpage,
- page, mode);
+ mops = folio_movable_ops(src);
+ rc = mops->migrate_page(&dst->page, &src->page, mode);
WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
- !PageIsolated(page));
+ !folio_test_isolated(src));
}
/*
- * When successful, old pagecache page->mapping must be cleared before
- * page is freed; but stats require that PageAnon be left as PageAnon.
+ * When successful, old pagecache src->mapping must be cleared before
+ * src is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (__PageMovable(page)) {
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ if (__PageMovable(&src->page)) {
+ VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
/*
* We clear PG_movable under page_lock so any compactor
* cannot try to migrate this page.
*/
- __ClearPageIsolated(page);
+ folio_clear_isolated(src);
}
/*
- * Anonymous and movable page->mapping will be cleared by
+ * Anonymous and movable src->mapping will be cleared by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
- if (!PageMappingFlags(page))
- page->mapping = NULL;
-
- if (likely(!is_zone_device_page(newpage)))
- flush_dcache_page(newpage);
+ if (!folio_mapping_flags(src))
+ src->mapping = NULL;
+ if (likely(!folio_is_zone_device(dst)))
+ flush_dcache_folio(dst);
}
out:
return rc;
}
-static int __unmap_and_move(struct page *page, struct page *newpage,
- int force, enum migrate_mode mode)
+/*
+ * To record some information during migration, we use some unused
+ * fields (mapping and private) of struct folio of the newly allocated
+ * destination folio. This is safe because nobody is using them
+ * except us.
+ */
+union migration_ptr {
+ struct anon_vma *anon_vma;
+ struct address_space *mapping;
+};
+static void __migrate_folio_record(struct folio *dst,
+ unsigned long page_was_mapped,
+ struct anon_vma *anon_vma)
+{
+ union migration_ptr ptr = { .anon_vma = anon_vma };
+ dst->mapping = ptr.mapping;
+ dst->private = (void *)page_was_mapped;
+}
+
+static void __migrate_folio_extract(struct folio *dst,
+ int *page_was_mappedp,
+ struct anon_vma **anon_vmap)
+{
+ union migration_ptr ptr = { .mapping = dst->mapping };
+ *anon_vmap = ptr.anon_vma;
+ *page_was_mappedp = (unsigned long)dst->private;
+ dst->mapping = NULL;
+ dst->private = NULL;
+}
+
+/* Restore the source folio to the original state upon failure */
+static void migrate_folio_undo_src(struct folio *src,
+ int page_was_mapped,
+ struct anon_vma *anon_vma,
+ bool locked,
+ struct list_head *ret)
+{
+ if (page_was_mapped)
+ remove_migration_ptes(src, src, false);
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+ if (locked)
+ folio_unlock(src);
+ if (ret)
+ list_move_tail(&src->lru, ret);
+}
+
+/* Restore the destination folio to the original state upon failure */
+static void migrate_folio_undo_dst(struct folio *dst, bool locked,
+ free_folio_t put_new_folio, unsigned long private)
+{
+ if (locked)
+ folio_unlock(dst);
+ if (put_new_folio)
+ put_new_folio(dst, private);
+ else
+ folio_put(dst);
+}
+
+/* Cleanup src folio upon migration success */
+static void migrate_folio_done(struct folio *src,
+ enum migrate_reason reason)
{
+ /*
+ * Compaction can migrate also non-LRU pages which are
+ * not accounted to NR_ISOLATED_*. They can be recognized
+ * as __PageMovable
+ */
+ if (likely(!__folio_test_movable(src)))
+ mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
+ folio_is_file_lru(src), -folio_nr_pages(src));
+
+ if (reason != MR_MEMORY_FAILURE)
+ /* We release the page in page_handle_poison. */
+ folio_put(src);
+}
+
+/* Obtain the lock on page, remove all ptes. */
+static int migrate_folio_unmap(new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ struct folio *src, struct folio **dstp, enum migrate_mode mode,
+ enum migrate_reason reason, struct list_head *ret)
+{
+ struct folio *dst;
int rc = -EAGAIN;
int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = !__PageMovable(page);
+ bool is_lru = !__PageMovable(&src->page);
+ bool locked = false;
+ bool dst_locked = false;
+
+ if (folio_ref_count(src) == 1) {
+ /* Folio was freed from under us. So we are done. */
+ folio_clear_active(src);
+ folio_clear_unevictable(src);
+ /* free_pages_prepare() will clear PG_isolated. */
+ list_del(&src->lru);
+ migrate_folio_done(src, reason);
+ return MIGRATEPAGE_SUCCESS;
+ }
+
+ dst = get_new_folio(src, private);
+ if (!dst)
+ return -ENOMEM;
+ *dstp = dst;
+
+ dst->private = NULL;
- if (!trylock_page(page)) {
- if (!force || mode == MIGRATE_ASYNC)
+ if (!folio_trylock(src)) {
+ if (mode == MIGRATE_ASYNC)
goto out;
/*
@@ -1044,10 +1145,19 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (current->flags & PF_MEMALLOC)
goto out;
- lock_page(page);
+ /*
+ * In "light" mode, we can wait for transient locks (eg
+ * inserting a page into the page table), but it's not
+ * worth waiting for I/O.
+ */
+ if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src))
+ goto out;
+
+ folio_lock(src);
}
+ locked = true;
- if (PageWriteback(page)) {
+ if (folio_test_writeback(src)) {
/*
* Only in the case of a full synchronous migration is it
* necessary to wait for PageWriteback. In the async case,
@@ -1060,201 +1170,164 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
break;
default:
rc = -EBUSY;
- goto out_unlock;
+ goto out;
}
- if (!force)
- goto out_unlock;
- wait_on_page_writeback(page);
+ folio_wait_writeback(src);
}
/*
- * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
- * we cannot notice that anon_vma is freed while we migrates a page.
+ * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
+ * we cannot notice that anon_vma is freed while we migrate a page.
* This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*
- * Only page_get_anon_vma() understands the subtleties of
+ * Only folio_get_anon_vma() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
* But if we cannot get anon_vma, then we won't need it anyway,
* because that implies that the anon page is no longer mapped
* (and cannot be remapped so long as we hold the page lock).
*/
- if (PageAnon(page) && !PageKsm(page))
- anon_vma = page_get_anon_vma(page);
+ if (folio_test_anon(src) && !folio_test_ksm(src))
+ anon_vma = folio_get_anon_vma(src);
/*
* Block others from accessing the new page when we get around to
* establishing additional references. We are usually the only one
- * holding a reference to newpage at this point. We used to have a BUG
- * here if trylock_page(newpage) fails, but would like to allow for
- * cases where there might be a race with the previous use of newpage.
+ * holding a reference to dst at this point. We used to have a BUG
+ * here if folio_trylock(dst) fails, but would like to allow for
+ * cases where there might be a race with the previous use of dst.
* This is much like races on refcount of oldpage: just don't BUG().
*/
- if (unlikely(!trylock_page(newpage)))
- goto out_unlock;
+ if (unlikely(!folio_trylock(dst)))
+ goto out;
+ dst_locked = true;
if (unlikely(!is_lru)) {
- rc = move_to_new_page(newpage, page, mode);
- goto out_unlock_both;
+ __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ return MIGRATEPAGE_UNMAP;
}
/*
* Corner case handling:
* 1. When a new swap-cache page is read into, it is added to the LRU
* and treated as swapcache but it has no rmap yet.
- * Calling try_to_unmap() against a page->mapping==NULL page will
+ * Calling try_to_unmap() against a src->mapping==NULL page will
* trigger a BUG. So handle it here.
- * 2. An orphaned page (see truncate_complete_page) might have
+ * 2. An orphaned page (see truncate_cleanup_page) might have
* fs-private metadata. The page can be picked up due to memory
* offlining. Everywhere else except page reclaim, the page is
* invisible to the vm, so the page can not be migrated. So try to
* free the metadata, so the page can be freed.
*/
- if (!page->mapping) {
- VM_BUG_ON_PAGE(PageAnon(page), page);
- if (page_has_private(page)) {
- try_to_free_buffers(page);
- goto out_unlock_both;
+ if (!src->mapping) {
+ if (folio_test_private(src)) {
+ try_to_free_buffers(src);
+ goto out;
}
- } else if (page_mapped(page)) {
+ } else if (folio_mapped(src)) {
/* Establish migration ptes */
- VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
- page);
- try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ VM_BUG_ON_FOLIO(folio_test_anon(src) &&
+ !folio_test_ksm(src) && !anon_vma, src);
+ try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
page_was_mapped = 1;
}
- if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, mode);
-
- if (page_was_mapped)
- remove_migration_ptes(page,
- rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
+ if (!folio_mapped(src)) {
+ __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ return MIGRATEPAGE_UNMAP;
+ }
-out_unlock_both:
- unlock_page(newpage);
-out_unlock:
- /* Drop an anon_vma reference if we took one */
- if (anon_vma)
- put_anon_vma(anon_vma);
- unlock_page(page);
out:
/*
- * If migration is successful, decrease refcount of the newpage
- * which will not free the page because new page owner increased
- * refcounter. As well, if it is LRU page, add the page to LRU
- * list in here. Use the old state of the isolated source page to
- * determine if we migrated a LRU page. newpage was already unlocked
- * and possibly modified by its owner - don't rely on the page
- * state.
+ * A folio that has not been unmapped will be restored to
+ * right list unless we want to retry.
*/
- if (rc == MIGRATEPAGE_SUCCESS) {
- if (unlikely(!is_lru))
- put_page(newpage);
- else
- putback_lru_page(newpage);
- }
+ if (rc == -EAGAIN)
+ ret = NULL;
+
+ migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
+ migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
return rc;
}
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page,
- free_page_t put_new_page,
- unsigned long private, struct page *page,
- int force, enum migrate_mode mode,
- enum migrate_reason reason)
+/* Migrate the folio to the newly allocated folio in dst. */
+static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
+ struct folio *src, struct folio *dst,
+ enum migrate_mode mode, enum migrate_reason reason,
+ struct list_head *ret)
{
- int rc = MIGRATEPAGE_SUCCESS;
- struct page *newpage = NULL;
+ int rc;
+ int page_was_mapped = 0;
+ struct anon_vma *anon_vma = NULL;
+ bool is_lru = !__PageMovable(&src->page);
+ struct list_head *prev;
- if (!thp_migration_supported() && PageTransHuge(page))
- return -ENOMEM;
+ __migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
+ prev = dst->lru.prev;
+ list_del(&dst->lru);
- if (page_count(page) == 1) {
- /* page was freed from under us. So we are done. */
- ClearPageActive(page);
- ClearPageUnevictable(page);
- if (unlikely(__PageMovable(page))) {
- lock_page(page);
- if (!PageMovable(page))
- __ClearPageIsolated(page);
- unlock_page(page);
- }
+ rc = move_to_new_folio(dst, src, mode);
+ if (rc)
goto out;
- }
- newpage = get_new_page(page, private);
- if (!newpage)
- return -ENOMEM;
+ if (unlikely(!is_lru))
+ goto out_unlock_both;
- rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
- set_page_owner_migrate_reason(newpage, reason);
+ /*
+ * When successful, push dst to LRU immediately: so that if it
+ * turns out to be an mlocked page, remove_migration_ptes() will
+ * automatically build up the correct dst->mlock_count for it.
+ *
+ * We would like to do something similar for the old page, when
+ * unsuccessful, and other cases when a page has been temporarily
+ * isolated from the unevictable LRU: but this case is the easiest.
+ */
+ folio_add_lru(dst);
+ if (page_was_mapped)
+ lru_add_drain();
-out:
- if (rc != -EAGAIN) {
- /*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
- * migrated will have kept its references and be restored.
- */
- list_del(&page->lru);
+ if (page_was_mapped)
+ remove_migration_ptes(src, dst, false);
- /*
- * Compaction can migrate also non-LRU pages which are
- * not accounted to NR_ISOLATED_*. They can be recognized
- * as __PageMovable
- */
- if (likely(!__PageMovable(page)))
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_lru(page), -thp_nr_pages(page));
- }
+out_unlock_both:
+ folio_unlock(dst);
+ set_page_owner_migrate_reason(&dst->page, reason);
+ /*
+ * If migration is successful, decrease refcount of dst,
+ * which will not free the page because new page owner increased
+ * refcounter.
+ */
+ folio_put(dst);
/*
- * If migration is successful, releases reference grabbed during
- * isolation. Otherwise, restore the page to right list unless
- * we want to retry.
+ * A folio that has been migrated has all references removed
+ * and will be freed.
*/
- if (rc == MIGRATEPAGE_SUCCESS) {
- put_page(page);
- if (reason == MR_MEMORY_FAILURE) {
- /*
- * Set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird,
- * it's how HWPoison flag works at the moment.
- */
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- }
- } else {
- if (rc != -EAGAIN) {
- if (likely(!__PageMovable(page))) {
- putback_lru_page(page);
- goto put_new;
- }
+ list_del(&src->lru);
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+ folio_unlock(src);
+ migrate_folio_done(src, reason);
- lock_page(page);
- if (PageMovable(page))
- putback_movable_page(page);
- else
- __ClearPageIsolated(page);
- unlock_page(page);
- put_page(page);
- }
-put_new:
- if (put_new_page)
- put_new_page(newpage, private);
- else
- put_page(newpage);
+ return rc;
+out:
+ /*
+ * A folio that has not been migrated will be restored to
+ * right list unless we want to retry.
+ */
+ if (rc == -EAGAIN) {
+ list_add(&dst->lru, prev);
+ __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ return rc;
}
+ migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
+
return rc;
}
@@ -1276,34 +1349,28 @@ put_new:
* because then pte is replaced with migration swap entry and direct I/O code
* will wait in the page fault for migration to complete.
*/
-static int unmap_and_move_huge_page(new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- struct page *hpage, int force,
- enum migrate_mode mode, int reason)
+static int unmap_and_move_huge_page(new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ struct folio *src, int force, enum migrate_mode mode,
+ int reason, struct list_head *ret)
{
+ struct folio *dst;
int rc = -EAGAIN;
int page_was_mapped = 0;
- struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- /*
- * Migratability of hugepages depends on architectures and their size.
- * This check is necessary because some callers of hugepage migration
- * like soft offline and memory hotremove don't walk through page
- * tables or check whether the hugepage is pmd-based or not before
- * kicking migration.
- */
- if (!hugepage_migration_supported(page_hstate(hpage))) {
- putback_active_hugepage(hpage);
- return -ENOSYS;
+ if (folio_ref_count(src) == 1) {
+ /* page was freed from under us. So we are done. */
+ folio_putback_active_hugetlb(src);
+ return MIGRATEPAGE_SUCCESS;
}
- new_hpage = get_new_page(hpage, private);
- if (!new_hpage)
+ dst = get_new_folio(src, private);
+ if (!dst)
return -ENOMEM;
- if (!trylock_page(hpage)) {
+ if (!folio_trylock(src)) {
if (!force)
goto out;
switch (mode) {
@@ -1313,237 +1380,607 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
default:
goto out;
}
- lock_page(hpage);
+ folio_lock(src);
}
/*
* Check for pages which are in the process of being freed. Without
- * page_mapping() set, hugetlbfs specific move page routine will not
+ * folio_mapping() set, hugetlbfs specific move page routine will not
* be called and we could leak usage counts for subpools.
*/
- if (page_private(hpage) && !page_mapping(hpage)) {
+ if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
rc = -EBUSY;
goto out_unlock;
}
- if (PageAnon(hpage))
- anon_vma = page_get_anon_vma(hpage);
+ if (folio_test_anon(src))
+ anon_vma = folio_get_anon_vma(src);
- if (unlikely(!trylock_page(new_hpage)))
+ if (unlikely(!folio_trylock(dst)))
goto put_anon;
- if (page_mapped(hpage)) {
- /*
- * try_to_unmap could potentially call huge_pmd_unshare.
- * Because of this, take semaphore in write mode here and
- * set TTU_RMAP_LOCKED to let lower levels know we have
- * taken the lock.
- */
- mapping = hugetlb_page_mapping_lock_write(hpage);
- if (unlikely(!mapping))
- goto unlock_put_anon;
+ if (folio_mapped(src)) {
+ enum ttu_flags ttu = 0;
+
+ if (!folio_test_anon(src)) {
+ /*
+ * In shared mappings, try_to_unmap could potentially
+ * call huge_pmd_unshare. Because of this, take
+ * semaphore in write mode here and set TTU_RMAP_LOCKED
+ * to let lower levels know we have taken the lock.
+ */
+ mapping = hugetlb_page_mapping_lock_write(&src->page);
+ if (unlikely(!mapping))
+ goto unlock_put_anon;
+
+ ttu = TTU_RMAP_LOCKED;
+ }
- try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
- TTU_RMAP_LOCKED);
+ try_to_migrate(src, ttu);
page_was_mapped = 1;
- /*
- * Leave mapping locked until after subsequent call to
- * remove_migration_ptes()
- */
+
+ if (ttu & TTU_RMAP_LOCKED)
+ i_mmap_unlock_write(mapping);
}
- if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, mode);
+ if (!folio_mapped(src))
+ rc = move_to_new_folio(dst, src, mode);
- if (page_was_mapped) {
- remove_migration_ptes(hpage,
- rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, true);
- i_mmap_unlock_write(mapping);
- }
+ if (page_was_mapped)
+ remove_migration_ptes(src,
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
unlock_put_anon:
- unlock_page(new_hpage);
+ folio_unlock(dst);
put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
if (rc == MIGRATEPAGE_SUCCESS) {
- move_hugetlb_state(hpage, new_hpage, reason);
- put_new_page = NULL;
+ move_hugetlb_state(src, dst, reason);
+ put_new_folio = NULL;
}
out_unlock:
- unlock_page(hpage);
+ folio_unlock(src);
out:
- if (rc != -EAGAIN)
- putback_active_hugepage(hpage);
+ if (rc == MIGRATEPAGE_SUCCESS)
+ folio_putback_active_hugetlb(src);
+ else if (rc != -EAGAIN)
+ list_move_tail(&src->lru, ret);
/*
* If migration was not successful and there's a freeing callback, use
* it. Otherwise, put_page() will drop the reference grabbed during
* isolation.
*/
- if (put_new_page)
- put_new_page(new_hpage, private);
+ if (put_new_folio)
+ put_new_folio(dst, private);
else
- putback_active_hugepage(new_hpage);
+ folio_putback_active_hugetlb(dst);
+
+ return rc;
+}
+
+static inline int try_split_folio(struct folio *folio, struct list_head *split_folios)
+{
+ int rc;
+
+ folio_lock(folio);
+ rc = split_folio_to_list(folio, split_folios);
+ folio_unlock(folio);
+ if (!rc)
+ list_move_tail(&folio->lru, split_folios);
return rc;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR
+#else
+#define NR_MAX_BATCHED_MIGRATION 512
+#endif
+#define NR_MAX_MIGRATE_PAGES_RETRY 10
+#define NR_MAX_MIGRATE_ASYNC_RETRY 3
+#define NR_MAX_MIGRATE_SYNC_RETRY \
+ (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
+
+struct migrate_pages_stats {
+ int nr_succeeded; /* Normal and large folios migrated successfully, in
+ units of base pages */
+ int nr_failed_pages; /* Normal and large folios failed to be migrated, in
+ units of base pages. Untried folios aren't counted */
+ int nr_thp_succeeded; /* THP migrated successfully */
+ int nr_thp_failed; /* THP failed to be migrated */
+ int nr_thp_split; /* THP split before migrating */
+};
+
/*
- * migrate_pages - migrate the pages specified in a list, to the free pages
- * supplied as the target for the page migration
- *
- * @from: The list of pages to be migrated.
- * @get_new_page: The function used to allocate free pages to be used
- * as the target of the page migration.
- * @put_new_page: The function used to free target pages if migration
- * fails, or NULL if no special handling is necessary.
- * @private: Private data to be passed on to get_new_page()
- * @mode: The migration mode that specifies the constraints for
- * page migration, if any.
- * @reason: The reason for page migration.
- *
- * The function returns after 10 attempts or if no pages are movable any more
- * because the list has become empty or no retryable pages exist any more.
- * The caller should call putback_movable_pages() to return pages to the LRU
- * or free list only if ret != 0.
+ * Returns the number of hugetlb folios that were not migrated, or an error code
+ * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
+ * any more because the list has become empty or no retryable hugetlb folios
+ * exist any more. It is caller's responsibility to call putback_movable_pages()
+ * only if ret != 0.
+ */
+static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ enum migrate_mode mode, int reason,
+ struct migrate_pages_stats *stats,
+ struct list_head *ret_folios)
+{
+ int retry = 1;
+ int nr_failed = 0;
+ int nr_retry_pages = 0;
+ int pass = 0;
+ struct folio *folio, *folio2;
+ int rc, nr_pages;
+
+ for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) {
+ retry = 0;
+ nr_retry_pages = 0;
+
+ list_for_each_entry_safe(folio, folio2, from, lru) {
+ if (!folio_test_hugetlb(folio))
+ continue;
+
+ nr_pages = folio_nr_pages(folio);
+
+ cond_resched();
+
+ /*
+ * Migratability of hugepages depends on architectures and
+ * their size. This check is necessary because some callers
+ * of hugepage migration like soft offline and memory
+ * hotremove don't walk through page tables or check whether
+ * the hugepage is pmd-based or not before kicking migration.
+ */
+ if (!hugepage_migration_supported(folio_hstate(folio))) {
+ nr_failed++;
+ stats->nr_failed_pages += nr_pages;
+ list_move_tail(&folio->lru, ret_folios);
+ continue;
+ }
+
+ rc = unmap_and_move_huge_page(get_new_folio,
+ put_new_folio, private,
+ folio, pass > 2, mode,
+ reason, ret_folios);
+ /*
+ * The rules are:
+ * Success: hugetlb folio will be put back
+ * -EAGAIN: stay on the from list
+ * -ENOMEM: stay on the from list
+ * Other errno: put on ret_folios list
+ */
+ switch(rc) {
+ case -ENOMEM:
+ /*
+ * When memory is low, don't bother to try to migrate
+ * other folios, just exit.
+ */
+ stats->nr_failed_pages += nr_pages + nr_retry_pages;
+ return -ENOMEM;
+ case -EAGAIN:
+ retry++;
+ nr_retry_pages += nr_pages;
+ break;
+ case MIGRATEPAGE_SUCCESS:
+ stats->nr_succeeded += nr_pages;
+ break;
+ default:
+ /*
+ * Permanent failure (-EBUSY, etc.):
+ * unlike -EAGAIN case, the failed folio is
+ * removed from migration folio list and not
+ * retried in the next outer loop.
+ */
+ nr_failed++;
+ stats->nr_failed_pages += nr_pages;
+ break;
+ }
+ }
+ }
+ /*
+ * nr_failed is number of hugetlb folios failed to be migrated. After
+ * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb
+ * folios as failed.
+ */
+ nr_failed += retry;
+ stats->nr_failed_pages += nr_retry_pages;
+
+ return nr_failed;
+}
+
+/*
+ * migrate_pages_batch() first unmaps folios in the from list as many as
+ * possible, then move the unmapped folios.
*
- * Returns the number of pages that were not migrated, or an error code.
+ * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
+ * lock or bit when we have locked more than one folio. Which may cause
+ * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
+ * length of the from list must be <= 1.
*/
-int migrate_pages(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- enum migrate_mode mode, int reason)
+static int migrate_pages_batch(struct list_head *from,
+ new_folio_t get_new_folio, free_folio_t put_new_folio,
+ unsigned long private, enum migrate_mode mode, int reason,
+ struct list_head *ret_folios, struct list_head *split_folios,
+ struct migrate_pages_stats *stats, int nr_pass)
{
int retry = 1;
int thp_retry = 1;
int nr_failed = 0;
- int nr_succeeded = 0;
- int nr_thp_succeeded = 0;
- int nr_thp_failed = 0;
- int nr_thp_split = 0;
+ int nr_retry_pages = 0;
int pass = 0;
bool is_thp = false;
- struct page *page;
- struct page *page2;
- int swapwrite = current->flags & PF_SWAPWRITE;
- int rc, nr_subpages;
+ struct folio *folio, *folio2, *dst = NULL, *dst2;
+ int rc, rc_saved = 0, nr_pages;
+ LIST_HEAD(unmap_folios);
+ LIST_HEAD(dst_folios);
+ bool nosplit = (reason == MR_NUMA_MISPLACED);
- if (!swapwrite)
- current->flags |= PF_SWAPWRITE;
+ VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
+ !list_empty(from) && !list_is_singular(from));
- for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
+ for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
thp_retry = 0;
+ nr_retry_pages = 0;
+
+ list_for_each_entry_safe(folio, folio2, from, lru) {
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
+ nr_pages = folio_nr_pages(folio);
- list_for_each_entry_safe(page, page2, from, lru) {
-retry:
- /*
- * THP statistics is based on the source huge page.
- * Capture required information that might get lost
- * during migration.
- */
- is_thp = PageTransHuge(page) && !PageHuge(page);
- nr_subpages = thp_nr_pages(page);
cond_resched();
- if (PageHuge(page))
- rc = unmap_and_move_huge_page(get_new_page,
- put_new_page, private, page,
- pass > 2, mode, reason);
- else
- rc = unmap_and_move(get_new_page, put_new_page,
- private, page, pass > 2, mode,
- reason);
+ /*
+ * Large folio migration might be unsupported or
+ * the allocation might be failed so we should retry
+ * on the same folio with the large folio split
+ * to normal folios.
+ *
+ * Split folios are put in split_folios, and
+ * we will migrate them after the rest of the
+ * list is processed.
+ */
+ if (!thp_migration_supported() && is_thp) {
+ nr_failed++;
+ stats->nr_thp_failed++;
+ if (!try_split_folio(folio, split_folios)) {
+ stats->nr_thp_split++;
+ continue;
+ }
+ stats->nr_failed_pages += nr_pages;
+ list_move_tail(&folio->lru, ret_folios);
+ continue;
+ }
+ rc = migrate_folio_unmap(get_new_folio, put_new_folio,
+ private, folio, &dst, mode, reason,
+ ret_folios);
+ /*
+ * The rules are:
+ * Success: folio will be freed
+ * Unmap: folio will be put on unmap_folios list,
+ * dst folio put on dst_folios list
+ * -EAGAIN: stay on the from list
+ * -ENOMEM: stay on the from list
+ * Other errno: put on ret_folios list
+ */
switch(rc) {
case -ENOMEM:
/*
- * THP migration might be unsupported or the
- * allocation could've failed so we should
- * retry on the same page with the THP split
- * to base pages.
- *
- * Head page is retried immediately and tail
- * pages are added to the tail of the list so
- * we encounter them after the rest of the list
- * is processed.
+ * When memory is low, don't bother to try to migrate
+ * other folios, move unmapped folios, then exit.
*/
- if (is_thp) {
- lock_page(page);
- rc = split_huge_page_to_list(page, from);
- unlock_page(page);
- if (!rc) {
- list_safe_reset_next(page, page2, lru);
- nr_thp_split++;
- goto retry;
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
+ /* Large folio NUMA faulting doesn't split to retry. */
+ if (folio_test_large(folio) && !nosplit) {
+ int ret = try_split_folio(folio, split_folios);
+
+ if (!ret) {
+ stats->nr_thp_split += is_thp;
+ break;
+ } else if (reason == MR_LONGTERM_PIN &&
+ ret == -EAGAIN) {
+ /*
+ * Try again to split large folio to
+ * mitigate the failure of longterm pinning.
+ */
+ retry++;
+ thp_retry += is_thp;
+ nr_retry_pages += nr_pages;
+ /* Undo duplicated failure counting. */
+ nr_failed--;
+ stats->nr_thp_failed -= is_thp;
+ break;
}
+ }
- nr_thp_failed++;
- nr_failed += nr_subpages;
+ stats->nr_failed_pages += nr_pages + nr_retry_pages;
+ /* nr_failed isn't updated for not used */
+ stats->nr_thp_failed += thp_retry;
+ rc_saved = rc;
+ if (list_empty(&unmap_folios))
goto out;
- }
- nr_failed++;
- goto out;
+ else
+ goto move;
case -EAGAIN:
- if (is_thp) {
- thp_retry++;
- break;
- }
retry++;
+ thp_retry += is_thp;
+ nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
- if (is_thp) {
- nr_thp_succeeded++;
- nr_succeeded += nr_subpages;
- break;
- }
- nr_succeeded++;
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ break;
+ case MIGRATEPAGE_UNMAP:
+ list_move_tail(&folio->lru, &unmap_folios);
+ list_add_tail(&dst->lru, &dst_folios);
break;
default:
/*
- * Permanent failure (-EBUSY, -ENOSYS, etc.):
- * unlike -EAGAIN case, the failed page is
- * removed from migration page list and not
+ * Permanent failure (-EBUSY, etc.):
+ * unlike -EAGAIN case, the failed folio is
+ * removed from migration folio list and not
* retried in the next outer loop.
*/
- if (is_thp) {
- nr_thp_failed++;
- nr_failed += nr_subpages;
- break;
- }
nr_failed++;
+ stats->nr_thp_failed += is_thp;
+ stats->nr_failed_pages += nr_pages;
break;
}
}
}
- nr_failed += retry + thp_retry;
- nr_thp_failed += thp_retry;
- rc = nr_failed;
-out:
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
- count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
- count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
- count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
- trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
- nr_thp_failed, nr_thp_split, mode, reason);
+ nr_failed += retry;
+ stats->nr_thp_failed += thp_retry;
+ stats->nr_failed_pages += nr_retry_pages;
+move:
+ /* Flush TLBs for all unmapped folios */
+ try_to_unmap_flush();
- if (!swapwrite)
- current->flags &= ~PF_SWAPWRITE;
+ retry = 1;
+ for (pass = 0; pass < nr_pass && retry; pass++) {
+ retry = 0;
+ thp_retry = 0;
+ nr_retry_pages = 0;
+
+ dst = list_first_entry(&dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
+ nr_pages = folio_nr_pages(folio);
+
+ cond_resched();
+
+ rc = migrate_folio_move(put_new_folio, private,
+ folio, dst, mode,
+ reason, ret_folios);
+ /*
+ * The rules are:
+ * Success: folio will be freed
+ * -EAGAIN: stay on the unmap_folios list
+ * Other errno: put on ret_folios list
+ */
+ switch(rc) {
+ case -EAGAIN:
+ retry++;
+ thp_retry += is_thp;
+ nr_retry_pages += nr_pages;
+ break;
+ case MIGRATEPAGE_SUCCESS:
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ break;
+ default:
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
+ stats->nr_failed_pages += nr_pages;
+ break;
+ }
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
+ }
+ nr_failed += retry;
+ stats->nr_thp_failed += thp_retry;
+ stats->nr_failed_pages += nr_retry_pages;
+
+ rc = rc_saved ? : nr_failed;
+out:
+ /* Cleanup remaining folios */
+ dst = list_first_entry(&dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
+ int page_was_mapped = 0;
+ struct anon_vma *anon_vma = NULL;
+
+ __migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
+ migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
+ true, ret_folios);
+ list_del(&dst->lru);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
return rc;
}
-struct page *alloc_migration_target(struct page *page, unsigned long private)
+static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ enum migrate_mode mode, int reason,
+ struct list_head *ret_folios, struct list_head *split_folios,
+ struct migrate_pages_stats *stats)
+{
+ int rc, nr_failed = 0;
+ LIST_HEAD(folios);
+ struct migrate_pages_stats astats;
+
+ memset(&astats, 0, sizeof(astats));
+ /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
+ rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC,
+ reason, &folios, split_folios, &astats,
+ NR_MAX_MIGRATE_ASYNC_RETRY);
+ stats->nr_succeeded += astats.nr_succeeded;
+ stats->nr_thp_succeeded += astats.nr_thp_succeeded;
+ stats->nr_thp_split += astats.nr_thp_split;
+ if (rc < 0) {
+ stats->nr_failed_pages += astats.nr_failed_pages;
+ stats->nr_thp_failed += astats.nr_thp_failed;
+ list_splice_tail(&folios, ret_folios);
+ return rc;
+ }
+ stats->nr_thp_failed += astats.nr_thp_split;
+ nr_failed += astats.nr_thp_split;
+ /*
+ * Fall back to migrate all failed folios one by one synchronously. All
+ * failed folios except split THPs will be retried, so their failure
+ * isn't counted
+ */
+ list_splice_tail_init(&folios, from);
+ while (!list_empty(from)) {
+ list_move(from->next, &folios);
+ rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
+ private, mode, reason, ret_folios,
+ split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
+ list_splice_tail_init(&folios, ret_folios);
+ if (rc < 0)
+ return rc;
+ nr_failed += rc;
+ }
+
+ return nr_failed;
+}
+
+/*
+ * migrate_pages - migrate the folios specified in a list, to the free folios
+ * supplied as the target for the page migration
+ *
+ * @from: The list of folios to be migrated.
+ * @get_new_folio: The function used to allocate free folios to be used
+ * as the target of the folio migration.
+ * @put_new_folio: The function used to free target folios if migration
+ * fails, or NULL if no special handling is necessary.
+ * @private: Private data to be passed on to get_new_folio()
+ * @mode: The migration mode that specifies the constraints for
+ * folio migration, if any.
+ * @reason: The reason for folio migration.
+ * @ret_succeeded: Set to the number of folios migrated successfully if
+ * the caller passes a non-NULL pointer.
+ *
+ * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
+ * are movable any more because the list has become empty or no retryable folios
+ * exist any more. It is caller's responsibility to call putback_movable_pages()
+ * only if ret != 0.
+ *
+ * Returns the number of {normal folio, large folio, hugetlb} that were not
+ * migrated, or an error code. The number of large folio splits will be
+ * considered as the number of non-migrated large folio, no matter how many
+ * split folios of the large folio are migrated successfully.
+ */
+int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
+{
+ int rc, rc_gather;
+ int nr_pages;
+ struct folio *folio, *folio2;
+ LIST_HEAD(folios);
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(split_folios);
+ struct migrate_pages_stats stats;
+
+ trace_mm_migrate_pages_start(mode, reason);
+
+ memset(&stats, 0, sizeof(stats));
+
+ rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private,
+ mode, reason, &stats, &ret_folios);
+ if (rc_gather < 0)
+ goto out;
+
+again:
+ nr_pages = 0;
+ list_for_each_entry_safe(folio, folio2, from, lru) {
+ /* Retried hugetlb folios will be kept in list */
+ if (folio_test_hugetlb(folio)) {
+ list_move_tail(&folio->lru, &ret_folios);
+ continue;
+ }
+
+ nr_pages += folio_nr_pages(folio);
+ if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
+ break;
+ }
+ if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
+ list_cut_before(&folios, from, &folio2->lru);
+ else
+ list_splice_init(from, &folios);
+ if (mode == MIGRATE_ASYNC)
+ rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
+ private, mode, reason, &ret_folios,
+ &split_folios, &stats,
+ NR_MAX_MIGRATE_PAGES_RETRY);
+ else
+ rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio,
+ private, mode, reason, &ret_folios,
+ &split_folios, &stats);
+ list_splice_tail_init(&folios, &ret_folios);
+ if (rc < 0) {
+ rc_gather = rc;
+ list_splice_tail(&split_folios, &ret_folios);
+ goto out;
+ }
+ if (!list_empty(&split_folios)) {
+ /*
+ * Failure isn't counted since all split folios of a large folio
+ * is counted as 1 failure already. And, we only try to migrate
+ * with minimal effort, force MIGRATE_ASYNC mode and retry once.
+ */
+ migrate_pages_batch(&split_folios, get_new_folio,
+ put_new_folio, private, MIGRATE_ASYNC, reason,
+ &ret_folios, NULL, &stats, 1);
+ list_splice_tail_init(&split_folios, &ret_folios);
+ }
+ rc_gather += rc;
+ if (!list_empty(from))
+ goto again;
+out:
+ /*
+ * Put the permanent failure folio back to migration list, they
+ * will be put back to the right list by the caller.
+ */
+ list_splice(&ret_folios, from);
+
+ /*
+ * Return 0 in case all split folios of fail-to-migrate large folios
+ * are migrated successfully.
+ */
+ if (list_empty(from))
+ rc_gather = 0;
+
+ count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
+ count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
+ count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded);
+ count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed);
+ count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
+ trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
+ stats.nr_thp_succeeded, stats.nr_thp_failed,
+ stats.nr_thp_split, mode, reason);
+
+ if (ret_succeeded)
+ *ret_succeeded = stats.nr_succeeded;
+
+ return rc_gather;
+}
+
+struct folio *alloc_migration_target(struct folio *src, unsigned long private)
{
struct migration_target_control *mtc;
gfp_t gfp_mask;
unsigned int order = 0;
- struct page *new_page = NULL;
int nid;
int zidx;
@@ -1551,34 +1988,30 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
gfp_mask = mtc->gfp_mask;
nid = mtc->nid;
if (nid == NUMA_NO_NODE)
- nid = page_to_nid(page);
+ nid = folio_nid(src);
- if (PageHuge(page)) {
- struct hstate *h = page_hstate(compound_head(page));
+ if (folio_test_hugetlb(src)) {
+ struct hstate *h = folio_hstate(src);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
- return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ return alloc_hugetlb_folio_nodemask(h, nid,
+ mtc->nmask, gfp_mask);
}
- if (PageTransHuge(page)) {
+ if (folio_test_large(src)) {
/*
* clear __GFP_RECLAIM to make the migration callback
* consistent with regular THP allocations.
*/
gfp_mask &= ~__GFP_RECLAIM;
gfp_mask |= GFP_TRANSHUGE;
- order = HPAGE_PMD_ORDER;
+ order = folio_order(src);
}
- zidx = zone_idx(page_zone(page));
+ zidx = zone_idx(folio_zone(src));
if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
gfp_mask |= __GFP_HIGHMEM;
- new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
-
- if (new_page && PageTransHuge(new_page))
- prep_transhuge_page(new_page);
-
- return new_page;
+ return __folio_alloc(gfp_mask, order, nid, mtc->nmask);
}
#ifdef CONFIG_NUMA
@@ -1604,7 +2037,7 @@ static int do_move_pages_to_node(struct mm_struct *mm,
};
err = migrate_pages(pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(pagelist);
return err;
@@ -1619,23 +2052,25 @@ static int do_move_pages_to_node(struct mm_struct *mm,
* target node
* 1 - when it has been queued
*/
-static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
int node, struct list_head *pagelist, bool migrate_all)
{
struct vm_area_struct *vma;
+ unsigned long addr;
struct page *page;
- unsigned int follflags;
int err;
+ bool isolated;
mmap_read_lock(mm);
+ addr = (unsigned long)untagged_addr_remote(mm, p);
+
err = -EFAULT;
- vma = find_vma(mm, addr);
- if (!vma || addr < vma->vm_start || !vma_migratable(vma))
+ vma = vma_lookup(mm, addr);
+ if (!vma || !vma_migratable(vma))
goto out;
/* FOLL_DUMP to ignore special (like zero) pages */
- follflags = FOLL_GET | FOLL_DUMP;
- page = follow_page(vma, addr, follflags);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1645,6 +2080,9 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
if (!page)
goto out;
+ if (is_zone_device_page(page))
+ goto out_putpage;
+
err = 0;
if (page_to_nid(page) == node)
goto out_putpage;
@@ -1655,16 +2093,18 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
if (PageHuge(page)) {
if (PageHead(page)) {
- isolate_huge_page(page, pagelist);
- err = 1;
+ isolated = isolate_hugetlb(page_folio(page), pagelist);
+ err = isolated ? 1 : -EBUSY;
}
} else {
struct page *head;
head = compound_head(page);
- err = isolate_lru_page(head);
- if (err)
+ isolated = isolate_lru_page(head);
+ if (!isolated) {
+ err = -EBUSY;
goto out_putpage;
+ }
err = 1;
list_add_tail(&head->lru, pagelist);
@@ -1699,12 +2139,12 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node,
* Positive err means the number of failed
* pages to migrate. Since we are going to
* abort and return the number of non-migrated
- * pages, so need to incude the rest of the
+ * pages, so need to include the rest of the
* nr_pages that have not been attempted as
* well.
*/
if (err > 0)
- err += nr_pages - i - 1;
+ err += nr_pages - i;
return err;
}
return store_status(status, start, node, i - start);
@@ -1725,11 +2165,10 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
int start, i;
int err = 0, err1;
- migrate_prep();
+ lru_cache_disable();
for (i = start = 0; i < nr_pages; i++) {
const void __user *p;
- unsigned long addr;
int node;
err = -EFAULT;
@@ -1737,7 +2176,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
goto out_flush;
if (get_user(node, nodes + i))
goto out_flush;
- addr = (unsigned long)untagged_addr(p);
err = -ENODEV;
if (node < 0 || node >= MAX_NUMNODES)
@@ -1765,8 +2203,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
* Errors in the page lookup or isolation are not fatal and we simply
* report them via status
*/
- err = add_page_for_migration(mm, addr, current_node,
- &pagelist, flags & MPOL_MF_MOVE_ALL);
+ err = add_page_for_migration(mm, p, current_node, &pagelist,
+ flags & MPOL_MF_MOVE_ALL);
if (err > 0) {
/* The page is successfully queued for migration */
@@ -1774,6 +2212,13 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
}
/*
+ * The move_pages() man page does not have an -EEXIST choice, so
+ * use -EFAULT instead.
+ */
+ if (err == -EEXIST)
+ err = -EFAULT;
+
+ /*
* If the page is already on the target node (!err), store the
* node, otherwise, store the err.
*/
@@ -1783,8 +2228,12 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
err = move_pages_and_store_status(mm, current_node, &pagelist,
status, start, i, nr_pages);
- if (err)
+ if (err) {
+ /* We have accounted for page i */
+ if (err > 0)
+ err--;
goto out;
+ }
current_node = NUMA_NO_NODE;
}
out_flush:
@@ -1794,6 +2243,7 @@ out_flush:
if (err >= 0)
err = err1;
out:
+ lru_cache_enable();
return err;
}
@@ -1813,18 +2263,25 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
struct page *page;
int err = -EFAULT;
- vma = find_vma(mm, addr);
- if (!vma || addr < vma->vm_start)
+ vma = vma_lookup(mm, addr);
+ if (!vma)
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
goto set_status;
- err = page ? page_to_nid(page) : -ENOENT;
+ err = -ENOENT;
+ if (!page)
+ goto set_status;
+
+ if (!is_zone_device_page(page))
+ err = page_to_nid(page);
+
+ put_page(page);
set_status:
*status = err;
@@ -1835,6 +2292,23 @@ set_status:
mmap_read_unlock(mm);
}
+static int get_compat_pages_array(const void __user *chunk_pages[],
+ const void __user * __user *pages,
+ unsigned long chunk_nr)
+{
+ compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
+ compat_uptr_t p;
+ int i;
+
+ for (i = 0; i < chunk_nr; i++) {
+ if (get_user(p, pages32 + i))
+ return -EFAULT;
+ chunk_pages[i] = compat_ptr(p);
+ }
+
+ return 0;
+}
+
/*
* Determine the nodes of a user array of pages and store it in
* a user array of status.
@@ -1843,19 +2317,22 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
const void __user * __user *pages,
int __user *status)
{
-#define DO_PAGES_STAT_CHUNK_NR 16
+#define DO_PAGES_STAT_CHUNK_NR 16UL
const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
int chunk_status[DO_PAGES_STAT_CHUNK_NR];
while (nr_pages) {
- unsigned long chunk_nr;
-
- chunk_nr = nr_pages;
- if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
- chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+ unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
- if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
- break;
+ if (in_compat_syscall()) {
+ if (get_compat_pages_array(chunk_pages, pages,
+ chunk_nr))
+ break;
+ } else {
+ if (copy_from_user(chunk_pages, pages,
+ chunk_nr * sizeof(*chunk_pages)))
+ break;
+ }
do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
@@ -1869,33 +2346,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
return nr_pages ? -EFAULT : 0;
}
-/*
- * Move a list of pages in the address space of the currently executing
- * process.
- */
-static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
- const void __user * __user *pages,
- const int __user *nodes,
- int __user *status, int flags)
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
{
struct task_struct *task;
struct mm_struct *mm;
- int err;
- nodemask_t task_nodes;
- /* Check flags */
- if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
- return -EINVAL;
-
- if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
- return -EPERM;
+ /*
+ * There is no need to check if current process has the right to modify
+ * the specified process when they are same.
+ */
+ if (!pid) {
+ mmget(current->mm);
+ *mem_nodes = cpuset_mems_allowed(current);
+ return current->mm;
+ }
/* Find the mm_struct */
rcu_read_lock();
- task = pid ? find_task_by_vpid(pid) : current;
+ task = find_task_by_vpid(pid);
if (!task) {
rcu_read_unlock();
- return -ESRCH;
+ return ERR_PTR(-ESRCH);
}
get_task_struct(task);
@@ -1905,22 +2376,47 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
- err = -EPERM;
+ mm = ERR_PTR(-EPERM);
goto out;
}
rcu_read_unlock();
- err = security_task_movememory(task);
- if (err)
+ mm = ERR_PTR(security_task_movememory(task));
+ if (IS_ERR(mm))
goto out;
-
- task_nodes = cpuset_mems_allowed(task);
+ *mem_nodes = cpuset_mems_allowed(task);
mm = get_task_mm(task);
+out:
put_task_struct(task);
-
if (!mm)
+ mm = ERR_PTR(-EINVAL);
+ return mm;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
+{
+ struct mm_struct *mm;
+ int err;
+ nodemask_t task_nodes;
+
+ /* Check flags */
+ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ mm = find_mm_struct(pid, &task_nodes);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
if (nodes)
err = do_pages_move(mm, task_nodes, nr_pages, pages,
nodes, status, flags);
@@ -1929,10 +2425,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
mmput(mm);
return err;
-
-out:
- put_task_struct(task);
- return err;
}
SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
@@ -1943,32 +2435,10 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
}
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
- compat_uptr_t __user *, pages32,
- const int __user *, nodes,
- int __user *, status,
- int, flags)
-{
- const void __user * __user *pages;
- int i;
-
- pages = compat_alloc_user_space(nr_pages * sizeof(void *));
- for (i = 0; i < nr_pages; i++) {
- compat_uptr_t p;
-
- if (get_user(p, pages32 + i) ||
- put_user(compat_ptr(p), pages + i))
- return -EFAULT;
- }
- return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
-}
-#endif /* CONFIG_COMPAT */
-
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns true if this is a safe migration target node for misplaced NUMA
- * pages. Currently it only checks the watermarks which crude
+ * pages. Currently it only checks the watermarks which is crude.
*/
static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
unsigned long nr_migrate_pages)
@@ -1978,7 +2448,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;
/* Avoid waking kswapd by allocating pages_to_migrate pages. */
@@ -1992,49 +2462,53 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
return false;
}
-static struct page *alloc_misplaced_dst_page(struct page *page,
+static struct folio *alloc_misplaced_dst_folio(struct folio *src,
unsigned long data)
{
int nid = (int) data;
- struct page *newpage;
-
- newpage = __alloc_pages_node(nid,
- (GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE | __GFP_NOMEMALLOC |
- __GFP_NORETRY | __GFP_NOWARN) &
- ~__GFP_RECLAIM, 0);
+ int order = folio_order(src);
+ gfp_t gfp = __GFP_THISNODE;
- return newpage;
+ if (order > 0)
+ gfp |= GFP_TRANSHUGE_LIGHT;
+ else {
+ gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN;
+ gfp &= ~__GFP_RECLAIM;
+ }
+ return __folio_alloc_node(gfp, order, nid);
}
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
- int page_lru;
+ int nr_pages = thp_nr_pages(page);
+ int order = compound_order(page);
- VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+ VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
- /* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
+ /* Do not migrate THP mapped by multiple processes */
+ if (PageTransHuge(page) && total_mapcount(page) > 1)
return 0;
- if (isolate_lru_page(page))
- return 0;
+ /* Avoid migrating to a node that is nearly full */
+ if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
+ int z;
- /*
- * migrate_misplaced_transhuge_page() skips page migration's usual
- * check on page_count(), so we must do it here, now that the page
- * has been isolated: a GUP pin, or any other pin, prevents migration.
- * The expected page count is 3: 1 for page's mapcount and 1 for the
- * caller's pin and 1 for the reference taken by isolate_lru_page().
- */
- if (PageTransHuge(page) && page_count(page) != 3) {
- putback_lru_page(page);
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
+ return 0;
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ if (managed_zone(pgdat->node_zones + z))
+ break;
+ }
+ wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
return 0;
}
- page_lru = page_is_file_lru(page);
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
- thp_nr_pages(page));
+ if (!isolate_lru_page(page))
+ return 0;
+
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
+ nr_pages);
/*
* Isolating the page has taken another reference, so the
@@ -2045,12 +2519,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 1;
}
-bool pmd_trans_migrating(pmd_t pmd)
-{
- struct page *page = pmd_page(pmd);
- return PageLocked(page);
-}
-
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -2062,7 +2530,9 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
int nr_remaining;
+ unsigned int nr_succeeded;
LIST_HEAD(migratepages);
+ int nr_pages = thp_nr_pages(page);
/*
* Don't migrate file pages that are mapped in multiple processes
@@ -2084,1013 +2554,30 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
goto out;
list_add(&page->lru, &migratepages);
- nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+ nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
NULL, node, MIGRATE_ASYNC,
- MR_NUMA_MISPLACED);
+ MR_NUMA_MISPLACED, &nr_succeeded);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_lru(page), -nr_pages);
putback_lru_page(page);
}
isolated = 0;
- } else
- count_vm_numa_event(NUMA_PAGE_MIGRATE);
- BUG_ON(!list_empty(&migratepages));
- return isolated;
-
-out:
- put_page(page);
- return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * Migrates a THP to a given target node. page must be locked and is unlocked
- * before returning.
- */
-int migrate_misplaced_transhuge_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pmd_t *pmd, pmd_t entry,
- unsigned long address,
- struct page *page, int node)
-{
- spinlock_t *ptl;
- pg_data_t *pgdat = NODE_DATA(node);
- int isolated = 0;
- struct page *new_page = NULL;
- int page_lru = page_is_file_lru(page);
- unsigned long start = address & HPAGE_PMD_MASK;
-
- new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!new_page)
- goto out_fail;
- prep_transhuge_page(new_page);
-
- isolated = numamigrate_isolate_page(pgdat, page);
- if (!isolated) {
- put_page(new_page);
- goto out_fail;
}
-
- /* Prepare a page as a migration target */
- __SetPageLocked(new_page);
- if (PageSwapBacked(page))
- __SetPageSwapBacked(new_page);
-
- /* anon mapping, we can simply copy page->mapping to the new page: */
- new_page->mapping = page->mapping;
- new_page->index = page->index;
- /* flush the cache before copying using the kernel virtual address */
- flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
- migrate_page_copy(new_page, page);
- WARN_ON(PageLRU(new_page));
-
- /* Recheck the target PMD */
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
- spin_unlock(ptl);
-
- /* Reverse changes made by migrate_page_copy() */
- if (TestClearPageActive(new_page))
- SetPageActive(page);
- if (TestClearPageUnevictable(new_page))
- SetPageUnevictable(page);
-
- unlock_page(new_page);
- put_page(new_page); /* Free it */
-
- /* Retake the callers reference and putback on LRU */
- get_page(page);
- putback_lru_page(page);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
-
- goto out_unlock;
+ if (nr_succeeded) {
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
+ if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
+ mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
+ nr_succeeded);
}
-
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-
- /*
- * Overwrite the old entry under pagetable lock and establish
- * the new PTE. Any parallel GUP will either observe the old
- * page blocking on the page lock, block on the page table
- * lock or observe the new page. The SetPageUptodate on the
- * new page and page_add_new_anon_rmap guarantee the copy is
- * visible before the pagetable update.
- */
- page_add_anon_rmap(new_page, vma, start, true);
- /*
- * At this point the pmd is numa/protnone (i.e. non present) and the TLB
- * has already been flushed globally. So no TLB can be currently
- * caching this non present pmd mapping. There's no need to clear the
- * pmd before doing set_pmd_at(), nor to flush the TLB after
- * set_pmd_at(). Clearing the pmd here would introduce a race
- * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
- * mmap_lock for reading. If the pmd is set to NULL at any given time,
- * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
- * pmd.
- */
- set_pmd_at(mm, start, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
-
- page_ref_unfreeze(page, 2);
- mlock_migrate_page(new_page, page);
- page_remove_rmap(page, true);
- set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
-
- spin_unlock(ptl);
-
- /* Take an "isolate" reference and put new page on the LRU. */
- get_page(new_page);
- putback_lru_page(new_page);
-
- unlock_page(new_page);
- unlock_page(page);
- put_page(page); /* Drop the rmap reference */
- put_page(page); /* Drop the LRU isolation reference */
-
- count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
- count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru,
- -HPAGE_PMD_NR);
+ BUG_ON(!list_empty(&migratepages));
return isolated;
-out_fail:
- count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
- ptl = pmd_lock(mm, pmd);
- if (pmd_same(*pmd, entry)) {
- entry = pmd_modify(entry, vma->vm_page_prot);
- set_pmd_at(mm, start, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
- }
- spin_unlock(ptl);
-
-out_unlock:
- unlock_page(page);
+out:
put_page(page);
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
-
#endif /* CONFIG_NUMA */
-
-#ifdef CONFIG_DEVICE_PRIVATE
-static int migrate_vma_collect_hole(unsigned long start,
- unsigned long end,
- __always_unused int depth,
- struct mm_walk *walk)
-{
- struct migrate_vma *migrate = walk->private;
- unsigned long addr;
-
- /* Only allow populating anonymous memory. */
- if (!vma_is_anonymous(walk->vma)) {
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->src[migrate->npages] = 0;
- migrate->dst[migrate->npages] = 0;
- migrate->npages++;
- }
- return 0;
- }
-
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
- migrate->dst[migrate->npages] = 0;
- migrate->npages++;
- migrate->cpages++;
- }
-
- return 0;
-}
-
-static int migrate_vma_collect_skip(unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
-{
- struct migrate_vma *migrate = walk->private;
- unsigned long addr;
-
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->dst[migrate->npages] = 0;
- migrate->src[migrate->npages++] = 0;
- }
-
- return 0;
-}
-
-static int migrate_vma_collect_pmd(pmd_t *pmdp,
- unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
-{
- struct migrate_vma *migrate = walk->private;
- struct vm_area_struct *vma = walk->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
- spinlock_t *ptl;
- pte_t *ptep;
-
-again:
- if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, -1, walk);
-
- if (pmd_trans_huge(*pmdp)) {
- struct page *page;
-
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_trans_huge(*pmdp))) {
- spin_unlock(ptl);
- goto again;
- }
-
- page = pmd_page(*pmdp);
- if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
- split_huge_pmd(vma, pmdp, addr);
- if (pmd_trans_unstable(pmdp))
- return migrate_vma_collect_skip(start, end,
- walk);
- } else {
- int ret;
-
- get_page(page);
- spin_unlock(ptl);
- if (unlikely(!trylock_page(page)))
- return migrate_vma_collect_skip(start, end,
- walk);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (ret)
- return migrate_vma_collect_skip(start, end,
- walk);
- if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, -1,
- walk);
- }
- }
-
- if (unlikely(pmd_bad(*pmdp)))
- return migrate_vma_collect_skip(start, end, walk);
-
- ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
- arch_enter_lazy_mmu_mode();
-
- for (; addr < end; addr += PAGE_SIZE, ptep++) {
- unsigned long mpfn = 0, pfn;
- struct page *page;
- swp_entry_t entry;
- pte_t pte;
-
- pte = *ptep;
-
- if (pte_none(pte)) {
- if (vma_is_anonymous(vma)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
- }
- goto next;
- }
-
- if (!pte_present(pte)) {
- /*
- * Only care about unaddressable device page special
- * page table entry. Other special swap entries are not
- * migratable, and we ignore regular swapped page.
- */
- entry = pte_to_swp_entry(pte);
- if (!is_device_private_entry(entry))
- goto next;
-
- page = device_private_entry_to_page(entry);
- if (!(migrate->flags &
- MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
- page->pgmap->owner != migrate->pgmap_owner)
- goto next;
-
- mpfn = migrate_pfn(page_to_pfn(page)) |
- MIGRATE_PFN_MIGRATE;
- if (is_write_device_private_entry(entry))
- mpfn |= MIGRATE_PFN_WRITE;
- } else {
- if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
- pfn = pte_pfn(pte);
- if (is_zero_pfn(pfn)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
- goto next;
- }
- page = vm_normal_page(migrate->vma, addr, pte);
- mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
- mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
- }
-
- /* FIXME support THP */
- if (!page || !page->mapping || PageTransCompound(page)) {
- mpfn = 0;
- goto next;
- }
-
- /*
- * By getting a reference on the page we pin it and that blocks
- * any kind of migration. Side effect is that it "freezes" the
- * pte.
- *
- * We drop this reference after isolating the page from the lru
- * for non device page (device page are not on the lru and thus
- * can't be dropped from it).
- */
- get_page(page);
- migrate->cpages++;
-
- /*
- * Optimize for the common case where page is only mapped once
- * in one process. If we can lock the page, then we can safely
- * set up a special migration page table entry now.
- */
- if (trylock_page(page)) {
- pte_t swp_pte;
-
- mpfn |= MIGRATE_PFN_LOCKED;
- ptep_get_and_clear(mm, addr, ptep);
-
- /* Setup special migration page table entry */
- entry = make_migration_entry(page, mpfn &
- MIGRATE_PFN_WRITE);
- swp_pte = swp_entry_to_pte(entry);
- if (pte_present(pte)) {
- if (pte_soft_dirty(pte))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pte))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- } else {
- if (pte_swp_soft_dirty(pte))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pte))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- }
- set_pte_at(mm, addr, ptep, swp_pte);
-
- /*
- * This is like regular unmap: we remove the rmap and
- * drop page refcount. Page won't be freed, as we took
- * a reference just above.
- */
- page_remove_rmap(page, false);
- put_page(page);
-
- if (pte_present(pte))
- unmapped++;
- }
-
-next:
- migrate->dst[migrate->npages] = 0;
- migrate->src[migrate->npages++] = mpfn;
- }
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
-
- /* Only flush the TLB if we actually modified any entries */
- if (unmapped)
- flush_tlb_range(walk->vma, start, end);
-
- return 0;
-}
-
-static const struct mm_walk_ops migrate_vma_walk_ops = {
- .pmd_entry = migrate_vma_collect_pmd,
- .pte_hole = migrate_vma_collect_hole,
-};
-
-/*
- * migrate_vma_collect() - collect pages over a range of virtual addresses
- * @migrate: migrate struct containing all migration information
- *
- * This will walk the CPU page table. For each virtual address backed by a
- * valid page, it updates the src array and takes a reference on the page, in
- * order to pin the page until we lock it and unmap it.
- */
-static void migrate_vma_collect(struct migrate_vma *migrate)
-{
- struct mmu_notifier_range range;
-
- /*
- * Note that the pgmap_owner is passed to the mmu notifier callback so
- * that the registered device driver can skip invalidating device
- * private page mappings that won't be migrated.
- */
- mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
- migrate->vma->vm_mm, migrate->start, migrate->end,
- migrate->pgmap_owner);
- mmu_notifier_invalidate_range_start(&range);
-
- walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
- &migrate_vma_walk_ops, migrate);
-
- mmu_notifier_invalidate_range_end(&range);
- migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
-}
-
-/*
- * migrate_vma_check_page() - check if page is pinned or not
- * @page: struct page to check
- *
- * Pinned pages cannot be migrated. This is the same test as in
- * migrate_page_move_mapping(), except that here we allow migration of a
- * ZONE_DEVICE page.
- */
-static bool migrate_vma_check_page(struct page *page)
-{
- /*
- * One extra ref because caller holds an extra reference, either from
- * isolate_lru_page() for a regular page, or migrate_vma_collect() for
- * a device page.
- */
- int extra = 1;
-
- /*
- * FIXME support THP (transparent huge page), it is bit more complex to
- * check them than regular pages, because they can be mapped with a pmd
- * or with a pte (split pte mapping).
- */
- if (PageCompound(page))
- return false;
-
- /* Page from ZONE_DEVICE have one extra reference */
- if (is_zone_device_page(page)) {
- /*
- * Private page can never be pin as they have no valid pte and
- * GUP will fail for those. Yet if there is a pending migration
- * a thread might try to wait on the pte migration entry and
- * will bump the page reference count. Sadly there is no way to
- * differentiate a regular pin from migration wait. Hence to
- * avoid 2 racing thread trying to migrate back to CPU to enter
- * infinite loop (one stoping migration because the other is
- * waiting on pte migration entry). We always return true here.
- *
- * FIXME proper solution is to rework migration_entry_wait() so
- * it does not need to take a reference on page.
- */
- return is_device_private_page(page);
- }
-
- /* For file back page */
- if (page_mapping(page))
- extra += 1 + page_has_private(page);
-
- if ((page_count(page) - extra) > page_mapcount(page))
- return false;
-
- return true;
-}
-
-/*
- * migrate_vma_prepare() - lock pages and isolate them from the lru
- * @migrate: migrate struct containing all migration information
- *
- * This locks pages that have been collected by migrate_vma_collect(). Once each
- * page is locked it is isolated from the lru (for non-device pages). Finally,
- * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
- * migrated by concurrent kernel threads.
- */
-static void migrate_vma_prepare(struct migrate_vma *migrate)
-{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
- unsigned long addr, i, restore = 0;
- bool allow_drain = true;
-
- lru_add_drain();
-
- for (i = 0; (i < npages) && migrate->cpages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
- bool remap = true;
-
- if (!page)
- continue;
-
- if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
- /*
- * Because we are migrating several pages there can be
- * a deadlock between 2 concurrent migration where each
- * are waiting on each other page lock.
- *
- * Make migrate_vma() a best effort thing and backoff
- * for any page we can not lock right away.
- */
- if (!trylock_page(page)) {
- migrate->src[i] = 0;
- migrate->cpages--;
- put_page(page);
- continue;
- }
- remap = false;
- migrate->src[i] |= MIGRATE_PFN_LOCKED;
- }
-
- /* ZONE_DEVICE pages are not on LRU */
- if (!is_zone_device_page(page)) {
- if (!PageLRU(page) && allow_drain) {
- /* Drain CPU's pagevec */
- lru_add_drain_all();
- allow_drain = false;
- }
-
- if (isolate_lru_page(page)) {
- if (remap) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
- restore++;
- } else {
- migrate->src[i] = 0;
- unlock_page(page);
- migrate->cpages--;
- put_page(page);
- }
- continue;
- }
-
- /* Drop the reference we took in collect */
- put_page(page);
- }
-
- if (!migrate_vma_check_page(page)) {
- if (remap) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
- restore++;
-
- if (!is_zone_device_page(page)) {
- get_page(page);
- putback_lru_page(page);
- }
- } else {
- migrate->src[i] = 0;
- unlock_page(page);
- migrate->cpages--;
-
- if (!is_zone_device_page(page))
- putback_lru_page(page);
- else
- put_page(page);
- }
- }
- }
-
- for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
-
- remove_migration_pte(page, migrate->vma, addr, page);
-
- migrate->src[i] = 0;
- unlock_page(page);
- put_page(page);
- restore--;
- }
-}
-
-/*
- * migrate_vma_unmap() - replace page mapping with special migration pte entry
- * @migrate: migrate struct containing all migration information
- *
- * Replace page mapping (CPU page table pte) with a special migration pte entry
- * and check again if it has been pinned. Pinned pages are restored because we
- * cannot migrate them.
- *
- * This is the last step before we call the device driver callback to allocate
- * destination memory and copy contents of original page over to new page.
- */
-static void migrate_vma_unmap(struct migrate_vma *migrate)
-{
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
- unsigned long addr, i, restore = 0;
-
- for (i = 0; i < npages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
-
- if (page_mapped(page)) {
- try_to_unmap(page, flags);
- if (page_mapped(page))
- goto restore;
- }
-
- if (migrate_vma_check_page(page))
- continue;
-
-restore:
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
- restore++;
- }
-
- for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
-
- remove_migration_ptes(page, page, false);
-
- migrate->src[i] = 0;
- unlock_page(page);
- restore--;
-
- if (is_zone_device_page(page))
- put_page(page);
- else
- putback_lru_page(page);
- }
-}
-
-/**
- * migrate_vma_setup() - prepare to migrate a range of memory
- * @args: contains the vma, start, and pfns arrays for the migration
- *
- * Returns: negative errno on failures, 0 when 0 or more pages were migrated
- * without an error.
- *
- * Prepare to migrate a range of memory virtual address range by collecting all
- * the pages backing each virtual address in the range, saving them inside the
- * src array. Then lock those pages and unmap them. Once the pages are locked
- * and unmapped, check whether each page is pinned or not. Pages that aren't
- * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
- * corresponding src array entry. Then restores any pages that are pinned, by
- * remapping and unlocking those pages.
- *
- * The caller should then allocate destination memory and copy source memory to
- * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
- * flag set). Once these are allocated and copied, the caller must update each
- * corresponding entry in the dst array with the pfn value of the destination
- * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
- * (destination pages must have their struct pages locked, via lock_page()).
- *
- * Note that the caller does not have to migrate all the pages that are marked
- * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
- * device memory to system memory. If the caller cannot migrate a device page
- * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
- * consequences for the userspace process, so it must be avoided if at all
- * possible.
- *
- * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
- * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
- * allowing the caller to allocate device memory for those unback virtual
- * address. For this the caller simply has to allocate device memory and
- * properly set the destination entry like for regular migration. Note that
- * this can still fails and thus inside the device driver must check if the
- * migration was successful for those entries after calling migrate_vma_pages()
- * just like for regular migration.
- *
- * After that, the callers must call migrate_vma_pages() to go over each entry
- * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
- * then migrate_vma_pages() to migrate struct page information from the source
- * struct page to the destination struct page. If it fails to migrate the
- * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
- * src array.
- *
- * At this point all successfully migrated pages have an entry in the src
- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
- * array entry with MIGRATE_PFN_VALID flag set.
- *
- * Once migrate_vma_pages() returns the caller may inspect which pages were
- * successfully migrated, and which were not. Successfully migrated pages will
- * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
- *
- * It is safe to update device page table after migrate_vma_pages() because
- * both destination and source page are still locked, and the mmap_lock is held
- * in read mode (hence no one can unmap the range being migrated).
- *
- * Once the caller is done cleaning up things and updating its page table (if it
- * chose to do so, this is not an obligation) it finally calls
- * migrate_vma_finalize() to update the CPU page table to point to new pages
- * for successfully migrated pages or otherwise restore the CPU page table to
- * point to the original source pages.
- */
-int migrate_vma_setup(struct migrate_vma *args)
-{
- long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
-
- args->start &= PAGE_MASK;
- args->end &= PAGE_MASK;
- if (!args->vma || is_vm_hugetlb_page(args->vma) ||
- (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
- return -EINVAL;
- if (nr_pages <= 0)
- return -EINVAL;
- if (args->start < args->vma->vm_start ||
- args->start >= args->vma->vm_end)
- return -EINVAL;
- if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
- return -EINVAL;
- if (!args->src || !args->dst)
- return -EINVAL;
-
- memset(args->src, 0, sizeof(*args->src) * nr_pages);
- args->cpages = 0;
- args->npages = 0;
-
- migrate_vma_collect(args);
-
- if (args->cpages)
- migrate_vma_prepare(args);
- if (args->cpages)
- migrate_vma_unmap(args);
-
- /*
- * At this point pages are locked and unmapped, and thus they have
- * stable content and can safely be copied to destination memory that
- * is allocated by the drivers.
- */
- return 0;
-
-}
-EXPORT_SYMBOL(migrate_vma_setup);
-
-/*
- * This code closely matches the code in:
- * __handle_mm_fault()
- * handle_pte_fault()
- * do_anonymous_page()
- * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
- * private page.
- */
-static void migrate_vma_insert_page(struct migrate_vma *migrate,
- unsigned long addr,
- struct page *page,
- unsigned long *src,
- unsigned long *dst)
-{
- struct vm_area_struct *vma = migrate->vma;
- struct mm_struct *mm = vma->vm_mm;
- bool flush = false;
- spinlock_t *ptl;
- pte_t entry;
- pgd_t *pgdp;
- p4d_t *p4dp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- /* Only allow populating anonymous memory */
- if (!vma_is_anonymous(vma))
- goto abort;
-
- pgdp = pgd_offset(mm, addr);
- p4dp = p4d_alloc(mm, pgdp, addr);
- if (!p4dp)
- goto abort;
- pudp = pud_alloc(mm, p4dp, addr);
- if (!pudp)
- goto abort;
- pmdp = pmd_alloc(mm, pudp, addr);
- if (!pmdp)
- goto abort;
-
- if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
- goto abort;
-
- /*
- * Use pte_alloc() instead of pte_alloc_map(). We can't run
- * pte_offset_map() on pmds where a huge pmd might be created
- * from a different thread.
- *
- * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
- * parallel threads are excluded by other means.
- *
- * Here we only have mmap_read_lock(mm).
- */
- if (pte_alloc(mm, pmdp))
- goto abort;
-
- /* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(pmdp)))
- goto abort;
-
- if (unlikely(anon_vma_prepare(vma)))
- goto abort;
- if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
- goto abort;
-
- /*
- * The memory barrier inside __SetPageUptodate makes sure that
- * preceding stores to the page contents become visible before
- * the set_pte_at() write.
- */
- __SetPageUptodate(page);
-
- if (is_zone_device_page(page)) {
- if (is_device_private_page(page)) {
- swp_entry_t swp_entry;
-
- swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
- entry = swp_entry_to_pte(swp_entry);
- }
- } else {
- entry = mk_pte(page, vma->vm_page_prot);
- if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
- }
-
- ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-
- if (check_stable_address_space(mm))
- goto unlock_abort;
-
- if (pte_present(*ptep)) {
- unsigned long pfn = pte_pfn(*ptep);
-
- if (!is_zero_pfn(pfn))
- goto unlock_abort;
- flush = true;
- } else if (!pte_none(*ptep))
- goto unlock_abort;
-
- /*
- * Check for userfaultfd but do not deliver the fault. Instead,
- * just back off.
- */
- if (userfaultfd_missing(vma))
- goto unlock_abort;
-
- inc_mm_counter(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, addr, false);
- if (!is_zone_device_page(page))
- lru_cache_add_inactive_or_unevictable(page, vma);
- get_page(page);
-
- if (flush) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, entry);
- update_mmu_cache(vma, addr, ptep);
- } else {
- /* No need to invalidate - it was non-present before */
- set_pte_at(mm, addr, ptep, entry);
- update_mmu_cache(vma, addr, ptep);
- }
-
- pte_unmap_unlock(ptep, ptl);
- *src = MIGRATE_PFN_MIGRATE;
- return;
-
-unlock_abort:
- pte_unmap_unlock(ptep, ptl);
-abort:
- *src &= ~MIGRATE_PFN_MIGRATE;
-}
-
-/**
- * migrate_vma_pages() - migrate meta-data from src page to dst page
- * @migrate: migrate struct containing all migration information
- *
- * This migrates struct page meta-data from source struct page to destination
- * struct page. This effectively finishes the migration from source page to the
- * destination page.
- */
-void migrate_vma_pages(struct migrate_vma *migrate)
-{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
- struct mmu_notifier_range range;
- unsigned long addr, i;
- bool notified = false;
-
- for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
- struct address_space *mapping;
- int r;
-
- if (!newpage) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
- }
-
- if (!page) {
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
- if (!notified) {
- notified = true;
-
- mmu_notifier_range_init(&range,
- MMU_NOTIFY_CLEAR, 0,
- NULL,
- migrate->vma->vm_mm,
- addr, migrate->end);
- mmu_notifier_invalidate_range_start(&range);
- }
- migrate_vma_insert_page(migrate, addr, newpage,
- &migrate->src[i],
- &migrate->dst[i]);
- continue;
- }
-
- mapping = page_mapping(page);
-
- if (is_zone_device_page(newpage)) {
- if (is_device_private_page(newpage)) {
- /*
- * For now only support private anonymous when
- * migrating to un-addressable device memory.
- */
- if (mapping) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
- }
- } else {
- /*
- * Other types of ZONE_DEVICE page are not
- * supported.
- */
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
- }
- }
-
- r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
- if (r != MIGRATEPAGE_SUCCESS)
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- }
-
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
- * did already call it.
- */
- if (notified)
- mmu_notifier_invalidate_range_only_end(&range);
-}
-EXPORT_SYMBOL(migrate_vma_pages);
-
-/**
- * migrate_vma_finalize() - restore CPU page table entry
- * @migrate: migrate struct containing all migration information
- *
- * This replaces the special migration pte entry with either a mapping to the
- * new page if migration was successful for that page, or to the original page
- * otherwise.
- *
- * This also unlocks the pages and puts them back on the lru, or drops the extra
- * refcount, for device pages.
- */
-void migrate_vma_finalize(struct migrate_vma *migrate)
-{
- const unsigned long npages = migrate->npages;
- unsigned long i;
-
- for (i = 0; i < npages; i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page) {
- if (newpage) {
- unlock_page(newpage);
- put_page(newpage);
- }
- continue;
- }
-
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
- if (newpage) {
- unlock_page(newpage);
- put_page(newpage);
- }
- newpage = page;
- }
-
- remove_migration_ptes(page, newpage, false);
- unlock_page(page);
-
- if (is_zone_device_page(page))
- put_page(page);
- else
- putback_lru_page(page);
-
- if (newpage != page) {
- unlock_page(newpage);
- if (is_zone_device_page(newpage))
- put_page(newpage);
- else
- putback_lru_page(newpage);
- }
- }
-}
-EXPORT_SYMBOL(migrate_vma_finalize);
-#endif /* CONFIG_DEVICE_PRIVATE */
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
new file mode 100644
index 000000000000..d5f492356e3e
--- /dev/null
+++ b/mm/migrate_device.c
@@ -0,0 +1,956 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device Memory Migration functionality.
+ *
+ * Originally written by Jérôme Glisse.
+ */
+#include <linux/export.h>
+#include <linux/memremap.h>
+#include <linux/migrate.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
+#include <linux/oom.h>
+#include <linux/pagewalk.h>
+#include <linux/rmap.h>
+#include <linux/swapops.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static int migrate_vma_collect_skip(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_hole(unsigned long start,
+ unsigned long end,
+ __always_unused int depth,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+again:
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+ if (pmd_trans_huge(*pmdp)) {
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ page = pmd_page(*pmdp);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ } else {
+ int ret;
+
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ }
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto again;
+ arch_enter_lazy_mmu_mode();
+
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn = 0, pfn;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+
+ pte = ptep_get(ptep);
+
+ if (pte_none(pte)) {
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
+ goto next;
+ }
+
+ if (!pte_present(pte)) {
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = pfn_swap_entry_to_page(entry);
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ page->pgmap->owner != migrate->pgmap_owner)
+ goto next;
+
+ mpfn = migrate_pfn(page_to_pfn(page)) |
+ MIGRATE_PFN_MIGRATE;
+ if (is_writable_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ pfn = pte_pfn(pte);
+ if (is_zero_pfn(pfn) &&
+ (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ goto next;
+ }
+ page = vm_normal_page(migrate->vma, addr, pte);
+ if (page && !is_zone_device_page(page) &&
+ !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
+ else if (page && is_device_coherent_page(page) &&
+ (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ page->pgmap->owner != migrate->pgmap_owner))
+ goto next;
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
+ /* FIXME support THP */
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = 0;
+ goto next;
+ }
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the page from the lru
+ * for non device page (device page are not on the lru and thus
+ * can't be dropped from it).
+ */
+ get_page(page);
+
+ /*
+ * We rely on trylock_page() to avoid deadlock between
+ * concurrent migrations where each is waiting on the others
+ * page lock. If we can't immediately lock the page we fail this
+ * migration as it is only best effort anyway.
+ *
+ * If we can lock the page it's safe to set up a migration entry
+ * now. In the common case where the page is mapped once in a
+ * single process setting up the migration entry now is an
+ * optimisation to avoid walking the rmap later with
+ * try_to_migrate().
+ */
+ if (trylock_page(page)) {
+ bool anon_exclusive;
+ pte_t swp_pte;
+
+ flush_cache_page(vma, addr, pte_pfn(pte));
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (anon_exclusive) {
+ pte = ptep_clear_flush(vma, addr, ptep);
+
+ if (page_try_share_anon_rmap(page)) {
+ set_pte_at(mm, addr, ptep, pte);
+ unlock_page(page);
+ put_page(page);
+ mpfn = 0;
+ goto next;
+ }
+ } else {
+ pte = ptep_get_and_clear(mm, addr, ptep);
+ }
+
+ migrate->cpages++;
+
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
+ /* Setup special migration page table entry */
+ if (mpfn & MIGRATE_PFN_WRITE)
+ entry = make_writable_migration_entry(
+ page_to_pfn(page));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(page));
+ if (pte_present(pte)) {
+ if (pte_young(pte))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pte))
+ entry = make_migration_entry_dirty(entry);
+ }
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_present(pte)) {
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
+ set_pte_at(mm, addr, ptep, swp_pte);
+
+ /*
+ * This is like regular unmap: we remove the rmap and
+ * drop page refcount. Page won't be freed, as we took
+ * a reference just above.
+ */
+ page_remove_rmap(page, vma, false);
+ put_page(page);
+
+ if (pte_present(pte))
+ unmapped++;
+ } else {
+ put_page(page);
+ mpfn = 0;
+ }
+
+next:
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+
+ /* Only flush the TLB if we actually modified any entries */
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
+ return 0;
+}
+
+static const struct mm_walk_ops migrate_vma_walk_ops = {
+ .pmd_entry = migrate_vma_collect_pmd,
+ .pte_hole = migrate_vma_collect_hole,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+ struct mmu_notifier_range range;
+
+ /*
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
+ * that the registered device driver can skip invalidating device
+ * private page mappings that won't be migrated.
+ */
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
+ migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
+ mmu_notifier_invalidate_range_start(&range);
+
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+ &migrate_vma_walk_ops, migrate);
+
+ mmu_notifier_invalidate_range_end(&range);
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * folio_migrate_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
+{
+ /*
+ * One extra ref because caller holds an extra reference, either from
+ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+ * a device page.
+ */
+ int extra = 1 + (page == fault_page);
+
+ /*
+ * FIXME support THP (transparent huge page), it is bit more complex to
+ * check them than regular pages, because they can be mapped with a pmd
+ * or with a pte (split pte mapping).
+ */
+ if (PageCompound(page))
+ return false;
+
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page))
+ extra++;
+
+ /* For file back page */
+ if (page_mapping(page))
+ extra += 1 + page_has_private(page);
+
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+ return true;
+}
+
+/*
+ * Unmaps pages for migration. Returns number of source pfns marked as
+ * migrating.
+ */
+static unsigned long migrate_device_unmap(unsigned long *src_pfns,
+ unsigned long npages,
+ struct page *fault_page)
+{
+ unsigned long i, restore = 0;
+ bool allow_drain = true;
+ unsigned long unmapped = 0;
+
+ lru_add_drain();
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
+ struct folio *folio;
+
+ if (!page) {
+ if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
+ unmapped++;
+ continue;
+ }
+
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's lru cache */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
+
+ if (!isolate_lru_page(page)) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ restore++;
+ continue;
+ }
+
+ /* Drop the reference we took in collect */
+ put_page(page);
+ }
+
+ folio = page_folio(page);
+ if (folio_mapped(folio))
+ try_to_migrate(folio, 0);
+
+ if (page_mapped(page) ||
+ !migrate_vma_check_page(page, fault_page)) {
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
+
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ restore++;
+ continue;
+ }
+
+ unmapped++;
+ }
+
+ for (i = 0; i < npages && restore; i++) {
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
+ struct folio *folio;
+
+ if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ folio = page_folio(page);
+ remove_migration_ptes(folio, folio, false);
+
+ src_pfns[i] = 0;
+ folio_unlock(folio);
+ folio_put(folio);
+ restore--;
+ }
+
+ return unmapped;
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
+ * special migration pte entry and check if it has been pinned. Pinned pages are
+ * restored because we cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages,
+ migrate->fault_page);
+}
+
+/**
+ * migrate_vma_setup() - prepare to migrate a range of memory
+ * @args: contains the vma, start, and pfns arrays for the migration
+ *
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
+ * without an error.
+ *
+ * Prepare to migrate a range of memory virtual address range by collecting all
+ * the pages backing each virtual address in the range, saving them inside the
+ * src array. Then lock those pages and unmap them. Once the pages are locked
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
+ * corresponding src array entry. Then restores any pages that are pinned, by
+ * remapping and unlocking those pages.
+ *
+ * The caller should then allocate destination memory and copy source memory to
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
+ * flag set). Once these are allocated and copied, the caller must update each
+ * corresponding entry in the dst array with the pfn value of the destination
+ * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
+ * lock_page().
+ *
+ * Note that the caller does not have to migrate all the pages that are marked
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
+ * device memory to system memory. If the caller cannot migrate a device page
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
+ * consequences for the userspace process, so it must be avoided if at all
+ * possible.
+ *
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing the caller to allocate device memory for those unbacked virtual
+ * addresses. For this the caller simply has to allocate device memory and
+ * properly set the destination entry like for regular migration. Note that
+ * this can still fail, and thus inside the device driver you must check if the
+ * migration was successful for those entries after calling migrate_vma_pages(),
+ * just like for regular migration.
+ *
+ * After that, the callers must call migrate_vma_pages() to go over each entry
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then migrate_vma_pages() to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
+ * src array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table after migrate_vma_pages() because
+ * both destination and source page are still locked, and the mmap_lock is held
+ * in read mode (hence no one can unmap the range being migrated).
+ *
+ * Once the caller is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) it finally calls
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
+ * for successfully migrated pages or otherwise restore the CPU page table to
+ * point to the original source pages.
+ */
+int migrate_vma_setup(struct migrate_vma *args)
+{
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
+
+ args->start &= PAGE_MASK;
+ args->end &= PAGE_MASK;
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (nr_pages <= 0)
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
+ return -EINVAL;
+ if (!args->src || !args->dst)
+ return -EINVAL;
+ if (args->fault_page && !is_device_private_page(args->fault_page))
+ return -EINVAL;
+
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+
+ if (args->cpages)
+ migrate_vma_unmap(args);
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the drivers.
+ */
+ return 0;
+
+}
+EXPORT_SYMBOL(migrate_vma_setup);
+
+/*
+ * This code closely matches the code in:
+ * __handle_mm_fault()
+ * handle_pte_fault()
+ * do_anonymous_page()
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
+ * private or coherent page.
+ */
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ bool flush = false;
+ spinlock_t *ptl;
+ pte_t entry;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ pte_t orig_pte;
+
+ /* Only allow populating anonymous memory */
+ if (!vma_is_anonymous(vma))
+ goto abort;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ goto abort;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ goto abort;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ goto abort;
+ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ goto abort;
+ if (pte_alloc(mm, pmdp))
+ goto abort;
+ if (unlikely(anon_vma_prepare(vma)))
+ goto abort;
+ if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
+ goto abort;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
+
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ if (is_zone_device_page(page) &&
+ !is_device_coherent_page(page)) {
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
+ }
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto abort;
+ orig_pte = ptep_get(ptep);
+
+ if (check_stable_address_space(mm))
+ goto unlock_abort;
+
+ if (pte_present(orig_pte)) {
+ unsigned long pfn = pte_pfn(orig_pte);
+
+ if (!is_zero_pfn(pfn))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pte_none(orig_pte))
+ goto unlock_abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ inc_mm_counter(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr);
+ if (!is_zone_device_page(page))
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ get_page(page);
+
+ if (flush) {
+ flush_cache_page(vma, addr, pte_pfn(orig_pte));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ } else {
+ /* No need to invalidate - it was non-present before */
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ }
+
+ pte_unmap_unlock(ptep, ptl);
+ *src = MIGRATE_PFN_MIGRATE;
+ return;
+
+unlock_abort:
+ pte_unmap_unlock(ptep, ptl);
+abort:
+ *src &= ~MIGRATE_PFN_MIGRATE;
+}
+
+static void __migrate_device_pages(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages,
+ struct migrate_vma *migrate)
+{
+ struct mmu_notifier_range range;
+ unsigned long i;
+ bool notified = false;
+
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
+ struct address_space *mapping;
+ int r;
+
+ if (!newpage) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (!page) {
+ unsigned long addr;
+
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ /*
+ * The only time there is no vma is when called from
+ * migrate_device_coherent_page(). However this isn't
+ * called if the page could not be unmapped.
+ */
+ VM_BUG_ON(!migrate);
+ addr = migrate->start + i*PAGE_SIZE;
+ if (!notified) {
+ notified = true;
+
+ mmu_notifier_range_init_owner(&range,
+ MMU_NOTIFY_MIGRATE, 0,
+ migrate->vma->vm_mm, addr, migrate->end,
+ migrate->pgmap_owner);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+ migrate_vma_insert_page(migrate, addr, newpage,
+ &src_pfns[i]);
+ continue;
+ }
+
+ mapping = page_mapping(page);
+
+ if (is_device_private_page(newpage) ||
+ is_device_coherent_page(newpage)) {
+ /*
+ * For now only support anonymous memory migrating to
+ * device private or coherent memory.
+ */
+ if (mapping) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ } else if (is_zone_device_page(newpage)) {
+ /*
+ * Other types of ZONE_DEVICE page are not supported.
+ */
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (migrate && migrate->fault_page == page)
+ r = migrate_folio_extra(mapping, page_folio(newpage),
+ page_folio(page),
+ MIGRATE_SYNC_NO_COPY, 1);
+ else
+ r = migrate_folio(mapping, page_folio(newpage),
+ page_folio(page), MIGRATE_SYNC_NO_COPY);
+ if (r != MIGRATEPAGE_SUCCESS)
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
+ * did already call it.
+ */
+ if (notified)
+ mmu_notifier_invalidate_range_only_end(&range);
+}
+
+/**
+ * migrate_device_pages() - migrate meta-data from src page to dst page
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Equivalent to migrate_vma_pages(). This is called to migrate struct page
+ * meta-data from source struct page to destination.
+ */
+void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
+ unsigned long npages)
+{
+ __migrate_device_pages(src_pfns, dst_pfns, npages, NULL);
+}
+EXPORT_SYMBOL(migrate_device_pages);
+
+/**
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate);
+}
+EXPORT_SYMBOL(migrate_vma_pages);
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
+ */
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; i++) {
+ struct folio *dst, *src;
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
+
+ if (!page) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ continue;
+ }
+
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ newpage = page;
+ }
+
+ src = page_folio(page);
+ dst = page_folio(newpage);
+ remove_migration_ptes(src, dst, false);
+ folio_unlock(src);
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+
+ if (newpage != page) {
+ unlock_page(newpage);
+ if (is_zone_device_page(newpage))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+ }
+}
+EXPORT_SYMBOL(migrate_device_finalize);
+
+/**
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+}
+EXPORT_SYMBOL(migrate_vma_finalize);
+
+/**
+ * migrate_device_range() - migrate device private pfns to normal memory.
+ * @src_pfns: array large enough to hold migrating source device private pfns.
+ * @start: starting pfn in the range to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that
+ * instead of looking up pages based on virtual address mappings a range of
+ * device pfns that should be migrated to system memory is used instead.
+ *
+ * This is useful when a driver needs to free device memory but doesn't know the
+ * virtual mappings of every page that may be in device memory. For example this
+ * is often the case when a driver is being unloaded or unbound from a device.
+ *
+ * Like migrate_vma_setup() this function will take a reference and lock any
+ * migrating pages that aren't free before unmapping them. Drivers may then
+ * allocate destination pages and start copying data from the device to CPU
+ * memory before calling migrate_device_pages().
+ */
+int migrate_device_range(unsigned long *src_pfns, unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (!get_page_unless_zero(page)) {
+ src_pfns[i] = 0;
+ continue;
+ }
+
+ if (!trylock_page(page)) {
+ src_pfns[i] = 0;
+ put_page(page);
+ continue;
+ }
+
+ src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ }
+
+ migrate_device_unmap(src_pfns, npages, NULL);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
+
+/*
+ * Migrate a device coherent page back to normal memory. The caller should have
+ * a reference on page which will be copied to the new page if migration is
+ * successful or dropped on failure.
+ */
+int migrate_device_coherent_page(struct page *page)
+{
+ unsigned long src_pfn, dst_pfn = 0;
+ struct page *dpage;
+
+ WARN_ON_ONCE(PageCompound(page));
+
+ lock_page(page);
+ src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
+
+ /*
+ * We don't have a VMA and don't need to walk the page tables to find
+ * the source page. So call migrate_vma_unmap() directly to unmap the
+ * page as migrate_vma_setup() will fail if args.vma == NULL.
+ */
+ migrate_device_unmap(&src_pfn, 1, NULL);
+ if (!(src_pfn & MIGRATE_PFN_MIGRATE))
+ return -EBUSY;
+
+ dpage = alloc_page(GFP_USER | __GFP_NOWARN);
+ if (dpage) {
+ lock_page(dpage);
+ dst_pfn = migrate_pfn(page_to_pfn(dpage));
+ }
+
+ migrate_device_pages(&src_pfn, &dst_pfn, 1);
+ if (src_pfn & MIGRATE_PFN_MIGRATE)
+ copy_highpage(dpage, page);
+ migrate_device_finalize(&src_pfn, &dst_pfn, 1);
+
+ if (src_pfn & MIGRATE_PFN_MIGRATE)
+ return 0;
+ return -EBUSY;
+}
diff --git a/mm/mincore.c b/mm/mincore.c
index 02db1a834021..dad3622cc963 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -20,6 +20,7 @@
#include <linux/pgtable.h>
#include <linux/uaccess.h>
+#include "swap.h"
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -32,7 +33,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
*/
- present = pte && !huge_pte_none(huge_ptep_get(pte));
+ present = pte && !huge_pte_none_mostly(huge_ptep_get(pte));
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
@@ -51,7 +52,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
{
unsigned char present = 0;
- struct page *page;
+ struct folio *folio;
/*
* When tmpfs swaps out a page from a file, any process mapping that
@@ -59,10 +60,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- page = find_get_incore_page(mapping, index);
- if (page) {
- present = PageUptodate(page);
- put_page(page);
+ folio = filemap_get_incore_folio(mapping, index);
+ if (!IS_ERR(folio)) {
+ present = folio_test_uptodate(folio);
+ folio_put(folio);
}
return present;
@@ -112,16 +113,16 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
goto out;
}
- if (pmd_trans_unstable(pmd)) {
- __mincore_unmapped_range(addr, end, vma, vec);
- goto out;
- }
-
ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!ptep) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; ptep++, addr += PAGE_SIZE) {
- pte_t pte = *ptep;
+ pte_t pte = ptep_get(ptep);
- if (pte_none(pte))
+ /* We need to do cache lookup too for pte markers */
+ if (pte_none_mostly(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
else if (pte_present(pte))
@@ -166,14 +167,16 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
* for writing; otherwise we'd be including shared non-exclusive
* mappings, which opens a side channel.
*/
- return inode_owner_or_capable(file_inode(vma->vm_file)) ||
- inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+ return inode_owner_or_capable(&nop_mnt_idmap,
+ file_inode(vma->vm_file)) ||
+ file_permission(vma->vm_file, MAY_WRITE) == 0;
}
static const struct mm_walk_ops mincore_walk_ops = {
.pmd_entry = mincore_pte_range,
.pte_hole = mincore_unmapped_range,
.hugetlb_entry = mincore_hugetlb,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -187,8 +190,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
unsigned long end;
int err;
- vma = find_vma(current->mm, addr);
- if (!vma || addr < vma->vm_start)
+ vma = vma_lookup(current->mm, addr);
+ if (!vma)
return -ENOMEM;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
if (!can_do_mincore(vma)) {
diff --git a/mm/mlock.c b/mm/mlock.c
index 884b1216da6a..479e09d0994c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,6 +14,7 @@
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
+#include <linux/pagewalk.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
@@ -23,9 +24,19 @@
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
+#include <linux/secretmem.h>
#include "internal.h"
+struct mlock_fbatch {
+ local_lock_t lock;
+ struct folio_batch fbatch;
+};
+
+static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
+
bool can_do_mlock(void)
{
if (rlimit(RLIMIT_MEMLOCK) != 0)
@@ -37,478 +48,354 @@ bool can_do_mlock(void)
EXPORT_SYMBOL(can_do_mlock);
/*
- * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * Mlocked folios are marked with the PG_mlocked flag for efficient testing
* in vmscan and, possibly, the fault path; and to support semi-accurate
* statistics.
*
- * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
- * be placed on the LRU "unevictable" list, rather than the [in]active lists.
- * The unevictable list is an LRU sibling list to the [in]active lists.
- * PageUnevictable is set to indicate the unevictable state.
- *
- * When lazy mlocking via vmscan, it is important to ensure that the
- * vma's VM_LOCKED status is not concurrently being modified, otherwise we
- * may have mlocked a page that is being munlocked. So lazy mlock must take
- * the mmap_lock for read, and verify that the vma really is locked
- * (see mm/rmap.c).
+ * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it
+ * will be ostensibly placed on the LRU "unevictable" list (actually no such
+ * list exists), rather than the [in]active lists. PG_unevictable is set to
+ * indicate the unevictable state.
*/
-/*
- * LRU accounting for clear_page_mlock()
- */
-void clear_page_mlock(struct page *page)
+static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec)
{
- int nr_pages;
+ /* There is nothing more we can do while it's off LRU */
+ if (!folio_test_clear_lru(folio))
+ return lruvec;
- if (!TestClearPageMlocked(page))
- return;
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
- nr_pages = thp_nr_pages(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
- /*
- * The previous TestClearPageMlocked() corresponds to the smp_mb()
- * in __pagevec_lru_add_fn().
- *
- * See __pagevec_lru_add_fn for more explanation.
- */
- if (!isolate_lru_page(page)) {
- putback_lru_page(page);
- } else {
+ if (unlikely(folio_evictable(folio))) {
/*
- * We lost the race. the page already moved to evictable list.
+ * This is a little surprising, but quite possible: PG_mlocked
+ * must have got cleared already by another CPU. Could this
+ * folio be unevictable? I'm not sure, but move it now if so.
*/
- if (PageUnevictable(page))
- count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ if (folio_test_unevictable(folio)) {
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
+
+ __count_vm_events(UNEVICTABLE_PGRESCUED,
+ folio_nr_pages(folio));
+ }
+ goto out;
+ }
+
+ if (folio_test_unevictable(folio)) {
+ if (folio_test_mlocked(folio))
+ folio->mlock_count++;
+ goto out;
}
+
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_set_unevictable(folio);
+ folio->mlock_count = !!folio_test_mlocked(folio);
+ lruvec_add_folio(lruvec, folio);
+ __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
+out:
+ folio_set_lru(folio);
+ return lruvec;
}
-/*
- * Mark page as mlocked if not already.
- * If page on LRU, isolate and putback to move to unevictable list.
- */
-void mlock_vma_page(struct page *page)
+static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec)
{
- /* Serialize with page migration */
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
- if (!TestSetPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
+ /* As above, this is a little surprising, but possible */
+ if (unlikely(folio_evictable(folio)))
+ goto out;
- mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
- count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
- if (!isolate_lru_page(page))
- putback_lru_page(page);
- }
+ folio_set_unevictable(folio);
+ folio->mlock_count = !!folio_test_mlocked(folio);
+ __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
+out:
+ lruvec_add_folio(lruvec, folio);
+ folio_set_lru(folio);
+ return lruvec;
}
-/*
- * Isolate a page from LRU with optional get_page() pin.
- * Assumes lru_lock already held and page already pinned.
- */
-static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec)
{
- if (PageLRU(page)) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
- if (getpage)
- get_page(page);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- return true;
- }
+ int nr_pages = folio_nr_pages(folio);
+ bool isolated = false;
- return false;
-}
+ if (!folio_test_clear_lru(folio))
+ goto munlock;
-/*
- * Finish munlock after successful page isolation
- *
- * Page must be locked. This is a wrapper for try_to_munlock()
- * and putback_lru_page() with munlock accounting.
- */
-static void __munlock_isolated_page(struct page *page)
-{
- /*
- * Optimization: if the page was mapped just once, that's our mapping
- * and we don't need to check all the other vmas.
- */
- if (page_mapcount(page) > 1)
- try_to_munlock(page);
+ isolated = true;
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
- /* Did try_to_unlock() succeed or punt? */
- if (!PageMlocked(page))
- count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
+ if (folio_test_unevictable(folio)) {
+ /* Then mlock_count is maintained, but might undercount */
+ if (folio->mlock_count)
+ folio->mlock_count--;
+ if (folio->mlock_count)
+ goto out;
+ }
+ /* else assume that was the last mlock: reclaim will fix it if not */
+
+munlock:
+ if (folio_test_clear_mlocked(folio)) {
+ __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
+ if (isolated || !folio_test_unevictable(folio))
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+ else
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ }
- putback_lru_page(page);
+ /* folio_evictable() has to be checked *after* clearing Mlocked */
+ if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ }
+out:
+ if (isolated)
+ folio_set_lru(folio);
+ return lruvec;
}
/*
- * Accounting for page isolation fail during munlock
- *
- * Performs accounting when page isolation fails in munlock. There is nothing
- * else to do because it means some other task has already removed the page
- * from the LRU. putback_lru_page() will take care of removing the page from
- * the unevictable list, if necessary. vmscan [page_referenced()] will move
- * the page back to the unevictable list if some other vma has it mlocked.
+ * Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
*/
-static void __munlock_isolation_failed(struct page *page)
+#define LRU_FOLIO 0x1
+#define NEW_FOLIO 0x2
+static inline struct folio *mlock_lru(struct folio *folio)
{
- int nr_pages = thp_nr_pages(page);
+ return (struct folio *)((unsigned long)folio + LRU_FOLIO);
+}
- if (PageUnevictable(page))
- __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
- else
- __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+static inline struct folio *mlock_new(struct folio *folio)
+{
+ return (struct folio *)((unsigned long)folio + NEW_FOLIO);
}
-/**
- * munlock_vma_page - munlock a vma page
- * @page: page to be unlocked, either a normal page or THP page head
- *
- * returns the size of the page as a page mask (0 for normal page,
- * HPAGE_PMD_NR - 1 for THP head page)
- *
- * called from munlock()/munmap() path with page supposedly on the LRU.
- * When we munlock a page, because the vma where we found the page is being
- * munlock()ed or munmap()ed, we want to check whether other vmas hold the
- * page locked so that we can leave it on the unevictable lru list and not
- * bother vmscan with it. However, to walk the page's rmap list in
- * try_to_munlock() we must isolate the page from the LRU. If some other
- * task has removed the page from the LRU, we won't be able to do that.
- * So we clear the PageMlocked as we might not get another chance. If we
- * can't isolate the page, we leave it for putback_lru_page() and vmscan
- * [page_referenced()/try_to_unmap()] to deal with.
+/*
+ * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
+ * make use of such folio pointer flags in future, but for now just keep it for
+ * mlock. We could use three separate folio batches instead, but one feels
+ * better (munlocking a full folio batch does not need to drain mlocking folio
+ * batches first).
*/
-unsigned int munlock_vma_page(struct page *page)
+static void mlock_folio_batch(struct folio_batch *fbatch)
{
- int nr_pages;
- pg_data_t *pgdat = page_pgdat(page);
-
- /* For try_to_munlock() and to serialize with page migration */
- BUG_ON(!PageLocked(page));
-
- VM_BUG_ON_PAGE(PageTail(page), page);
-
- /*
- * Serialize with any parallel __split_huge_page_refcount() which
- * might otherwise copy PageMlocked to part of the tail pages before
- * we clear it in the head page. It also stabilizes thp_nr_pages().
- */
- spin_lock_irq(&pgdat->lru_lock);
+ struct lruvec *lruvec = NULL;
+ unsigned long mlock;
+ struct folio *folio;
+ int i;
- if (!TestClearPageMlocked(page)) {
- /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
- nr_pages = 1;
- goto unlock_out;
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ folio = fbatch->folios[i];
+ mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO);
+ folio = (struct folio *)((unsigned long)folio - mlock);
+ fbatch->folios[i] = folio;
+
+ if (mlock & LRU_FOLIO)
+ lruvec = __mlock_folio(folio, lruvec);
+ else if (mlock & NEW_FOLIO)
+ lruvec = __mlock_new_folio(folio, lruvec);
+ else
+ lruvec = __munlock_folio(folio, lruvec);
}
- nr_pages = thp_nr_pages(page);
- __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ folios_put(fbatch->folios, folio_batch_count(fbatch));
+ folio_batch_reinit(fbatch);
+}
- if (__munlock_isolate_lru_page(page, true)) {
- spin_unlock_irq(&pgdat->lru_lock);
- __munlock_isolated_page(page);
- goto out;
- }
- __munlock_isolation_failed(page);
+void mlock_drain_local(void)
+{
+ struct folio_batch *fbatch;
-unlock_out:
- spin_unlock_irq(&pgdat->lru_lock);
+ local_lock(&mlock_fbatch.lock);
+ fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+ if (folio_batch_count(fbatch))
+ mlock_folio_batch(fbatch);
+ local_unlock(&mlock_fbatch.lock);
+}
-out:
- return nr_pages - 1;
+void mlock_drain_remote(int cpu)
+{
+ struct folio_batch *fbatch;
+
+ WARN_ON_ONCE(cpu_online(cpu));
+ fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
+ if (folio_batch_count(fbatch))
+ mlock_folio_batch(fbatch);
}
-/*
- * convert get_user_pages() return value to posix mlock() error
- */
-static int __mlock_posix_error_return(long retval)
+bool need_mlock_drain(int cpu)
{
- if (retval == -EFAULT)
- retval = -ENOMEM;
- else if (retval == -ENOMEM)
- retval = -EAGAIN;
- return retval;
+ return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu));
}
-/*
- * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
- *
- * The fast path is available only for evictable pages with single mapping.
- * Then we can bypass the per-cpu pvec and get better performance.
- * when mapcount > 1 we need try_to_munlock() which can fail.
- * when !page_evictable(), we need the full redo logic of putback_lru_page to
- * avoid leaving evictable page in unevictable list.
- *
- * In case of success, @page is added to @pvec and @pgrescued is incremented
- * in case that the page was previously unevictable. @page is also unlocked.
+/**
+ * mlock_folio - mlock a folio already on (or temporarily off) LRU
+ * @folio: folio to be mlocked.
*/
-static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
- int *pgrescued)
+void mlock_folio(struct folio *folio)
{
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- if (page_mapcount(page) <= 1 && page_evictable(page)) {
- pagevec_add(pvec, page);
- if (TestClearPageUnevictable(page))
- (*pgrescued)++;
- unlock_page(page);
- return true;
+ struct folio_batch *fbatch;
+
+ local_lock(&mlock_fbatch.lock);
+ fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+
+ if (!folio_test_set_mlocked(folio)) {
+ int nr_pages = folio_nr_pages(folio);
+
+ zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
- return false;
+ folio_get(folio);
+ if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
+ folio_test_large(folio) || lru_cache_disabled())
+ mlock_folio_batch(fbatch);
+ local_unlock(&mlock_fbatch.lock);
}
-/*
- * Putback multiple evictable pages to the LRU
- *
- * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
- * the pages might have meanwhile become unevictable but that is OK.
+/**
+ * mlock_new_folio - mlock a newly allocated folio not yet on LRU
+ * @folio: folio to be mlocked, either normal or a THP head.
*/
-static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+void mlock_new_folio(struct folio *folio)
{
- count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
- /*
- *__pagevec_lru_add() calls release_pages() so we don't call
- * put_page() explicitly
- */
- __pagevec_lru_add(pvec);
- count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+ struct folio_batch *fbatch;
+ int nr_pages = folio_nr_pages(folio);
+
+ local_lock(&mlock_fbatch.lock);
+ fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+ folio_set_mlocked(folio);
+
+ zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+
+ folio_get(folio);
+ if (!folio_batch_add(fbatch, mlock_new(folio)) ||
+ folio_test_large(folio) || lru_cache_disabled())
+ mlock_folio_batch(fbatch);
+ local_unlock(&mlock_fbatch.lock);
}
-/*
- * Munlock a batch of pages from the same zone
- *
- * The work is split to two main phases. First phase clears the Mlocked flag
- * and attempts to isolate the pages, all under a single zone lru lock.
- * The second phase finishes the munlock only for pages where isolation
- * succeeded.
- *
- * Note that the pagevec may be modified during the process.
+/**
+ * munlock_folio - munlock a folio
+ * @folio: folio to be munlocked, either normal or a THP head.
*/
-static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+void munlock_folio(struct folio *folio)
{
- int i;
- int nr = pagevec_count(pvec);
- int delta_munlocked = -nr;
- struct pagevec pvec_putback;
- int pgrescued = 0;
-
- pagevec_init(&pvec_putback);
-
- /* Phase 1: page isolation */
- spin_lock_irq(&zone->zone_pgdat->lru_lock);
- for (i = 0; i < nr; i++) {
- struct page *page = pvec->pages[i];
-
- if (TestClearPageMlocked(page)) {
- /*
- * We already have pin from follow_page_mask()
- * so we can spare the get_page() here.
- */
- if (__munlock_isolate_lru_page(page, false))
- continue;
- else
- __munlock_isolation_failed(page);
- } else {
- delta_munlocked++;
- }
-
- /*
- * We won't be munlocking this page in the next phase
- * but we still need to release the follow_page_mask()
- * pin. We cannot do it under lru_lock however. If it's
- * the last pin, __page_cache_release() would deadlock.
- */
- pagevec_add(&pvec_putback, pvec->pages[i]);
- pvec->pages[i] = NULL;
- }
- __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(&zone->zone_pgdat->lru_lock);
-
- /* Now we can release pins of pages that we are not munlocking */
- pagevec_release(&pvec_putback);
-
- /* Phase 2: page munlock */
- for (i = 0; i < nr; i++) {
- struct page *page = pvec->pages[i];
-
- if (page) {
- lock_page(page);
- if (!__putback_lru_fast_prepare(page, &pvec_putback,
- &pgrescued)) {
- /*
- * Slow path. We don't want to lose the last
- * pin before unlock_page()
- */
- get_page(page); /* for putback_lru_page() */
- __munlock_isolated_page(page);
- unlock_page(page);
- put_page(page); /* from follow_page_mask() */
- }
- }
- }
+ struct folio_batch *fbatch;
+ local_lock(&mlock_fbatch.lock);
+ fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
/*
- * Phase 3: page putback for pages that qualified for the fast path
- * This will also call put_page() to return pin from follow_page_mask()
+ * folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
+ * which will check whether the folio is multiply mlocked.
*/
- if (pagevec_count(&pvec_putback))
- __putback_lru_fast(&pvec_putback, pgrescued);
+ folio_get(folio);
+ if (!folio_batch_add(fbatch, folio) ||
+ folio_test_large(folio) || lru_cache_disabled())
+ mlock_folio_batch(fbatch);
+ local_unlock(&mlock_fbatch.lock);
}
-/*
- * Fill up pagevec for __munlock_pagevec using pte walk
- *
- * The function expects that the struct page corresponding to @start address is
- * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
- *
- * The rest of @pvec is filled by subsequent pages within the same pmd and same
- * zone, as long as the pte's are present and vm_normal_page() succeeds. These
- * pages also get pinned.
- *
- * Returns the address of the next page that should be scanned. This equals
- * @start + PAGE_SIZE when no page could be added by the pte walk.
- */
-static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
- struct vm_area_struct *vma, struct zone *zone,
- unsigned long start, unsigned long end)
+static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
{
- pte_t *pte;
+ struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
+ pte_t *start_pte, *pte;
+ pte_t ptent;
+ struct folio *folio;
- /*
- * Initialize pte walk starting at the already pinned page where we
- * are sure that there is a pte, as it was pinned under the same
- * mmap_lock write op.
- */
- pte = get_locked_pte(vma->vm_mm, start, &ptl);
- /* Make sure we do not cross the page table boundary */
- end = pgd_addr_end(start, end);
- end = p4d_addr_end(start, end);
- end = pud_addr_end(start, end);
- end = pmd_addr_end(start, end);
-
- /* The page next to the pinned page is the first we will try to get */
- start += PAGE_SIZE;
- while (start < end) {
- struct page *page = NULL;
- pte++;
- if (pte_present(*pte))
- page = vm_normal_page(vma, start, *pte);
- /*
- * Break if page could not be obtained or the page's node+zone does not
- * match
- */
- if (!page || page_zone(page) != zone)
- break;
-
- /*
- * Do not use pagevec for PTE-mapped THP,
- * munlock_vma_pages_range() will handle them.
- */
- if (PageTransCompound(page))
- break;
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (!pmd_present(*pmd))
+ goto out;
+ if (is_huge_zero_pmd(*pmd))
+ goto out;
+ folio = page_folio(pmd_page(*pmd));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_folio(folio);
+ else
+ munlock_folio(folio);
+ goto out;
+ }
- get_page(page);
- /*
- * Increase the address that will be returned *before* the
- * eventual break due to pvec becoming full by adding the page
- */
- start += PAGE_SIZE;
- if (pagevec_add(pvec, page) == 0)
- break;
+ start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+ for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
+ continue;
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
+ continue;
+ if (folio_test_large(folio))
+ continue;
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_folio(folio);
+ else
+ munlock_folio(folio);
}
- pte_unmap_unlock(pte, ptl);
- return start;
+ pte_unmap(start_pte);
+out:
+ spin_unlock(ptl);
+ cond_resched();
+ return 0;
}
/*
- * munlock_vma_pages_range() - munlock all pages in the vma range.'
- * @vma - vma containing range to be munlock()ed.
+ * mlock_vma_pages_range() - mlock any pages already in the range,
+ * or munlock all pages in the range.
+ * @vma - vma containing range to be mlock()ed or munlock()ed
* @start - start address in @vma of the range
- * @end - end of range in @vma.
- *
- * For mremap(), munmap() and exit().
- *
- * Called with @vma VM_LOCKED.
+ * @end - end of range in @vma
+ * @newflags - the new set of flags for @vma.
*
- * Returns with VM_LOCKED cleared. Callers must be prepared to
- * deal with this.
- *
- * We don't save and restore VM_LOCKED here because pages are
- * still on lru. In unmap path, pages might be scanned by reclaim
- * and re-mlocked by try_to_{munlock|unmap} before we unmap and
- * free them. This will result in freeing mlocked pages.
+ * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
+ * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
*/
-void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static void mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ static const struct mm_walk_ops mlock_walk_ops = {
+ .pmd_entry = mlock_pte_range,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
+ };
- while (start < end) {
- struct page *page;
- unsigned int page_mask = 0;
- unsigned long page_increm;
- struct pagevec pvec;
- struct zone *zone;
+ /*
+ * There is a slight chance that concurrent page migration,
+ * or page reclaim finding a page of this now-VM_LOCKED vma,
+ * will call mlock_vma_folio() and raise page's mlock_count:
+ * double counting, leaving the page unevictable indefinitely.
+ * Communicate this danger to mlock_vma_folio() with VM_IO,
+ * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
+ * mmap_lock is held in write mode here, so this weird
+ * combination should not be visible to other mmap_lock users;
+ * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
+ */
+ if (newflags & VM_LOCKED)
+ newflags |= VM_IO;
+ vm_flags_reset_once(vma, newflags);
- pagevec_init(&pvec);
- /*
- * Although FOLL_DUMP is intended for get_dump_page(),
- * it just so happens that its special treatment of the
- * ZERO_PAGE (returning an error instead of doing get_page)
- * suits munlock very well (and if somehow an abnormal page
- * has sneaked into the range, we won't oops here: great).
- */
- page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
-
- if (page && !IS_ERR(page)) {
- if (PageTransTail(page)) {
- VM_BUG_ON_PAGE(PageMlocked(page), page);
- put_page(page); /* follow_page_mask() */
- } else if (PageTransHuge(page)) {
- lock_page(page);
- /*
- * Any THP page found by follow_page_mask() may
- * have gotten split before reaching
- * munlock_vma_page(), so we need to compute
- * the page_mask here instead.
- */
- page_mask = munlock_vma_page(page);
- unlock_page(page);
- put_page(page); /* follow_page_mask() */
- } else {
- /*
- * Non-huge pages are handled in batches via
- * pagevec. The pin from follow_page_mask()
- * prevents them from collapsing by THP.
- */
- pagevec_add(&pvec, page);
- zone = page_zone(page);
-
- /*
- * Try to fill the rest of pagevec using fast
- * pte walk. This will also update start to
- * the next page to process. Then munlock the
- * pagevec.
- */
- start = __munlock_pagevec_fill(&pvec, vma,
- zone, start, end);
- __munlock_pagevec(&pvec, zone);
- goto next;
- }
- }
- page_increm = 1 + page_mask;
- start += page_increm * PAGE_SIZE;
-next:
- cond_resched();
+ lru_add_drain();
+ walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
+ lru_add_drain();
+
+ if (newflags & VM_IO) {
+ newflags &= ~VM_IO;
+ vm_flags_reset_once(vma, newflags);
}
}
@@ -521,39 +408,39 @@ next:
*
* For vmas that pass the filters, merge/split as appropriate.
*/
-static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end, vm_flags_t newflags)
+static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, vm_flags_t newflags)
{
struct mm_struct *mm = vma->vm_mm;
pgoff_t pgoff;
int nr_pages;
int ret = 0;
- int lock = !!(newflags & VM_LOCKED);
- vm_flags_t old_flags = vma->vm_flags;
+ vm_flags_t oldflags = vma->vm_flags;
- if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+ if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
- vma_is_dax(vma))
+ vma_is_dax(vma) || vma_is_secretmem(vma))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ *prev = vma_merge(vmi, mm, *prev, start, end, newflags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*prev) {
vma = *prev;
goto success;
}
if (start != vma->vm_start) {
- ret = split_vma(mm, vma, start, 1);
+ ret = split_vma(vmi, vma, start, 1);
if (ret)
goto out;
}
if (end != vma->vm_end) {
- ret = split_vma(mm, vma, end, 0);
+ ret = split_vma(vmi, vma, end, 0);
if (ret)
goto out;
}
@@ -563,9 +450,9 @@ success:
* Keep track of amount of locked VM.
*/
nr_pages = (end - start) >> PAGE_SHIFT;
- if (!lock)
+ if (!(newflags & VM_LOCKED))
nr_pages = -nr_pages;
- else if (old_flags & VM_LOCKED)
+ else if (oldflags & VM_LOCKED)
nr_pages = 0;
mm->locked_vm += nr_pages;
@@ -575,11 +462,12 @@ success:
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
- if (lock)
- vma->vm_flags = newflags;
- else
- munlock_vma_pages_range(vma, start, end);
-
+ if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
+ /* No work to do, and mlocking twice would be wrong */
+ vm_flags_reset(vma, newflags);
+ } else {
+ mlock_vma_pages_range(vma, start, end, newflags);
+ }
out:
*prev = vma;
return ret;
@@ -589,8 +477,8 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
vm_flags_t flags)
{
unsigned long nstart, end, tmp;
- struct vm_area_struct * vma, * prev;
- int error;
+ struct vm_area_struct *vma, *prev;
+ VMA_ITERATOR(vmi, current->mm, start);
VM_BUG_ON(offset_in_page(start));
VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -599,39 +487,40 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
return -EINVAL;
if (end == start)
return 0;
- vma = find_vma(current->mm, start);
- if (!vma || vma->vm_start > start)
+ vma = vma_iter_load(&vmi);
+ if (!vma)
return -ENOMEM;
- prev = vma->vm_prev;
+ prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
- for (nstart = start ; ; ) {
- vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ nstart = start;
+ tmp = vma->vm_start;
+ for_each_vma_range(vmi, vma, end) {
+ int error;
+ vm_flags_t newflags;
- newflags |= flags;
+ if (vma->vm_start != tmp)
+ return -ENOMEM;
+ newflags = vma->vm_flags & ~VM_LOCKED_MASK;
+ newflags |= flags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
- error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
+ error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
if (error)
- break;
+ return error;
+ tmp = vma_iter_end(&vmi);
nstart = tmp;
- if (nstart < prev->vm_end)
- nstart = prev->vm_end;
- if (nstart >= end)
- break;
-
- vma = prev->vm_next;
- if (!vma || vma->vm_start != nstart) {
- error = -ENOMEM;
- break;
- }
}
- return error;
+
+ if (tmp < end)
+ return -ENOMEM;
+
+ return 0;
}
/*
@@ -646,24 +535,21 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long count = 0;
+ unsigned long end;
+ VMA_ITERATOR(vmi, mm, start);
- if (mm == NULL)
- mm = current->mm;
-
- vma = find_vma(mm, start);
- if (vma == NULL)
- vma = mm->mmap;
+ /* Don't overflow past ULONG_MAX */
+ if (unlikely(ULONG_MAX - len < start))
+ end = ULONG_MAX;
+ else
+ end = start + len;
- for (; vma ; vma = vma->vm_next) {
- if (start >= vma->vm_end)
- continue;
- if (start + len <= vma->vm_start)
- break;
+ for_each_vma_range(vmi, vma, end) {
if (vma->vm_flags & VM_LOCKED) {
if (start > vma->vm_start)
count -= (start - vma->vm_start);
- if (start + len < vma->vm_end) {
- count += start + len - vma->vm_start;
+ if (end < vma->vm_end) {
+ count += end - vma->vm_start;
break;
}
count += vma->vm_end - vma->vm_start;
@@ -673,6 +559,18 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
return count >> PAGE_SHIFT;
}
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+ if (retval == -EFAULT)
+ retval = -ENOMEM;
+ else if (retval == -ENOMEM)
+ retval = -EAGAIN;
+ return retval;
+}
+
static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
unsigned long locked;
@@ -767,10 +665,11 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
*/
static int apply_mlockall_flags(int flags)
{
- struct vm_area_struct * vma, * prev = NULL;
+ VMA_ITERATOR(vmi, current->mm, 0);
+ struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
- current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+ current->mm->def_flags &= ~VM_LOCKED_MASK;
if (flags & MCL_FUTURE) {
current->mm->def_flags |= VM_LOCKED;
@@ -787,14 +686,15 @@ static int apply_mlockall_flags(int flags)
to_add |= VM_LOCKONFAULT;
}
- for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ for_each_vma(vmi, vma) {
vm_flags_t newflags;
- newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= to_add;
/* Ignore errors */
- mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
+ newflags);
cond_resched();
}
out:
@@ -847,32 +747,38 @@ SYSCALL_DEFINE0(munlockall)
*/
static DEFINE_SPINLOCK(shmlock_user_lock);
-int user_shm_lock(size_t size, struct user_struct *user)
+int user_shm_lock(size_t size, struct ucounts *ucounts)
{
unsigned long lock_limit, locked;
+ long memlock;
int allowed = 0;
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
lock_limit = rlimit(RLIMIT_MEMLOCK);
- if (lock_limit == RLIM_INFINITY)
- allowed = 1;
- lock_limit >>= PAGE_SHIFT;
+ if (lock_limit != RLIM_INFINITY)
+ lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock);
- if (!allowed &&
- locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+ memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+
+ if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
goto out;
- get_uid(user);
- user->locked_shm += locked;
+ }
+ if (!get_ucounts(ucounts)) {
+ dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+ allowed = 0;
+ goto out;
+ }
allowed = 1;
out:
spin_unlock(&shmlock_user_lock);
return allowed;
}
-void user_shm_unlock(size_t size, struct user_struct *user)
+void user_shm_unlock(size_t size, struct ucounts *ucounts)
{
spin_lock(&shmlock_user_lock);
- user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
spin_unlock(&shmlock_user_lock);
- free_uid(user);
+ put_ucounts(ucounts);
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index b06a30fbedff..a1963c3322af 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,15 +14,27 @@
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/mman.h>
+#include <linux/memblock.h>
+#include <linux/page-isolation.h>
+#include <linux/padata.h>
+#include <linux/nmi.h>
+#include <linux/buffer_head.h>
+#include <linux/kmemleak.h>
+#include <linux/kfence.h>
+#include <linux/page_ext.h>
+#include <linux/pti.h>
+#include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/cma.h>
#include "internal.h"
+#include "slab.h"
+#include "shuffle.h"
+
+#include <asm/setup.h>
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
-#ifndef SECTIONS_SHIFT
-#define SECTIONS_SHIFT 0
-#endif
-
/* The zonelists are simply reported, validation is manual. */
void __init mminit_verify_zonelist(void)
{
@@ -69,14 +81,16 @@ void __init mminit_verify_pageflags_layout(void)
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_REFS_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
@@ -173,22 +187,17 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
case MEM_ONLINE:
case MEM_OFFLINE:
mm_compute_batch(sysctl_overcommit_memory);
+ break;
default:
break;
}
return NOTIFY_OK;
}
-static struct notifier_block compute_batch_nb __meminitdata = {
- .notifier_call = mm_compute_batch_notifier,
- .priority = IPC_CALLBACK_PRI, /* use lowest priority */
-};
-
static int __init mm_compute_batch_init(void)
{
mm_compute_batch(sysctl_overcommit_memory);
- register_hotmemory_notifier(&compute_batch_nb);
-
+ hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
return 0;
}
@@ -205,3 +214,2594 @@ static int __init mm_sysfs_init(void)
return 0;
}
postcore_initcall(mm_sysfs_init);
+
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
+
+static unsigned long required_kernelcore __initdata;
+static unsigned long required_kernelcore_percent __initdata;
+static unsigned long required_movablecore __initdata;
+static unsigned long required_movablecore_percent __initdata;
+
+static unsigned long nr_kernel_pages __initdata;
+static unsigned long nr_all_pages __initdata;
+static unsigned long dma_reserve __initdata;
+
+static bool deferred_struct_pages __meminitdata;
+
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+
+static int __init cmdline_parse_core(char *p, unsigned long *core,
+ unsigned long *percent)
+{
+ unsigned long long coremem;
+ char *endptr;
+
+ if (!p)
+ return -EINVAL;
+
+ /* Value may be a percentage of total memory, otherwise bytes */
+ coremem = simple_strtoull(p, &endptr, 0);
+ if (*endptr == '%') {
+ /* Paranoid check for percent values greater than 100 */
+ WARN_ON(coremem > 100);
+
+ *percent = coremem;
+ } else {
+ coremem = memparse(p, &p);
+ /* Paranoid check that UL is enough for the coremem value */
+ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+ *core = coremem >> PAGE_SHIFT;
+ *percent = 0UL;
+ }
+ return 0;
+}
+
+bool mirrored_kernelcore __initdata_memblock;
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+static int __init cmdline_parse_kernelcore(char *p)
+{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
+ return cmdline_parse_core(p, &required_kernelcore,
+ &required_kernelcore_percent);
+}
+early_param("kernelcore", cmdline_parse_kernelcore);
+
+/*
+ * movablecore=size sets the amount of memory for use for allocations that
+ * can be reclaimed or migrated.
+ */
+static int __init cmdline_parse_movablecore(char *p)
+{
+ return cmdline_parse_core(p, &required_movablecore,
+ &required_movablecore_percent);
+}
+early_param("movablecore", cmdline_parse_movablecore);
+
+/*
+ * early_calculate_totalpages()
+ * Sum pages in active regions for movable zone.
+ * Populate N_MEMORY for calculating usable_nodes.
+ */
+static unsigned long __init early_calculate_totalpages(void)
+{
+ unsigned long totalpages = 0;
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ unsigned long pages = end_pfn - start_pfn;
+
+ totalpages += pages;
+ if (pages)
+ node_set_state(nid, N_MEMORY);
+ }
+ return totalpages;
+}
+
+/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+static void __init find_usable_zone_for_movable(void)
+{
+ int zone_index;
+ for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+ if (zone_index == ZONE_MOVABLE)
+ continue;
+
+ if (arch_zone_highest_possible_pfn[zone_index] >
+ arch_zone_lowest_possible_pfn[zone_index])
+ break;
+ }
+
+ VM_BUG_ON(zone_index == -1);
+ movable_zone = zone_index;
+}
+
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+static void __init find_zone_movable_pfns_for_nodes(void)
+{
+ int i, nid;
+ unsigned long usable_startpfn;
+ unsigned long kernelcore_node, kernelcore_remaining;
+ /* save the state before borrow the nodemask */
+ nodemask_t saved_node_state = node_states[N_MEMORY];
+ unsigned long totalpages = early_calculate_totalpages();
+ int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+ struct memblock_region *r;
+
+ /* Need to find movable_zone earlier when movable_node is specified. */
+ find_usable_zone_for_movable();
+
+ /*
+ * If movable_node is specified, ignore kernelcore and movablecore
+ * options.
+ */
+ if (movable_node_is_enabled()) {
+ for_each_mem_region(r) {
+ if (!memblock_is_hotpluggable(r))
+ continue;
+
+ nid = memblock_get_region_node(r);
+
+ usable_startpfn = PFN_DOWN(r->base);
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ goto out2;
+ }
+
+ /*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_mem_region(r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = memblock_get_region_node(r);
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < PHYS_PFN(SZ_4G)) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.\n");
+
+ goto out2;
+ }
+
+ /*
+ * If kernelcore=nn% or movablecore=nn% was specified, calculate the
+ * amount of necessary memory.
+ */
+ if (required_kernelcore_percent)
+ required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
+ 10000UL;
+ if (required_movablecore_percent)
+ required_movablecore = (totalpages * 100 * required_movablecore_percent) /
+ 10000UL;
+
+ /*
+ * If movablecore= was specified, calculate what size of
+ * kernelcore that corresponds so that memory usable for
+ * any allocation type is evenly spread. If both kernelcore
+ * and movablecore are specified, then the value of kernelcore
+ * will be used for required_kernelcore if it's greater than
+ * what movablecore would have allowed.
+ */
+ if (required_movablecore) {
+ unsigned long corepages;
+
+ /*
+ * Round-up so that ZONE_MOVABLE is at least as large as what
+ * was requested by the user
+ */
+ required_movablecore =
+ roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ required_movablecore = min(totalpages, required_movablecore);
+ corepages = totalpages - required_movablecore;
+
+ required_kernelcore = max(required_kernelcore, corepages);
+ }
+
+ /*
+ * If kernelcore was not specified or kernelcore size is larger
+ * than totalpages, there is no ZONE_MOVABLE.
+ */
+ if (!required_kernelcore || required_kernelcore >= totalpages)
+ goto out;
+
+ /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+ /* Spread kernelcore memory as evenly as possible throughout nodes */
+ kernelcore_node = required_kernelcore / usable_nodes;
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ /*
+ * Recalculate kernelcore_node if the division per node
+ * now exceeds what is necessary to satisfy the requested
+ * amount of memory for the kernel
+ */
+ if (required_kernelcore < kernelcore_node)
+ kernelcore_node = required_kernelcore / usable_nodes;
+
+ /*
+ * As the map is walked, we track how much memory is usable
+ * by the kernel using kernelcore_remaining. When it is
+ * 0, the rest of the node is usable by ZONE_MOVABLE
+ */
+ kernelcore_remaining = kernelcore_node;
+
+ /* Go through each range of PFNs within this node */
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ unsigned long size_pages;
+
+ start_pfn = max(start_pfn, zone_movable_pfn[nid]);
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /* Account for what is only usable for kernelcore */
+ if (start_pfn < usable_startpfn) {
+ unsigned long kernel_pages;
+ kernel_pages = min(end_pfn, usable_startpfn)
+ - start_pfn;
+
+ kernelcore_remaining -= min(kernel_pages,
+ kernelcore_remaining);
+ required_kernelcore -= min(kernel_pages,
+ required_kernelcore);
+
+ /* Continue if range is now fully accounted */
+ if (end_pfn <= usable_startpfn) {
+
+ /*
+ * Push zone_movable_pfn to the end so
+ * that if we have to rebalance
+ * kernelcore across nodes, we will
+ * not double account here
+ */
+ zone_movable_pfn[nid] = end_pfn;
+ continue;
+ }
+ start_pfn = usable_startpfn;
+ }
+
+ /*
+ * The usable PFN range for ZONE_MOVABLE is from
+ * start_pfn->end_pfn. Calculate size_pages as the
+ * number of pages used as kernelcore
+ */
+ size_pages = end_pfn - start_pfn;
+ if (size_pages > kernelcore_remaining)
+ size_pages = kernelcore_remaining;
+ zone_movable_pfn[nid] = start_pfn + size_pages;
+
+ /*
+ * Some kernelcore has been met, update counts and
+ * break if the kernelcore for this node has been
+ * satisfied
+ */
+ required_kernelcore -= min(required_kernelcore,
+ size_pages);
+ kernelcore_remaining -= size_pages;
+ if (!kernelcore_remaining)
+ break;
+ }
+ }
+
+ /*
+ * If there is still required_kernelcore, we do another pass with one
+ * less node in the count. This will push zone_movable_pfn[nid] further
+ * along on the nodes that still have memory until kernelcore is
+ * satisfied
+ */
+ usable_nodes--;
+ if (usable_nodes && required_kernelcore > usable_nodes)
+ goto restart;
+
+out2:
+ /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ unsigned long start_pfn, end_pfn;
+
+ zone_movable_pfn[nid] =
+ roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ if (zone_movable_pfn[nid] >= end_pfn)
+ zone_movable_pfn[nid] = 0;
+ }
+
+out:
+ /* restore the node_state */
+ node_states[N_MEMORY] = saved_node_state;
+}
+
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid)
+{
+ mm_zero_struct_page(page);
+ set_page_links(page, zone, nid, pfn);
+ init_page_count(page);
+ page_mapcount_reset(page);
+ page_cpupid_reset_last(page);
+ page_kasan_tag_reset(page);
+
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (!is_highmem_idx(zone))
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * During memory init memblocks map pfns to nids. The search is expensive and
+ * this caches recent lookups. The implementation of __early_pfn_to_nid
+ * treats start/end as pfns.
+ */
+struct mminit_pfnnid_cache {
+ unsigned long last_start;
+ unsigned long last_end;
+ int last_nid;
+};
+
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ */
+static int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
+{
+ unsigned long start_pfn, end_pfn;
+ int nid;
+
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
+
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+ if (nid != NUMA_NO_NODE) {
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
+ }
+
+ return nid;
+}
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+ static DEFINE_SPINLOCK(early_pfn_lock);
+ int nid;
+
+ spin_lock(&early_pfn_lock);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+ if (nid < 0)
+ nid = first_online_node;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
+}
+
+int hashdist = HASHDIST_DEFAULT;
+
+static int __init set_hashdist(char *str)
+{
+ if (!str)
+ return 0;
+ hashdist = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("hashdist=", set_hashdist);
+
+static inline void fixup_hashdist(void)
+{
+ if (num_node_state(N_MEMORY) == 1)
+ hashdist = 0;
+}
+#else
+static inline void fixup_hashdist(void) {}
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
+{
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+
+/* Returns true if the struct page for the pfn is initialised */
+static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
+{
+ if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ return false;
+
+ return true;
+}
+
+/*
+ * Returns true when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+{
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ if (early_page_ext_enabled())
+ return false;
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
+ }
+
+ /* Always populate low zones for address-constrained allocations */
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
+ return false;
+
+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
+ return true;
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ nr_initialised++;
+ if ((nr_initialised > PAGES_PER_SECTION) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
+ }
+ return false;
+}
+
+static void __meminit init_reserved_page(unsigned long pfn, int nid)
+{
+ pg_data_t *pgdat;
+ int zid;
+
+ if (early_page_initialised(pfn, nid))
+ return;
+
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_spans_pfn(zone, pfn))
+ break;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+}
+#else
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
+
+static inline bool early_page_initialised(unsigned long pfn, int nid)
+{
+ return true;
+}
+
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+{
+ return false;
+}
+
+static inline void init_reserved_page(unsigned long pfn, int nid)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void __meminit reserve_bootmem_region(phys_addr_t start,
+ phys_addr_t end, int nid)
+{
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ for (; start_pfn < end_pfn; start_pfn++) {
+ if (pfn_valid(start_pfn)) {
+ struct page *page = pfn_to_page(start_pfn);
+
+ init_reserved_page(start_pfn, nid);
+
+ /* Avoid false-positive PageTail() */
+ INIT_LIST_HEAD(&page->lru);
+
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
+ }
+ }
+}
+
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_mem_region(r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * Only struct pages that correspond to ranges defined by memblock.memory
+ * are zeroed and initialized by going through __init_single_page() during
+ * memmap_init_zone_range().
+ *
+ * But, there could be struct pages that correspond to holes in
+ * memblock.memory. This can happen because of the following reasons:
+ * - physical memory bank size is not necessarily the exact multiple of the
+ * arbitrary section size
+ * - early reserved memory may not be listed in memblock.memory
+ * - memory layouts defined with memmap= kernel parameter may not align
+ * nicely with memmap sections
+ *
+ * Explicitly initialize those struct pages so that:
+ * - PG_Reserved is set
+ * - zone and node links point to zone and node that span the page if the
+ * hole is in the middle of a zone
+ * - zone and node links point to adjacent zone/node if the hole falls on
+ * the zone boundary; the pages in such holes will be prepended to the
+ * zone/node above the hole except for the trailing pages in the last
+ * section that will be appended to the zone/node below.
+ */
+static void __init init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
+{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(pageblock_start_pfn(pfn))) {
+ pfn = pageblock_end_pfn(pfn) - 1;
+ continue;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zone, node);
+ __SetPageReserved(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ if (pgcnt)
+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
+ node, zone_names[zone], pgcnt);
+}
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by memblock_free_all() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
+ */
+void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
+ unsigned long start_pfn, unsigned long zone_end_pfn,
+ enum meminit_context context,
+ struct vmem_altmap *altmap, int migratetype)
+{
+ unsigned long pfn, end_pfn = start_pfn + size;
+ struct page *page;
+
+ if (highest_memmap_pfn < end_pfn - 1)
+ highest_memmap_pfn = end_pfn - 1;
+
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Honor reservation requested by the driver for this ZONE_DEVICE
+ * memory. We limit the total number of pages to initialize to just
+ * those that might contain the memory mapping. We will defer the
+ * ZONE_DEVICE page initialization until after we have released
+ * the hotplug lock.
+ */
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ }
+#endif
+
+ for (pfn = start_pfn; pfn < end_pfn; ) {
+ /*
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
+ */
+ if (context == MEMINIT_EARLY) {
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, zone_end_pfn)) {
+ deferred_struct_pages = true;
+ break;
+ }
+ }
+
+ page = pfn_to_page(pfn);
+ __init_single_page(page, pfn, zone, nid);
+ if (context == MEMINIT_HOTPLUG)
+ __SetPageReserved(page);
+
+ /*
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+ * such that unmovable allocations won't be scattered all
+ * over the place during system boot.
+ */
+ if (pageblock_aligned(pfn)) {
+ set_pageblock_migratetype(page, migratetype);
+ cond_resched();
+ }
+ pfn++;
+ }
+}
+
+static void __init memmap_init_zone_range(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long *hole_pfn)
+{
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+ if (*hole_pfn < start_pfn)
+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
+
+ *hole_pfn = end_pfn;
+}
+
+static void __init memmap_init(void)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long hole_pfn = 0;
+ int i, j, zone_id = 0, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ struct pglist_data *node = NODE_DATA(nid);
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = node->node_zones + j;
+
+ if (!populated_zone(zone))
+ continue;
+
+ memmap_init_zone_range(zone, start_pfn, end_pfn,
+ &hole_pfn);
+ zone_id = j;
+ }
+ }
+
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * Initialize the memory map for hole in the range [memory_end,
+ * section_end].
+ * Append the pages in this hole to the highest zone in the last
+ * node.
+ * The call to init_unavailable_range() is outside the ifdef to
+ * silence the compiler warining about zone_id set but not used;
+ * for FLATMEM it is a nop anyway
+ */
+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
+ if (hole_pfn < end_pfn)
+#endif
+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
+ unsigned long zone_idx, int nid,
+ struct dev_pagemap *pgmap)
+{
+
+ __init_single_page(page, pfn, zone_idx, nid);
+
+ /*
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * We can use the non-atomic __set_bit operation for setting
+ * the flag as we are still initializing the pages.
+ */
+ __SetPageReserved(page);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
+ * ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ page->zone_device_data = NULL;
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
+ * because this is done early in section_activate()
+ */
+ if (pageblock_aligned(pfn)) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+
+ /*
+ * ZONE_DEVICE pages are released directly to the driver page allocator
+ * which will set the page count to 1 when allocating the page.
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT)
+ set_page_count(page, 0);
+}
+
+/*
+ * With compound page geometry and when struct pages are stored in ram most
+ * tail pages are reused. Consequently, the amount of unique struct pages to
+ * initialize is a lot smaller that the total amount of struct pages being
+ * mapped. This is a paired / mild layering violation with explicit knowledge
+ * of how the sparse_vmemmap internals handle compound pages in the lack
+ * of an altmap. See vmemmap_populate_compound_pages().
+ */
+static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
+{
+ if (!vmemmap_can_optimize(altmap, pgmap))
+ return pgmap_vmemmap_nr(pgmap);
+
+ return 2 * (PAGE_SIZE / sizeof(struct page));
+}
+
+static void __ref memmap_init_compound(struct page *head,
+ unsigned long head_pfn,
+ unsigned long zone_idx, int nid,
+ struct dev_pagemap *pgmap,
+ unsigned long nr_pages)
+{
+ unsigned long pfn, end_pfn = head_pfn + nr_pages;
+ unsigned int order = pgmap->vmemmap_shift;
+
+ __SetPageHead(head);
+ for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+ prep_compound_tail(head, pfn - head_pfn);
+ set_page_count(page, 0);
+
+ /*
+ * The first tail page stores important compound page info.
+ * Call prep_compound_head() after the first tail page has
+ * been initialized, to not have the data overwritten.
+ */
+ if (pfn == head_pfn + 1)
+ prep_compound_head(head, order);
+ }
+}
+
+void __ref memmap_init_zone_device(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long nr_pages,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+ unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
+ unsigned long zone_idx = zone_idx(zone);
+ unsigned long start = jiffies;
+ int nid = pgdat->node_id;
+
+ if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
+ return;
+
+ /*
+ * The call to memmap_init should have already taken care
+ * of the pages reserved for the memmap, so we can just jump to
+ * the end of that region and start processing the device pages.
+ */
+ if (altmap) {
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ nr_pages = end_pfn - start_pfn;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+
+ if (pfns_per_compound == 1)
+ continue;
+
+ memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
+ compound_nr_pages(altmap, pgmap));
+ }
+
+ pr_debug("%s initialised %lu pages in %ums\n", __func__,
+ nr_pages, jiffies_to_msecs(jiffies - start));
+}
+#endif
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independent of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+static void __init adjust_zone_range_for_zone_movable(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ /* Only adjust if ZONE_MOVABLE is on this node */
+ if (zone_movable_pfn[nid]) {
+ /* Size ZONE_MOVABLE */
+ if (zone_type == ZONE_MOVABLE) {
+ *zone_start_pfn = zone_movable_pfn[nid];
+ *zone_end_pfn = min(node_end_pfn,
+ arch_zone_highest_possible_pfn[movable_zone]);
+
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (!mirrored_kernelcore &&
+ *zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
+ /* Check if this whole range is within ZONE_MOVABLE */
+ } else if (*zone_start_pfn >= zone_movable_pfn[nid])
+ *zone_start_pfn = *zone_end_pfn;
+ }
+}
+
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for.
+ */
+unsigned long __init __absent_pages_in_range(int nid,
+ unsigned long range_start_pfn,
+ unsigned long range_end_pfn)
+{
+ unsigned long nr_absent = range_end_pfn - range_start_pfn;
+ unsigned long start_pfn, end_pfn;
+ int i;
+
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ nr_absent -= end_pfn - start_pfn;
+ }
+ return nr_absent;
+}
+
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * Return: the number of pages frames in memory holes within a range.
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+/* Return the number of page frames in holes in a zone on a node */
+static unsigned long __init zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long zone_start_pfn,
+ unsigned long zone_end_pfn)
+{
+ unsigned long nr_absent;
+
+ /* zone is empty, we don't have any absent pages */
+ if (zone_start_pfn == zone_end_pfn)
+ return 0;
+
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_mem_region(r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ }
+
+ return nr_absent;
+}
+
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+static unsigned long __init zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
+
+ /* Get the start and end of the zone */
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ zone_start_pfn, zone_end_pfn);
+
+ /* Check that this node has pages within the zone's required range */
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
+ return 0;
+
+ /* Move the zone boundaries inside the node if necessary */
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
+
+ /* Return the spanned pages */
+ return *zone_end_pfn - *zone_start_pfn;
+}
+
+static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
+ z->zone_start_pfn = 0;
+ z->spanned_pages = 0;
+ z->present_pages = 0;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+ z->present_early_pages = 0;
+#endif
+ }
+
+ pgdat->node_spanned_pages = 0;
+ pgdat->node_present_pages = 0;
+ pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
+}
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn)
+{
+ unsigned long realtotalpages = 0, totalpages = 0;
+ enum zone_type i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long spanned, absent;
+ unsigned long real_size;
+
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn);
+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
+ zone_start_pfn,
+ zone_end_pfn);
+
+ real_size = spanned - absent;
+
+ if (spanned)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = spanned;
+ zone->present_pages = real_size;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+ zone->present_early_pages = real_size;
+#endif
+
+ totalpages += spanned;
+ realtotalpages += real_size;
+ }
+
+ pgdat->node_spanned_pages = totalpages;
+ pgdat->node_present_pages = realtotalpages;
+ pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
+{
+ unsigned long pages = spanned_pages;
+
+ /*
+ * Provide a more accurate estimation if there are holes within
+ * the zone and SPARSEMEM is in use. If there are holes within the
+ * zone, each populated memory region may cost us one or two extra
+ * memmap pages due to alignment because memmap pages for each
+ * populated regions may not be naturally aligned on page boundary.
+ * So the (present_pages >> 4) heuristic is a tradeoff for that.
+ */
+ if (spanned_pages > present_pages + (present_pages >> 4) &&
+ IS_ENABLED(CONFIG_SPARSEMEM))
+ pages = present_pages;
+
+ return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pgdat_init_split_queue(struct pglist_data *pgdat)
+{
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+ spin_lock_init(&ds_queue->split_queue_lock);
+ INIT_LIST_HEAD(&ds_queue->split_queue);
+ ds_queue->split_queue_len = 0;
+}
+#else
+static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
+#endif
+
+#ifdef CONFIG_COMPACTION
+static void pgdat_init_kcompactd(struct pglist_data *pgdat)
+{
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+}
+#else
+static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
+#endif
+
+static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
+{
+ int i;
+
+ pgdat_resize_init(pgdat);
+ pgdat_kswapd_lock_init(pgdat);
+
+ pgdat_init_split_queue(pgdat);
+ pgdat_init_kcompactd(pgdat);
+
+ init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
+ for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
+ init_waitqueue_head(&pgdat->reclaim_wait[i]);
+
+ pgdat_page_ext_init(pgdat);
+ lruvec_init(&pgdat->__lruvec);
+}
+
+static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+ unsigned long remaining_pages)
+{
+ atomic_long_set(&zone->managed_pages, remaining_pages);
+ zone_set_nid(zone, nid);
+ zone->name = zone_names[idx];
+ zone->zone_pgdat = NODE_DATA(nid);
+ spin_lock_init(&zone->lock);
+ zone_seqlock_init(zone);
+ zone_pcp_init(zone);
+}
+
+static void __meminit zone_init_free_lists(struct zone *zone)
+{
+ unsigned int order, t;
+ for_each_migratetype_order(order, t) {
+ INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+ zone->free_area[order].nr_free = 0;
+ }
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ INIT_LIST_HEAD(&zone->unaccepted_pages);
+#endif
+}
+
+void __meminit init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long size)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int zone_idx = zone_idx(zone) + 1;
+
+ if (zone_idx > pgdat->nr_zones)
+ pgdat->nr_zones = zone_idx;
+
+ zone->zone_start_pfn = zone_start_pfn;
+
+ mminit_dprintk(MMINIT_TRACE, "memmap_init",
+ "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+ pgdat->node_id,
+ (unsigned long)zone_idx(zone),
+ zone_start_pfn, (zone_start_pfn + size));
+
+ zone_init_free_lists(zone);
+ zone->initialized = 1;
+}
+
+#ifndef CONFIG_SPARSEMEM
+/*
+ * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of pageblock_order by rounding
+ * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
+ * round what is now in bits to nearest long in bits, then return it in
+ * bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
+{
+ unsigned long usemapsize;
+
+ zonesize += zone_start_pfn & (pageblock_nr_pages-1);
+ usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = usemapsize >> pageblock_order;
+ usemapsize *= NR_PAGEBLOCK_BITS;
+ usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+
+ return usemapsize / 8;
+}
+
+static void __ref setup_usemap(struct zone *zone)
+{
+ unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
+ zone->spanned_pages);
+ zone->pageblock_flags = NULL;
+ if (usemapsize) {
+ zone->pageblock_flags =
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+ zone_to_nid(zone));
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, zone_to_nid(zone));
+ }
+}
+#else
+static inline void setup_usemap(struct zone *zone) {}
+#endif /* CONFIG_SPARSEMEM */
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+
+/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
+void __init set_pageblock_order(void)
+{
+ unsigned int order = MAX_ORDER;
+
+ /* Check that pageblock_nr_pages has not already been setup */
+ if (pageblock_order)
+ return;
+
+ /* Don't let pageblocks exceed the maximum allocation granularity. */
+ if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
+ order = HUGETLB_PAGE_ORDER;
+
+ /*
+ * Assume the largest contiguous order of interest is a huge page.
+ * This value may be variable depending on boot parameters on IA64 and
+ * powerpc.
+ */
+ pageblock_order = order;
+}
+#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
+ */
+void __init set_pageblock_order(void)
+{
+}
+
+#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * Set up the zone data structures
+ * - init pgdat internals
+ * - init all zones belonging to this node
+ *
+ * NOTE: this function is only called during memory hotplug
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
+{
+ int nid = pgdat->node_id;
+ enum zone_type z;
+ int cpu;
+
+ pgdat_init_internals(pgdat);
+
+ if (pgdat->per_cpu_nodestats == &boot_nodestats)
+ pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+
+ /*
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
+ * when it starts in the near future.
+ */
+ pgdat->nr_zones = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
+ pgdat->node_start_pfn = 0;
+ pgdat->node_present_pages = 0;
+
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
+
+ /*
+ * When memory is hot-added, all the memory is in offline state. So
+ * clear all zones' present_pages and managed_pages because they will
+ * be updated in online_pages() and offline_pages().
+ */
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ zone->present_pages = 0;
+ zone_init_internals(zone, z, nid, 0);
+ }
+}
+#endif
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
+ * NOTE: this function is only called during early init.
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat)
+{
+ enum zone_type j;
+ int nid = pgdat->node_id;
+
+ pgdat_init_internals(pgdat);
+ pgdat->per_cpu_nodestats = &boot_nodestats;
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = pgdat->node_zones + j;
+ unsigned long size, freesize, memmap_pages;
+
+ size = zone->spanned_pages;
+ freesize = zone->present_pages;
+
+ /*
+ * Adjust freesize so that it accounts for how much memory
+ * is used by this zone for memmap. This affects the watermark
+ * and per-cpu initialisations
+ */
+ memmap_pages = calc_memmap_size(size, freesize);
+ if (!is_highmem_idx(j)) {
+ if (freesize >= memmap_pages) {
+ freesize -= memmap_pages;
+ if (memmap_pages)
+ pr_debug(" %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
+ } else
+ pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
+ zone_names[j], memmap_pages, freesize);
+ }
+
+ /* Account for reserved pages */
+ if (j == 0 && freesize > dma_reserve) {
+ freesize -= dma_reserve;
+ pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
+ }
+
+ if (!is_highmem_idx(j))
+ nr_kernel_pages += freesize;
+ /* Charge for highmem memmap if there are enough kernel pages */
+ else if (nr_kernel_pages > memmap_pages * 2)
+ nr_kernel_pages -= memmap_pages;
+ nr_all_pages += freesize;
+
+ /*
+ * Set an approximate value for lowmem here, it will be adjusted
+ * when the bootmem allocator frees pages into the buddy system.
+ * And all highmem pages will be managed by the buddy system.
+ */
+ zone_init_internals(zone, j, nid, freesize);
+
+ if (!size)
+ continue;
+
+ setup_usemap(zone);
+ init_currently_empty_zone(zone, zone->zone_start_pfn, size);
+ }
+}
+
+void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, int nid, bool exact_nid)
+{
+ void *ptr;
+
+ if (exact_nid)
+ ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid);
+ else
+ ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid);
+
+ if (ptr && size > 0)
+ page_init_poison(ptr, size);
+
+ return ptr;
+}
+
+#ifdef CONFIG_FLATMEM
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+ unsigned long __maybe_unused start = 0;
+ unsigned long __maybe_unused offset = 0;
+
+ /* Skip empty nodes */
+ if (!pgdat->node_spanned_pages)
+ return;
+
+ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+ offset = pgdat->node_start_pfn - start;
+ /* ia64 gets its own node_mem_map, before this, without bootmem */
+ if (!pgdat->node_mem_map) {
+ unsigned long size, end;
+ struct page *map;
+
+ /*
+ * The zone's endpoints aren't required to be MAX_ORDER
+ * aligned but the node_mem_map endpoints must be in order
+ * for the buddy allocator to function correctly.
+ */
+ end = pgdat_end_pfn(pgdat);
+ end = ALIGN(end, MAX_ORDER_NR_PAGES);
+ size = (end - start) * sizeof(struct page);
+ map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+ pgdat->node_id, false);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
+ pgdat->node_mem_map = map + offset;
+ }
+ pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ __func__, pgdat->node_id, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
+#ifndef CONFIG_NUMA
+ /*
+ * With no DISCONTIG, the global mem_map is just set as node 0's
+ */
+ if (pgdat == NODE_DATA(0)) {
+ mem_map = NODE_DATA(0)->node_mem_map;
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= offset;
+ }
+#endif
+}
+#else
+static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
+#endif /* CONFIG_FLATMEM */
+
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by memblock_set_node(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0.
+ */
+void __init get_pfn_range_for_nid(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ unsigned long this_start_pfn, this_end_pfn;
+ int i;
+
+ *start_pfn = -1UL;
+ *end_pfn = 0;
+
+ for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+ *start_pfn = min(*start_pfn, this_start_pfn);
+ *end_pfn = max(*end_pfn, this_end_pfn);
+ }
+
+ if (*start_pfn == -1UL)
+ *start_pfn = 0;
+}
+
+static void __init free_area_init_node(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = 0;
+ unsigned long end_pfn = 0;
+
+ /* pg_data_t should be reset to zero when it's allocated */
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = start_pfn;
+ pgdat->per_cpu_nodestats = NULL;
+
+ if (start_pfn != end_pfn) {
+ pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
+ } else {
+ pr_info("Initmem setup node %d as memoryless\n", nid);
+
+ reset_memoryless_node_totalpages(pgdat);
+ }
+
+ alloc_node_mem_map(pgdat);
+ pgdat_set_deferred_range(pgdat);
+
+ free_area_init_core(pgdat);
+ lru_gen_init_pgdat(pgdat);
+}
+
+/* Any regular or high memory on that node ? */
+static void check_for_memory(pg_data_t *pgdat)
+{
+ enum zone_type zone_type;
+
+ for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+ if (populated_zone(zone)) {
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ node_set_state(pgdat->node_id, N_HIGH_MEMORY);
+ if (zone_type <= ZONE_NORMAL)
+ node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
+ break;
+ }
+ }
+}
+
+#if MAX_NUMNODES > 1
+/*
+ * Figure out the number of possible node ids.
+ */
+void __init setup_nr_node_ids(void)
+{
+ unsigned int highest;
+
+ highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
+ nr_node_ids = highest + 1;
+}
+#endif
+
+/*
+ * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+static bool arch_has_descending_max_zone_pfns(void)
+{
+ return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
+}
+
+/**
+ * free_area_init - Initialise all pg_data_t and zone data
+ * @max_zone_pfn: an array of max PFNs for each zone
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by memblock_set_node(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init(unsigned long *max_zone_pfn)
+{
+ unsigned long start_pfn, end_pfn;
+ int i, nid, zone;
+ bool descending;
+
+ /* Record where the zone boundaries are */
+ memset(arch_zone_lowest_possible_pfn, 0,
+ sizeof(arch_zone_lowest_possible_pfn));
+ memset(arch_zone_highest_possible_pfn, 0,
+ sizeof(arch_zone_highest_possible_pfn));
+
+ start_pfn = PHYS_PFN(memblock_start_of_DRAM());
+ descending = arch_has_descending_max_zone_pfns();
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (descending)
+ zone = MAX_NR_ZONES - i - 1;
+ else
+ zone = i;
+
+ if (zone == ZONE_MOVABLE)
+ continue;
+
+ end_pfn = max(max_zone_pfn[zone], start_pfn);
+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
+ arch_zone_highest_possible_pfn[zone] = end_pfn;
+
+ start_pfn = end_pfn;
+ }
+
+ /* Find the PFNs that ZONE_MOVABLE begins at in each node */
+ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+ find_zone_movable_pfns_for_nodes();
+
+ /* Print out the zone ranges */
+ pr_info("Zone ranges:\n");
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
+ pr_info(" %-8s ", zone_names[i]);
+ if (arch_zone_lowest_possible_pfn[i] ==
+ arch_zone_highest_possible_pfn[i])
+ pr_cont("empty\n");
+ else
+ pr_cont("[mem %#018Lx-%#018Lx]\n",
+ (u64)arch_zone_lowest_possible_pfn[i]
+ << PAGE_SHIFT,
+ ((u64)arch_zone_highest_possible_pfn[i]
+ << PAGE_SHIFT) - 1);
+ }
+
+ /* Print out the PFNs ZONE_MOVABLE begins at in each node */
+ pr_info("Movable zone start for each node\n");
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (zone_movable_pfn[i])
+ pr_info(" Node %d: %#018Lx\n", i,
+ (u64)zone_movable_pfn[i] << PAGE_SHIFT);
+ }
+
+ /*
+ * Print out the early node map, and initialize the
+ * subsection-map relative to active online memory ranges to
+ * enable future "sub-section" extensions of the memory map.
+ */
+ pr_info("Early memory node ranges\n");
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ ((u64)end_pfn << PAGE_SHIFT) - 1);
+ subsection_map_init(start_pfn, end_pfn - start_pfn);
+ }
+
+ /* Initialise every node */
+ mminit_verify_pageflags_layout();
+ setup_nr_node_ids();
+ set_pageblock_order();
+
+ for_each_node(nid) {
+ pg_data_t *pgdat;
+
+ if (!node_online(nid)) {
+ pr_info("Initializing node %d as memoryless\n", nid);
+
+ /* Allocator not initialized yet */
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat)
+ panic("Cannot allocate %zuB for node %d.\n",
+ sizeof(*pgdat), nid);
+ arch_refresh_nodedata(nid, pgdat);
+ free_area_init_node(nid);
+
+ /*
+ * We do not want to confuse userspace by sysfs
+ * files/directories for node without any memory
+ * attached to it, so this node is not marked as
+ * N_MEMORY and not marked online so that no sysfs
+ * hierarchy will be created via register_one_node for
+ * it. The pgdat will get fully initialized by
+ * hotadd_init_pgdat() when memory is hotplugged into
+ * this node.
+ */
+ continue;
+ }
+
+ pgdat = NODE_DATA(nid);
+ free_area_init_node(nid);
+
+ /* Any memory on that node */
+ if (pgdat->node_present_pages)
+ node_set_state(nid, N_MEMORY);
+ check_for_memory(pgdat);
+ }
+
+ memmap_init();
+
+ /* disable hash distribution for systems with a single node */
+ fixup_hashdist();
+}
+
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
+ * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+ unsigned long accl_mask = 0, last_end = 0;
+ unsigned long start, end, mask;
+ int last_nid = NUMA_NO_NODE;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
+ if (!start || last_nid < 0 || last_nid == nid) {
+ last_nid = nid;
+ last_end = end;
+ continue;
+ }
+
+ /*
+ * Start with a mask granular enough to pin-point to the
+ * start pfn and tick off bits one-by-one until it becomes
+ * too coarse to separate the current node from the last.
+ */
+ mask = ~((1 << __ffs(start)) - 1);
+ while (mask && last_end <= (start & (mask << 1)))
+ mask <<= 1;
+
+ /* accumulate all internode masks */
+ accl_mask |= mask;
+ }
+
+ /* convert mask to number of pages */
+ return ~accl_mask + 1;
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __init deferred_free_range(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ struct page *page;
+ unsigned long i;
+
+ if (!nr_pages)
+ return;
+
+ page = pfn_to_page(pfn);
+
+ /* Free a large naturally-aligned chunk if possible */
+ if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
+ for (i = 0; i < nr_pages; i += pageblock_nr_pages)
+ set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
+ __free_pages_core(page, MAX_ORDER);
+ return;
+ }
+
+ /* Accept chunks smaller than MAX_ORDER upfront */
+ accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
+
+ for (i = 0; i < nr_pages; i++, page++, pfn++) {
+ if (pageblock_aligned(pfn))
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ __free_pages_core(page, 0);
+ }
+}
+
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
+
+/*
+ * Returns true if page needs to be initialized or freed to buddy allocator.
+ *
+ * We check if a current MAX_ORDER block is valid by only checking the validity
+ * of the head pfn.
+ */
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
+{
+ if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn))
+ return false;
+ return true;
+}
+
+/*
+ * Free pages to buddy allocator. Try to free aligned pages in
+ * MAX_ORDER_NR_PAGES sizes.
+ */
+static void __init deferred_free_pages(unsigned long pfn,
+ unsigned long end_pfn)
+{
+ unsigned long nr_free = 0;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(pfn)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 0;
+ } else if (IS_MAX_ORDER_ALIGNED(pfn)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 1;
+ } else {
+ nr_free++;
+ }
+ }
+ /* Free the last block of pages to allocator */
+ deferred_free_range(pfn - nr_free, nr_free);
+}
+
+/*
+ * Initialize struct pages. We minimize pfn page lookups and scheduler checks
+ * by performing it only once every MAX_ORDER_NR_PAGES.
+ * Return number of pages initialized.
+ */
+static unsigned long __init deferred_init_pages(struct zone *zone,
+ unsigned long pfn,
+ unsigned long end_pfn)
+{
+ int nid = zone_to_nid(zone);
+ unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
+ struct page *page = NULL;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(pfn)) {
+ page = NULL;
+ continue;
+ } else if (!page || IS_MAX_ORDER_ALIGNED(pfn)) {
+ page = pfn_to_page(pfn);
+ } else {
+ page++;
+ }
+ __init_single_page(page, pfn, zid, nid);
+ nr_pages++;
+ }
+ return (nr_pages);
+}
+
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
+ continue;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+ if (mo_pfn < *end_pfn) {
+ *start_pfn = mo_pfn;
+ break;
+ }
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ swap(j, *i);
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ deferred_free_pages(spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ return nr_pages;
+}
+
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg)
+{
+ unsigned long spfn, epfn;
+ struct zone *zone = arg;
+ u64 i;
+
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so that we
+ * can avoid introducing any issues with the buddy allocator.
+ */
+ while (spfn < end_pfn) {
+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
+}
+
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ return 1;
+}
+
+/* Initialise remaining memory on a node */
+static int __init deferred_init_memmap(void *data)
+{
+ pg_data_t *pgdat = data;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0;
+ unsigned long first_init_pfn, flags;
+ unsigned long start = jiffies;
+ struct zone *zone;
+ int zid, max_threads;
+ u64 i;
+
+ /* Bind memory initialisation thread to a local node if possible */
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(current, cpumask);
+
+ pgdat_resize_lock(pgdat, &flags);
+ first_init_pfn = pgdat->first_deferred_pfn;
+ if (first_init_pfn == ULONG_MAX) {
+ pgdat_resize_unlock(pgdat, &flags);
+ pgdat_init_report_one_done();
+ return 0;
+ }
+
+ /* Sanity check boundaries */
+ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+ BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+ pgdat->first_deferred_pfn = ULONG_MAX;
+
+ /*
+ * Once we unlock here, the zone cannot be grown anymore, thus if an
+ * interrupt thread must allocate this early in boot, zone must be
+ * pre-grown prior to start of deferred page initialization.
+ */
+ pgdat_resize_unlock(pgdat, &flags);
+
+ /* Only the highest zone is deferred so find it */
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ zone = pgdat->node_zones + zid;
+ if (first_init_pfn < zone_end_pfn(zone))
+ break;
+ }
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
+
+ max_threads = deferred_page_init_max_threads(cpumask);
+
+ while (spfn < epfn) {
+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
+ struct padata_mt_job job = {
+ .thread_fn = deferred_init_memmap_chunk,
+ .fn_arg = zone,
+ .start = spfn,
+ .size = epfn_align - spfn,
+ .align = PAGES_PER_SECTION,
+ .min_chunk = PAGES_PER_SECTION,
+ .max_threads = max_threads,
+ };
+
+ padata_do_multithreaded(&job);
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ epfn_align);
+ }
+zone_empty:
+ /* Sanity check that the next zone really is unpopulated */
+ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+
+ pr_info("node %d deferred pages initialised in %ums\n",
+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
+
+ pgdat_init_report_one_done();
+ return 0;
+}
+
+/*
+ * If this zone has deferred pages, try to grow it by initializing enough
+ * deferred pages to satisfy the allocation specified by order, rounded up to
+ * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
+ *
+ * Return true when zone was grown, otherwise return false. We return true even
+ * when we grow less than requested, to let the caller decide if there are
+ * enough pages to satisfy the allocation.
+ *
+ * Note: We use noinline because this function is needed only during boot, and
+ * it is called from a __ref function _deferred_grow_zone. This way we are
+ * making sure that it is not inlined into permanent text section.
+ */
+bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+ pg_data_t *pgdat = zone->zone_pgdat;
+ unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
+ u64 i;
+
+ /* Only the last zone may have deferred pages */
+ if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
+ return false;
+
+ pgdat_resize_lock(pgdat, &flags);
+
+ /*
+ * If someone grew this zone while we were waiting for spinlock, return
+ * true, as there might be enough pages already.
+ */
+ if (first_deferred_pfn != pgdat->first_deferred_pfn) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return true;
+ }
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
+ pgdat_resize_unlock(pgdat, &flags);
+ /* Retry only once. */
+ return first_deferred_pfn != ULONG_MAX;
+ }
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn) {
+ /* update our first deferred PFN for this section */
+ first_deferred_pfn = spfn;
+
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ touch_nmi_watchdog();
+
+ /* We should only stop along section boundaries */
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+ continue;
+
+ /* If our quota has been met we can stop here */
+ if (nr_pages >= nr_pages_needed)
+ break;
+ }
+
+ pgdat->first_deferred_pfn = spfn;
+ pgdat_resize_unlock(pgdat, &flags);
+
+ return nr_pages > 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+#ifdef CONFIG_CMA
+void __init init_cma_reserved_pageblock(struct page *page)
+{
+ unsigned i = pageblock_nr_pages;
+ struct page *p = page;
+
+ do {
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ } while (++p, --i);
+
+ set_pageblock_migratetype(page, MIGRATE_CMA);
+ set_page_refcounted(page);
+ __free_pages(page, pageblock_order);
+
+ adjust_managed_page_count(page, pageblock_nr_pages);
+ page_zone(page)->cma_pages += pageblock_nr_pages;
+}
+#endif
+
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = pageblock_end_pfn(block_start_pfn);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ cond_resched();
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
+void __init page_alloc_init_late(void)
+{
+ struct zone *zone;
+ int nid;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
+ for_each_node_state(nid, N_MEMORY) {
+ kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+ }
+
+ /* Block until all are initialised */
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /*
+ * We initialized the rest of the deferred pages. Permanently disable
+ * on-demand struct page initialization.
+ */
+ static_branch_disable(&deferred_pages);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
+#endif
+
+ buffer_init();
+
+ /* Discard memblock private memory */
+ memblock_discard();
+
+ for_each_node_state(nid, N_MEMORY)
+ shuffle_free_memory(NODE_DATA(nid));
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
+
+ /* Initialize page ext after all struct pages are initialized. */
+ if (deferred_struct_pages)
+ page_ext_init();
+
+ page_alloc_sysctl_init();
+}
+
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE (64ul << 30)
+#define ADAPT_SCALE_SHIFT 2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ * quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+ unsigned long bucketsize,
+ unsigned long numentries,
+ int scale,
+ int flags,
+ unsigned int *_hash_shift,
+ unsigned int *_hash_mask,
+ unsigned long low_limit,
+ unsigned long high_limit)
+{
+ unsigned long long max = high_limit;
+ unsigned long log2qty, size;
+ void *table;
+ gfp_t gfp_flags;
+ bool virt;
+ bool huge;
+
+ /* allow the kernel cmdline to have a say */
+ if (!numentries) {
+ /* round applicable memory size up to nearest megabyte */
+ numentries = nr_kernel_pages;
+ numentries -= arch_reserved_kernel_pages();
+
+ /* It isn't necessary when PAGE_SIZE >= 1MB */
+ if (PAGE_SIZE < SZ_1M)
+ numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
+
+#if __BITS_PER_LONG > 32
+ if (!high_limit) {
+ unsigned long adapt;
+
+ for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+ adapt <<= ADAPT_SCALE_SHIFT)
+ scale++;
+ }
+#endif
+
+ /* limit to 1 bucket per 2^scale bytes of low memory */
+ if (scale > PAGE_SHIFT)
+ numentries >>= (scale - PAGE_SHIFT);
+ else
+ numentries <<= (PAGE_SHIFT - scale);
+
+ /* Make sure we've got at least a 0-order allocation.. */
+ if (unlikely(flags & HASH_SMALL)) {
+ /* Makes no sense without HASH_EARLY */
+ WARN_ON(!(flags & HASH_EARLY));
+ if (!(numentries >> *_hash_shift)) {
+ numentries = 1UL << *_hash_shift;
+ BUG_ON(!numentries);
+ }
+ } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ numentries = PAGE_SIZE / bucketsize;
+ }
+ numentries = roundup_pow_of_two(numentries);
+
+ /* limit allocation size to 1/16 total memory by default */
+ if (max == 0) {
+ max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+ do_div(max, bucketsize);
+ }
+ max = min(max, 0x80000000ULL);
+
+ if (numentries < low_limit)
+ numentries = low_limit;
+ if (numentries > max)
+ numentries = max;
+
+ log2qty = ilog2(numentries);
+
+ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
+ do {
+ virt = false;
+ size = bucketsize << log2qty;
+ if (flags & HASH_EARLY) {
+ if (flags & HASH_ZERO)
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
+ else
+ table = memblock_alloc_raw(size,
+ SMP_CACHE_BYTES);
+ } else if (get_order(size) > MAX_ORDER || hashdist) {
+ table = vmalloc_huge(size, gfp_flags);
+ virt = true;
+ if (table)
+ huge = is_vm_area_hugepages(table);
+ } else {
+ /*
+ * If bucketsize is not a power-of-two, we may free
+ * some pages at the end of hash table which
+ * alloc_pages_exact() automatically does
+ */
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
+ }
+ } while (!table && size > PAGE_SIZE && --log2qty);
+
+ if (!table)
+ panic("Failed to allocate %s hash table\n", tablename);
+
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
+
+ if (_hash_shift)
+ *_hash_shift = log2qty;
+ if (_hash_mask)
+ *_hash_mask = (1 << log2qty) - 1;
+
+ return table;
+}
+
+/**
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+ dma_reserve = new_dma_reserve;
+}
+
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
+ unsigned int order)
+{
+
+ if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
+ int nid = early_pfn_to_nid(pfn);
+
+ if (!early_page_initialised(pfn, nid))
+ return;
+ }
+
+ if (!kmsan_memblock_free_pages(page, order)) {
+ /* KMSAN will take care of these pages. */
+ return;
+ }
+ __free_pages_core(page, order);
+}
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
+EXPORT_SYMBOL(init_on_alloc);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
+EXPORT_SYMBOL(init_on_free);
+
+static bool _init_on_alloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
+static int __init early_init_on_alloc(char *buf)
+{
+
+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
+}
+early_param("init_on_alloc", early_init_on_alloc);
+
+static bool _init_on_free_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
+static int __init early_init_on_free(char *buf)
+{
+ return kstrtobool(buf, &_init_on_free_enabled_early);
+}
+early_param("init_on_free", early_init_on_free);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
+
+/*
+ * Enable static keys related to various memory debugging and hardening options.
+ * Some override others, and depend on early params that are evaluated in the
+ * order of appearance. So we need to first gather the full picture of what was
+ * enabled, and then make decisions.
+ */
+static void __init mem_debugging_and_hardening_init(void)
+{
+ bool page_poisoning_requested = false;
+ bool want_check_pages = false;
+
+#ifdef CONFIG_PAGE_POISONING
+ /*
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ if (page_poisoning_enabled() ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled())) {
+ static_branch_enable(&_page_poisoning_enabled);
+ page_poisoning_requested = true;
+ want_check_pages = true;
+ }
+#endif
+
+ if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
+ page_poisoning_requested) {
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+ "will take precedence over init_on_alloc and init_on_free\n");
+ _init_on_alloc_enabled_early = false;
+ _init_on_free_enabled_early = false;
+ }
+
+ if (_init_on_alloc_enabled_early) {
+ want_check_pages = true;
+ static_branch_enable(&init_on_alloc);
+ } else {
+ static_branch_disable(&init_on_alloc);
+ }
+
+ if (_init_on_free_enabled_early) {
+ want_check_pages = true;
+ static_branch_enable(&init_on_free);
+ } else {
+ static_branch_disable(&init_on_free);
+ }
+
+ if (IS_ENABLED(CONFIG_KMSAN) &&
+ (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
+ pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (debug_pagealloc_enabled()) {
+ want_check_pages = true;
+ static_branch_enable(&_debug_pagealloc_enabled);
+
+ if (debug_guardpage_minorder())
+ static_branch_enable(&_debug_guardpage_enabled);
+ }
+#endif
+
+ /*
+ * Any page debugging or hardening option also enables sanity checking
+ * of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
+ * enabled already.
+ */
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
+ static_branch_enable(&check_pages_enabled);
+}
+
+/* Report memory auto-initialization states for this boot. */
+static void __init report_meminit(void)
+{
+ const char *stack;
+
+ if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
+ stack = "all(pattern)";
+ else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
+ stack = "all(zero)";
+ else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
+ stack = "byref_all(zero)";
+ else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
+ stack = "byref(zero)";
+ else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
+ stack = "__user(zero)";
+ else
+ stack = "off";
+
+ pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
+ stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
+ want_init_on_free() ? "on" : "off");
+ if (want_init_on_free())
+ pr_info("mem auto-init: clearing system memory may take some time...\n");
+}
+
+static void __init mem_init_print_info(void)
+{
+ unsigned long physpages, codesize, datasize, rosize, bss_size;
+ unsigned long init_code_size, init_data_size;
+
+ physpages = get_num_physpages();
+ codesize = _etext - _stext;
+ datasize = _edata - _sdata;
+ rosize = __end_rodata - __start_rodata;
+ bss_size = __bss_stop - __bss_start;
+ init_data_size = __init_end - __init_begin;
+ init_code_size = _einittext - _sinittext;
+
+ /*
+ * Detect special cases and adjust section sizes accordingly:
+ * 1) .init.* may be embedded into .data sections
+ * 2) .init.text.* may be out of [__init_begin, __init_end],
+ * please refer to arch/tile/kernel/vmlinux.lds.S.
+ * 3) .rodata.* may be embedded into .text or .data sections.
+ */
+#define adj_init_size(start, end, size, pos, adj) \
+ do { \
+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
+ size -= adj; \
+ } while (0)
+
+ adj_init_size(__init_begin, __init_end, init_data_size,
+ _sinittext, init_code_size);
+ adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+ adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+ adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+ adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+
+#undef adj_init_size
+
+ pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
+#ifdef CONFIG_HIGHMEM
+ ", %luK highmem"
+#endif
+ ")\n",
+ K(nr_free_pages()), K(physpages),
+ codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
+ (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
+ K(physpages - totalram_pages() - totalcma_pages),
+ K(totalcma_pages)
+#ifdef CONFIG_HIGHMEM
+ , K(totalhigh_pages())
+#endif
+ );
+}
+
+/*
+ * Set up kernel memory allocators
+ */
+void __init mm_core_init(void)
+{
+ /* Initializations relying on SMP setup */
+ build_all_zonelists(NULL);
+ page_alloc_init_cpuhp();
+
+ /*
+ * page_ext requires contiguous pages,
+ * bigger than MAX_ORDER unless SPARSEMEM.
+ */
+ page_ext_init_flatmem();
+ mem_debugging_and_hardening_init();
+ kfence_alloc_pool();
+ report_meminit();
+ kmsan_init_shadow();
+ stack_depot_early_init();
+ mem_init();
+ mem_init_print_info();
+ kmem_cache_init();
+ /*
+ * page_owner must be initialized after buddy is ready, and also after
+ * slab is ready so that stack_depot_init() works properly
+ */
+ page_ext_init_flatmem_late();
+ kmemleak_init();
+ ptlock_cache_init();
+ pgtable_cache_init();
+ debug_objects_mem_init();
+ vmalloc_init();
+ /* If no deferred init page_ext now, as vmap is fully initialized */
+ if (!deferred_struct_pages)
+ page_ext_init();
+ /* Should be run before the first non-init thread is created */
+ init_espfix_bsp();
+ /* Should be run after espfix64 is set up. */
+ pti_init();
+ kmsan_init_runtime();
+ mm_cache_init();
+}
diff --git a/mm/mm_slot.h b/mm/mm_slot.h
new file mode 100644
index 000000000000..83f18ed1c4bd
--- /dev/null
+++ b/mm/mm_slot.h
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _LINUX_MM_SLOT_H
+#define _LINUX_MM_SLOT_H
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+
+/*
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: link to the mm_slots hash list
+ * @mm_node: link into the mm_slots list
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+#define mm_slot_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+static inline void *mm_slot_alloc(struct kmem_cache *cache)
+{
+ if (!cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(cache, GFP_KERNEL);
+}
+
+static inline void mm_slot_free(struct kmem_cache *cache, void *objp)
+{
+ kmem_cache_free(cache, objp);
+}
+
+#define mm_slot_lookup(_hashtable, _mm) \
+({ \
+ struct mm_slot *tmp_slot, *mm_slot = NULL; \
+ \
+ hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \
+ if (_mm == tmp_slot->mm) { \
+ mm_slot = tmp_slot; \
+ break; \
+ } \
+ \
+ mm_slot; \
+})
+
+#define mm_slot_insert(_hashtable, _mm, _mm_slot) \
+({ \
+ _mm_slot->mm = _mm; \
+ hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \
+})
+
+#endif /* _LINUX_MM_SLOT_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 67d11ad6df24..3937479d0e07 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,7 +13,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
+#include <linux/mm_inline.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -38,7 +38,6 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
-#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
@@ -47,6 +46,7 @@
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
+#include <linux/ksm.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -76,45 +76,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
- unsigned long start, unsigned long end);
-
-/* description of effects of mapping type and prot in current implementation.
- * this is due to the limited x86 page protection hardware. The expected
- * behavior is in parens:
- *
- * map_type prot
- * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
- * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
- * w: (no) no w: (no) no w: (yes) yes w: (no) no
- * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
- * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
- * w: (no) no w: (no) no w: (copy) copy w: (no) no
- * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- */
-pgprot_t protection_map[16] __ro_after_init = {
- __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
- __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
-};
-
-#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
-static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
-{
- return prot;
-}
-#endif
-
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
-{
- pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
- pgprot_val(arch_vm_get_page_prot(vm_flags)));
-
- return arch_filter_pgprot(ret);
-}
-EXPORT_SYMBOL(vm_get_page_prot);
+ struct vm_area_struct *next, unsigned long start,
+ unsigned long end, bool mm_wr_locked);
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
@@ -142,8 +107,6 @@ void vma_set_page_prot(struct vm_area_struct *vma)
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
- if (vma->vm_flags & VM_DENYWRITE)
- allow_write_access(file);
if (vma->vm_flags & VM_SHARED)
mapping_unmap_writable(mapping);
@@ -169,34 +132,70 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
/*
- * Close a vm structure and free it, returning the next.
+ * Close a vm structure and free it.
*/
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
- struct vm_area_struct *next = vma->vm_next;
-
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
- vm_area_free(vma);
- return next;
+ if (unreachable)
+ __vm_area_free(vma);
+ else
+ vm_area_free(vma);
}
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
- struct list_head *uf);
+static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
+ unsigned long min)
+{
+ return mas_prev(&vmi->mas, min);
+}
+
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end, gfp_t gfp)
+{
+ vmi->mas.index = start;
+ vmi->mas.last = end - 1;
+ mas_store_gfp(&vmi->mas, NULL, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * check_brk_limits() - Use platform specific check of range & verify mlock
+ * limits.
+ * @addr: The address to check
+ * @len: The size of increase.
+ *
+ * Return: 0 on success.
+ */
+static int check_brk_limits(unsigned long addr, unsigned long len)
+{
+ unsigned long mapped_addr;
+
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
+
+ return mlock_future_ok(current->mm, current->mm->def_flags, len)
+ ? 0 : -EAGAIN;
+}
+static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request, unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
- unsigned long retval;
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *next;
+ struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
- bool populate;
- bool downgraded = false;
+ bool populate = false;
LIST_HEAD(uf);
+ struct vma_iterator vmi;
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -236,261 +235,114 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto success;
}
- /*
- * Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_lock to read.
- */
+ /* Always allow shrinking brk. */
if (brk <= mm->brk) {
- int ret;
-
+ /* Search one past newbrk */
+ vma_iter_init(&vmi, mm, newbrk);
+ brkvma = vma_find(&vmi, oldbrk);
+ if (!brkvma || brkvma->vm_start >= oldbrk)
+ goto out; /* mapping intersects with an existing non-brk vma. */
/*
- * mm->brk must to be protected by write mmap_lock so update it
- * before downgrading mmap_lock. When __do_munmap() fails,
- * mm->brk will be restored from origbrk.
+ * mm->brk must be protected by write mmap_lock.
+ * do_vma_munmap() will drop the lock on success, so update it
+ * before calling do_vma_munmap().
*/
mm->brk = brk;
- ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
- if (ret < 0) {
- mm->brk = origbrk;
+ if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true))
goto out;
- } else if (ret == 1) {
- downgraded = true;
- }
- goto success;
+
+ goto success_unlocked;
}
- /* Check against existing mmap mappings. */
- next = find_vma(mm, oldbrk);
+ if (check_brk_limits(oldbrk, newbrk - oldbrk))
+ goto out;
+
+ /*
+ * Only check if the next VMA is within the stack_guard_gap of the
+ * expansion area
+ */
+ vma_iter_init(&vmi, mm, oldbrk);
+ next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
+ brkvma = vma_prev_limit(&vmi, mm->start_brk);
/* Ok, looks good - let it rip. */
- if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
+
mm->brk = brk;
+ if (mm->def_flags & VM_LOCKED)
+ populate = true;
success:
- populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
- if (downgraded)
- mmap_read_unlock(mm);
- else
- mmap_write_unlock(mm);
+ mmap_write_unlock(mm);
+success_unlocked:
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
out:
- retval = origbrk;
+ mm->brk = origbrk;
mmap_write_unlock(mm);
- return retval;
-}
-
-static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
-{
- unsigned long gap, prev_end;
-
- /*
- * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
- * allow two stack_guard_gaps between them here, and when choosing
- * an unmapped area; whereas when expanding we only require one.
- * That's a little inconsistent, but keeps the code here simpler.
- */
- gap = vm_start_gap(vma);
- if (vma->vm_prev) {
- prev_end = vm_end_gap(vma->vm_prev);
- if (gap > prev_end)
- gap -= prev_end;
- else
- gap = 0;
- }
- return gap;
-}
-
-#ifdef CONFIG_DEBUG_VM_RB
-static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
-{
- unsigned long max = vma_compute_gap(vma), subtree_gap;
- if (vma->vm_rb.rb_left) {
- subtree_gap = rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- if (vma->vm_rb.rb_right) {
- subtree_gap = rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- return max;
-}
-
-static int browse_rb(struct mm_struct *mm)
-{
- struct rb_root *root = &mm->mm_rb;
- int i = 0, j, bug = 0;
- struct rb_node *nd, *pn = NULL;
- unsigned long prev = 0, pend = 0;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev) {
- pr_emerg("vm_start %lx < prev %lx\n",
- vma->vm_start, prev);
- bug = 1;
- }
- if (vma->vm_start < pend) {
- pr_emerg("vm_start %lx < pend %lx\n",
- vma->vm_start, pend);
- bug = 1;
- }
- if (vma->vm_start > vma->vm_end) {
- pr_emerg("vm_start %lx > vm_end %lx\n",
- vma->vm_start, vma->vm_end);
- bug = 1;
- }
- spin_lock(&mm->page_table_lock);
- if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- pr_emerg("free gap %lx, correct %lx\n",
- vma->rb_subtree_gap,
- vma_compute_subtree_gap(vma));
- bug = 1;
- }
- spin_unlock(&mm->page_table_lock);
- i++;
- pn = nd;
- prev = vma->vm_start;
- pend = vma->vm_end;
- }
- j = 0;
- for (nd = pn; nd; nd = rb_prev(nd))
- j++;
- if (i != j) {
- pr_emerg("backwards %d, forwards %d\n", j, i);
- bug = 1;
- }
- return bug ? -1 : i;
-}
-
-static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
-{
- struct rb_node *nd;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- VM_BUG_ON_VMA(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
- vma);
- }
+ return origbrk;
}
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- unsigned long highest_address = 0;
- struct vm_area_struct *vma = mm->mmap;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- while (vma) {
+ mt_validate(&mm->mm_mt);
+ for_each_vma(vmi, vma) {
+#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
+#endif
+ unsigned long vmi_start, vmi_end;
+ bool warn = 0;
+
+ vmi_start = vma_iter_addr(&vmi);
+ vmi_end = vma_iter_end(&vmi);
+ if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
+ warn = 1;
+
+ if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
+ warn = 1;
+
+ if (warn) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma);
+ pr_emerg("tree range: %px start %lx end %lx\n", vma,
+ vmi_start, vmi_end - 1);
+ vma_iter_dump_tree(&vmi);
+ }
+#ifdef CONFIG_DEBUG_VM_RB
if (anon_vma) {
anon_vma_lock_read(anon_vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
-
- highest_address = vm_end_gap(vma);
- vma = vma->vm_next;
+#endif
i++;
}
if (i != mm->map_count) {
- pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
- bug = 1;
- }
- if (highest_address != mm->highest_vm_end) {
- pr_emerg("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
- bug = 1;
- }
- i = browse_rb(mm);
- if (i != mm->map_count) {
- if (i != -1)
- pr_emerg("map_count %d rb %d\n", mm->map_count, i);
+ pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
-#else
-#define validate_mm_rb(root, ignore) do { } while (0)
-#define validate_mm(mm) do { } while (0)
-#endif
-
-RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
- struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_gap)
-/*
- * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
- * vma->vm_prev->vm_end values changed, without modifying the vma's position
- * in the rbtree.
- */
-static void vma_gap_update(struct vm_area_struct *vma)
-{
- /*
- * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
- * a callback function that does exactly what we want.
- */
- vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
-}
-
-static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- /* All rb_subtree_gap values must be consistent prior to insertion */
- validate_mm_rb(root, NULL);
-
- rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
-{
- /*
- * Note rb_erase_augmented is a fairly large inline function,
- * so make sure we instantiate it only once with our desired
- * augmented rbtree callbacks.
- */
- rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- struct rb_root *root,
- struct vm_area_struct *ignore)
-{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of
- *
- * a. the "next" vma being erased if next->vm_start was reduced in
- * __vma_adjust() -> __vma_unlink()
- * b. the vma being erased in detach_vmas_to_be_unmapped() ->
- * vma_rb_erase()
- */
- validate_mm_rb(root, ignore);
-
- __vma_rb_erase(vma, root);
-}
-
-static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- vma_rb_erase_ignore(vma, root, vma);
-}
+#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
+#define validate_mm(mm) do { } while (0)
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
/*
* vma has some anon_vma assigned, and is already inserted on that
@@ -524,467 +376,362 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
-static int find_vma_links(struct mm_struct *mm, unsigned long addr,
- unsigned long end, struct vm_area_struct **pprev,
- struct rb_node ***rb_link, struct rb_node **rb_parent)
-{
- struct rb_node **__rb_link, *__rb_parent, *rb_prev;
-
- __rb_link = &mm->mm_rb.rb_node;
- rb_prev = __rb_parent = NULL;
-
- while (*__rb_link) {
- struct vm_area_struct *vma_tmp;
-
- __rb_parent = *__rb_link;
- vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- /* Fail if an existing vma overlaps the area */
- if (vma_tmp->vm_start < end)
- return -ENOMEM;
- __rb_link = &__rb_parent->rb_left;
- } else {
- rb_prev = __rb_parent;
- __rb_link = &__rb_parent->rb_right;
- }
- }
-
- *pprev = NULL;
- if (rb_prev)
- *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- *rb_link = __rb_link;
- *rb_parent = __rb_parent;
- return 0;
-}
-
static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
- unsigned long nr_pages = 0;
+ VMA_ITERATOR(vmi, mm, addr);
struct vm_area_struct *vma;
+ unsigned long nr_pages = 0;
- /* Find first overlaping mapping */
- vma = find_vma_intersection(mm, addr, end);
- if (!vma)
- return 0;
-
- nr_pages = (min(end, vma->vm_end) -
- max(addr, vma->vm_start)) >> PAGE_SHIFT;
-
- /* Iterate over the rest of the overlaps */
- for (vma = vma->vm_next; vma; vma = vma->vm_next) {
- unsigned long overlap_len;
+ for_each_vma_range(vmi, vma, end) {
+ unsigned long vm_start = max(addr, vma->vm_start);
+ unsigned long vm_end = min(end, vma->vm_end);
- if (vma->vm_start > end)
- break;
-
- overlap_len = min(end, vma->vm_end) - vma->vm_start;
- nr_pages += overlap_len >> PAGE_SHIFT;
+ nr_pages += PHYS_PFN(vm_end - vm_start);
}
return nr_pages;
}
-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- struct rb_node **rb_link, struct rb_node *rb_parent)
-{
- /* Update tracking information for the gap following the new vma. */
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
-
- /*
- * vma->vm_prev wasn't known when we followed the rbtree to find the
- * correct insertion point for that vma. As a result, we could not
- * update the vma vm_rb parents rb_subtree_gap values on the way down.
- * So, we first insert the vma with a zero rb_subtree_gap value
- * (to be consistent with what we did on the way down), and then
- * immediately update the gap to the correct value. Finally we
- * rebalance the rbtree after all augmented values have been set.
- */
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- vma->rb_subtree_gap = 0;
- vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
-}
-
-static void __vma_link_file(struct vm_area_struct *vma)
+static void __vma_link_file(struct vm_area_struct *vma,
+ struct address_space *mapping)
{
- struct file *file;
-
- file = vma->vm_file;
- if (file) {
- struct address_space *mapping = file->f_mapping;
-
- if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file_inode(file)->i_writecount);
- if (vma->vm_flags & VM_SHARED)
- mapping_allow_writable(mapping);
-
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_insert(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
- }
-}
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(mapping);
-static void
-__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- __vma_link_list(mm, vma, prev);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
}
-static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
+static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct address_space *mapping = NULL;
+ if (vma_iter_prealloc(&vmi))
+ return -ENOMEM;
+
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
}
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- __vma_link_file(vma);
+ vma_iter_store(&vmi, vma);
- if (mapping)
+ if (mapping) {
+ __vma_link_file(vma, mapping);
i_mmap_unlock_write(mapping);
+ }
mm->map_count++;
validate_mm(mm);
+ return 0;
}
/*
- * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree. It has already been inserted into the interval tree.
+ * init_multi_vma_prep() - Initializer for struct vma_prepare
+ * @vp: The vma_prepare struct
+ * @vma: The vma that will be altered once locked
+ * @next: The next vma if it is to be adjusted
+ * @remove: The first vma to be removed
+ * @remove2: The second vma to be removed
*/
-static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+static inline void init_multi_vma_prep(struct vma_prepare *vp,
+ struct vm_area_struct *vma, struct vm_area_struct *next,
+ struct vm_area_struct *remove, struct vm_area_struct *remove2)
{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
+ memset(vp, 0, sizeof(struct vma_prepare));
+ vp->vma = vma;
+ vp->anon_vma = vma->anon_vma;
+ vp->remove = remove;
+ vp->remove2 = remove2;
+ vp->adj_next = next;
+ if (!vp->anon_vma && next)
+ vp->anon_vma = next->anon_vma;
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- BUG();
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- mm->map_count++;
-}
+ vp->file = vma->vm_file;
+ if (vp->file)
+ vp->mapping = vma->vm_file->f_mapping;
-static __always_inline void __vma_unlink(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *ignore)
-{
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- __vma_unlink_list(mm, vma);
- /* Kill the cache */
- vmacache_invalidate(mm);
}
/*
- * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
- * is already present in an i_mmap tree without adjusting the tree.
- * The following helper function should be used when such adjustments
- * are necessary. The "insert" vma (if any) is to be inserted
- * before we drop the necessary locks.
+ * init_vma_prep() - Initializer wrapper for vma_prepare struct
+ * @vp: The vma_prepare struct
+ * @vma: The vma that will be altered once locked
*/
-int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
- struct vm_area_struct *expand)
+static inline void init_vma_prep(struct vma_prepare *vp,
+ struct vm_area_struct *vma)
{
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
- struct address_space *mapping = NULL;
- struct rb_root_cached *root = NULL;
- struct anon_vma *anon_vma = NULL;
- struct file *file = vma->vm_file;
- bool start_changed = false, end_changed = false;
- long adjust_next = 0;
- int remove_next = 0;
-
- if (next && !insert) {
- struct vm_area_struct *exporter = NULL, *importer = NULL;
-
- if (end >= next->vm_end) {
- /*
- * vma expands, overlapping all the next, and
- * perhaps the one after too (mprotect case 6).
- * The only other cases that gets here are
- * case 1, case 7 and case 8.
- */
- if (next == expand) {
- /*
- * The only case where we don't expand "vma"
- * and we expand "next" instead is case 8.
- */
- VM_WARN_ON(end != next->vm_end);
- /*
- * remove_next == 3 means we're
- * removing "vma" and that to do so we
- * swapped "vma" and "next".
- */
- remove_next = 3;
- VM_WARN_ON(file != next->vm_file);
- swap(vma, next);
- } else {
- VM_WARN_ON(expand != vma);
- /*
- * case 1, 6, 7, remove_next == 2 is case 6,
- * remove_next == 1 is case 1 or 7.
- */
- remove_next = 1 + (end > next->vm_end);
- VM_WARN_ON(remove_next == 2 &&
- end != next->vm_next->vm_end);
- /* trim end to next, for case 6 first pass */
- end = next->vm_end;
- }
-
- exporter = next;
- importer = vma;
-
- /*
- * If next doesn't have anon_vma, import from vma after
- * next, if the vma overlaps with it.
- */
- if (remove_next == 2 && !next->anon_vma)
- exporter = next->vm_next;
-
- } else if (end > next->vm_start) {
- /*
- * vma expands, overlapping part of the next:
- * mprotect case 5 shifting the boundary up.
- */
- adjust_next = (end - next->vm_start);
- exporter = next;
- importer = vma;
- VM_WARN_ON(expand != importer);
- } else if (end < vma->vm_end) {
- /*
- * vma shrinks, and !insert tells it's not
- * split_vma inserting another: so it must be
- * mprotect case 4 shifting the boundary down.
- */
- adjust_next = -(vma->vm_end - end);
- exporter = vma;
- importer = next;
- VM_WARN_ON(expand != importer);
- }
-
- /*
- * Easily overlooked: when mprotect shifts the boundary,
- * make sure the expanding vma has anon_vma set if the
- * shrinking vma had, to cover any anon pages imported.
- */
- if (exporter && exporter->anon_vma && !importer->anon_vma) {
- int error;
-
- importer->anon_vma = exporter->anon_vma;
- error = anon_vma_clone(importer, exporter);
- if (error)
- return error;
- }
- }
-again:
- vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
-
- if (file) {
- mapping = file->f_mapping;
- root = &mapping->i_mmap;
- uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
+}
- if (adjust_next)
- uprobe_munmap(next, next->vm_start, next->vm_end);
- i_mmap_lock_write(mapping);
- if (insert) {
+/*
+ * vma_prepare() - Helper function for handling locking VMAs prior to altering
+ * @vp: The initialized vma_prepare struct
+ */
+static inline void vma_prepare(struct vma_prepare *vp)
+{
+ vma_start_write(vp->vma);
+ if (vp->adj_next)
+ vma_start_write(vp->adj_next);
+ /* vp->insert is always a newly created VMA, no need for locking */
+ if (vp->remove)
+ vma_start_write(vp->remove);
+ if (vp->remove2)
+ vma_start_write(vp->remove2);
+
+ if (vp->file) {
+ uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
+
+ if (vp->adj_next)
+ uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
+ vp->adj_next->vm_end);
+
+ i_mmap_lock_write(vp->mapping);
+ if (vp->insert && vp->insert->vm_file) {
/*
* Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
*/
- __vma_link_file(insert);
+ __vma_link_file(vp->insert,
+ vp->insert->vm_file->f_mapping);
}
}
- anon_vma = vma->anon_vma;
- if (!anon_vma && adjust_next)
- anon_vma = next->anon_vma;
- if (anon_vma) {
- VM_WARN_ON(adjust_next && next->anon_vma &&
- anon_vma != next->anon_vma);
- anon_vma_lock_write(anon_vma);
- anon_vma_interval_tree_pre_update_vma(vma);
- if (adjust_next)
- anon_vma_interval_tree_pre_update_vma(next);
+ if (vp->anon_vma) {
+ anon_vma_lock_write(vp->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vp->vma);
+ if (vp->adj_next)
+ anon_vma_interval_tree_pre_update_vma(vp->adj_next);
}
- if (file) {
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_remove(vma, root);
- if (adjust_next)
- vma_interval_tree_remove(next, root);
- }
-
- if (start != vma->vm_start) {
- vma->vm_start = start;
- start_changed = true;
- }
- if (end != vma->vm_end) {
- vma->vm_end = end;
- end_changed = true;
- }
- vma->vm_pgoff = pgoff;
- if (adjust_next) {
- next->vm_start += adjust_next;
- next->vm_pgoff += adjust_next >> PAGE_SHIFT;
+ if (vp->file) {
+ flush_dcache_mmap_lock(vp->mapping);
+ vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
+ if (vp->adj_next)
+ vma_interval_tree_remove(vp->adj_next,
+ &vp->mapping->i_mmap);
}
- if (file) {
- if (adjust_next)
- vma_interval_tree_insert(next, root);
- vma_interval_tree_insert(vma, root);
- flush_dcache_mmap_unlock(mapping);
- }
+}
- if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- */
- if (remove_next != 3)
- __vma_unlink(mm, next, next);
- else
- /*
- * vma is not before next if they've been
- * swapped.
- *
- * pre-swap() next->vm_start was reduced so
- * tell validate_mm_rb to ignore pre-swap()
- * "next" (which is stored in post-swap()
- * "vma").
- */
- __vma_unlink(mm, next, vma);
- if (file)
- __remove_shared_vm_struct(next, file, mapping);
- } else if (insert) {
+/*
+ * vma_complete- Helper function for handling the unlocking after altering VMAs,
+ * or for inserting a VMA.
+ *
+ * @vp: The vma_prepare struct
+ * @vmi: The vma iterator
+ * @mm: The mm_struct
+ */
+static inline void vma_complete(struct vma_prepare *vp,
+ struct vma_iterator *vmi, struct mm_struct *mm)
+{
+ if (vp->file) {
+ if (vp->adj_next)
+ vma_interval_tree_insert(vp->adj_next,
+ &vp->mapping->i_mmap);
+ vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
+ flush_dcache_mmap_unlock(vp->mapping);
+ }
+
+ if (vp->remove && vp->file) {
+ __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping);
+ if (vp->remove2)
+ __remove_shared_vm_struct(vp->remove2, vp->file,
+ vp->mapping);
+ } else if (vp->insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
- __insert_vm_struct(mm, insert);
- } else {
- if (start_changed)
- vma_gap_update(vma);
- if (end_changed) {
- if (!next)
- mm->highest_vm_end = vm_end_gap(vma);
- else if (!adjust_next)
- vma_gap_update(next);
- }
+ vma_iter_store(vmi, vp->insert);
+ mm->map_count++;
}
- if (anon_vma) {
- anon_vma_interval_tree_post_update_vma(vma);
- if (adjust_next)
- anon_vma_interval_tree_post_update_vma(next);
- anon_vma_unlock_write(anon_vma);
+ if (vp->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vp->vma);
+ if (vp->adj_next)
+ anon_vma_interval_tree_post_update_vma(vp->adj_next);
+ anon_vma_unlock_write(vp->anon_vma);
}
- if (file) {
- i_mmap_unlock_write(mapping);
- uprobe_mmap(vma);
+ if (vp->file) {
+ i_mmap_unlock_write(vp->mapping);
+ uprobe_mmap(vp->vma);
- if (adjust_next)
- uprobe_mmap(next);
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
}
- if (remove_next) {
- if (file) {
- uprobe_munmap(next, next->vm_start, next->vm_end);
- fput(file);
+ if (vp->remove) {
+again:
+ vma_mark_detached(vp->remove, true);
+ if (vp->file) {
+ uprobe_munmap(vp->remove, vp->remove->vm_start,
+ vp->remove->vm_end);
+ fput(vp->file);
}
- if (next->anon_vma)
- anon_vma_merge(vma, next);
+ if (vp->remove->anon_vma)
+ anon_vma_merge(vp->vma, vp->remove);
mm->map_count--;
- mpol_put(vma_policy(next));
- vm_area_free(next);
+ mpol_put(vma_policy(vp->remove));
+ if (!vp->remove2)
+ WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
+ vm_area_free(vp->remove);
+
/*
* In mprotect's case 6 (see comments on vma_merge),
- * we must remove another next too. It would clutter
- * up the code too much to do both in one go.
+ * we are removing both mid and next vmas
*/
- if (remove_next != 3) {
- /*
- * If "next" was removed and vma->vm_end was
- * expanded (up) over it, in turn
- * "next->vm_prev->vm_end" changed and the
- * "vma->vm_next" gap must be updated.
- */
- next = vma->vm_next;
- } else {
- /*
- * For the scope of the comment "next" and
- * "vma" considered pre-swap(): if "vma" was
- * removed, next->vm_start was expanded (down)
- * over it and the "next" gap must be updated.
- * Because of the swap() the post-swap() "vma"
- * actually points to pre-swap() "next"
- * (post-swap() "next" as opposed is now a
- * dangling pointer).
- */
- next = vma;
- }
- if (remove_next == 2) {
- remove_next = 1;
- end = next->vm_end;
+ if (vp->remove2) {
+ vp->remove = vp->remove2;
+ vp->remove2 = NULL;
goto again;
}
- else if (next)
- vma_gap_update(next);
- else {
- /*
- * If remove_next == 2 we obviously can't
- * reach this path.
- *
- * If remove_next == 3 we can't reach this
- * path because pre-swap() next is always not
- * NULL. pre-swap() "next" is not being
- * removed and its next->vm_end is not altered
- * (and furthermore "end" already matches
- * next->vm_end in remove_next == 3).
- *
- * We reach this only in the remove_next == 1
- * case if the "next" vma that was removed was
- * the highest vma of the mm. However in such
- * case next->vm_end == "end" and the extended
- * "vma" has vma->vm_end == next->vm_end so
- * mm->highest_vm_end doesn't need any update
- * in remove_next == 1 case.
- */
- VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
- }
}
- if (insert && file)
- uprobe_mmap(insert);
+ if (vp->insert && vp->file)
+ uprobe_mmap(vp->insert);
+}
- validate_mm(mm);
+/*
+ * dup_anon_vma() - Helper function to duplicate anon_vma
+ * @dst: The destination VMA
+ * @src: The source VMA
+ *
+ * Returns: 0 on success.
+ */
+static inline int dup_anon_vma(struct vm_area_struct *dst,
+ struct vm_area_struct *src)
+{
+ /*
+ * Easily overlooked: when mprotect shifts the boundary, make sure the
+ * expanding vma has anon_vma set if the shrinking vma had, to cover any
+ * anon pages imported.
+ */
+ if (src->anon_vma && !dst->anon_vma) {
+ vma_start_write(dst);
+ dst->anon_vma = src->anon_vma;
+ return anon_vma_clone(dst, src);
+ }
+
+ return 0;
+}
+
+/*
+ * vma_expand - Expand an existing VMA
+ *
+ * @vmi: The vma iterator
+ * @vma: The vma to expand
+ * @start: The start of the vma
+ * @end: The exclusive end of the vma
+ * @pgoff: The page offset of vma
+ * @next: The current of next vma.
+ *
+ * Expand @vma to @start and @end. Can expand off the start and end. Will
+ * expand over @next if it's different from @vma and @end == @next->vm_end.
+ * Checking if the @vma can expand and merge with @next needs to be handled by
+ * the caller.
+ *
+ * Returns: 0 on success
+ */
+int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff,
+ struct vm_area_struct *next)
+{
+ bool remove_next = false;
+ struct vma_prepare vp;
+
+ if (next && (vma != next) && (end == next->vm_end)) {
+ int ret;
+
+ remove_next = true;
+ ret = dup_anon_vma(vma, next);
+ if (ret)
+ return ret;
+ }
+ init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
+ /* Not merging but overwriting any part of next is not handled. */
+ VM_WARN_ON(next && !vp.remove &&
+ next != vma && end > next->vm_start);
+ /* Only handles expanding */
+ VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
+
+ if (vma_iter_prealloc(vmi))
+ goto nomem;
+
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, start, end, 0);
+ /* VMA iterator points to previous, so set to start if necessary */
+ if (vma_iter_addr(vmi) != start)
+ vma_iter_set(vmi, start);
+
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ /* Note: mas must be pointing to the expanding VMA */
+ vma_iter_store(vmi, vma);
+
+ vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
+ return 0;
+
+nomem:
+ return -ENOMEM;
+}
+
+/*
+ * vma_shrink() - Reduce an existing VMAs memory area
+ * @vmi: The vma iterator
+ * @vma: The VMA to modify
+ * @start: The new start
+ * @end: The new end
+ *
+ * Returns: 0 on success, -ENOMEM otherwise
+ */
+int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff)
+{
+ struct vma_prepare vp;
+
+ WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
+
+ if (vma_iter_prealloc(vmi))
+ return -ENOMEM;
+
+ init_vma_prep(&vp, vma);
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, start, end, 0);
+
+ if (vma->vm_start < start)
+ vma_iter_clear(vmi, vma->vm_start, start);
+
+ if (vma->vm_end > end)
+ vma_iter_clear(vmi, end, vma->vm_end);
+
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
return 0;
}
/*
* If the vma has a ->close operation then the driver probably needs to release
- * per-vma resources, so we don't attempt to merge those.
+ * per-vma resources, so we don't attempt to merge those if the caller indicates
+ * the current vma may be removed as part of the merge.
*/
-static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+static inline bool is_mergeable_vma(struct vm_area_struct *vma,
+ struct file *file, unsigned long vm_flags,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name, bool may_remove_vma)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -995,19 +742,20 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
* extended instead.
*/
if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
- return 0;
+ return false;
if (vma->vm_file != file)
- return 0;
- if (vma->vm_ops && vma->vm_ops->close)
- return 0;
+ return false;
+ if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
+ return false;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
- return 0;
- return 1;
+ return false;
+ if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
+ return false;
+ return true;
}
-static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2,
- struct vm_area_struct *vma)
+static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
+ struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
/*
* The list_is_singular() test is to avoid merging VMA cloned from
@@ -1015,7 +763,7 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
*/
if ((!anon_vma1 || !anon_vma2) && (!vma ||
list_is_singular(&vma->anon_vma_chain)))
- return 1;
+ return true;
return anon_vma1 == anon_vma2;
}
@@ -1029,19 +777,21 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
* We don't check here for the merged mmap wrapping around the end of pagecache
* indices (16TB on ia32) because do_mmap() does not permit mmap's which
* wrap, nor mmaps which cover the final page at index -1UL.
+ *
+ * We assume the vma may be removed as part of the merge.
*/
-static int
+static bool
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -1050,27 +800,29 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
*
* We cannot merge two vmas if they have differently assigned (non-NULL)
* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ *
+ * We assume that vma is not removed as part of the merge.
*/
-static int
+static bool
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@@ -1079,48 +831,68 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* this area are about to be changed to vm_flags - and the no-change
* case has already been eliminated.
*
- * The following mprotect cases have to be considered, where AAAA is
+ * The following mprotect cases have to be considered, where **** is
* the area passed down from mprotect_fixup, never extending beyond one
- * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
+ * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
+ * at the same address as **** and is of the same or larger span, and
+ * NNNN the next vma after ****:
*
- * AAAA AAAA AAAA
- * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN
+ * **** **** ****
+ * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
* cannot merge might become might become
- * PPNNNNNNNNNN PPPPPPPPPPNN
+ * PPNNNNNNNNNN PPPPPPPPPPCC
* mmap, brk or case 4 below case 5 below
* mremap move:
- * AAAA AAAA
- * PPPP NNNN PPPPNNNNXXXX
+ * **** ****
+ * PPPP NNNN PPPPCCCCNNNN
* might become might become
* PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
- * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or
- * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8
+ * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
+ * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
*
- * It is important for case 8 that the vma NNNN overlapping the
- * region AAAA is never going to extended over XXXX. Instead XXXX must
- * be extended in region AAAA and NNNN must be removed. This way in
- * all cases where vma_merge succeeds, the moment vma_adjust drops the
+ * It is important for case 8 that the vma CCCC overlapping the
+ * region **** is never going to extended over NNNN. Instead NNNN must
+ * be extended in region **** and CCCC must be removed. This way in
+ * all cases where vma_merge succeeds, the moment vma_merge drops the
* rmap_locks, the properties of the merged vma will be already
* correct for the whole merged range. Some of those properties like
* vm_page_prot/vm_flags may be accessed by rmap_walks and they must
* be correct for the whole merged range immediately after the
- * rmap_locks are released. Otherwise if XXXX would be removed and
- * NNNN would be extended over the XXXX range, remove_migration_ptes
+ * rmap_locks are released. Otherwise if NNNN would be removed and
+ * CCCC would be extended over the NNNN range, remove_migration_ptes
* or other rmap walkers (if working on addresses beyond the "end"
- * parameter) may establish ptes with the wrong permissions of NNNN
- * instead of the right permissions of XXXX.
+ * parameter) may establish ptes with the wrong permissions of CCCC
+ * instead of the right permissions of NNNN.
+ *
+ * In the code below:
+ * PPPP is represented by *prev
+ * CCCC is represented by *curr or not represented at all (NULL)
+ * NNNN is represented by *next or not represented at all (NULL)
+ * **** is not represented - it will be merged and the vma containing the
+ * area is returned, or the function will return NULL
*/
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
-{
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name)
+{
+ struct vm_area_struct *curr, *next, *res;
+ struct vm_area_struct *vma, *adjust, *remove, *remove2;
+ struct vma_prepare vp;
+ pgoff_t vma_pgoff;
+ int err = 0;
+ bool merge_prev = false;
+ bool merge_next = false;
+ bool vma_expanded = false;
+ unsigned long vma_start = addr;
+ unsigned long vma_end = end;
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- struct vm_area_struct *area, *next;
- int err;
+ long adj_start = 0;
+ validate_mm(mm);
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
@@ -1128,79 +900,130 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- if (prev)
- next = prev->vm_next;
+ /* Does the input range span an existing VMA? (cases 5 - 8) */
+ curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
+
+ if (!curr || /* cases 1 - 4 */
+ end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
+ next = vma_lookup(mm, end);
else
- next = mm->mmap;
- area = next;
- if (area && area->vm_end == end) /* cases 6, 7, 8 */
- next = next->vm_next;
+ next = NULL; /* case 5 */
+
+ if (prev) {
+ vma_start = prev->vm_start;
+ vma_pgoff = prev->vm_pgoff;
+
+ /* Can we merge the predecessor? */
+ if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
+ && can_vma_merge_after(prev, vm_flags, anon_vma, file,
+ pgoff, vm_userfaultfd_ctx, anon_name)) {
+ merge_prev = true;
+ vma_prev(vmi);
+ }
+ }
- /* verify some invariant that must be enforced by the caller */
+ /* Can we merge the successor? */
+ if (next && mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
+ vm_userfaultfd_ctx, anon_name)) {
+ merge_next = true;
+ }
+
+ /* Verify some invariant that must be enforced by the caller. */
VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(area && end > area->vm_end);
+ VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
VM_WARN_ON(addr >= end);
- /*
- * Can it merge with the predecessor?
- */
- if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
- can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff,
- vm_userfaultfd_ctx)) {
- /*
- * OK, it can. Can we now merge in the successor as well?
- */
- if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file,
- pgoff+pglen,
- vm_userfaultfd_ctx) &&
- is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma, NULL)) {
- /* cases 1, 6 */
- err = __vma_adjust(prev, prev->vm_start,
- next->vm_end, prev->vm_pgoff, NULL,
- prev);
- } else /* cases 2, 5, 7 */
- err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
- if (err)
- return NULL;
- khugepaged_enter_vma_merge(prev, vm_flags);
- return prev;
- }
-
- /*
- * Can this new request be merged in front of next?
- */
- if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx)) {
- if (prev && addr < prev->vm_end) /* case 4 */
- err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
- else { /* cases 3, 8 */
- err = __vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
+ if (!merge_prev && !merge_next)
+ return NULL; /* Not mergeable. */
+
+ res = vma = prev;
+ remove = remove2 = adjust = NULL;
+
+ /* Can we merge both the predecessor and the successor? */
+ if (merge_prev && merge_next &&
+ is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
+ remove = next; /* case 1 */
+ vma_end = next->vm_end;
+ err = dup_anon_vma(prev, next);
+ if (curr) { /* case 6 */
+ remove = curr;
+ remove2 = next;
+ if (!next->anon_vma)
+ err = dup_anon_vma(prev, curr);
+ }
+ } else if (merge_prev) { /* case 2 */
+ if (curr) {
+ err = dup_anon_vma(prev, curr);
+ if (end == curr->vm_end) { /* case 7 */
+ remove = curr;
+ } else { /* case 5 */
+ adjust = curr;
+ adj_start = (end - curr->vm_start);
+ }
+ }
+ } else { /* merge_next */
+ res = next;
+ if (prev && addr < prev->vm_end) { /* case 4 */
+ vma_end = addr;
+ adjust = next;
+ adj_start = -(prev->vm_end - addr);
+ err = dup_anon_vma(next, prev);
+ } else {
/*
- * In case 3 area is already equal to next and
- * this is a noop, but in case 8 "area" has
- * been removed and next was expanded over it.
+ * Note that cases 3 and 8 are the ONLY ones where prev
+ * is permitted to be (but is not necessarily) NULL.
*/
- area = next;
+ vma = next; /* case 3 */
+ vma_start = addr;
+ vma_end = next->vm_end;
+ vma_pgoff = next->vm_pgoff - pglen;
+ if (curr) { /* case 8 */
+ vma_pgoff = curr->vm_pgoff;
+ remove = curr;
+ err = dup_anon_vma(next, curr);
+ }
}
- if (err)
- return NULL;
- khugepaged_enter_vma_merge(area, vm_flags);
- return area;
}
- return NULL;
+ /* Error in anon_vma clone. */
+ if (err)
+ return NULL;
+
+ if (vma_iter_prealloc(vmi))
+ return NULL;
+
+ init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
+ VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
+ vp.anon_vma != adjust->anon_vma);
+
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
+ if (vma_start < vma->vm_start || vma_end > vma->vm_end)
+ vma_expanded = true;
+
+ vma->vm_start = vma_start;
+ vma->vm_end = vma_end;
+ vma->vm_pgoff = vma_pgoff;
+
+ if (vma_expanded)
+ vma_iter_store(vmi, vma);
+
+ if (adj_start) {
+ adjust->vm_start += adj_start;
+ adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
+ if (adj_start < 0) {
+ WARN_ON(vma_expanded);
+ vma_iter_store(vmi, next);
+ }
+ }
+
+ vma_complete(&vp, vmi, mm);
+ vma_iter_free(vmi);
+ validate_mm(mm);
+ khugepaged_enter_vma(res, vm_flags);
+
+ return res;
}
/*
@@ -1231,7 +1054,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
* the same as 'old', the other will be the new one that is trying
* to share the anon_vma.
*
- * NOTE! This runs with mm_sem held for reading, so it is possible that
+ * NOTE! This runs with mmap_lock held for reading, so it is possible that
* the anon_vma of 'old' is concurrently in the process of being set up
* by another page fault trying to merge _that_. But that's ok: if it
* is being set up, that automatically means that it will be a singleton
@@ -1245,7 +1068,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
*
* We also make sure that the two vma's are compatible (adjacent,
* and with the same memory policies). That's all stable, even with just
- * a read lock on the mm_sem.
+ * a read lock on the mmap_lock.
*/
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
@@ -1268,18 +1091,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
struct anon_vma *anon_vma = NULL;
+ struct vm_area_struct *prev, *next;
/* Try next first. */
- if (vma->vm_next) {
- anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ next = mas_walk(&mas);
+ if (next) {
+ anon_vma = reusable_anon_vma(next, vma, next);
if (anon_vma)
return anon_vma;
}
+ prev = mas_prev(&mas, 0);
+ VM_BUG_ON_VMA(prev != vma, vma);
+ prev = mas_prev(&mas, 0);
/* Try prev next. */
- if (vma->vm_prev)
- anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
+ if (prev)
+ anon_vma = reusable_anon_vma(prev, prev, vma);
/*
* We might reach here with anon_vma == NULL if we can't find
@@ -1307,22 +1136,21 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-static inline int mlock_future_check(struct mm_struct *mm,
- unsigned long flags,
- unsigned long len)
+bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+ unsigned long bytes)
{
- unsigned long locked, lock_limit;
+ unsigned long locked_pages, limit_pages;
- /* mlock MCL_FUTURE? */
- if (flags & VM_LOCKED) {
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- return 0;
+ if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+ return true;
+
+ locked_pages = bytes >> PAGE_SHIFT;
+ locked_pages += mm->locked_vm;
+
+ limit_pages = rlimit(RLIMIT_MEMLOCK);
+ limit_pages >>= PAGE_SHIFT;
+
+ return locked_pages <= limit_pages;
}
static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
@@ -1369,6 +1197,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags_t vm_flags;
int pkey = 0;
+ validate_mm(mm);
*populate = 0;
if (!len)
@@ -1412,9 +1241,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
- struct vm_area_struct *vma = find_vma(mm, addr);
-
- if (vma && vma->vm_start < addr + len)
+ if (find_vma_intersection(mm, addr, addr + len))
return -EEXIST;
}
@@ -1435,7 +1262,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- if (mlock_future_check(mm, vm_flags, len))
+ if (!mlock_future_ok(mm, vm_flags, len))
return -EAGAIN;
if (file) {
@@ -1475,12 +1302,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
- /*
- * Make sure there are no mandatory locks on the file.
- */
- if (locks_verify_locked(file))
- return -EAGAIN;
-
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
@@ -1566,7 +1387,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
goto out_fput;
}
} else if (flags & MAP_HUGETLB) {
- struct user_struct *user = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1577,19 +1397,15 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
- * A dummy user value is used because we are not locking
- * memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE,
+ HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
if (file)
@@ -1628,6 +1444,48 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
+{
+ return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
+}
+
+static bool vma_is_shared_writable(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
+ (VM_WRITE | VM_SHARED);
+}
+
+static bool vma_fs_can_writeback(struct vm_area_struct *vma)
+{
+ /* No managed pages to writeback. */
+ if (vma->vm_flags & VM_PFNMAP)
+ return false;
+
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_can_writeback(vma->vm_file->f_mapping);
+}
+
+/*
+ * Does this VMA require the underlying folios to have their dirty state
+ * tracked?
+ */
+bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
+{
+ /* Only shared, writable VMAs require dirty tracking. */
+ if (!vma_is_shared_writable(vma))
+ return false;
+
+ /* Does the filesystem need to be notified? */
+ if (vm_ops_needs_writenotify(vma->vm_ops))
+ return true;
+
+ /*
+ * Even if the filesystem doesn't indicate a need for writenotify, if it
+ * can writeback, dirty tracking is still required.
+ */
+ return vma_fs_can_writeback(vma);
+}
+
/*
* Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
@@ -1636,34 +1494,33 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
*/
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
- vm_flags_t vm_flags = vma->vm_flags;
- const struct vm_operations_struct *vm_ops = vma->vm_ops;
-
/* If it was private or non-writable, the write bit is already clear */
- if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ if (!vma_is_shared_writable(vma))
return 0;
/* The backer wishes to know when pages are first written to? */
- if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
+ if (vm_ops_needs_writenotify(vma->vm_ops))
return 1;
/* The open routine did something to the protections that pgprot_modify
* won't preserve? */
if (pgprot_val(vm_page_prot) !=
- pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
+ pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
- /* Specialty mapping? */
- if (vm_flags & VM_PFNMAP)
- return 0;
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_wp(vma))
+ return 1;
/* Can the mapping track the dirty pages? */
- return vma->vm_file && vma->vm_file->f_mapping &&
- mapping_can_writeback(vma->vm_file->f_mapping);
+ return vma_fs_can_writeback(vma);
}
/*
@@ -1682,405 +1539,108 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev, *merge;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, addr + len);
-
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
-
- /* Clear old maps */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
-
- /*
- * Private writable mapping: check memory availability
- */
- if (accountable_mapping(file, vm_flags)) {
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
-
- /*
- * Can we just expand an old mapping?
- */
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
- if (vma)
- goto out;
-
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
-
- if (file) {
- if (vm_flags & VM_DENYWRITE) {
- error = deny_write_access(file);
- if (error)
- goto free_vma;
- }
- if (vm_flags & VM_SHARED) {
- error = mapping_map_writable(file->f_mapping);
- if (error)
- goto allow_write_and_free_vma;
- }
-
- /* ->mmap() can change vma->vm_file, but must guarantee that
- * vma_link() below can deny write-access if VM_DENYWRITE is set
- * and map writably if VM_SHARED is set. This usually means the
- * new file must not have been exposed to user-space, yet.
- */
- vma->vm_file = get_file(file);
- error = call_mmap(file, vma);
- if (error)
- goto unmap_and_free_vma;
-
- /* If vm_flags changed after call_mmap(), we should try merge vma again
- * as we may succeed this time.
- */
- if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
- if (merge) {
- /* ->mmap() can change vma->vm_file and fput the original file. So
- * fput the vma->vm_file here or we would add an extra fput for file
- * and cause general protection fault ultimately.
- */
- fput(vma->vm_file);
- vm_area_free(vma);
- vma = merge;
- /* Update vm_flags and possible addr to pick up the change. We don't
- * warn here if addr changed as the vma is not linked by vma_link().
- */
- addr = vma->vm_start;
- vm_flags = vma->vm_flags;
- goto unmap_writable;
- }
- }
-
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- * Bug: If addr is changed, prev, rb_link, rb_parent should
- * be updated for vma_link()
- */
- WARN_ON_ONCE(addr != vma->vm_start);
-
- addr = vma->vm_start;
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- } else {
- vma_set_anonymous(vma);
- }
-
- /* Allow architectures to sanity-check the vm_flags */
- if (!arch_validate_flags(vma->vm_flags)) {
- error = -EINVAL;
- if (file)
- goto unmap_and_free_vma;
- else
- goto free_vma;
- }
-
- vma_link(mm, vma, prev, rb_link, rb_parent);
- /* Once vma denies write, undo our temporary denial count */
- if (file) {
-unmap_writable:
- if (vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
- if (vm_flags & VM_DENYWRITE)
- allow_write_access(file);
- }
- file = vma->vm_file;
-out:
- perf_event_mmap(vma);
-
- vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
- if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm))
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
- else
- mm->locked_vm += (len >> PAGE_SHIFT);
- }
-
- if (file)
- uprobe_mmap(vma);
-
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vma->vm_flags |= VM_SOFTDIRTY;
-
- vma_set_page_prot(vma);
-
- return addr;
-
-unmap_and_free_vma:
- vma->vm_file = NULL;
- fput(file);
-
- /* Undo any partial mapping done by a device driver. */
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
- if (vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
-allow_write_and_free_vma:
- if (vm_flags & VM_DENYWRITE)
- allow_write_access(file);
-free_vma:
- vm_area_free(vma);
-unacct_error:
- if (charged)
- vm_unacct_memory(charged);
- return error;
-}
-
+/**
+ * unmapped_area() - Find an area between the low_limit and the high_limit with
+ * the correct alignment and offset, all from @info. Note: current->mm is used
+ * for the search.
+ *
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
- /*
- * We implement the search by looking for an rbtree node that
- * immediately follows a suitable gap. That is,
- * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
- * - gap_end = vma->vm_start >= info->low_limit + length;
- * - gap_end - gap_start >= length
- */
+ unsigned long length, gap;
+ unsigned long low_limit, high_limit;
+ struct vm_area_struct *tmp;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /* Adjust search limits by the desired length */
- if (info->high_limit < length)
+ low_limit = info->low_limit;
+ if (low_limit < mmap_min_addr)
+ low_limit = mmap_min_addr;
+ high_limit = info->high_limit;
+retry:
+ if (mas_empty_area(&mas, low_limit, high_limit - 1, length))
return -ENOMEM;
- high_limit = info->high_limit - length;
- if (info->low_limit > high_limit)
- return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- goto check_highest;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
- goto check_highest;
-
- while (true) {
- /* Visit left subtree if it looks promising */
- gap_end = vm_start_gap(vma);
- if (gap_end >= low_limit && vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
+ gap = mas.index;
+ gap += (info->align_offset - gap) & info->align_mask;
+ tmp = mas_next(&mas, ULONG_MAX);
+ if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */
+ if (vm_start_gap(tmp) < gap + length - 1) {
+ low_limit = tmp->vm_end;
+ mas_reset(&mas);
+ goto retry;
}
-
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
-check_current:
- /* Check if current node has a suitable gap */
- if (gap_start > high_limit)
- return -ENOMEM;
- if (gap_end >= low_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit right subtree if it looks promising */
- if (vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- goto check_highest;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_left) {
- gap_start = vm_end_gap(vma->vm_prev);
- gap_end = vm_start_gap(vma);
- goto check_current;
- }
+ } else {
+ tmp = mas_prev(&mas, 0);
+ if (tmp && vm_end_gap(tmp) > gap) {
+ low_limit = vm_end_gap(tmp);
+ mas_reset(&mas);
+ goto retry;
}
}
-check_highest:
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
- if (gap_start > high_limit)
- return -ENOMEM;
-
-found:
- /* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
- gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
-
- VM_BUG_ON(gap_start + info->length > info->high_limit);
- VM_BUG_ON(gap_start + info->length > gap_end);
- return gap_start;
+ return gap;
}
+/**
+ * unmapped_area_topdown() - Find an area between the low_limit and the
+ * high_limit with the correct alignment and offset at the highest available
+ * address, all from @info. Note: current->mm is used for the search.
+ *
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ unsigned long length, gap, gap_end;
+ unsigned long low_limit, high_limit;
+ struct vm_area_struct *tmp;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /*
- * Adjust search limits by the desired length.
- * See implementation comment at top of unmapped_area().
- */
- gap_end = info->high_limit;
- if (gap_end < length)
+ low_limit = info->low_limit;
+ if (low_limit < mmap_min_addr)
+ low_limit = mmap_min_addr;
+ high_limit = info->high_limit;
+retry:
+ if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length))
return -ENOMEM;
- high_limit = gap_end - length;
- if (info->low_limit > high_limit)
- return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- if (gap_start <= high_limit)
- goto found_highest;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- return -ENOMEM;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
- return -ENOMEM;
-
- while (true) {
- /* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
- if (gap_start <= high_limit && vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
+ gap = mas.last + 1 - info->length;
+ gap -= (gap - info->align_offset) & info->align_mask;
+ gap_end = mas.last;
+ tmp = mas_next(&mas, ULONG_MAX);
+ if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */
+ if (vm_start_gap(tmp) <= gap_end) {
+ high_limit = vm_start_gap(tmp);
+ mas_reset(&mas);
+ goto retry;
}
-
-check_current:
- /* Check if current node has a suitable gap */
- gap_end = vm_start_gap(vma);
- if (gap_end < low_limit)
- return -ENOMEM;
- if (gap_start <= high_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit left subtree if it looks promising */
- if (vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- return -ENOMEM;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_right) {
- gap_start = vma->vm_prev ?
- vm_end_gap(vma->vm_prev) : 0;
- goto check_current;
- }
+ } else {
+ tmp = mas_prev(&mas, 0);
+ if (tmp && vm_end_gap(tmp) > gap) {
+ high_limit = tmp->vm_start;
+ mas_reset(&mas);
+ goto retry;
}
}
-found:
- /* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
- gap_end = info->high_limit;
-
-found_highest:
- /* Compute highest gap address at the desired alignment */
- gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
-
- VM_BUG_ON(gap_end < info->low_limit);
- VM_BUG_ON(gap_end < gap_start);
- return gap_end;
+ return gap;
}
/*
@@ -2105,14 +1665,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
return addr;
}
-#ifndef arch_get_mmap_end
-#define arch_get_mmap_end(addr) (TASK_SIZE)
-#endif
-
-#ifndef arch_get_mmap_base
-#define arch_get_mmap_base(addr, base) (base)
-#endif
-
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -2124,15 +1676,15 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
*
* This function "knows" that -ENOMEM has the bits set.
*/
-#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+generic_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
- const unsigned long mmap_end = arch_get_mmap_end(addr);
+ const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
@@ -2157,22 +1709,30 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.align_offset = 0;
return vm_unmapped_area(&info);
}
+
+#ifndef HAVE_ARCH_UNMAPPED_AREA
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ return generic_get_unmapped_area(filp, addr, len, pgoff, flags);
+}
#endif
/*
* This mmap-allocator allocates new areas top-down from below the
* stack's low limit (the base):
*/
-#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
+generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
struct vm_unmapped_area_info info;
- const unsigned long mmap_end = arch_get_mmap_end(addr);
+ const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
/* requested length too big for entire address space */
if (len > mmap_end - mmap_min_addr)
@@ -2193,7 +1753,7 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.align_mask = 0;
info.align_offset = 0;
@@ -2215,6 +1775,15 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
return addr;
}
+
+#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
+}
#endif
unsigned long
@@ -2261,57 +1830,67 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
EXPORT_SYMBOL(get_unmapped_area);
-/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
+ * start_addr < end_addr.
+ */
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
{
- struct rb_node *rb_node;
- struct vm_area_struct *vma;
-
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
+ unsigned long index = start_addr;
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *tmp;
-
- tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
- if (tmp->vm_end > addr) {
- vma = tmp;
- if (tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
+/**
+ * find_vma() - Find the VMA for a given address, or the next VMA.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ *
+ * Returns: The VMA associated with addr, or the next VMA.
+ * May return %NULL in the case of no VMA at addr or above.
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long index = addr;
- if (vma)
- vmacache_update(addr, vma);
- return vma;
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
-
EXPORT_SYMBOL(find_vma);
-/*
- * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+/**
+ * find_vma_prev() - Find the VMA for a given address, or the next vma and
+ * set %pprev to the previous VMA, if any.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ * @pprev: The pointer to set to the previous VMA
+ *
+ * Note that RCU lock is missing here since the external mmap_lock() is used
+ * instead.
+ *
+ * Returns: The VMA associated with @addr, or the next vma.
+ * May return %NULL in the case of no vma at addr or above.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- vma = find_vma(mm, addr);
- if (vma) {
- *pprev = vma->vm_prev;
- } else {
- struct rb_node *rb_node = rb_last(&mm->mm_rb);
-
- *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
- }
+ vma = mas_walk(&mas);
+ *pprev = mas_prev(&mas, 0);
+ if (!vma)
+ vma = mas_next(&mas, ULONG_MAX);
return vma;
}
@@ -2335,15 +1914,8 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return -ENOMEM;
/* mlock limit tests */
- if (vma->vm_flags & VM_LOCKED) {
- unsigned long locked;
- unsigned long limit;
- locked = mm->locked_vm + grow;
- limit = rlimit(RLIMIT_MEMLOCK);
- limit >>= PAGE_SHIFT;
- if (locked > limit && !capable(CAP_IPC_LOCK))
- return -ENOMEM;
- }
+ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
+ return -ENOMEM;
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
@@ -2366,12 +1938,13 @@ static int acct_stack_growth(struct vm_area_struct *vma,
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
@@ -2389,17 +1962,24 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (gap_addr < address || gap_addr > TASK_SIZE)
gap_addr = TASK_SIZE;
- next = vma->vm_next;
- if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
}
+ if (mas_preallocate(&mas, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
+ /* Lock the VMA before expanding to prevent concurrent page faults */
+ vma_start_write(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_lock in read mode. We need the
@@ -2419,15 +1999,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2435,11 +2013,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
+ /* Overwrite old entry in mtree. */
+ mas_set_range(&mas, vma->vm_start, address - 1);
+ mas_store_prealloc(&mas, vma);
anon_vma_interval_tree_post_update_vma(vma);
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2447,39 +2024,51 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma_merge(vma, vma->vm_flags);
- validate_mm(mm);
+ khugepaged_enter_vma(vma, vma->vm_flags);
+ mas_destroy(&mas);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
+ * mmap_lock held for writing.
*/
-int expand_downwards(struct vm_area_struct *vma,
- unsigned long address)
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
struct vm_area_struct *prev;
int error = 0;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return -EFAULT;
+
address &= PAGE_MASK;
- if (address < mmap_min_addr)
+ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
return -EPERM;
/* Enforce stack_guard_gap */
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);
/* Check that both stack segments have the same anon_vma? */
- if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
- vma_is_accessible(prev)) {
- if (address - prev->vm_end < stack_guard_gap)
+ if (prev) {
+ if (!(prev->vm_flags & VM_GROWSDOWN) &&
+ vma_is_accessible(prev) &&
+ (address - prev->vm_end < stack_guard_gap))
return -ENOMEM;
}
+ if (mas_preallocate(&mas, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
+ /* Lock the VMA before expanding to prevent concurrent page faults */
+ vma_start_write(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_lock in read mode. We need the
@@ -2499,15 +2088,13 @@ int expand_downwards(struct vm_area_struct *vma,
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2516,8 +2103,10 @@ int expand_downwards(struct vm_area_struct *vma,
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ /* Overwrite old entry in mtree. */
+ mas_set_range(&mas, address, vma->vm_end - 1);
+ mas_store_prealloc(&mas, vma);
anon_vma_interval_tree_post_update_vma(vma);
- vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2525,8 +2114,8 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma_merge(vma, vma->vm_flags);
- validate_mm(mm);
+ khugepaged_enter_vma(vma, vma->vm_flags);
+ mas_destroy(&mas);
return error;
}
@@ -2542,18 +2131,17 @@ static int __init cmdline_parse_stack_guard_gap(char *p)
if (!*endptr)
stack_guard_gap = val << PAGE_SHIFT;
- return 0;
+ return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
return expand_upwards(vma, address);
}
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
+struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma, *prev;
@@ -2561,21 +2149,23 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
- /* don't alter vm_end if the coredump is running */
- if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ if (!prev)
+ return NULL;
+ if (expand_stack_locked(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+ return -EINVAL;
return expand_downwards(vma, address);
}
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
+struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma;
unsigned long start;
@@ -2586,13 +2176,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return NULL;
if (vma->vm_start <= addr)
return vma;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- return NULL;
- /* don't alter vm_start if the coredump is running */
- if (!mmget_still_valid(mm))
- return NULL;
start = vma->vm_start;
- if (expand_stack(vma, addr))
+ if (expand_stack_locked(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED)
populate_vma_page_range(vma, addr, start, NULL);
@@ -2600,28 +2185,113 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
}
#endif
-EXPORT_SYMBOL_GPL(find_extend_vma);
+/*
+ * IA64 has some horrid mapping rules: it can expand both up and down,
+ * but with various special rules.
+ *
+ * We'll get rid of this architecture eventually, so the ugliness is
+ * temporary.
+ */
+#ifdef CONFIG_IA64
+static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
+{
+ return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
+ REGION_OFFSET(addr) < RGN_MAP_LIMIT;
+}
+
+/*
+ * IA64 stacks grow down, but there's a special register backing store
+ * that can grow up. Only sequentially, though, so the new address must
+ * match vm_end.
+ */
+static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (!vma_expand_ok(vma, addr))
+ return -EFAULT;
+ if (vma->vm_end != (addr & PAGE_MASK))
+ return -EFAULT;
+ return expand_upwards(vma, addr);
+}
+
+static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (!vma_expand_ok(vma, addr))
+ return -EFAULT;
+ return expand_downwards(vma, addr);
+}
+
+#elif defined(CONFIG_STACK_GROWSUP)
+
+#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
+#define vma_expand_down(vma, addr) (-EFAULT)
+
+#else
+
+#define vma_expand_up(vma,addr) (-EFAULT)
+#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
+
+#endif
+
+/*
+ * expand_stack(): legacy interface for page faulting. Don't use unless
+ * you have to.
+ *
+ * This is called with the mm locked for reading, drops the lock, takes
+ * the lock for writing, tries to look up a vma again, expands it if
+ * necessary, and downgrades the lock to reading again.
+ *
+ * If no vma is found or it can't be expanded, it returns NULL and has
+ * dropped the lock.
+ */
+struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma, *prev;
+
+ mmap_read_unlock(mm);
+ if (mmap_write_lock_killable(mm))
+ return NULL;
+
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && vma->vm_start <= addr)
+ goto success;
+
+ if (prev && !vma_expand_up(prev, addr)) {
+ vma = prev;
+ goto success;
+ }
+
+ if (vma && !vma_expand_down(vma, addr))
+ goto success;
+
+ mmap_write_unlock(mm);
+ return NULL;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+}
/*
- * Ok - we have the memory areas we should free on the vma list,
- * so release them, and do the vma updates.
+ * Ok - we have the memory areas we should free on a maple tree so release them,
+ * and do the vma updates.
*
* Called with the mm semaphore held.
*/
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
{
unsigned long nr_accounted = 0;
+ struct vm_area_struct *vma;
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
- do {
+ mas_for_each(mas, vma, ULONG_MAX) {
long nrpages = vma_pages(vma);
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
- vma = remove_vma(vma);
- } while (vma);
+ remove_vma(vma, false);
+ }
vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
@@ -2631,76 +2301,42 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
- unsigned long start, unsigned long end)
+ struct vm_area_struct *next,
+ unsigned long start, unsigned long end, bool mm_wr_locked)
{
- struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct mmu_gather tlb;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end);
- free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- tlb_finish_mmu(&tlb, start, end);
-}
-
-/*
- * Create a list of vma's touched by the unmap, removing them from the mm's
- * vma list as we go..
- */
-static bool
-detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, unsigned long end)
-{
- struct vm_area_struct **insertion_point;
- struct vm_area_struct *tail_vma = NULL;
-
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
- vma->vm_prev = NULL;
- do {
- vma_rb_erase(vma, &mm->mm_rb);
- mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
- *insertion_point = vma;
- if (vma) {
- vma->vm_prev = prev;
- vma_gap_update(vma);
- } else
- mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
- tail_vma->vm_next = NULL;
-
- /* Kill the cache */
- vmacache_invalidate(mm);
-
- /*
- * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
- * VM_GROWSUP VMA. Such VMAs can change their size under
- * down_read(mmap_lock) and collide with the VMA we are about to unmap.
- */
- if (vma && (vma->vm_flags & VM_GROWSDOWN))
- return false;
- if (prev && (prev->vm_flags & VM_GROWSUP))
- return false;
- return true;
+ unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
+ free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : USER_PGTABLES_CEILING,
+ mm_wr_locked);
+ tlb_finish_mmu(&tlb);
}
/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
+ * VMA Iterator will point to the end VMA.
*/
-int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
+ struct vma_prepare vp;
struct vm_area_struct *new;
int err;
- if (vma->vm_ops && vma->vm_ops->split) {
- err = vma->vm_ops->split(vma, addr);
+ validate_mm(vma->vm_mm);
+
+ WARN_ON(vma->vm_start >= addr);
+ WARN_ON(vma->vm_end <= addr);
+
+ if (vma->vm_ops && vma->vm_ops->may_split) {
+ err = vma->vm_ops->may_split(vma, addr);
if (err)
return err;
}
@@ -2709,16 +2345,20 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new)
return -ENOMEM;
- if (new_below)
+ err = -ENOMEM;
+ if (vma_iter_prealloc(vmi))
+ goto out_free_vma;
+
+ if (new_below) {
new->vm_end = addr;
- else {
+ } else {
new->vm_start = addr;
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
err = vma_dup_policy(vma, new);
if (err)
- goto out_free_vma;
+ goto out_free_vmi;
err = anon_vma_clone(new, vma);
if (err)
@@ -2730,26 +2370,34 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- if (new_below)
- err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
- ((addr - new->vm_start) >> PAGE_SHIFT), new);
- else
- err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ init_vma_prep(&vp, vma);
+ vp.insert = new;
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+
+ if (new_below) {
+ vma->vm_start = addr;
+ vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
+ } else {
+ vma->vm_end = addr;
+ }
+
+ /* vma_complete stores the new vma */
+ vma_complete(&vp, vmi, vma->vm_mm);
/* Success. */
- if (!err)
- return 0;
+ if (new_below)
+ vma_next(vmi);
+ validate_mm(vma->vm_mm);
+ return 0;
- /* Clean everything up if vma_adjust failed. */
- if (new->vm_ops && new->vm_ops->close)
- new->vm_ops->close(new);
- if (new->vm_file)
- fput(new->vm_file);
- unlink_anon_vmas(new);
- out_free_mpol:
+out_free_mpol:
mpol_put(vma_policy(new));
- out_free_vma:
+out_free_vmi:
+ vma_iter_free(vmi);
+out_free_vma:
vm_area_free(new);
+ validate_mm(vma->vm_mm);
return err;
}
@@ -2757,51 +2405,42 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
* Split a vma into two pieces at address 'addr', a new vma is allocated
* either for the first part or the tail.
*/
-int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
- if (mm->map_count >= sysctl_max_map_count)
+ if (vma->vm_mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
- return __split_vma(mm, vma, addr, new_below);
+ return __split_vma(vmi, vma, addr, new_below);
}
-/* Munmap is split into 2 main parts -- this part which finds
- * what needs doing, and the areas themselves, which do the
- * work. This now handles partial unmappings.
- * Jeremy Fitzhardinge <jeremy@goop.org>
+/*
+ * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
+ * @vmi: The vma iterator
+ * @vma: The starting vm_area_struct
+ * @mm: The mm_struct
+ * @start: The aligned start address to munmap.
+ * @end: The aligned end address to munmap.
+ * @uf: The userfaultfd list_head
+ * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
+ * success.
+ *
+ * Return: 0 on success and drops the lock if so directed, error and leaves the
+ * lock held otherwise.
*/
-int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf, bool downgrade)
-{
- unsigned long end;
- struct vm_area_struct *vma, *prev, *last;
-
- if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
-
- len = PAGE_ALIGN(len);
- end = start + len;
- if (len == 0)
- return -EINVAL;
-
- /*
- * arch_unmap() might do unmaps itself. It must be called
- * and finish any rbtree manipulation before this code
- * runs and also starts to manipulate the rbtree.
- */
- arch_unmap(mm, start, end);
-
- /* Find the first overlapping VMA */
- vma = find_vma(mm, start);
- if (!vma)
- return 0;
- prev = vma->vm_prev;
- /* we have start < vma->vm_end */
-
- /* if it doesn't overlap, we have nothing.. */
- if (vma->vm_start >= end)
- return 0;
+static int
+do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool unlock)
+{
+ struct vm_area_struct *prev, *next = NULL;
+ struct maple_tree mt_detach;
+ int count = 0;
+ int error = -ENOMEM;
+ unsigned long locked_vm = 0;
+ MA_STATE(mas_detach, &mt_detach, 0, 0);
+ mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ mt_set_external_lock(&mt_detach, &mm->mmap_lock);
/*
* If we need to split any vma, do it now to save pain later.
@@ -2810,8 +2449,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
+
+ /* Does it split the first one? */
if (start > vma->vm_start) {
- int error;
/*
* Make sure that map_count on return from munmap() will
@@ -2819,93 +2459,452 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* its limit temporarily, to help free resources as expected.
*/
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
+ goto map_count_exceeded;
- error = __split_vma(mm, vma, start, 0);
+ error = __split_vma(vmi, vma, start, 0);
if (error)
- return error;
- prev = vma;
- }
+ goto start_split_failed;
- /* Does it split the last one? */
- last = find_vma(mm, end);
- if (last && end > last->vm_start) {
- int error = __split_vma(mm, last, end, 1);
- if (error)
- return error;
+ vma = vma_iter_load(vmi);
}
- vma = prev ? prev->vm_next : mm->mmap;
- if (unlikely(uf)) {
- /*
- * If userfaultfd_unmap_prep returns an error the vmas
- * will remain splitted, but userland will get a
- * highly unexpected error anyway. This is no
- * different than the case where the first of the two
- * __split_vma fails, but we don't undo the first
- * split, despite we could. This is unlikely enough
- * failure that it's not worth optimizing it for.
- */
- int error = userfaultfd_unmap_prep(vma, start, end, uf);
- if (error)
- return error;
- }
+ prev = vma_prev(vmi);
+ if (unlikely((!prev)))
+ vma_iter_set(vmi, start);
/*
- * unlock any mlock()ed ranges before detaching vmas
+ * Detach a range of VMAs from the mm. Using next as a temp variable as
+ * it is always overwritten.
*/
- if (mm->locked_vm) {
- struct vm_area_struct *tmp = vma;
- while (tmp && tmp->vm_start < end) {
- if (tmp->vm_flags & VM_LOCKED) {
- mm->locked_vm -= vma_pages(tmp);
- munlock_vma_pages_all(tmp);
- }
+ for_each_vma_range(*vmi, next, end) {
+ /* Does it split the end? */
+ if (next->vm_end > end) {
+ error = __split_vma(vmi, next, end, 0);
+ if (error)
+ goto end_split_failed;
+ }
+ vma_start_write(next);
+ mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
+ error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
+ if (error)
+ goto munmap_gather_failed;
+ vma_mark_detached(next, true);
+ if (next->vm_flags & VM_LOCKED)
+ locked_vm += vma_pages(next);
- tmp = tmp->vm_next;
+ count++;
+ if (unlikely(uf)) {
+ /*
+ * If userfaultfd_unmap_prep returns an error the vmas
+ * will remain split, but userland will get a
+ * highly unexpected error anyway. This is no
+ * different than the case where the first of the two
+ * __split_vma fails, but we don't undo the first
+ * split, despite we could. This is unlikely enough
+ * failure that it's not worth optimizing it for.
+ */
+ error = userfaultfd_unmap_prep(next, start, end, uf);
+
+ if (error)
+ goto userfaultfd_error;
}
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ BUG_ON(next->vm_start < start);
+ BUG_ON(next->vm_start > end);
+#endif
}
- /* Detach vmas from rbtree */
- if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
- downgrade = false;
+ if (vma_iter_end(vmi) > end)
+ next = vma_iter_load(vmi);
+
+ if (!next)
+ next = vma_next(vmi);
- if (downgrade)
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ /* Make sure no VMAs are about to be lost. */
+ {
+ MA_STATE(test, &mt_detach, start, end - 1);
+ struct vm_area_struct *vma_mas, *vma_test;
+ int test_count = 0;
+
+ vma_iter_set(vmi, start);
+ rcu_read_lock();
+ vma_test = mas_find(&test, end - 1);
+ for_each_vma_range(*vmi, vma_mas, end) {
+ BUG_ON(vma_mas != vma_test);
+ test_count++;
+ vma_test = mas_next(&test, end - 1);
+ }
+ rcu_read_unlock();
+ BUG_ON(count != test_count);
+ }
+#endif
+ vma_iter_set(vmi, start);
+ error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
+ if (error)
+ goto clear_tree_failed;
+
+ /* Point of no return */
+ mm->locked_vm -= locked_vm;
+ mm->map_count -= count;
+ if (unlock)
mmap_write_downgrade(mm);
- unmap_region(mm, vma, prev, start, end);
+ /*
+ * We can free page tables without write-locking mmap_lock because VMAs
+ * were isolated before we downgraded mmap_lock.
+ */
+ unmap_region(mm, &mt_detach, vma, prev, next, start, end, !unlock);
+ /* Statistics and freeing VMAs */
+ mas_set(&mas_detach, start);
+ remove_mt(mm, &mas_detach);
+ __mt_destroy(&mt_detach);
+ validate_mm(mm);
+ if (unlock)
+ mmap_read_unlock(mm);
+
+ return 0;
+
+clear_tree_failed:
+userfaultfd_error:
+munmap_gather_failed:
+end_split_failed:
+ mas_set(&mas_detach, 0);
+ mas_for_each(&mas_detach, next, end)
+ vma_mark_detached(next, false);
+
+ __mt_destroy(&mt_detach);
+start_split_failed:
+map_count_exceeded:
+ validate_mm(mm);
+ return error;
+}
+
+/*
+ * do_vmi_munmap() - munmap a given range.
+ * @vmi: The vma iterator
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length of the range to munmap
+ * @uf: The userfaultfd list_head
+ * @unlock: set to true if the user wants to drop the mmap_lock on success
+ *
+ * This function takes a @mas that is either pointing to the previous VMA or set
+ * to MA_START and sets it up to remove the mapping(s). The @len will be
+ * aligned and any arch_unmap work will be preformed.
+ *
+ * Return: 0 on success and drops the lock if so directed, error and leaves the
+ * lock held otherwise.
+ */
+int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool unlock)
+{
+ unsigned long end;
+ struct vm_area_struct *vma;
+
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ end = start + PAGE_ALIGN(len);
+ if (end == start)
+ return -EINVAL;
- /* Fix up all other VM information */
- remove_vma_list(mm, vma);
+ /* arch_unmap() might do unmaps itself. */
+ arch_unmap(mm, start, end);
- return downgrade ? 1 : 0;
+ /* Find the first overlapping VMA */
+ vma = vma_find(vmi, end);
+ if (!vma) {
+ if (unlock)
+ mmap_write_unlock(mm);
+ return 0;
+ }
+
+ return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}
+/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length to be munmapped.
+ * @uf: The userfaultfd list_head
+ *
+ * Return: 0 on success, error otherwise.
+ */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
- return __do_munmap(mm, start, len, uf, false);
+ VMA_ITERATOR(vmi, mm, start);
+
+ return do_vmi_munmap(&vmi, mm, start, len, uf, false);
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *next, *prev, *merge;
+ pgoff_t pglen = len >> PAGE_SHIFT;
+ unsigned long charged = 0;
+ unsigned long end = addr + len;
+ unsigned long merge_start = addr, merge_end = end;
+ pgoff_t vm_pgoff;
+ int error;
+ VMA_ITERATOR(vmi, mm, addr);
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ nr_pages = count_vma_pages_range(mm, addr, end);
+
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
+ /* Unmap any existing mapping in the area */
+ if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
+ return -ENOMEM;
+
+ /*
+ * Private writable mapping: check memory availability
+ */
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+
+ next = vma_next(&vmi);
+ prev = vma_prev(&vmi);
+ if (vm_flags & VM_SPECIAL)
+ goto cannot_expand;
+
+ /* Attempt to expand an old mapping */
+ /* Check next */
+ if (next && next->vm_start == end && !vma_policy(next) &&
+ can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
+ NULL_VM_UFFD_CTX, NULL)) {
+ merge_end = next->vm_end;
+ vma = next;
+ vm_pgoff = next->vm_pgoff - pglen;
+ }
+
+ /* Check prev */
+ if (prev && prev->vm_end == addr && !vma_policy(prev) &&
+ (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
+ pgoff, vma->vm_userfaultfd_ctx, NULL) :
+ can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
+ NULL_VM_UFFD_CTX, NULL))) {
+ merge_start = prev->vm_start;
+ vma = prev;
+ vm_pgoff = prev->vm_pgoff;
+ }
+
+
+ /* Actually expand, if possible */
+ if (vma &&
+ !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
+ khugepaged_enter_vma(vma, vm_flags);
+ goto expanded;
+ }
+
+cannot_expand:
+ if (prev)
+ vma_iter_next_range(&vmi);
+
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+
+ vma_iter_set(&vmi, addr);
+ vma->vm_start = addr;
+ vma->vm_end = end;
+ vm_flags_init(vma, vm_flags);
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ if (vm_flags & VM_SHARED) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto free_vma;
+ }
+
+ vma->vm_file = get_file(file);
+ error = call_mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+
+ /*
+ * Expansion is handled above, merging is handled below.
+ * Drivers should not alter the address of the VMA.
+ */
+ error = -EINVAL;
+ if (WARN_ON((addr != vma->vm_start)))
+ goto close_and_free_vma;
+
+ vma_iter_set(&vmi, addr);
+ /*
+ * If vm_flags changed after call_mmap(), we should try merge
+ * vma again as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(&vmi, mm, prev, vma->vm_start,
+ vma->vm_end, vma->vm_flags, NULL,
+ vma->vm_file, vma->vm_pgoff, NULL,
+ NULL_VM_UFFD_CTX, NULL);
+ if (merge) {
+ /*
+ * ->mmap() can change vma->vm_file and fput
+ * the original file. So fput the vma->vm_file
+ * here or we would add an extra fput for file
+ * and cause general protection fault
+ * ultimately.
+ */
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags to pick up the change. */
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
+ vm_flags = vma->vm_flags;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ } else {
+ vma_set_anonymous(vma);
+ }
+
+ if (map_deny_write_exec(vma, vma->vm_flags)) {
+ error = -EACCES;
+ goto close_and_free_vma;
+ }
+
+ /* Allow architectures to sanity-check the vm_flags */
+ error = -EINVAL;
+ if (!arch_validate_flags(vma->vm_flags))
+ goto close_and_free_vma;
+
+ error = -ENOMEM;
+ if (vma_iter_prealloc(&vmi))
+ goto close_and_free_vma;
+
+ /* Lock the VMA since it is modified after insertion into VMA tree */
+ vma_start_write(vma);
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ vma_iter_store(&vmi, vma);
+ mm->map_count++;
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(vma->vm_file->f_mapping);
+
+ flush_dcache_mmap_lock(vma->vm_file->f_mapping);
+ vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
+ flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ }
+
+ /*
+ * vma_merge() calls khugepaged_enter_vma() either, the below
+ * call covers the non-merge case.
+ */
+ khugepaged_enter_vma(vma, vma->vm_flags);
+
+ /* Once vma denies write, undo our temporary denial count */
+unmap_writable:
+ if (file && vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+ file = vma->vm_file;
+ ksm_add_vma(vma);
+expanded:
+ perf_event_mmap(vma);
+
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
+ vm_flags_clear(vma, VM_LOCKED_MASK);
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ }
+
+ if (file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vm_flags_set(vma, VM_SOFTDIRTY);
+
+ vma_set_page_prot(vma);
+
+ validate_mm(mm);
+ return addr;
+
+close_and_free_vma:
+ if (file && vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+
+ if (file || vma->vm_file) {
+unmap_and_free_vma:
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start,
+ vma->vm_end, true);
+ }
+ if (file && (vm_flags & VM_SHARED))
+ mapping_unmap_writable(file->f_mapping);
+free_vma:
+ vm_area_free(vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ validate_mm(mm);
+ return error;
}
-static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
+static int __vm_munmap(unsigned long start, size_t len, bool unlock)
{
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, start);
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = __do_munmap(mm, start, len, &uf, downgrade);
- /*
- * Returning 1 indicates mmap_lock is downgraded.
- * But 1 is not legal return value of vm_munmap() and munmap(), reset
- * it to 0 before return.
- */
- if (ret == 1) {
- mmap_read_unlock(mm);
- ret = 0;
- } else
+ ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
+ if (ret || !unlock)
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
@@ -2921,7 +2920,6 @@ EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
addr = untagged_addr(addr);
- profile_munmap(addr);
return __vm_munmap(addr, len, true);
}
@@ -2939,7 +2937,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long ret = -EINVAL;
struct file *file;
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
current->comm, current->pid);
if (prot)
@@ -2957,20 +2955,18 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (mmap_write_lock_killable(mm))
return -EINTR;
- vma = find_vma(mm, start);
+ vma = vma_lookup(mm, start);
if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;
- if (start < vma->vm_start)
- goto out;
-
if (start + size > vma->vm_end) {
- struct vm_area_struct *next;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
+ struct vm_area_struct *next, *prev = vma;
- for (next = vma->vm_next; next; next = next->vm_next) {
+ for_each_vma_range(vmi, next, start + size) {
/* hole between vmas ? */
- if (next->vm_start != next->vm_prev->vm_end)
+ if (next->vm_start != prev->vm_end)
goto out;
if (next->vm_file != vma->vm_file)
@@ -2981,6 +2977,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (start + size <= next->vm_end)
break;
+
+ prev = next;
}
if (!next)
@@ -2993,25 +2991,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
- if (vma->vm_flags & VM_LOCKED) {
- struct vm_area_struct *tmp;
+ if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
- /* drop PG_Mlocked flag for over-mapped range */
- for (tmp = vma; tmp->vm_start >= start + size;
- tmp = tmp->vm_next) {
- /*
- * Split pmd and munlock page on the border
- * of the range.
- */
- vma_adjust_trans_huge(tmp, start, start + size, 0);
-
- munlock_vma_pages_range(tmp,
- max(tmp->vm_start, start),
- min(tmp->vm_end, start + size));
- }
- }
-
file = get_file(vma->vm_file);
ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
@@ -3026,42 +3008,54 @@ out:
}
/*
- * this is really a simplified "do_mmap". it only handles
- * anonymous maps. eventually we may be able to do some
- * brk-specific accounting here.
+ * do_vma_munmap() - Unmap a full or partial vma.
+ * @vmi: The vma iterator pointing at the vma
+ * @vma: The first vma to be munmapped
+ * @start: the start of the address to unmap
+ * @end: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ * @unlock: Drop the lock on success
+ *
+ * unmaps a VMA mapping when the vma iterator is already in position.
+ * Does not handle alignment.
+ *
+ * Return: 0 on success drops the lock of so directed, error on failure and will
+ * still hold the lock.
*/
-static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
+int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, struct list_head *uf,
+ bool unlock)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- struct rb_node **rb_link, *rb_parent;
- pgoff_t pgoff = addr >> PAGE_SHIFT;
- int error;
- unsigned long mapped_addr;
-
- /* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
- return -EINVAL;
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ struct mm_struct *mm = vma->vm_mm;
- mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (IS_ERR_VALUE(mapped_addr))
- return mapped_addr;
+ arch_unmap(mm, start, end);
+ return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
+}
- error = mlock_future_check(mm, mm->def_flags, len);
- if (error)
- return error;
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @vmi: The vma iterator
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vma_prepare vp;
+ validate_mm(mm);
/*
- * Clear old maps. this also does some error checking for us
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
*/
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
-
- /* Check against address space limits *after* clearing old maps... */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
@@ -3071,45 +3065,70 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- /* Can we just expand an old private anonymous mapping? */
- vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
- if (vma)
- goto out;
-
/*
- * create a vma struct for an anonymous mapping
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
*/
- vma = vm_area_alloc(mm);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ if (vma && vma->vm_end == addr && !vma_policy(vma) &&
+ can_vma_merge_after(vma, flags, NULL, NULL,
+ addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
+ if (vma_iter_prealloc(vmi))
+ goto unacct_fail;
+
+ init_vma_prep(&vp, vma);
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+ vma->vm_end = addr + len;
+ vm_flags_set(vma, VM_SOFTDIRTY);
+ vma_iter_store(vmi, vma);
+
+ vma_complete(&vp, vmi, mm);
+ khugepaged_enter_vma(vma, flags);
+ goto out;
}
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto unacct_fail;
+
vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
- vma->vm_flags = flags;
+ vma->vm_pgoff = addr >> PAGE_SHIFT;
+ vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(flags);
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ mm->map_count++;
+ ksm_add_vma(vma);
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
- vma->vm_flags |= VM_SOFTDIRTY;
+ vm_flags_set(vma, VM_SOFTDIRTY);
+ validate_mm(mm);
return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+unacct_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, addr);
len = PAGE_ALIGN(request);
if (len < request)
@@ -3120,13 +3139,31 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_brk_flags(addr, len, flags, &uf);
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+
+ ret = check_brk_limits(addr, len);
+ if (ret)
+ goto limits_failed;
+
+ ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
+ if (ret)
+ goto munmap_failed;
+
+ vma = vma_prev(&vmi);
+ ret = do_brk_flags(&vmi, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
+
+munmap_failed:
+limits_failed:
+ mmap_write_unlock(mm);
+ return ret;
}
EXPORT_SYMBOL(vm_brk_flags);
@@ -3142,68 +3179,59 @@ void exit_mmap(struct mm_struct *mm)
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ int count = 0;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Manually reap the mm to free as much memory as possible.
- * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
- * this mm from further consideration. Taking mm->mmap_lock for
- * write after setting MMF_OOM_SKIP will guarantee that the oom
- * reaper will not run on this mm again after mmap_lock is
- * dropped.
- *
- * Nothing can be holding mm->mmap_lock here and the above call
- * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
- * __oom_reap_task_mm() will not block.
- *
- * This needs to be done before calling munlock_vma_pages_all(),
- * which clears VM_LOCKED, otherwise the oom reaper cannot
- * reliably test it.
- */
- (void)__oom_reap_task_mm(mm);
-
- set_bit(MMF_OOM_SKIP, &mm->flags);
- mmap_write_lock(mm);
- mmap_write_unlock(mm);
- }
-
- if (mm->locked_vm) {
- vma = mm->mmap;
- while (vma) {
- if (vma->vm_flags & VM_LOCKED)
- munlock_vma_pages_all(vma);
- vma = vma->vm_next;
- }
- }
-
+ mmap_read_lock(mm);
arch_exit_mmap(mm);
- vma = mm->mmap;
- if (!vma) /* Can happen if dup_mmap() received an OOM */
+ vma = mas_find(&mas, ULONG_MAX);
+ if (!vma) {
+ /* Can happen if dup_mmap() received an OOM */
+ mmap_read_unlock(mm);
return;
+ }
lru_add_drain();
flush_cache_mm(mm);
- tlb_gather_mmu(&tlb, mm, 0, -1);
+ tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
- /* Use -1 here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
- tlb_finish_mmu(&tlb, 0, -1);
+ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
+ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false);
+ mmap_read_unlock(mm);
+
+ /*
+ * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
+ * because the memory has been already freed.
+ */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ mmap_write_lock(mm);
+ mt_clear_in_rcu(&mm->mm_mt);
+ free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+ USER_PGTABLES_CEILING, true);
+ tlb_finish_mmu(&tlb);
/*
- * Walk the list again, actually closing and freeing it,
- * with preemption enabled, without holding any MM locks.
+ * Walk the list again, actually closing and freeing it, with preemption
+ * enabled, without holding any MM locks besides the unreachable
+ * mmap_write_lock.
*/
- while (vma) {
+ do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- vma = remove_vma(vma);
+ remove_vma(vma, true);
+ count++;
cond_resched();
- }
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+
+ BUG_ON(count != mm->map_count);
+
+ trace_exit_mmap(mm);
+ __mt_destroy(&mm->mm_mt);
+ mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3213,14 +3241,14 @@ void exit_mmap(struct mm_struct *mm)
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = vma_pages(vma);
+
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
return -ENOMEM;
+
if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
/*
@@ -3240,7 +3268,11 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, vma)) {
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -3256,9 +3288,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
- struct rb_node **rb_link, *rb_parent;
bool faulted_in_anon_vma = true;
+ VMA_ITERATOR(vmi, mm, addr);
+ validate_mm(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3268,11 +3301,13 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ new_vma = find_vma_prev(mm, addr, &prev);
+ if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+
+ new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@@ -3310,16 +3345,28 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ vma_start_write(new_vma);
+ if (vma_link(mm, new_vma))
+ goto out_vma_link;
*need_rmap_locks = false;
}
+ validate_mm(mm);
return new_vma;
+out_vma_link:
+ if (new_vma->vm_ops && new_vma->vm_ops->close)
+ new_vma->vm_ops->close(new_vma);
+
+ if (new_vma->vm_file)
+ fput(new_vma->vm_file);
+
+ unlink_anon_vmas(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
vm_area_free(new_vma);
out:
+ validate_mm(mm);
return NULL;
}
@@ -3354,7 +3401,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
- mm->total_vm += npages;
+ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
if (is_exec_mapping(flags))
mm->exec_vm += npages;
@@ -3391,6 +3438,17 @@ static int special_mapping_mremap(struct vm_area_struct *new_vma)
return 0;
}
+static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Forbid splitting special mappings - kernel has expectations over
+ * the number of pages in mapping. Together with VM_DONTEXPAND
+ * the size of vma should stay the same over the special mapping's
+ * lifetime.
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
.fault = special_mapping_fault,
@@ -3398,6 +3456,7 @@ static const struct vm_operations_struct special_mapping_vmops = {
.name = special_mapping_name,
/* vDSO code relies that VVAR can't be accessed remotely */
.access = NULL,
+ .may_split = special_mapping_split,
};
static const struct vm_operations_struct legacy_special_mapping_vmops = {
@@ -3444,6 +3503,7 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
+ validate_mm(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3451,7 +3511,8 @@ static struct vm_area_struct *__install_special_mapping(
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
+ vm_flags_init(vma, (vm_flags | mm->def_flags |
+ VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = ops;
@@ -3465,10 +3526,12 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
+ validate_mm(mm);
return vma;
out:
vm_area_free(vma);
+ validate_mm(mm);
return ERR_PTR(ret);
}
@@ -3577,6 +3640,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* of mm/rmap.c:
* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
* hugetlb mapping);
+ * - all vmas marked locked
* - all i_mmap_rwsem locks;
* - all anon_vma->rwseml
*
@@ -3593,12 +3657,20 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
- BUG_ON(mmap_read_trylock(mm));
+ mmap_assert_write_locked(mm);
mutex_lock(&mm_all_locks_mutex);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
+ if (signal_pending(current))
+ goto out_unlock;
+ vma_start_write(vma);
+ }
+
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3606,7 +3678,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3614,7 +3687,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
@@ -3673,17 +3747,19 @@ void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
- BUG_ON(mmap_read_trylock(mm));
+ mmap_assert_write_locked(mm);
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_unlock_anon_vma(avc->anon_vma);
if (vma->vm_file && vma->vm_file->f_mapping)
vm_unlock_mapping(vma->vm_file->f_mapping);
}
+ vma_end_write_all(mm);
mutex_unlock(&mm_all_locks_mutex);
}
@@ -3798,13 +3874,9 @@ static int reserve_mem_notifier(struct notifier_block *nb,
return NOTIFY_OK;
}
-static struct notifier_block reserve_mem_nb = {
- .notifier_call = reserve_mem_notifier,
-};
-
static int __meminit init_reserve_notifier(void)
{
- if (register_hotmemory_notifier(&reserve_mem_nb))
+ if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
new file mode 100644
index 000000000000..1854850b4b89
--- /dev/null
+++ b/mm/mmap_lock.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+#define CREATE_TRACE_POINTS
+#include <trace/events/mmap_lock.h>
+
+#include <linux/mm.h>
+#include <linux/cgroup.h>
+#include <linux/memcontrol.h>
+#include <linux/mmap_lock.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/trace_events.h>
+#include <linux/local_lock.h>
+
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
+
+#ifdef CONFIG_MEMCG
+
+/*
+ * Our various events all share the same buffer (because we don't want or need
+ * to allocate a set of buffers *per event type*), so we need to protect against
+ * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
+ * been made.
+ */
+static DEFINE_MUTEX(reg_lock);
+static int reg_refcount; /* Protected by reg_lock. */
+
+/*
+ * Size of the buffer for memcg path names. Ignoring stack trace support,
+ * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
+ */
+#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
+
+/*
+ * How many contexts our trace events might be called in: normal, softirq, irq,
+ * and NMI.
+ */
+#define CONTEXT_COUNT 4
+
+struct memcg_path {
+ local_lock_t lock;
+ char __rcu *buf;
+ local_t buf_idx;
+};
+static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+ .buf_idx = LOCAL_INIT(0),
+};
+
+static char **tmp_bufs;
+
+/* Called with reg_lock held. */
+static void free_memcg_path_bufs(void)
+{
+ struct memcg_path *memcg_path;
+ int cpu;
+ char **old = tmp_bufs;
+
+ for_each_possible_cpu(cpu) {
+ memcg_path = per_cpu_ptr(&memcg_paths, cpu);
+ *(old++) = rcu_dereference_protected(memcg_path->buf,
+ lockdep_is_held(&reg_lock));
+ rcu_assign_pointer(memcg_path->buf, NULL);
+ }
+
+ /* Wait for inflight memcg_path_buf users to finish. */
+ synchronize_rcu();
+
+ old = tmp_bufs;
+ for_each_possible_cpu(cpu) {
+ kfree(*(old++));
+ }
+
+ kfree(tmp_bufs);
+ tmp_bufs = NULL;
+}
+
+int trace_mmap_lock_reg(void)
+{
+ int cpu;
+ char *new;
+
+ mutex_lock(&reg_lock);
+
+ /* If the refcount is going 0->1, proceed with allocating buffers. */
+ if (reg_refcount++)
+ goto out;
+
+ tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs),
+ GFP_KERNEL);
+ if (tmp_bufs == NULL)
+ goto out_fail;
+
+ for_each_possible_cpu(cpu) {
+ new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
+ if (new == NULL)
+ goto out_fail_free;
+ rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
+ /* Don't need to wait for inflights, they'd have gotten NULL. */
+ }
+
+out:
+ mutex_unlock(&reg_lock);
+ return 0;
+
+out_fail_free:
+ free_memcg_path_bufs();
+out_fail:
+ /* Since we failed, undo the earlier ref increment. */
+ --reg_refcount;
+
+ mutex_unlock(&reg_lock);
+ return -ENOMEM;
+}
+
+void trace_mmap_lock_unreg(void)
+{
+ mutex_lock(&reg_lock);
+
+ /* If the refcount is going 1->0, proceed with freeing buffers. */
+ if (--reg_refcount)
+ goto out;
+
+ free_memcg_path_bufs();
+
+out:
+ mutex_unlock(&reg_lock);
+}
+
+static inline char *get_memcg_path_buf(void)
+{
+ struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
+ char *buf;
+ int idx;
+
+ rcu_read_lock();
+ buf = rcu_dereference(memcg_path->buf);
+ if (buf == NULL) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) -
+ MEMCG_PATH_BUF_SIZE;
+ return &buf[idx];
+}
+
+static inline void put_memcg_path_buf(void)
+{
+ local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
+ rcu_read_unlock();
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ const char *memcg_path; \
+ local_lock(&memcg_paths.lock); \
+ memcg_path = get_mm_memcg_path(mm); \
+ trace_mmap_lock_##type(mm, \
+ memcg_path != NULL ? memcg_path : "", \
+ ##__VA_ARGS__); \
+ if (likely(memcg_path != NULL)) \
+ put_memcg_path_buf(); \
+ local_unlock(&memcg_paths.lock); \
+ } while (0)
+
+#else /* !CONFIG_MEMCG */
+
+int trace_mmap_lock_reg(void)
+{
+ return 0;
+}
+
+void trace_mmap_lock_unreg(void)
+{
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
+
+#endif /* CONFIG_MEMCG */
+
+#ifdef CONFIG_TRACING
+#ifdef CONFIG_MEMCG
+/*
+ * Write the given mm_struct's memcg path to a percpu buffer, and return a
+ * pointer to it. If the path cannot be determined, or no buffer was available
+ * (because the trace event is being unregistered), NULL is returned.
+ *
+ * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
+ * disabled by the caller before calling us, and re-enabled only after the
+ * caller is done with the pointer.
+ *
+ * The caller must call put_memcg_path_buf() once the buffer is no longer
+ * needed. This must be done while preemption is still disabled.
+ */
+static const char *get_mm_memcg_path(struct mm_struct *mm)
+{
+ char *buf = NULL;
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+
+ if (memcg == NULL)
+ goto out;
+ if (unlikely(memcg->css.cgroup == NULL))
+ goto out_put;
+
+ buf = get_memcg_path_buf();
+ if (buf == NULL)
+ goto out_put;
+
+ cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
+
+out_put:
+ css_put(&memcg->css);
+out:
+ return buf;
+}
+
+#endif /* CONFIG_MEMCG */
+
+/*
+ * Trace calls must be in a separate file, as otherwise there's a circular
+ * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
+ */
+
+void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
+{
+ TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
+
+void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
+ bool success)
+{
+ TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
+
+void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
+{
+ TRACE_MMAP_LOCK_EVENT(released, mm, write);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_released);
+#endif /* CONFIG_TRACING */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 03c33c93a582..ea9683e12936 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -3,10 +3,12 @@
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
+#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>
+#include <linux/rmap.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
@@ -17,6 +19,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
+ /* Limit batching if we have delayed rmaps pending */
+ if (tlb->delayed_rmap && tlb->active != &tlb->local)
+ return false;
+
batch = tlb->active;
if (batch->next) {
tlb->active = batch->next;
@@ -26,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return false;
- batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (!batch)
return false;
@@ -41,13 +47,59 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}
+#ifdef CONFIG_SMP
+static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
+{
+ for (int i = 0; i < batch->nr; i++) {
+ struct encoded_page *enc = batch->encoded_pages[i];
+
+ if (encoded_page_flags(enc)) {
+ struct page *page = encoded_page_ptr(enc);
+ page_remove_rmap(page, vma, false);
+ }
+ }
+}
+
+/**
+ * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
+ * @tlb: the current mmu_gather
+ *
+ * Note that because of how tlb_next_batch() above works, we will
+ * never start multiple new batches with pending delayed rmaps, so
+ * we only need to walk through the current active batch and the
+ * original local one.
+ */
+void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ if (!tlb->delayed_rmap)
+ return;
+
+ tlb_flush_rmap_batch(&tlb->local, vma);
+ if (tlb->active != &tlb->local)
+ tlb_flush_rmap_batch(tlb->active, vma);
+ tlb->delayed_rmap = 0;
+}
+#endif
+
static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- free_pages_and_swap_cache(batch->pages, batch->nr);
- batch->nr = 0;
+ struct encoded_page **pages = batch->encoded_pages;
+
+ do {
+ /*
+ * limit free batch count when PAGE_SIZE > 4K
+ */
+ unsigned int nr = min(512U, batch->nr);
+
+ free_pages_and_swap_cache(pages, nr);
+ pages += nr;
+ batch->nr -= nr;
+
+ cond_resched();
+ } while (batch->nr);
}
tlb->active = &tlb->local;
}
@@ -63,7 +115,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
tlb->local.next = NULL;
}
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
{
struct mmu_gather_batch *batch;
@@ -78,13 +130,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
* Add the page and check if we are full. If so
* force a flush.
*/
- batch->pages[batch->nr++] = page;
+ batch->encoded_pages[batch->nr++] = page;
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return true;
batch = tlb->active;
}
- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+ VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
return false;
}
@@ -139,7 +191,7 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */
}
-static void tlb_remove_table_sync_one(void)
+void tlb_remove_table_sync_one(void)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
@@ -163,8 +215,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)
#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
-static void tlb_remove_table_sync_one(void) { }
-
static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
__tlb_remove_table_free(batch);
@@ -249,25 +299,11 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
tlb_flush_mmu_free(tlb);
}
-/**
- * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
- * @tlb: the mmu_gather structure to initialize
- * @mm: the mm_struct of the target address space
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
- *
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @start and @end are set to 0 and -1
- * respectively when @mm is without users and we're going to destroy
- * the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ bool fullmm)
{
tlb->mm = mm;
-
- /* Is it from 0 to ~0? */
- tlb->fullmm = !(start | (end+1));
+ tlb->fullmm = fullmm;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb->need_flush_all = 0;
@@ -277,6 +313,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
tlb->active = &tlb->local;
tlb->batch_count = 0;
#endif
+ tlb->delayed_rmap = 0;
tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
@@ -288,16 +325,42 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
}
/**
+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm.
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+ __tlb_gather_mmu(tlb, mm, false);
+}
+
+/**
+ * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ *
+ * In this case, @mm is without users and we're going to destroy the
+ * full address space (exit/execve).
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm.
+ */
+void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+ __tlb_gather_mmu(tlb, mm, true);
+}
+
+/**
* tlb_finish_mmu - finish an mmu_gather structure
* @tlb: the mmu_gather structure to finish
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
*
* Called at the end of the shootdown operation to free up any resources that
* were required.
*/
-void tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end)
+void tlb_finish_mmu(struct mmu_gather *tlb)
{
/*
* If there are parallel threads are doing PTE changes on same range
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 4fc918163dd3..50c0dde1354f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -501,10 +501,33 @@ static int mn_hlist_invalidate_range_start(
"");
WARN_ON(mmu_notifier_range_blockable(range) ||
_ret != -EAGAIN);
+ /*
+ * We call all the notifiers on any EAGAIN,
+ * there is no way for a notifier to know if
+ * its start method failed, thus a start that
+ * does EAGAIN can't also do end.
+ */
+ WARN_ON(ops->invalidate_range_end);
ret = _ret;
}
}
}
+
+ if (ret) {
+ /*
+ * Must be non-blocking to get here. If there are multiple
+ * notifiers and one or more failed start, any that succeeded
+ * start are expecting their end to be called. Do so now.
+ */
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list,
+ hlist, srcu_read_lock_held(&srcu)) {
+ if (!subscription->ops->invalidate_range_end)
+ continue;
+
+ subscription->ops->invalidate_range_end(subscription,
+ range);
+ }
+ }
srcu_read_unlock(&srcu, id);
return ret;
@@ -612,13 +635,6 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
mmap_assert_write_locked(mm);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
- if (IS_ENABLED(CONFIG_LOCKDEP)) {
- fs_reclaim_acquire(GFP_KERNEL);
- lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
- lock_map_release(&__mmu_notifier_invalidate_range_start_map);
- fs_reclaim_release(GFP_KERNEL);
- }
-
if (!mm->notifier_subscriptions) {
/*
* kmalloc cannot be called under mm_take_all_locks(), but we
@@ -913,7 +929,7 @@ static int __mmu_interval_notifier_insert(
return -EOVERFLOW;
/* Must call with a mmget() held */
- if (WARN_ON(atomic_read(&mm->mm_count) <= 0))
+ if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
return -EINVAL;
/* pairs with mmdrop in mmu_interval_notifier_remove() */
@@ -1020,6 +1036,18 @@ int mmu_interval_notifier_insert_locked(
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
+static bool
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
+ unsigned long seq)
+{
+ bool ret;
+
+ spin_lock(&subscriptions->lock);
+ ret = subscriptions->invalidate_seq != seq;
+ spin_unlock(&subscriptions->lock);
+ return ret;
+}
+
/**
* mmu_interval_notifier_remove - Remove a interval notifier
* @interval_sub: Interval subscription to unregister
@@ -1067,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
if (seq)
wait_event(subscriptions->wq,
- READ_ONCE(subscriptions->invalidate_seq) != seq);
+ mmu_interval_seq_released(subscriptions, seq));
/* pairs with mmgrab in mmu_interval_notifier_insert() */
mmdrop(mm);
@@ -1092,13 +1120,3 @@ void mmu_notifier_synchronize(void)
synchronize_srcu(&srcu);
}
EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
-
-bool
-mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
-{
- if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
- return false;
- /* Return true if the vma still have the read flag set. */
- return range->vma->vm_flags & VM_READ;
-}
-EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..68e1511be12d 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,28 +72,24 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
return z;
}
-#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-bool memmap_valid_within(unsigned long pfn,
- struct page *page, struct zone *zone)
-{
- if (page_to_pfn(page) != pfn)
- return false;
-
- if (page_zone(page) != zone)
- return false;
-
- return true;
-}
-#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
void lruvec_init(struct lruvec *lruvec)
{
enum lru_list lru;
memset(lruvec, 0, sizeof(struct lruvec));
+ spin_lock_init(&lruvec->lru_lock);
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
+ /*
+ * The "Unevictable LRU" is imaginary: though its size is maintained,
+ * it is never scanned, and unevictable pages are not threaded on it
+ * (so that their lru fields can be reused to hold mlock_count).
+ * Poison its list head, so that any operations on it would crash.
+ */
+ list_del(&lruvec->lists[LRU_UNEVICTABLE]);
+
+ lru_gen_init_lruvec(lruvec);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
@@ -102,13 +98,14 @@ int page_cpupid_xchg_last(struct page *page, int cpupid)
unsigned long old_flags, flags;
int last_cpupid;
+ old_flags = READ_ONCE(page->flags);
do {
- old_flags = flags = page->flags;
- last_cpupid = page_cpupid_last(page);
+ flags = old_flags;
+ last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
+ } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
return last_cpupid;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 56c02beb6041..3aef1340533a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,40 +29,73 @@
#include <linux/uaccess.h>
#include <linux/mm_inline.h>
#include <linux/pgtable.h>
+#include <linux/sched/sysctl.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/memory-tiers.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#include "internal.h"
-static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ struct page *page;
+
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
+ return false;
+
+ /* Don't touch entries that are not even readable. */
+ if (pte_protnone(pte))
+ return false;
+
+ /* Do we need write faults for softdirty tracking? */
+ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
+ return false;
+
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_pte_wp(vma, pte))
+ return false;
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ /*
+ * Writable MAP_PRIVATE mapping: We can only special-case on
+ * exclusive anonymous pages, because we know that our
+ * write-fault handler similarly would map them writable without
+ * any additional checks while holding the PT lock.
+ */
+ page = vm_normal_page(vma, addr, pte);
+ return page && PageAnon(page) && PageAnonExclusive(page);
+ }
+
+ /*
+ * Writable MAP_SHARED mapping: "clean" might indicate that the FS still
+ * needs a real write-fault for writenotify
+ * (see vma_wants_writenotify()). If "dirty", the assumption is that the
+ * FS was already notified and we can simply mark the PTE writable
+ * just like the write-fault handler would do.
+ */
+ return pte_dirty(pte);
+}
+
+static long change_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
- unsigned long pages = 0;
+ long pages = 0;
int target_node = NUMA_NO_NODE;
- bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
- /*
- * Can be called with only the mmap_lock for reading by
- * prot_numa so we must check the pmd isn't constantly
- * changing from under us from pmd_none to pmd_trans_huge
- * and/or the other way around.
- */
- if (pmd_trans_unstable(pmd))
- return 0;
-
- /*
- * The pmd points to a regular pte so the pmd can't change
- * from under us even if the mmap_lock is only hold for
- * reading.
- */
+ tlb_change_page_size(tlb, PAGE_SIZE);
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return -EAGAIN;
/* Get target node for single threaded private VMAs */
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
@@ -72,10 +105,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
- oldpte = *pte;
+ oldpte = ptep_get(pte);
if (pte_present(oldpte)) {
pte_t ptent;
- bool preserve_write = prot_numa && pte_write(oldpte);
/*
* Avoid trapping faults against the zero or KSM
@@ -83,18 +115,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (prot_numa) {
struct page *page;
+ int nid;
+ bool toptier;
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
page = vm_normal_page(vma, addr, oldpte);
- if (!page || PageKsm(page))
+ if (!page || is_zone_device_page(page) || PageKsm(page))
continue;
/* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) &&
- page_mapcount(page) != 1)
+ page_count(page) != 1)
continue;
/*
@@ -109,60 +143,108 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* Don't mess with PTEs if page is already on the node
* a single-threaded process is running on.
*/
- if (target_node == page_to_nid(page))
+ nid = page_to_nid(page);
+ if (target_node == nid)
+ continue;
+ toptier = node_is_toptier(nid);
+
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ toptier)
continue;
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page,
+ jiffies_to_msecs(jiffies));
}
oldpte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_modify(oldpte, newprot);
- if (preserve_write)
- ptent = pte_mk_savedwrite(ptent);
- if (uffd_wp) {
- ptent = pte_wrprotect(ptent);
+ if (uffd_wp)
ptent = pte_mkuffd_wp(ptent);
- } else if (uffd_wp_resolve) {
- /*
- * Leave the write bit to be handled
- * by PF interrupt handler, then
- * things like COW could be properly
- * handled.
- */
+ else if (uffd_wp_resolve)
ptent = pte_clear_uffd_wp(ptent);
- }
- /* Avoid taking write faults for known dirty pages */
- if (dirty_accountable && pte_dirty(ptent) &&
- (pte_soft_dirty(ptent) ||
- !(vma->vm_flags & VM_SOFTDIRTY))) {
+ /*
+ * In some writable, shared mappings, we might want
+ * to catch actual write access -- see
+ * vma_wants_writenotify().
+ *
+ * In all writable, private mappings, we have to
+ * properly handle COW.
+ *
+ * In both cases, we can sometimes still change PTEs
+ * writable and avoid the write-fault handler, for
+ * example, if a PTE is already dirty and no other
+ * COW or special handling is required.
+ */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
+ !pte_write(ptent) &&
+ can_change_pte_writable(vma, addr, ptent))
ptent = pte_mkwrite(ptent);
- }
+
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
+ if (pte_needs_flush(oldpte, ptent))
+ tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
pages++;
} else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
+
/*
* A protection check is difficult so
* just be safe and disable write
*/
- make_migration_entry_read(&entry);
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(
+ swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
- if (pte_swp_uffd_wp(oldpte))
- newpte = pte_swp_mkuffd_wp(newpte);
- } else if (is_write_device_private_entry(entry)) {
+ } else if (is_writable_device_private_entry(entry)) {
/*
* We do not preserve soft-dirtiness. See
* copy_one_pte() for explanation.
*/
- make_device_private_entry_read(&entry);
+ entry = make_readable_device_private_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_writable_device_exclusive_entry(entry)) {
+ entry = make_readable_device_exclusive_entry(
+ swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_pte_marker_entry(entry)) {
+ /*
+ * Ignore swapin errors unconditionally,
+ * because any access should sigbus anyway.
+ */
+ if (is_swapin_error_entry(entry))
+ continue;
+ /*
+ * If this is uffd-wp pte marker and we'd like
+ * to unprotect it, drop it; the next page
+ * fault will trigger without uffd trapping.
+ */
+ if (uffd_wp_resolve) {
+ pte_clear(vma->vm_mm, addr, pte);
+ pages++;
+ }
+ continue;
} else {
newpte = oldpte;
}
@@ -176,6 +258,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
+ } else {
+ /* It must be an none page, or what else?.. */
+ WARN_ON_ONCE(!pte_none(oldpte));
+
+ /*
+ * Nobody plays with any none ptes besides
+ * userfaultfd when applying the protections.
+ */
+ if (likely(!uffd_wp))
+ continue;
+
+ if (userfaultfd_wp_use_markers(vma)) {
+ /*
+ * For file-backed mem, we need to be able to
+ * wr-protect a none pte, because even if the
+ * pte is none, the page/swap cache could
+ * exist. Doing that by install a marker.
+ */
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ pages++;
+ }
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -185,37 +289,74 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
/*
- * Used when setting automatic NUMA hinting protection where it is
- * critical that a numa hinting PMD is not confused with a bad PMD.
+ * Return true if we want to split THPs into PTE mappings in change
+ * protection procedure, false otherwise.
*/
-static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+static inline bool
+pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
{
- pmd_t pmdval = pmd_read_atomic(pmd);
-
- /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- barrier();
-#endif
+ /*
+ * pte markers only resides in pte level, if we need pte markers,
+ * we need to split. We cannot wr-protect shmem thp because file
+ * thp is handled differently when split by erasing the pmd so far.
+ */
+ return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
+}
- if (pmd_none(pmdval))
- return 1;
- if (pmd_trans_huge(pmdval))
- return 0;
- if (unlikely(pmd_bad(pmdval))) {
- pmd_clear_bad(pmd);
- return 1;
- }
+/*
+ * Return true if we want to populate pgtables in change protection
+ * procedure, false otherwise
+ */
+static inline bool
+pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
+{
+ /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
+ if (!(cp_flags & MM_CP_UFFD_WP))
+ return false;
- return 0;
+ /* Populate if the userfaultfd mode requires pte markers */
+ return userfaultfd_wp_use_markers(vma);
}
-static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
- pud_t *pud, unsigned long addr, unsigned long end,
- pgprot_t newprot, unsigned long cp_flags)
+/*
+ * Populate the pgtable underneath for whatever reason if requested.
+ * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
+ * allocation failures during page faults by kicking OOM and returning
+ * error.
+ */
+#define change_pmd_prepare(vma, pmd, cp_flags) \
+ ({ \
+ long err = 0; \
+ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
+ if (pte_alloc(vma->vm_mm, pmd)) \
+ err = -ENOMEM; \
+ } \
+ err; \
+ })
+
+/*
+ * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
+ * have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
+ * while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
+ */
+#define change_prepare(vma, high, low, addr, cp_flags) \
+ ({ \
+ long err = 0; \
+ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
+ low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
+ if (p == NULL) \
+ err = -ENOMEM; \
+ } \
+ err; \
+ })
+
+static inline long change_pmd_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0;
unsigned long nr_huge_updates = 0;
struct mmu_notifier_range range;
@@ -223,39 +364,48 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pmd = pmd_offset(pud, addr);
do {
- unsigned long this_pages;
-
+ long ret;
+ pmd_t _pmd;
+again:
next = pmd_addr_end(addr, end);
- /*
- * Automatic NUMA balancing walks the tables with mmap_lock
- * held for read. It's possible a parallel update to occur
- * between pmd_trans_huge() and a pmd_none_or_clear_bad()
- * check leading to a false positive and clearing.
- * Hence, it's necessary to atomically read the PMD value
- * for all the checks.
- */
- if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
- pmd_none_or_clear_bad_unless_trans_huge(pmd))
+ ret = change_pmd_prepare(vma, pmd, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
+
+ if (pmd_none(*pmd))
goto next;
/* invoke the mmu notifier if the pmd is populated */
if (!range.start) {
mmu_notifier_range_init(&range,
MMU_NOTIFY_PROTECTION_VMA, 0,
- vma, vma->vm_mm, addr, end);
+ vma->vm_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE) {
+ _pmd = pmdp_get_lockless(pmd);
+ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
+ if ((next - addr != HPAGE_PMD_SIZE) ||
+ pgtable_split_needed(vma, cp_flags)) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
+ /*
+ * For file-backed, the pmd could have been
+ * cleared; make sure pmd populated if
+ * necessary, then fall-through to pte level.
+ */
+ ret = change_pmd_prepare(vma, pmd, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
} else {
- int nr_ptes = change_huge_pmd(vma, pmd, addr,
- newprot, cp_flags);
-
- if (nr_ptes) {
- if (nr_ptes == HPAGE_PMD_NR) {
+ ret = change_huge_pmd(tlb, vma, pmd,
+ addr, newprot, cp_flags);
+ if (ret) {
+ if (ret == HPAGE_PMD_NR) {
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
@@ -266,9 +416,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
/* fall through, the trans huge pmd just split */
}
- this_pages = change_pte_range(vma, pmd, addr, next, newprot,
- cp_flags);
- pages += this_pages;
+
+ ret = change_pte_range(tlb, vma, pmd, addr, next, newprot,
+ cp_flags);
+ if (ret < 0)
+ goto again;
+ pages += ret;
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
@@ -281,88 +434,108 @@ next:
return pages;
}
-static inline unsigned long change_pud_range(struct vm_area_struct *vma,
- p4d_t *p4d, unsigned long addr, unsigned long end,
- pgprot_t newprot, unsigned long cp_flags)
+static inline long change_pud_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
+ ret = change_prepare(vma, pud, pmd, addr, cp_flags);
+ if (ret)
+ return ret;
if (pud_none_or_clear_bad(pud))
continue;
- pages += change_pmd_range(vma, pud, addr, next, newprot,
+ pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
cp_flags);
} while (pud++, addr = next, addr != end);
return pages;
}
-static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
- pgd_t *pgd, unsigned long addr, unsigned long end,
- pgprot_t newprot, unsigned long cp_flags)
+static inline long change_p4d_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
+ ret = change_prepare(vma, p4d, pud, addr, cp_flags);
+ if (ret)
+ return ret;
if (p4d_none_or_clear_bad(p4d))
continue;
- pages += change_pud_range(vma, p4d, addr, next, newprot,
+ pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
cp_flags);
} while (p4d++, addr = next, addr != end);
return pages;
}
-static unsigned long change_protection_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+static long change_protection_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
- unsigned long start = addr;
- unsigned long pages = 0;
+ long pages = 0, ret;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
- flush_cache_range(vma, addr, end);
- inc_tlb_flush_pending(mm);
+ tlb_start_vma(tlb, vma);
do {
next = pgd_addr_end(addr, end);
+ ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
if (pgd_none_or_clear_bad(pgd))
continue;
- pages += change_p4d_range(vma, pgd, addr, next, newprot,
+ pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
cp_flags);
} while (pgd++, addr = next, addr != end);
- /* Only flush the TLB if we actually modified any entries: */
- if (pages)
- flush_tlb_range(vma, start, end);
- dec_tlb_flush_pending(mm);
+ tlb_end_vma(tlb, vma);
return pages;
}
-unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+long change_protection(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long cp_flags)
{
- unsigned long pages;
+ pgprot_t newprot = vma->vm_page_prot;
+ long pages;
BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking)
+ * are expected to reflect their requirements via VMA flags such that
+ * vma_set_page_prot() will adjust vma->vm_page_prot accordingly.
+ */
+ if (cp_flags & MM_CP_PROT_NUMA)
+ newprot = PAGE_NONE;
+#else
+ WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
+#endif
+
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot);
+ pages = hugetlb_change_protection(vma, start, end, newprot,
+ cp_flags);
else
- pages = change_protection_range(vma, start, end, newprot,
+ pages = change_protection_range(tlb, vma, start, end, newprot,
cp_flags);
return pages;
@@ -371,7 +544,8 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
- return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
+ *(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
@@ -379,7 +553,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
- return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
+ *(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
@@ -393,19 +568,21 @@ static const struct mm_walk_ops prot_none_walk_ops = {
.pte_entry = prot_none_pte_entry,
.hugetlb_entry = prot_none_hugetlb_entry,
.test_walk = prot_none_test,
+ .walk_lock = PGWALK_WRLOCK,
};
int
-mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
- unsigned long start, unsigned long end, unsigned long newflags)
+mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
+ struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ unsigned long start, unsigned long end, unsigned long newflags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
+ unsigned int mm_cp_flags = 0;
unsigned long charged = 0;
pgoff_t pgoff;
int error;
- int dirty_accountable = 0;
if (newflags == oldflags) {
*pprev = vma;
@@ -452,9 +629,9 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
* First try to merge with previous and/or next vma.
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *pprev = vma_merge(mm, *pprev, start, end, newflags,
+ *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
@@ -464,13 +641,13 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
*pprev = vma;
if (start != vma->vm_start) {
- error = split_vma(mm, vma, start, 1);
+ error = split_vma(vmi, vma, start, 1);
if (error)
goto fail;
}
if (end != vma->vm_end) {
- error = split_vma(mm, vma, end, 0);
+ error = split_vma(vmi, vma, end, 0);
if (error)
goto fail;
}
@@ -480,12 +657,12 @@ success:
* vm_flags and vm_page_prot are protected by the mmap_lock
* held in write mode.
*/
- vma->vm_flags = newflags;
- dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
+ vm_flags_reset(vma, newflags);
+ if (vma_wants_manual_pte_write_upgrade(vma))
+ mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
- change_protection(vma, start, end, vma->vm_page_prot,
- dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
+ change_protection(tlb, vma, start, end, mm_cp_flags);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
@@ -514,10 +691,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
{
unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
- int error = -EINVAL;
+ int error;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
+ struct mmu_gather tlb;
+ struct vma_iterator vmi;
start = untagged_addr(start);
@@ -549,11 +728,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
goto out;
- vma = find_vma(current->mm, start);
+ vma_iter_init(&vmi, current->mm, start);
+ vma = vma_find(&vmi, end);
error = -ENOMEM;
if (!vma)
goto out;
- prev = vma->vm_prev;
+
if (unlikely(grows & PROT_GROWSDOWN)) {
if (vma->vm_start >= end)
goto out;
@@ -571,15 +751,23 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
goto out;
}
}
+
+ prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
- for (nstart = start ; ; ) {
+ tlb_gather_mmu(&tlb, current->mm);
+ nstart = start;
+ tmp = vma->vm_start;
+ for_each_vma_range(vmi, vma, end) {
unsigned long mask_off_old_flags;
unsigned long newflags;
int new_vma_pkey;
- /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+ if (vma->vm_start != tmp) {
+ error = -ENOMEM;
+ break;
+ }
/* Does the application expect PROT_READ to imply PROT_EXEC */
if (rier && (vma->vm_flags & VM_MAYEXEC))
@@ -590,8 +778,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
* If a permission is not passed to mprotect(), it must be
* cleared from the VMA.
*/
- mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
- VM_FLAGS_CLEAR;
+ mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
newflags = calc_vm_prot_bits(prot, new_vma_pkey);
@@ -600,39 +787,47 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
error = -EACCES;
- goto out;
+ break;
+ }
+
+ if (map_deny_write_exec(vma, newflags)) {
+ error = -EACCES;
+ break;
}
/* Allow architectures to sanity-check the new flags */
if (!arch_validate_flags(newflags)) {
error = -EINVAL;
- goto out;
+ break;
}
error = security_file_mprotect(vma, reqprot, prot);
if (error)
- goto out;
+ break;
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
- error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
- if (error)
- goto out;
- nstart = tmp;
- if (nstart < prev->vm_end)
- nstart = prev->vm_end;
- if (nstart >= end)
- goto out;
-
- vma = prev->vm_next;
- if (!vma || vma->vm_start != nstart) {
- error = -ENOMEM;
- goto out;
+ if (vma->vm_ops && vma->vm_ops->mprotect) {
+ error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
+ if (error)
+ break;
}
+
+ error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
+ if (error)
+ break;
+
+ tmp = vma_iter_end(&vmi);
+ nstart = tmp;
prot = reqprot;
}
+ tlb_finish_mmu(&tlb);
+
+ if (!error && tmp < end)
+ error = -ENOMEM;
+
out:
mmap_write_unlock(current->mm);
return error;
@@ -691,7 +886,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
mmap_write_unlock(current->mm);
/*
- * We could provie warnings or errors if any VMA still
+ * We could provide warnings or errors if any VMA still
* has the pkey set here.
*/
return ret;
diff --git a/mm/mremap.c b/mm/mremap.c
index 138abbae4f75..91f0173d396f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,6 +9,7 @@
*/
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/ksm.h>
@@ -22,20 +23,20 @@
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
-#include <linux/mm-arch-hooks.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
#include "internal.h"
-static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
- pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (pgd_none_or_clear_bad(pgd))
@@ -49,6 +50,18 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
if (pud_none_or_clear_bad(pud))
return NULL;
+ return pud;
+}
+
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = get_old_pud(mm, addr);
+ if (!pud)
+ return NULL;
+
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return NULL;
@@ -56,19 +69,27 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return pmd;
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
pgd = pgd_offset(mm, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return NULL;
- pud = pud_alloc(mm, p4d, addr);
+
+ return pud_alloc(mm, p4d, addr);
+}
+
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = alloc_new_pud(mm, vma, addr);
if (!pud)
return NULL;
@@ -112,7 +133,7 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
-static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
@@ -122,6 +143,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
unsigned long len = old_end - old_addr;
+ int err = 0;
/*
* When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -149,8 +171,16 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* pte locks because exclusive mmap_lock prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- new_pte = pte_offset_map(new_pmd, new_addr);
- new_ptl = pte_lockptr(mm, new_pmd);
+ if (!old_pte) {
+ err = -EAGAIN;
+ goto out;
+ }
+ new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
+ if (!new_pte) {
+ pte_unmap_unlock(old_pte, old_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
flush_tlb_batched_pending(vma->vm_mm);
@@ -158,7 +188,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
- if (pte_none(*old_pte))
+ if (pte_none(ptep_get(old_pte)))
continue;
pte = ptep_get_and_clear(mm, old_addr, old_pte);
@@ -187,10 +217,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
+out:
if (need_rmap_locks)
drop_rmap_locks(vma);
+ return err;
}
+#ifndef arch_supports_page_table_move
+#define arch_supports_page_table_move arch_supports_page_table_move
+static inline bool arch_supports_page_table_move(void)
+{
+ return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
+ IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
+}
+#endif
+
#ifdef CONFIG_HAVE_MOVE_PMD
static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
@@ -199,6 +240,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
struct mm_struct *mm = vma->vm_mm;
pmd_t pmd;
+ if (!arch_supports_page_table_move())
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have released it.
@@ -240,8 +283,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pmd_none(*new_pmd));
- /* Set the new pmd */
- set_pmd_at(mm, new_addr, new_pmd, pmd);
+ pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -249,79 +291,288 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
+#else
+static inline bool move_normal_pmd(struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
+ pmd_t *new_pmd)
+{
+ return false;
+}
#endif
+#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
+static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ pud_t pud;
+
+ if (!arch_supports_page_table_move())
+ return false;
+ /*
+ * The destination pud shouldn't be established, free_pgtables()
+ * should have released it.
+ */
+ if (WARN_ON_ONCE(!pud_none(*new_pud)))
+ return false;
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_lock prevents deadlock.
+ */
+ old_ptl = pud_lock(vma->vm_mm, old_pud);
+ new_ptl = pud_lockptr(mm, new_pud);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+ /* Clear the pud */
+ pud = *old_pud;
+ pud_clear(old_pud);
+
+ VM_BUG_ON(!pud_none(*new_pud));
+
+ pud_populate(mm, new_pud, pud_pgtable(pud));
+ flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+
+ return true;
+}
+#else
+static inline bool move_normal_pud(struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
+ pud_t *new_pud)
+{
+ return false;
+}
+#endif
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ pud_t pud;
+
+ /*
+ * The destination pud shouldn't be established, free_pgtables()
+ * should have released it.
+ */
+ if (WARN_ON_ONCE(!pud_none(*new_pud)))
+ return false;
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_lock prevents deadlock.
+ */
+ old_ptl = pud_lock(vma->vm_mm, old_pud);
+ new_ptl = pud_lockptr(mm, new_pud);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+ /* Clear the pud */
+ pud = *old_pud;
+ pud_clear(old_pud);
+
+ VM_BUG_ON(!pud_none(*new_pud));
+
+ /* Set the new pud */
+ /* mark soft_ditry when we add pud level soft dirty support */
+ set_pud_at(mm, new_addr, new_pud, pud);
+ flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+
+ return true;
+}
+#else
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+ WARN_ON_ONCE(1);
+ return false;
+
+}
+#endif
+
+enum pgt_entry {
+ NORMAL_PMD,
+ HPAGE_PMD,
+ NORMAL_PUD,
+ HPAGE_PUD,
+};
+
+/*
+ * Returns an extent of the corresponding size for the pgt_entry specified if
+ * valid. Else returns a smaller extent bounded by the end of the source and
+ * destination pgt_entry.
+ */
+static __always_inline unsigned long get_extent(enum pgt_entry entry,
+ unsigned long old_addr, unsigned long old_end,
+ unsigned long new_addr)
+{
+ unsigned long next, extent, mask, size;
+
+ switch (entry) {
+ case HPAGE_PMD:
+ case NORMAL_PMD:
+ mask = PMD_MASK;
+ size = PMD_SIZE;
+ break;
+ case HPAGE_PUD:
+ case NORMAL_PUD:
+ mask = PUD_MASK;
+ size = PUD_SIZE;
+ break;
+ default:
+ BUILD_BUG();
+ break;
+ }
+
+ next = (old_addr + size) & mask;
+ /* even if next overflowed, extent below will be ok */
+ extent = next - old_addr;
+ if (extent > old_end - old_addr)
+ extent = old_end - old_addr;
+ next = (new_addr + size) & mask;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
+ return extent;
+}
+
+/*
+ * Attempts to speedup the move by moving entry at the level corresponding to
+ * pgt_entry. Returns true if the move was successful, else false.
+ */
+static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long new_addr,
+ void *old_entry, void *new_entry, bool need_rmap_locks)
+{
+ bool moved = false;
+
+ /* See comment in move_ptes() */
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
+
+ switch (entry) {
+ case NORMAL_PMD:
+ moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
+ new_entry);
+ break;
+ case NORMAL_PUD:
+ moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
+ new_entry);
+ break;
+ case HPAGE_PMD:
+ moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ move_huge_pmd(vma, old_addr, new_addr, old_entry,
+ new_entry);
+ break;
+ case HPAGE_PUD:
+ moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ move_huge_pud(vma, old_addr, new_addr, old_entry,
+ new_entry);
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
+
+ return moved;
+}
+
unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
bool need_rmap_locks)
{
- unsigned long extent, next, old_end;
+ unsigned long extent, old_end;
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
+ pud_t *old_pud, *new_pud;
+
+ if (!len)
+ return 0;
old_end = old_addr + len;
- flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ if (is_vm_hugetlb_page(vma))
+ return move_hugetlb_page_tables(vma, new_vma, old_addr,
+ new_addr, len);
+
+ flush_cache_range(vma, old_addr, old_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
- next = (old_addr + PMD_SIZE) & PMD_MASK;
- /* even if next overflowed, extent below will be ok */
- extent = next - old_addr;
- if (extent > old_end - old_addr)
- extent = old_end - old_addr;
- next = (new_addr + PMD_SIZE) & PMD_MASK;
- if (extent > next - new_addr)
- extent = next - new_addr;
+ /*
+ * If extent is PUD-sized try to speed up the move by moving at the
+ * PUD level if possible.
+ */
+ extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
+
+ old_pud = get_old_pud(vma->vm_mm, old_addr);
+ if (!old_pud)
+ continue;
+ new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+ if (!new_pud)
+ break;
+ if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+ if (extent == HPAGE_PUD_SIZE) {
+ move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
+ old_pud, new_pud, need_rmap_locks);
+ /* We ignore and continue on error? */
+ continue;
+ }
+ } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
+
+ if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
+ old_pud, new_pud, true))
+ continue;
+ }
+
+ extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
- if (extent == HPAGE_PMD_SIZE) {
- bool moved;
- /* See comment in move_ptes() */
- if (need_rmap_locks)
- take_rmap_locks(vma);
- moved = move_huge_pmd(vma, old_addr, new_addr,
- old_pmd, new_pmd);
- if (need_rmap_locks)
- drop_rmap_locks(vma);
- if (moved)
- continue;
- }
- split_huge_pmd(vma, old_pmd, old_addr);
- if (pmd_trans_unstable(old_pmd))
+again:
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
+ pmd_devmap(*old_pmd)) {
+ if (extent == HPAGE_PMD_SIZE &&
+ move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
+ old_pmd, new_pmd, need_rmap_locks))
continue;
- } else if (extent == PMD_SIZE) {
-#ifdef CONFIG_HAVE_MOVE_PMD
+ split_huge_pmd(vma, old_pmd, old_addr);
+ } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
+ extent == PMD_SIZE) {
/*
* If the extent is PMD-sized, try to speed the move by
* moving at the PMD level if possible.
*/
- bool moved;
-
- if (need_rmap_locks)
- take_rmap_locks(vma);
- moved = move_normal_pmd(vma, old_addr, new_addr,
- old_pmd, new_pmd);
- if (need_rmap_locks)
- drop_rmap_locks(vma);
- if (moved)
+ if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
+ old_pmd, new_pmd, true))
continue;
-#endif
}
-
+ if (pmd_none(*old_pmd))
+ continue;
if (pte_alloc(new_vma->vm_mm, new_pmd))
break;
- move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
- new_pmd, new_addr, need_rmap_locks);
+ if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+ new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
+ goto again;
}
mmu_notifier_invalidate_range_end(&range);
@@ -335,16 +586,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
bool *locked, unsigned long flags,
struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
+ long to_account = new_len - old_len;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
unsigned long moved_len;
- unsigned long excess = 0;
+ unsigned long account_start = 0;
+ unsigned long account_end = 0;
unsigned long hiwater_vm;
- int split = 0;
- int err;
+ int err = 0;
bool need_rmap_locks;
+ struct vma_iterator vmi;
/*
* We'd prefer to avoid failure later on in do_munmap:
@@ -353,6 +606,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
+ if (unlikely(flags & MREMAP_DONTUNMAP))
+ to_account = new_len;
+
+ if (vma->vm_ops && vma->vm_ops->may_split) {
+ if (vma->vm_start != old_addr)
+ err = vma->vm_ops->may_split(vma, old_addr);
+ if (!err && vma->vm_end != old_addr + old_len)
+ err = vma->vm_ops->may_split(vma, old_addr + old_len);
+ if (err)
+ return err;
+ }
+
/*
* Advise KSM to break any KSM pages in the area to be moved:
* it would be confusing if they were to turn up at the new
@@ -365,11 +630,20 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (err)
return err;
+ if (vm_flags & VM_ACCOUNT) {
+ if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
+ return -ENOMEM;
+ }
+
+ vma_start_write(vma);
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
- if (!new_vma)
+ if (!new_vma) {
+ if (vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(to_account >> PAGE_SHIFT);
return -ENOMEM;
+ }
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
need_rmap_locks);
@@ -393,17 +667,19 @@ static unsigned long move_vma(struct vm_area_struct *vma,
new_addr = err;
} else {
mremap_userfaultfd_prep(new_vma, uf);
- arch_remap(mm, old_addr, old_addr + old_len,
- new_addr, new_addr + new_len);
+ }
+
+ if (is_vm_hugetlb_page(vma)) {
+ clear_vma_resv_huge_pages(vma);
}
/* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vm_flags & VM_ACCOUNT) {
- vma->vm_flags &= ~VM_ACCOUNT;
- excess = vma->vm_end - vma->vm_start - old_len;
- if (old_addr > vma->vm_start &&
- old_addr + old_len < vma->vm_end)
- split = 1;
+ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
+ vm_flags_clear(vma, VM_ACCOUNT);
+ if (vma->vm_start < old_addr)
+ account_start = vma->vm_start;
+ if (vma->vm_end > old_addr + old_len)
+ account_end = vma->vm_end;
}
/*
@@ -420,66 +696,62 @@ static unsigned long move_vma(struct vm_area_struct *vma,
/* Tell pfnmap has moved from this vma */
if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn_moved(vma);
+ untrack_pfn_clear(vma);
if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
- if (vm_flags & VM_ACCOUNT) {
- /* Always put back VM_ACCOUNT since we won't unmap */
- vma->vm_flags |= VM_ACCOUNT;
-
- vm_acct_memory(new_len >> PAGE_SHIFT);
- }
+ /* We always clear VM_LOCKED[ONFAULT] on the old vma */
+ vm_flags_clear(vma, VM_LOCKED_MASK);
/*
- * VMAs can actually be merged back together in copy_vma
- * calling merge_vma. This can happen with anonymous vmas
- * which have not yet been faulted, so if we were to consider
- * this VMA split we'll end up adding VM_ACCOUNT on the
- * next VMA, which is completely unrelated if this VMA
- * was re-merged.
+ * anon_vma links of the old vma is no longer needed after its page
+ * table has been moved.
*/
- if (split && new_vma == vma)
- split = 0;
-
- /* We always clear VM_LOCKED[ONFAULT] on the old vma */
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ if (new_vma != vma && vma->vm_start == old_addr &&
+ vma->vm_end == (old_addr + old_len))
+ unlink_anon_vmas(vma);
/* Because we won't unmap we don't need to touch locked_vm */
- goto out;
+ return new_addr;
}
- if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
+ vma_iter_init(&vmi, mm, old_addr);
+ if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
/* OOM: unable to split vma, just get accounts right */
- vm_unacct_memory(excess >> PAGE_SHIFT);
- excess = 0;
+ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
+ vm_acct_memory(old_len >> PAGE_SHIFT);
+ account_start = account_end = 0;
}
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
*locked = true;
}
-out:
+
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
- if (excess) {
- vma->vm_flags |= VM_ACCOUNT;
- if (split)
- vma->vm_next->vm_flags |= VM_ACCOUNT;
+ if (account_start) {
+ vma = vma_prev(&vmi);
+ vm_flags_set(vma, VM_ACCOUNT);
+ }
+
+ if (account_end) {
+ vma = vma_next(&vmi);
+ vm_flags_set(vma, VM_ACCOUNT);
}
return new_addr;
}
static struct vm_area_struct *vma_to_resize(unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long flags,
- unsigned long *p)
+ unsigned long old_len, unsigned long new_len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = find_vma(mm, addr);
+ struct vm_area_struct *vma;
unsigned long pgoff;
- if (!vma || vma->vm_start > addr)
+ vma = vma_lookup(mm, addr);
+ if (!vma)
return ERR_PTR(-EFAULT);
/*
@@ -495,11 +767,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EINVAL);
}
- if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
- vma->vm_flags & VM_SHARED))
- return ERR_PTR(-EINVAL);
-
- if (is_vm_hugetlb_page(vma))
+ if ((flags & MREMAP_DONTUNMAP) &&
+ (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return ERR_PTR(-EINVAL);
/* We can't remap across vm area boundaries */
@@ -518,26 +787,13 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return ERR_PTR(-EFAULT);
- if (vma->vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = mm->locked_vm << PAGE_SHIFT;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- locked += new_len - old_len;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return ERR_PTR(-EAGAIN);
- }
+ if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
+ return ERR_PTR(-EAGAIN);
if (!may_expand_vm(mm, vma->vm_flags,
(new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
- if (vma->vm_flags & VM_ACCOUNT) {
- unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return ERR_PTR(-ENOMEM);
- *p = charged;
- }
-
return vma;
}
@@ -550,7 +806,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
- unsigned long charged = 0;
unsigned long map_flags = 0;
if (offset_in_page(new_addr))
@@ -572,7 +827,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
* So, to avoid such scenario we can pre-compute if the whole
* operation has high chances to success map-wise.
* Worst-scenario case is when both vma's (new_addr and old_addr) get
- * split in 3 before unmaping it.
+ * split in 3 before unmapping it.
* That means 2 more maps (1 for each) to the ones we already hold.
* Check whether current map count plus 2 still leads us to 4 maps below
* the threshold, otherwise return -ENOMEM here to be more safe.
@@ -586,14 +841,14 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
goto out;
}
- if (old_len >= new_len) {
+ if (old_len > new_len) {
ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
- if (ret && old_len != new_len)
+ if (ret)
goto out;
old_len = new_len;
}
- vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -616,7 +871,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
if (IS_ERR_VALUE(ret))
- goto out1;
+ goto out;
/* We got a new mapping */
if (!(flags & MREMAP_FIXED))
@@ -625,12 +880,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
- if (!(offset_in_page(ret)))
- goto out;
-
-out1:
- vm_unacct_memory(charged);
-
out:
return ret;
}
@@ -638,9 +887,10 @@ out:
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
{
unsigned long end = vma->vm_end + delta;
+
if (end < vma->vm_end) /* overflow */
return 0;
- if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+ if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
return 0;
if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
0, MAP_FIXED) & ~PAGE_MASK)
@@ -662,9 +912,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
- unsigned long charged = 0;
bool locked = false;
- bool downgraded = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
@@ -677,7 +925,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* mapping address intact. A non-zero tag will cause the subsequent
* range checks to reject the address as invalid.
*
- * See Documentation/arm64/tagged-address-abi.rst for more information.
+ * See Documentation/arch/arm64/tagged-address-abi.rst for more
+ * information.
*/
addr = untagged_addr(addr);
@@ -712,6 +961,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (mmap_write_lock_killable(current->mm))
return -EINTR;
+ vma = vma_lookup(mm, addr);
+ if (!vma) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h __maybe_unused = hstate_vma(vma);
+
+ old_len = ALIGN(old_len, huge_page_size(h));
+ new_len = ALIGN(new_len, huge_page_size(h));
+
+ /* addrs must be huge page aligned */
+ if (addr & ~huge_page_mask(h))
+ goto out;
+ if (new_addr & ~huge_page_mask(h))
+ goto out;
+
+ /*
+ * Don't allow remap expansion, because the underlying hugetlb
+ * reservation is not yet capable to handle split reservation.
+ */
+ if (new_len > old_len)
+ goto out;
+ }
if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
ret = mremap_to(addr, old_len, new_addr, new_len,
@@ -723,28 +997,30 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * __do_munmap does all the needed commit accounting, and
- * downgrades mmap_lock to read if so directed.
+ * do_vmi_munmap does all the needed commit accounting, and
+ * unlocks the mmap_lock if so directed.
*/
if (old_len >= new_len) {
- int retval;
+ VMA_ITERATOR(vmi, mm, addr + new_len);
+
+ if (old_len == new_len) {
+ ret = addr;
+ goto out;
+ }
- retval = __do_munmap(mm, addr+new_len, old_len - new_len,
- &uf_unmap, true);
- if (retval < 0 && old_len != new_len) {
- ret = retval;
+ ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
+ &uf_unmap, true);
+ if (ret)
goto out;
- /* Returning 1 indicates mmap_lock is downgraded to read. */
- } else if (retval == 1)
- downgraded = true;
+
ret = addr;
- goto out;
+ goto out_unlocked;
}
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -755,10 +1031,34 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (old_len == vma->vm_end - addr) {
/* can we just expand the current mapping? */
if (vma_expandable(vma, new_len - old_len)) {
- int pages = (new_len - old_len) >> PAGE_SHIFT;
+ long pages = (new_len - old_len) >> PAGE_SHIFT;
+ unsigned long extension_start = addr + old_len;
+ unsigned long extension_end = addr + new_len;
+ pgoff_t extension_pgoff = vma->vm_pgoff +
+ ((extension_start - vma->vm_start) >> PAGE_SHIFT);
+ VMA_ITERATOR(vmi, mm, extension_start);
+
+ if (vma->vm_flags & VM_ACCOUNT) {
+ if (security_vm_enough_memory_mm(mm, pages)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
- if (vma_adjust(vma, vma->vm_start, addr + new_len,
- vma->vm_pgoff, NULL)) {
+ /*
+ * Function vma_merge() is called on the extension we
+ * are adding to the already existing vma, vma_merge()
+ * will merge this extension with the already existing
+ * vma (expand operation itself) and possibly also with
+ * the next vma if it becomes adjacent to the expanded
+ * vma and otherwise compatible.
+ */
+ vma = vma_merge(&vmi, mm, vma, extension_start,
+ extension_end, vma->vm_flags, vma->anon_vma,
+ vma->vm_file, extension_pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ if (!vma) {
+ vm_unacct_memory(pages);
ret = -ENOMEM;
goto out;
}
@@ -797,16 +1097,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
&locked, flags, &uf, &uf_unmap);
}
out:
- if (offset_in_page(ret)) {
- vm_unacct_memory(charged);
+ if (offset_in_page(ret))
locked = false;
- }
- if (downgraded)
- mmap_read_unlock(current->mm);
- else
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(current->mm);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
+out_unlocked:
userfaultfd_unmap_complete(mm, &uf_unmap_early);
mremap_userfaultfd_complete(&uf, addr, ret, old_len);
userfaultfd_unmap_complete(mm, &uf_unmap);
diff --git a/mm/msync.c b/mm/msync.c
index 69c6d2029531..ac4c9bfea2e7 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -55,7 +55,9 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out;
/*
* If the interval [start,end) covers some unmapped address ranges,
- * just ignore them, but return -ENOMEM at the end.
+ * just ignore them, but return -ENOMEM at the end. Besides, if the
+ * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM
+ * anyway and there is nothing left to do, so return immediately.
*/
mmap_read_lock(mm);
vma = find_vma(mm, start);
@@ -69,6 +71,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out_unlock;
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
+ if (flags == MS_ASYNC)
+ goto out_unlock;
start = vma->vm_start;
if (start >= end)
goto out_unlock;
@@ -100,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
error = 0;
goto out_unlock;
}
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
}
}
out_unlock:
diff --git a/mm/nommu.c b/mm/nommu.c
index 0df7ca321314..c072a660ec2c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -19,7 +19,6 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
-#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -27,7 +26,6 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/compiler.h>
#include <linux/mount.h>
@@ -38,6 +36,7 @@
#include <linux/printk.h>
#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -175,7 +174,7 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
mmap_write_lock(current->mm);
vma = find_vma(current->mm, (unsigned long)ret);
if (vma)
- vma->vm_flags |= VM_USERMAP;
+ vm_flags_set(vma, VM_USERMAP);
mmap_write_unlock(current->mm);
}
@@ -200,24 +199,13 @@ unsigned long vmalloc_to_pfn(const void *addr)
}
EXPORT_SYMBOL(vmalloc_to_pfn);
-long vread(char *buf, char *addr, unsigned long count)
-{
- /* Don't allow overflow */
- if ((unsigned long) buf + count < count)
- count = -(unsigned long) buf;
-
- memcpy(buf, addr, count);
- return count;
-}
-
-long vwrite(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
- memcpy(addr, buf, count);
- return count;
+ return copy_to_iter(addr, count, iter);
}
/*
@@ -233,10 +221,12 @@ long vwrite(char *buf, char *addr, unsigned long count)
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
+ return __vmalloc(size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc);
+void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc);
+
/*
* vzalloc - allocate virtually contiguous memory with zero fill
*
@@ -251,7 +241,7 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ return __vmalloc(size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc);
@@ -354,13 +344,6 @@ void vm_unmap_aliases(void)
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- BUG();
- return NULL;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
BUG();
@@ -516,7 +499,7 @@ static void delete_nommu_region(struct vm_region *region)
static void free_page_series(unsigned long from, unsigned long to)
{
for (; from < to; from += PAGE_SIZE) {
- struct page *page = virt_to_page(from);
+ struct page *page = virt_to_page((void *)from);
atomic_long_dec(&mmap_pages_allocated);
put_page(page);
@@ -561,26 +544,13 @@ static void put_nommu_region(struct vm_region *region)
__put_nommu_region(region);
}
-/*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * and tree and add to the address space's page tree also if not an anonymous
- * page
- * - should be called with mm->mmap_lock held writelocked
- */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
{
- struct vm_area_struct *pvma, *prev;
- struct address_space *mapping;
- struct rb_node **p, *parent, *rb_prev;
-
- BUG_ON(!vma->vm_region);
-
- mm->map_count++;
vma->vm_mm = mm;
/* add the VMA to the mapping */
if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
@@ -588,67 +558,14 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
-
- /* add the VMA to the tree */
- parent = rb_prev = NULL;
- p = &mm->mm_rb.rb_node;
- while (*p) {
- parent = *p;
- pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
-
- /* sort by: start addr, end addr, VMA struct addr in that order
- * (the latter is necessary as we may get identical VMAs) */
- if (vma->vm_start < pvma->vm_start)
- p = &(*p)->rb_left;
- else if (vma->vm_start > pvma->vm_start) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma->vm_end < pvma->vm_end)
- p = &(*p)->rb_left;
- else if (vma->vm_end > pvma->vm_end) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma < pvma)
- p = &(*p)->rb_left;
- else if (vma > pvma) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else
- BUG();
- }
-
- rb_link_node(&vma->vm_rb, parent, p);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
-
- /* add VMA to the VMA list also */
- prev = NULL;
- if (rb_prev)
- prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
-
- __vma_link_list(mm, vma, prev);
}
-/*
- * delete a VMA from its owning mm_struct and address space
- */
-static void delete_vma_from_mm(struct vm_area_struct *vma)
-{
- int i;
- struct address_space *mapping;
- struct mm_struct *mm = vma->vm_mm;
- struct task_struct *curr = current;
-
- mm->map_count--;
- for (i = 0; i < VMACACHE_SIZE; i++) {
- /* if the vma is cached, invalidate the entire cache */
- if (curr->vmacache.vmas[i] == vma) {
- vmacache_invalidate(mm);
- break;
- }
- }
-
+static void cleanup_vma_from_mm(struct vm_area_struct *vma)
+{
+ vma->vm_mm->map_count--;
/* remove the VMA from the mapping */
if (vma->vm_file) {
+ struct address_space *mapping;
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
@@ -657,13 +574,26 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
- /* remove from the MM's tree and list */
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+/*
+ * delete a VMA from its owning mm_struct and address space
+ */
+static int delete_vma_from_mm(struct vm_area_struct *vma)
+{
+ VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
- __vma_unlink_list(mm, vma);
-}
+ if (vma_iter_prealloc(&vmi)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+ cleanup_vma_from_mm(vma);
+ /* remove from the MM's tree and list */
+ vma_iter_clear(&vmi, vma->vm_start, vma->vm_end);
+ return 0;
+}
/*
* destroy a VMA record
*/
@@ -677,52 +607,60 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
vm_area_free(vma);
}
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
+{
+ unsigned long index = start_addr;
+
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
+
/*
* look up the first VMA in which addr resides, NULL if none
* - should be called with mm->mmap_lock at least held readlocked
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma;
-
- /* check the cache first */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end > addr) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ VMA_ITERATOR(vmi, mm, addr);
- return NULL;
+ return vma_iter_load(&vmi);
}
EXPORT_SYMBOL(find_vma);
/*
- * find a VMA
- * - we don't extend stack VMAs under NOMMU conditions
+ * At least xtensa ends up having protection faults even with no
+ * MMU.. No stack expansion, at least.
*/
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
{
- return find_vma(mm, addr);
+ struct vm_area_struct *vma;
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ mmap_read_unlock(mm);
+ return vma;
}
/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
{
return -ENOMEM;
}
+struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+{
+ mmap_read_unlock(mm);
+ return NULL;
+}
+
/*
* look up the first VMA exactly that exactly matches addr
* - should be called with mm->mmap_lock at least held readlocked
@@ -733,26 +671,17 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long end = addr + len;
+ VMA_ITERATOR(vmi, mm, addr);
- /* check the cache first */
- vma = vmacache_find_exact(mm, addr, end);
- if (vma)
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start < addr)
- continue;
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end == end) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ vma = vma_iter_load(&vmi);
+ if (!vma)
+ return NULL;
+ if (vma->vm_start != addr)
+ return NULL;
+ if (vma->vm_end != end)
+ return NULL;
- return NULL;
+ return vma;
}
/*
@@ -843,9 +772,6 @@ static int validate_mmap_request(struct file *file,
(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (locks_verify_locked(file))
- return -EAGAIN;
-
if (!(capabilities & NOMMU_MAP_DIRECT))
return -ENODEV;
@@ -928,29 +854,36 @@ static unsigned long determine_vm_flags(struct file *file,
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
- /* vm_flags |= mm->def_flags; */
- if (!(capabilities & NOMMU_MAP_DIRECT)) {
- /* attempt to share read-only copies of mapped file chunks */
+ if (!file) {
+ /*
+ * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because
+ * there is no fork().
+ */
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (file && !(prot & PROT_WRITE))
- vm_flags |= VM_MAYSHARE;
+ } else if (flags & MAP_PRIVATE) {
+ /* MAP_PRIVATE file mapping */
+ if (capabilities & NOMMU_MAP_DIRECT)
+ vm_flags |= (capabilities & NOMMU_VMFLAGS);
+ else
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (!(prot & PROT_WRITE) && !current->ptrace)
+ /*
+ * R/O private file mapping which cannot be used to
+ * modify memory, especially also not via active ptrace
+ * (e.g., set breakpoints) or later by upgrading
+ * permissions (no mprotect()). We can try overlaying
+ * the file mapping, which will work e.g., on chardevs,
+ * ramfs/tmpfs/shmfs and romfs/cramf.
+ */
+ vm_flags |= VM_MAYOVERLAY;
} else {
- /* overlay a shareable mapping on the backing device or inode
- * if possible - used for chardevs, ramfs/tmpfs/shmfs and
- * romfs/cramfs */
- vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
- if (flags & MAP_SHARED)
- vm_flags |= VM_SHARED;
+ /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */
+ vm_flags |= VM_SHARED | VM_MAYSHARE |
+ (capabilities & NOMMU_VMFLAGS);
}
- /* refuse to let anyone share private mappings with this process if
- * it's being traced - otherwise breakpoints set in it may interfere
- * with another untraced process
- */
- if ((flags & MAP_PRIVATE) && current->ptrace)
- vm_flags &= ~VM_MAYSHARE;
-
return vm_flags;
}
@@ -988,15 +921,18 @@ static int do_mmap_private(struct vm_area_struct *vma,
void *base;
int ret, order;
- /* invoke the file's mapping function so that it can keep track of
- * shared mappings on devices or memory
- * - VM_MAYSHARE will be set if it may attempt to share
+ /*
+ * Invoke the file's mapping function so that it can keep track of
+ * shared mappings on devices or memory. VM_MAYOVERLAY will be set if
+ * it may attempt to share, which will make is_nommu_shared_mapping()
+ * happy.
*/
if (capabilities & NOMMU_MAP_DIRECT) {
ret = call_mmap(vma->vm_file, vma);
+ /* shouldn't return success if we're not sharing */
+ if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
+ ret = -ENOSYS;
if (ret == 0) {
- /* shouldn't return success if we're not sharing */
- BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
}
@@ -1027,7 +963,8 @@ static int do_mmap_private(struct vm_area_struct *vma,
atomic_long_add(total, &mmap_pages_allocated);
- region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
+ vm_flags_set(vma, VM_MAPPED_COPY);
+ region->vm_flags = vma->vm_flags;
region->vm_start = (unsigned long) base;
region->vm_end = region->vm_start + len;
region->vm_top = region->vm_start + (total << PAGE_SHIFT);
@@ -1088,6 +1025,7 @@ unsigned long do_mmap(struct file *file,
vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
+ VMA_ITERATOR(vmi, current->mm, 0);
*populate = 0;
@@ -1106,6 +1044,7 @@ unsigned long do_mmap(struct file *file,
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
if (!region)
@@ -1115,11 +1054,14 @@ unsigned long do_mmap(struct file *file,
if (!vma)
goto error_getting_vma;
+ if (vma_iter_prealloc(&vmi))
+ goto error_vma_iter_prealloc;
+
region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
- vma->vm_flags = vm_flags;
+ vm_flags_init(vma, vm_flags);
vma->vm_pgoff = pgoff;
if (file) {
@@ -1137,7 +1079,7 @@ unsigned long do_mmap(struct file *file,
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
- if (vm_flags & VM_MAYSHARE) {
+ if (is_nommu_shared_mapping(vm_flags)) {
struct vm_region *pregion;
unsigned long pglen, rpglen, pgend, rpgend, start;
@@ -1147,7 +1089,7 @@ unsigned long do_mmap(struct file *file,
for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
pregion = rb_entry(rb, struct vm_region, vm_rb);
- if (!(pregion->vm_flags & VM_MAYSHARE))
+ if (!is_nommu_shared_mapping(pregion->vm_flags))
continue;
/* search for overlapping mappings on the same file */
@@ -1183,7 +1125,7 @@ unsigned long do_mmap(struct file *file,
vma->vm_end = start + len;
if (pregion->vm_flags & VM_MAPPED_COPY)
- vma->vm_flags |= VM_MAPPED_COPY;
+ vm_flags_set(vma, VM_MAPPED_COPY);
else {
ret = do_mmap_shared_file(vma);
if (ret < 0) {
@@ -1255,7 +1197,11 @@ unsigned long do_mmap(struct file *file,
current->mm->total_vm += len >> PAGE_SHIFT;
share:
- add_vma_to_mm(current->mm, vma);
+ BUG_ON(!vma->vm_region);
+ setup_vma_to_mm(vma, current->mm);
+ current->mm->map_count++;
+ /* add the VMA to the tree */
+ vma_iter_store(&vmi, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1271,6 +1217,7 @@ share:
error_just_free:
up_write(&nommu_region_sem);
error:
+ vma_iter_free(&vmi);
if (region->vm_file)
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
@@ -1297,6 +1244,14 @@ error_getting_region:
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
+
+error_vma_iter_prealloc:
+ kmem_cache_free(vm_region_jar, region);
+ vm_area_free(vma);
+ pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
@@ -1313,8 +1268,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
goto out;
}
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
@@ -1358,18 +1311,20 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
* split a vma into two pieces at address 'addr', a new vma is allocated either
* for the first part or the tail.
*/
-int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
struct vm_region *region;
unsigned long npages;
+ struct mm_struct *mm;
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
if (vma->vm_file)
return -ENOMEM;
+ mm = vma->vm_mm;
if (mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
@@ -1378,9 +1333,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return -ENOMEM;
new = vm_area_dup(vma);
- if (!new) {
- kmem_cache_free(vm_region_jar, region);
- return -ENOMEM;
+ if (!new)
+ goto err_vma_dup;
+
+ if (vma_iter_prealloc(vmi)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ goto err_vmi_preallocate;
}
/* most fields are the same, copy all, and then fixup */
@@ -1399,7 +1358,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- delete_vma_from_mm(vma);
down_write(&nommu_region_sem);
delete_nommu_region(vma->vm_region);
if (new_below) {
@@ -1412,16 +1370,25 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
add_nommu_region(vma->vm_region);
add_nommu_region(new->vm_region);
up_write(&nommu_region_sem);
- add_vma_to_mm(mm, vma);
- add_vma_to_mm(mm, new);
+
+ setup_vma_to_mm(vma, mm);
+ setup_vma_to_mm(new, mm);
+ vma_iter_store(vmi, new);
+ mm->map_count++;
return 0;
+
+err_vmi_preallocate:
+ vm_area_free(new);
+err_vma_dup:
+ kmem_cache_free(vm_region_jar, region);
+ return -ENOMEM;
}
/*
* shrink a VMA by removing the specified chunk from either the beginning or
* the end
*/
-static int shrink_vma(struct mm_struct *mm,
+static int vmi_shrink_vma(struct vma_iterator *vmi,
struct vm_area_struct *vma,
unsigned long from, unsigned long to)
{
@@ -1429,12 +1396,19 @@ static int shrink_vma(struct mm_struct *mm,
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- delete_vma_from_mm(vma);
- if (from > vma->vm_start)
+ if (vma_iter_prealloc(vmi)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+
+ if (from > vma->vm_start) {
+ vma_iter_clear(vmi, from, vma->vm_end);
vma->vm_end = from;
- else
+ } else {
+ vma_iter_clear(vmi, vma->vm_start, to);
vma->vm_start = to;
- add_vma_to_mm(mm, vma);
+ }
/* cut the backing region down to size */
region = vma->vm_region;
@@ -1462,9 +1436,10 @@ static int shrink_vma(struct mm_struct *mm,
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
{
+ VMA_ITERATOR(vmi, mm, start);
struct vm_area_struct *vma;
unsigned long end;
- int ret;
+ int ret = 0;
len = PAGE_ALIGN(len);
if (len == 0)
@@ -1473,7 +1448,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
end = start + len;
/* find the first potentially overlapping VMA */
- vma = find_vma(mm, start);
+ vma = vma_find(&vmi, end);
if (!vma) {
static int limit;
if (limit < 5) {
@@ -1492,7 +1467,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
return -EINVAL;
if (end == vma->vm_end)
goto erase_whole_vma;
- vma = vma->vm_next;
+ vma = vma_find(&vmi, end);
} while (vma);
return -EINVAL;
} else {
@@ -1506,19 +1481,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
if (end != vma->vm_end && offset_in_page(end))
return -EINVAL;
if (start != vma->vm_start && end != vma->vm_end) {
- ret = split_vma(mm, vma, start, 1);
+ ret = split_vma(&vmi, vma, start, 1);
if (ret < 0)
return ret;
}
- return shrink_vma(mm, vma, start, end);
+ return vmi_shrink_vma(&vmi, vma, start, end);
}
erase_whole_vma:
- delete_vma_from_mm(vma);
- delete_vma(mm, vma);
- return 0;
+ if (delete_vma_from_mm(vma))
+ ret = -ENOMEM;
+ else
+ delete_vma(mm, vma);
+ return ret;
}
-EXPORT_SYMBOL(do_munmap);
int vm_munmap(unsigned long addr, size_t len)
{
@@ -1542,6 +1518,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
*/
void exit_mmap(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
if (!mm)
@@ -1549,12 +1526,18 @@ void exit_mmap(struct mm_struct *mm)
mm->total_vm = 0;
- while ((vma = mm->mmap)) {
- mm->mmap = vma->vm_next;
- delete_vma_from_mm(vma);
+ /*
+ * Lock the mm to avoid assert complaining even though this is the only
+ * user of the mm
+ */
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
+ cleanup_vma_from_mm(vma);
delete_vma(mm, vma);
cond_resched();
}
+ __mt_destroy(&mm->mm_mt);
+ mmap_write_unlock(mm);
}
int vm_brk(unsigned long addr, unsigned long len)
@@ -1597,7 +1580,7 @@ static unsigned long do_mremap(unsigned long addr,
if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT;
- if (vma->vm_flags & VM_MAYSHARE)
+ if (is_nommu_shared_mapping(vma->vm_flags))
return (unsigned long) -EPERM;
if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
@@ -1632,7 +1615,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
if (addr != (pfn << PAGE_SHIFT))
return -EINVAL;
- vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1662,12 +1645,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- return -ENOMEM;
-}
-
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
@@ -1675,15 +1652,16 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_fault *vmf,
+vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
+ return 0;
}
EXPORT_SYMBOL(filemap_map_pages);
-int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long addr, void *buf, int len, unsigned int gup_flags)
+int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
+ int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
int write = gup_flags & FOLL_WRITE;
@@ -1729,7 +1707,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
- return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
+ return __access_remote_vm(mm, addr, buf, len, gup_flags);
}
/*
@@ -1748,7 +1726,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
if (!mm)
return 0;
- len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+ len = __access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
return len;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8b84661a6410..612b5597d3af 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -28,6 +28,7 @@
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/swap.h>
+#include <linux/syscalls.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
@@ -51,9 +52,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
-int sysctl_panic_on_oom;
-int sysctl_oom_kill_allocating_task;
-int sysctl_oom_dump_tasks = 1;
+static int sysctl_panic_on_oom;
+static int sysctl_oom_kill_allocating_task;
+static int sysctl_oom_dump_tasks = 1;
/*
* Serializes oom killer invocations (out_of_memory()) from all contexts to
@@ -74,7 +75,7 @@ static inline bool is_memcg_oom(struct oom_control *oc)
#ifdef CONFIG_NUMA
/**
- * oom_cpuset_eligible() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligibility for kill
* @start: task struct of which task to consider
* @oc: pointer to struct oom_control
*
@@ -92,9 +93,6 @@ static bool oom_cpuset_eligible(struct task_struct *start,
bool ret = false;
const nodemask_t *mask = oc->nodemask;
- if (is_memcg_oom(oc))
- return true;
-
rcu_read_lock();
for_each_thread(start, tsk) {
if (mask) {
@@ -104,7 +102,7 @@ static bool oom_cpuset_eligible(struct task_struct *start,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- ret = mempolicy_nodemask_intersects(tsk, mask);
+ ret = mempolicy_in_oom_domain(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
@@ -171,10 +169,12 @@ static bool oom_unkillable_task(struct task_struct *p)
}
/*
- * Print out unreclaimble slabs info when unreclaimable slabs amount is greater
- * than all user memory (LRU pages)
- */
-static bool is_dump_unreclaim_slabs(void)
+ * Check whether unreclaimable slab amount is greater than
+ * all user memory(LRU pages).
+ * dump_unreclaimable_slab() could help in the case that
+ * oom due to too much unreclaimable slab used by kernel.
+*/
+static bool should_dump_unreclaim_slab(void)
{
unsigned long nr_lru;
@@ -393,9 +393,8 @@ static int dump_task(struct task_struct *p, void *arg)
task = find_lock_task_mm(p);
if (!task) {
/*
- * This is a kthread or all of p's threads have already
- * detached their mm's. There's no need to report
- * them; they can't be oom killed anyway.
+ * All of p's threads have already detached their mm's. There's
+ * no need to report them; they can't be oom killed anyway.
*/
return 0;
}
@@ -462,8 +461,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
if (is_memcg_oom(oc))
mem_cgroup_print_oom_meminfo(oc->memcg);
else {
- show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
- if (is_dump_unreclaim_slabs())
+ __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
+ if (should_dump_unreclaim_slab())
dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
@@ -510,10 +509,11 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-bool __oom_reap_task_mm(struct mm_struct *mm)
+static bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
bool ret = true;
+ VMA_ITERATOR(vmi, mm, 0);
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -523,8 +523,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
*/
set_bit(MMF_UNSTABLE, &mm->flags);
- for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_lru_vma(vma))
+ for_each_vma(vmi, vma) {
+ if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
continue;
/*
@@ -542,17 +542,17 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_gather tlb;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
- vma, mm, vma->vm_start,
+ mm, vma->vm_start,
vma->vm_end);
- tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
ret = false;
continue;
}
unmap_page_range(&tlb, vma, range.start, range.end, NULL);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
}
}
@@ -633,22 +633,24 @@ done:
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
- /* Drop a reference taken by wake_oom_reaper */
+ /* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
}
static int oom_reaper(void *unused)
{
+ set_freezable();
+
while (true) {
struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
- spin_lock(&oom_reaper_lock);
+ spin_lock_irq(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list;
}
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irq(&oom_reaper_lock);
if (tsk)
oom_reap_task(tsk);
@@ -657,30 +659,88 @@ static int oom_reaper(void *unused)
return 0;
}
-static void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct timer_list *timer)
{
- /* mm is already queued? */
- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
- return;
+ struct task_struct *tsk = container_of(timer, struct task_struct,
+ oom_reaper_timer);
+ struct mm_struct *mm = tsk->signal->oom_mm;
+ unsigned long flags;
- get_task_struct(tsk);
+ /* The victim managed to terminate on its own - see exit_mmap */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ put_task_struct(tsk);
+ return;
+ }
- spin_lock(&oom_reaper_lock);
+ spin_lock_irqsave(&oom_reaper_lock, flags);
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irqrestore(&oom_reaper_lock, flags);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}
+/*
+ * Give the OOM victim time to exit naturally before invoking the oom_reaping.
+ * The timers timeout is arbitrary... the longer it is, the longer the worst
+ * case scenario for the OOM can take. If it is too small, the oom_reaper can
+ * get in the way and release resources needed by the process exit path.
+ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
+ * before the exit path is able to wake the futex waiters.
+ */
+#define OOM_REAPER_DELAY (2*HZ)
+static void queue_oom_reaper(struct task_struct *tsk)
+{
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ return;
+
+ get_task_struct(tsk);
+ timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
+ tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
+ add_timer(&tsk->oom_reaper_timer);
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vm_oom_kill_table[] = {
+ {
+ .procname = "panic_on_oom",
+ .data = &sysctl_panic_on_oom,
+ .maxlen = sizeof(sysctl_panic_on_oom),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "oom_kill_allocating_task",
+ .data = &sysctl_oom_kill_allocating_task,
+ .maxlen = sizeof(sysctl_oom_kill_allocating_task),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "oom_dump_tasks",
+ .data = &sysctl_oom_dump_tasks,
+ .maxlen = sizeof(sysctl_oom_dump_tasks),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif
+
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", vm_oom_kill_table);
+#endif
return 0;
}
subsys_initcall(oom_init)
#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
+static inline void queue_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */
@@ -705,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk)
return;
/* oom_mm is bound to the signal struct life time. */
- if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
mmgrab(tsk->signal->oom_mm);
- set_bit(MMF_OOM_VICTIM, &mm->flags);
- }
/*
* Make sure that the task is woken up from uninterruptible sleep
@@ -785,11 +843,11 @@ static inline bool __task_will_free_mem(struct task_struct *task)
struct signal_struct *sig = task->signal;
/*
- * A coredumping process may sleep for an extended period in exit_mm(),
- * so the oom killer cannot assume that the process will promptly exit
- * and release memory.
+ * A coredumping process may sleep for an extended period in
+ * coredump_task_exit(), so the oom killer cannot assume that
+ * the process will promptly exit and release memory.
*/
- if (sig->flags & SIGNAL_GROUP_COREDUMP)
+ if (sig->core_state)
return false;
if (sig->flags & SIGNAL_GROUP_EXIT)
@@ -921,7 +979,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
}
/*
- * No kthead_use_mm() user needs to read from the userspace so
+ * No kthread_use_mm() user needs to read from the userspace so
* we are ok to reap it.
*/
if (unlikely(p->flags & PF_KTHREAD))
@@ -931,7 +989,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
rcu_read_unlock();
if (can_oom_reap)
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
mmdrop(mm);
put_task_struct(victim);
@@ -967,7 +1025,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
task_lock(victim);
if (task_will_free_mem(victim)) {
mark_oom_victim(victim);
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
return;
@@ -990,9 +1048,10 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
+ memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
mem_cgroup_print_oom_group(oom_group);
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
- (void*)message);
+ (void *)message);
mem_cgroup_put(oom_group);
}
}
@@ -1053,7 +1112,7 @@ bool out_of_memory(struct oom_control *oc)
if (!is_memcg_oom(oc)) {
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
- if (freed > 0)
+ if (freed > 0 && !is_sysrq_oom(oc))
/* Got some memory back in the last second. */
return true;
}
@@ -1065,18 +1124,16 @@ bool out_of_memory(struct oom_control *oc)
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
- wake_oom_reaper(current);
+ queue_oom_reaper(current);
return true;
}
/*
* The OOM killer does not compensate for IO-less reclaim.
- * pagefault_out_of_memory lost its gfp context so we have to
- * make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
- * invoke the OOM killer even if it is a GFP_NOFS allocation.
+ * But mem_cgroup_oom() has to invoke the OOM killer even
+ * if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
+ if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
@@ -1118,25 +1175,86 @@ bool out_of_memory(struct oom_control *oc)
}
/*
- * The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
- * killing is already in progress so do nothing.
+ * The pagefault handler calls here because some allocation has failed. We have
+ * to take care of the memcg OOM here because this is the only safe context without
+ * any locks held but let the oom killer triggered from the allocation context care
+ * about the global OOM.
*/
void pagefault_out_of_memory(void)
{
- struct oom_control oc = {
- .zonelist = NULL,
- .nodemask = NULL,
- .memcg = NULL,
- .gfp_mask = 0,
- .order = 0,
- };
+ static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (mem_cgroup_oom_synchronize(true))
return;
- if (!mutex_trylock(&oom_lock))
+ if (fatal_signal_pending(current))
return;
- out_of_memory(&oc);
- mutex_unlock(&oom_lock);
+
+ if (__ratelimit(&pfoom_rs))
+ pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
+}
+
+SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_MMU
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ struct task_struct *p;
+ unsigned int f_flags;
+ bool reap = false;
+ long ret = 0;
+
+ if (flags)
+ return -EINVAL;
+
+ task = pidfd_get_task(pidfd, &f_flags);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ /*
+ * Make sure to choose a thread which still has a reference to mm
+ * during the group exit
+ */
+ p = find_lock_task_mm(task);
+ if (!p) {
+ ret = -ESRCH;
+ goto put_task;
+ }
+
+ mm = p->mm;
+ mmgrab(mm);
+
+ if (task_will_free_mem(p))
+ reap = true;
+ else {
+ /* Error only if the work has not been done already */
+ if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+ ret = -EINVAL;
+ }
+ task_unlock(p);
+
+ if (!reap)
+ goto drop_mm;
+
+ if (mmap_read_lock_killable(mm)) {
+ ret = -EINTR;
+ goto drop_mm;
+ }
+ /*
+ * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
+ * possible change in exit_mmap is seen
+ */
+ if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
+ ret = -EAGAIN;
+ mmap_read_unlock(mm);
+
+drop_mm:
+ mmdrop(mm);
+put_task:
+ put_task_struct(task);
+ return ret;
+#else
+ return -ENOSYS;
+#endif /* CONFIG_MMU */
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 358d6f28c627..d3f42009bb70 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -13,6 +13,7 @@
*/
#include <linux/kernel.h>
+#include <linux/math64.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
@@ -32,7 +33,6 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
-#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
@@ -71,30 +71,30 @@ static long ratelimit_pages = 32;
/*
* Start background writeback (via writeback threads) at this percentage
*/
-int dirty_background_ratio = 10;
+static int dirty_background_ratio = 10;
/*
* dirty_background_bytes starts at 0 (disabled) so that it is a function of
* dirty_background_ratio * the amount of dirtyable memory
*/
-unsigned long dirty_background_bytes;
+static unsigned long dirty_background_bytes;
/*
* free highmem will not be subtracted from the total free memory
* for calculating free ratios if vm_highmem_is_dirtyable is true
*/
-int vm_highmem_is_dirtyable;
+static int vm_highmem_is_dirtyable;
/*
* The generator of dirty data starts writeback at this percentage
*/
-int vm_dirty_ratio = 20;
+static int vm_dirty_ratio = 20;
/*
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of
* vm_dirty_ratio * the amount of dirtyable memory
*/
-unsigned long vm_dirty_bytes;
+static unsigned long vm_dirty_bytes;
/*
* The interval between `kupdate'-style writebacks
@@ -109,11 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
/*
- * Flag that makes the machine dump writes/reads and block dirtyings.
- */
-int block_dump;
-
-/*
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
* a full sync is triggered after this time elapses without any disk activity.
*/
@@ -189,7 +184,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
static void wb_min_max_ratio(struct bdi_writeback *wb,
unsigned long *minp, unsigned long *maxp)
{
- unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
unsigned long long min = wb->bdi->min_ratio;
unsigned long long max = wb->bdi->max_ratio;
@@ -203,7 +198,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
min *= this_bw;
min = div64_ul(min, tot_bw);
}
- if (max < 100) {
+ if (max < 100 * BDI_RATIO_SCALE) {
max *= this_bw;
max = div64_ul(max, tot_bw);
}
@@ -330,18 +325,6 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
}
/*
- * Unreclaimable memory (kernel memory or anonymous memory
- * without swap) can bring down the dirtyable pages below
- * the zone's dirty balance reserve and the above calculation
- * will underflow. However we still want to add in nodes
- * which are below threshold (negative values) to get a more
- * accurate calculation but make sure that the total never
- * underflows.
- */
- if ((long)x < 0)
- x = 0;
-
- /*
* Make sure that the number of highmem pages is never larger
* than the number of the total dirtyable memory. This can only
* occur in very strange VM situations but we want to make sure
@@ -509,7 +492,8 @@ bool node_dirty_ok(struct pglist_data *pgdat)
return nr_pages <= limit;
}
-int dirty_background_ratio_handler(struct ctl_table *table, int write,
+#ifdef CONFIG_SYSCTL
+static int dirty_background_ratio_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -520,7 +504,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write,
return ret;
}
-int dirty_background_bytes_handler(struct ctl_table *table, int write,
+static int dirty_background_bytes_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -531,7 +515,7 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write,
return ret;
}
-int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
+static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
@@ -545,7 +529,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
return ret;
}
-int dirty_bytes_handler(struct ctl_table *table, int write,
+static int dirty_bytes_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
@@ -558,6 +542,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
}
return ret;
}
+#endif
static unsigned long wp_next_time(unsigned long cur_time)
{
@@ -568,12 +553,12 @@ static unsigned long wp_next_time(unsigned long cur_time)
return cur_time;
}
-static void wb_domain_writeout_inc(struct wb_domain *dom,
+static void wb_domain_writeout_add(struct wb_domain *dom,
struct fprop_local_percpu *completions,
- unsigned int max_prop_frac)
+ unsigned int max_prop_frac, long nr)
{
- __fprop_inc_percpu_max(&dom->completions, completions,
- max_prop_frac);
+ __fprop_add_percpu_max(&dom->completions, completions,
+ max_prop_frac, nr);
/* First event after period switching was turned off? */
if (unlikely(!dom->period_time)) {
/*
@@ -589,20 +574,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom,
/*
* Increment @wb's writeout completion count and the global writeout
- * completion count. Called from test_clear_page_writeback().
+ * completion count. Called from __folio_end_writeback().
*/
-static inline void __wb_writeout_inc(struct bdi_writeback *wb)
+static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
{
struct wb_domain *cgdom;
- inc_wb_stat(wb, WB_WRITTEN);
- wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
- wb->bdi->max_prop_frac);
+ wb_stat_mod(wb, WB_WRITTEN, nr);
+ wb_domain_writeout_add(&global_wb_domain, &wb->completions,
+ wb->bdi->max_prop_frac, nr);
cgdom = mem_cgroup_wb_domain(wb);
if (cgdom)
- wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
- wb->bdi->max_prop_frac);
+ wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
+ wb->bdi->max_prop_frac, nr);
}
void wb_writeout_inc(struct bdi_writeback *wb)
@@ -610,7 +595,7 @@ void wb_writeout_inc(struct bdi_writeback *wb)
unsigned long flags;
local_irq_save(flags);
- __wb_writeout_inc(wb);
+ __wb_writeout_add(wb, 1);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);
@@ -666,20 +651,65 @@ void wb_domain_exit(struct wb_domain *dom)
*/
static unsigned int bdi_min_ratio;
-int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+static int bdi_check_pages_limit(unsigned long pages)
+{
+ unsigned long max_dirty_pages = global_dirtyable_memory();
+
+ if (pages > max_dirty_pages)
+ return -EINVAL;
+
+ return 0;
+}
+
+static unsigned long bdi_ratio_from_pages(unsigned long pages)
+{
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long ratio;
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);
+
+ return ratio;
+}
+
+static u64 bdi_get_bytes(unsigned int ratio)
+{
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ u64 bytes;
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100;
+
+ return bytes;
+}
+
+static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
+ unsigned int delta;
int ret = 0;
+ if (min_ratio > 100 * BDI_RATIO_SCALE)
+ return -EINVAL;
+ min_ratio *= BDI_RATIO_SCALE;
+
spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
- min_ratio -= bdi->min_ratio;
- if (bdi_min_ratio + min_ratio < 100) {
- bdi_min_ratio += min_ratio;
- bdi->min_ratio += min_ratio;
+ if (min_ratio < bdi->min_ratio) {
+ delta = bdi->min_ratio - min_ratio;
+ bdi_min_ratio -= delta;
+ bdi->min_ratio = min_ratio;
} else {
- ret = -EINVAL;
+ delta = min_ratio - bdi->min_ratio;
+ if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) {
+ bdi_min_ratio += delta;
+ bdi->min_ratio = min_ratio;
+ } else {
+ ret = -EINVAL;
+ }
}
}
spin_unlock_bh(&bdi_lock);
@@ -687,11 +717,11 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
return ret;
}
-int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
+static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{
int ret = 0;
- if (max_ratio > 100)
+ if (max_ratio > 100 * BDI_RATIO_SCALE)
return -EINVAL;
spin_lock_bh(&bdi_lock);
@@ -705,8 +735,81 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
return ret;
}
+
+int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ return __bdi_set_min_ratio(bdi, min_ratio);
+}
+
+int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio)
+{
+ return __bdi_set_max_ratio(bdi, max_ratio);
+}
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE);
+}
+
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
+{
+ return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE);
+}
EXPORT_SYMBOL(bdi_set_max_ratio);
+u64 bdi_get_min_bytes(struct backing_dev_info *bdi)
+{
+ return bdi_get_bytes(bdi->min_ratio);
+}
+
+int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
+{
+ int ret;
+ unsigned long pages = min_bytes >> PAGE_SHIFT;
+ unsigned long min_ratio;
+
+ ret = bdi_check_pages_limit(pages);
+ if (ret)
+ return ret;
+
+ min_ratio = bdi_ratio_from_pages(pages);
+ return __bdi_set_min_ratio(bdi, min_ratio);
+}
+
+u64 bdi_get_max_bytes(struct backing_dev_info *bdi)
+{
+ return bdi_get_bytes(bdi->max_ratio);
+}
+
+int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
+{
+ int ret;
+ unsigned long pages = max_bytes >> PAGE_SHIFT;
+ unsigned long max_ratio;
+
+ ret = bdi_check_pages_limit(pages);
+ if (ret)
+ return ret;
+
+ max_ratio = bdi_ratio_from_pages(pages);
+ return __bdi_set_max_ratio(bdi, max_ratio);
+}
+
+int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit)
+{
+ if (strict_limit > 1)
+ return -EINVAL;
+
+ spin_lock_bh(&bdi_lock);
+ if (strict_limit)
+ bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+ else
+ bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
+ spin_unlock_bh(&bdi_lock);
+
+ return 0;
+}
+
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
unsigned long bg_thresh)
{
@@ -769,15 +872,15 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
&numerator, &denominator);
- wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+ wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE);
wb_thresh *= numerator;
wb_thresh = div64_ul(wb_thresh, denominator);
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
- wb_thresh += (thresh * wb_min_ratio) / 100;
- if (wb_thresh > (thresh * wb_max_ratio) / 100)
- wb_thresh = thresh * wb_max_ratio / 100;
+ wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE))
+ wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
return wb_thresh;
}
@@ -845,7 +948,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
* ^ pos_ratio
* |
* | |<===== global dirty control scope ======>|
- * 2.0 .............*
+ * 2.0 * * * * * * *
* | .*
* | . *
* | . *
@@ -898,7 +1001,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
- unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
@@ -1090,7 +1193,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
* write_bandwidth = ---------------------------------------------------
* period
*
- * @written may have decreased due to account_page_redirty().
+ * @written may have decreased due to folio_account_redirty().
* Avoid underflowing @bw calculation.
*/
bw = written - min(written, wb->written_stamp);
@@ -1121,7 +1224,7 @@ out:
&wb->bdi->tot_write_bandwidth) <= 0);
}
wb->write_bandwidth = bw;
- wb->avg_write_bandwidth = avg;
+ WRITE_ONCE(wb->avg_write_bandwidth, avg);
}
static void update_dirty_limit(struct dirty_throttle_control *dtc)
@@ -1153,8 +1256,8 @@ update:
dom->dirty_limit = limit;
}
-static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
- unsigned long now)
+static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
+ unsigned long now)
{
struct wb_domain *dom = dtc_dom(dtc);
@@ -1330,7 +1433,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
else
dirty_ratelimit -= step;
- wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
@@ -1338,35 +1441,28 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
struct dirty_throttle_control *mdtc,
- unsigned long start_time,
bool update_ratelimit)
{
struct bdi_writeback *wb = gdtc->wb;
unsigned long now = jiffies;
- unsigned long elapsed = now - wb->bw_time_stamp;
+ unsigned long elapsed;
unsigned long dirtied;
unsigned long written;
- lockdep_assert_held(&wb->list_lock);
+ spin_lock(&wb->list_lock);
/*
- * rate-limit, only update once every 200ms.
+ * Lockless checks for elapsed time are racy and delayed update after
+ * IO completion doesn't do it at all (to make sure written pages are
+ * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
+ * division errors.
*/
- if (elapsed < BANDWIDTH_INTERVAL)
- return;
-
+ elapsed = max(now - wb->bw_time_stamp, 1UL);
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
- /*
- * Skip quiet periods when disk bandwidth is under-utilized.
- * (at least 1s idle time between two flusher runs)
- */
- if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
- goto snapshot;
-
if (update_ratelimit) {
- domain_update_bandwidth(gdtc, now);
+ domain_update_dirty_limit(gdtc, now);
wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
/*
@@ -1374,23 +1470,41 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
* compiler has no way to figure that out. Help it.
*/
if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
- domain_update_bandwidth(mdtc, now);
+ domain_update_dirty_limit(mdtc, now);
wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
}
}
wb_update_write_bandwidth(wb, elapsed, written);
-snapshot:
wb->dirtied_stamp = dirtied;
wb->written_stamp = written;
- wb->bw_time_stamp = now;
+ WRITE_ONCE(wb->bw_time_stamp, now);
+ spin_unlock(&wb->list_lock);
}
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
- __wb_update_bandwidth(&gdtc, NULL, start_time, false);
+ __wb_update_bandwidth(&gdtc, NULL, false);
+}
+
+/* Interval after which we consider wb idle and don't estimate bandwidth */
+#define WB_BANDWIDTH_IDLE_JIF (HZ)
+
+static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
+{
+ unsigned long now = jiffies;
+ unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);
+
+ if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
+ !atomic_read(&wb->writeback_inodes)) {
+ spin_lock(&wb->list_lock);
+ wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
+ wb->written_stamp = wb_stat(wb, WB_WRITTEN);
+ WRITE_ONCE(wb->bw_time_stamp, now);
+ spin_unlock(&wb->list_lock);
+ }
}
/*
@@ -1413,7 +1527,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
static unsigned long wb_max_pause(struct bdi_writeback *wb,
unsigned long wb_dirty)
{
- unsigned long bw = wb->avg_write_bandwidth;
+ unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long t;
/*
@@ -1435,8 +1549,8 @@ static long wb_min_pause(struct bdi_writeback *wb,
unsigned long dirty_ratelimit,
int *nr_dirtied_pause)
{
- long hi = ilog2(wb->avg_write_bandwidth);
- long lo = ilog2(wb->dirty_ratelimit);
+ long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
+ long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
long t; /* target pause */
long pause; /* estimated next pause */
int pages; /* target nr_dirtied_pause */
@@ -1552,8 +1666,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
* If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout.
*/
-static void balance_dirty_pages(struct bdi_writeback *wb,
- unsigned long pages_dirtied)
+static int balance_dirty_pages(struct bdi_writeback *wb,
+ unsigned long pages_dirtied, unsigned int flags)
{
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
@@ -1573,6 +1687,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
+ int ret = 0;
for (;;) {
unsigned long now = jiffies;
@@ -1626,6 +1741,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
}
/*
+ * In laptop mode, we wait until hitting the higher threshold
+ * before starting background writeout, and then write out all
+ * the way down to the lower threshold. So slow writers cause
+ * minimal disk activity.
+ *
+ * In normal mode, we start background writeout at the lower
+ * background_thresh, to keep the amount of dirty memory low.
+ */
+ if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
+ !writeback_in_progress(wb))
+ wb_start_background_writeback(wb);
+
+ /*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
* when the wb limits are ramping up in case of !strictlimit.
@@ -1655,6 +1783,7 @@ free_running:
break;
}
+ /* Start writeback even when in laptop mode */
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
@@ -1713,18 +1842,15 @@ free_running:
sdtc = mdtc;
}
- if (dirty_exceeded && !wb->dirty_exceeded)
- wb->dirty_exceeded = 1;
+ if (dirty_exceeded != wb->dirty_exceeded)
+ wb->dirty_exceeded = dirty_exceeded;
- if (time_is_before_jiffies(wb->bw_time_stamp +
- BANDWIDTH_INTERVAL)) {
- spin_lock(&wb->list_lock);
- __wb_update_bandwidth(gdtc, mdtc, start_time, true);
- spin_unlock(&wb->list_lock);
- }
+ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+ BANDWIDTH_INTERVAL))
+ __wb_update_bandwidth(gdtc, mdtc, true);
/* throttle according to the chosen dtc */
- dirty_ratelimit = wb->dirty_ratelimit;
+ dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
RATELIMIT_CALC_SHIFT;
max_pause = wb_max_pause(wb, sdtc->wb_dirty);
@@ -1790,6 +1916,10 @@ pause:
period,
pause,
start_time);
+ if (flags & BDP_ASYNC) {
+ ret = -EAGAIN;
+ break;
+ }
__set_current_state(TASK_KILLABLE);
wb->dirty_sleep = now;
io_schedule_timeout(pause);
@@ -1806,7 +1936,7 @@ pause:
break;
/*
- * In the case of an unresponding NFS server and the NFS dirty
+ * In the case of an unresponsive NFS server and the NFS dirty
* pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
@@ -1821,26 +1951,7 @@ pause:
if (fatal_signal_pending(current))
break;
}
-
- if (!dirty_exceeded && wb->dirty_exceeded)
- wb->dirty_exceeded = 0;
-
- if (writeback_in_progress(wb))
- return;
-
- /*
- * In laptop mode, we wait until hitting the higher threshold before
- * starting background writeout, and then write out all the way down
- * to the lower threshold. So slow writers cause minimal disk activity.
- *
- * In normal mode, we start background writeout at the lower
- * background_thresh, to keep the amount of dirty memory low.
- */
- if (laptop_mode)
- return;
-
- if (nr_reclaimable > gdtc->bg_thresh)
- wb_start_background_writeback(wb);
+ return ret;
}
static DEFINE_PER_CPU(int, bdp_ratelimits);
@@ -1862,28 +1973,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
- * balance_dirty_pages_ratelimited - balance dirty memory state
- * @mapping: address_space which was dirtied
+ * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
+ * @mapping: address_space which was dirtied.
+ * @flags: BDP flags.
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
- * On really big machines, get_writeback_state is expensive, so try to avoid
- * calling it too often (ratelimiting). But once we're over the dirty memory
- * limit we decrease the ratelimiting by a lot, to prevent individual processes
- * from overshooting the limit by (ratelimit_pages) each.
+ * See balance_dirty_pages_ratelimited() for details.
+ *
+ * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
+ * indicate that memory is out of balance and the caller must wait
+ * for I/O to complete. Otherwise, it will return 0 to indicate
+ * that either memory was already in balance, or it was able to sleep
+ * until the amount of dirty memory returned to balance.
*/
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
+ unsigned int flags)
{
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
int ratelimit;
+ int ret = 0;
int *p;
if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
- return;
+ return ret;
if (inode_cgwb_enabled(inode))
wb = wb_get_create_current(bdi, GFP_KERNEL);
@@ -1923,9 +2040,28 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
preempt_enable();
if (unlikely(current->nr_dirtied >= ratelimit))
- balance_dirty_pages(wb, current->nr_dirtied);
+ ret = balance_dirty_pages(wb, current->nr_dirtied, flags);
wb_put(wb);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags);
+
+/**
+ * balance_dirty_pages_ratelimited - balance dirty memory state.
+ * @mapping: address_space which was dirtied.
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied. The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * Once we're over the dirty memory limit we decrease the ratelimiting
+ * by a lot, to prevent individual processes from overshooting the limit
+ * by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+ balance_dirty_pages_ratelimited_flags(mapping, 0);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
@@ -1945,6 +2081,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
struct dirty_throttle_control * const gdtc = &gdtc_stor;
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
+ unsigned long reclaimable;
+ unsigned long thresh;
/*
* Similar to balance_dirty_pages() but ignores pages being written
@@ -1957,8 +2095,13 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (gdtc->dirty > gdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) >
- wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
+ thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh);
+ if (thresh < 2 * wb_stat_error())
+ reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+ else
+ reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+
+ if (reclaimable > thresh)
return true;
if (mdtc) {
@@ -1972,18 +2115,24 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (mdtc->dirty > mdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) >
- wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
+ thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh);
+ if (thresh < 2 * wb_stat_error())
+ reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+ else
+ reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+
+ if (reclaimable > thresh)
return true;
}
return false;
}
+#ifdef CONFIG_SYSCTL
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
-int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
+static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
unsigned int old_interval = dirty_writeback_interval;
@@ -2004,8 +2153,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
return ret;
}
+#endif
-#ifdef CONFIG_BLOCK
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
@@ -2040,13 +2189,10 @@ void laptop_sync_completion(void)
rcu_read_unlock();
}
-#endif
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
* if a large number of processes all perform writes at the same time.
- * If it is too low then SMP machines will call the (expensive)
- * get_writeback_state too often.
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
@@ -2072,6 +2218,83 @@ static int page_writeback_cpu_online(unsigned int cpu)
return 0;
}
+#ifdef CONFIG_SYSCTL
+
+/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
+static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
+
+static struct ctl_table vm_page_writeback_sysctls[] = {
+ {
+ .procname = "dirty_background_ratio",
+ .data = &dirty_background_ratio,
+ .maxlen = sizeof(dirty_background_ratio),
+ .mode = 0644,
+ .proc_handler = dirty_background_ratio_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "dirty_background_bytes",
+ .data = &dirty_background_bytes,
+ .maxlen = sizeof(dirty_background_bytes),
+ .mode = 0644,
+ .proc_handler = dirty_background_bytes_handler,
+ .extra1 = SYSCTL_LONG_ONE,
+ },
+ {
+ .procname = "dirty_ratio",
+ .data = &vm_dirty_ratio,
+ .maxlen = sizeof(vm_dirty_ratio),
+ .mode = 0644,
+ .proc_handler = dirty_ratio_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "dirty_bytes",
+ .data = &vm_dirty_bytes,
+ .maxlen = sizeof(vm_dirty_bytes),
+ .mode = 0644,
+ .proc_handler = dirty_bytes_handler,
+ .extra1 = (void *)&dirty_bytes_min,
+ },
+ {
+ .procname = "dirty_writeback_centisecs",
+ .data = &dirty_writeback_interval,
+ .maxlen = sizeof(dirty_writeback_interval),
+ .mode = 0644,
+ .proc_handler = dirty_writeback_centisecs_handler,
+ },
+ {
+ .procname = "dirty_expire_centisecs",
+ .data = &dirty_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#ifdef CONFIG_HIGHMEM
+ {
+ .procname = "highmem_is_dirtyable",
+ .data = &vm_highmem_is_dirtyable,
+ .maxlen = sizeof(vm_highmem_is_dirtyable),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ {
+ .procname = "laptop_mode",
+ .data = &laptop_mode,
+ .maxlen = sizeof(laptop_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {}
+};
+#endif
+
/*
* Called early on to tune the page writeback dirty limits.
*
@@ -2096,6 +2319,9 @@ void __init page_writeback_init(void)
page_writeback_cpu_online, NULL);
cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
page_writeback_cpu_online);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", vm_page_writeback_sysctls);
+#endif
}
/**
@@ -2172,15 +2398,15 @@ int write_cache_pages(struct address_space *mapping,
int ret = 0;
int done = 0;
int error;
- struct pagevec pvec;
- int nr_pages;
+ struct folio_batch fbatch;
+ int nr_folios;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
int range_whole = 0;
xa_mark_t tag;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* prev offset */
end = -1;
@@ -2200,50 +2426,53 @@ int write_cache_pages(struct address_space *mapping,
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
- tag);
- if (nr_pages == 0)
+ nr_folios = filemap_get_folios_tag(mapping, &index, end,
+ tag, &fbatch);
+
+ if (nr_folios == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
+ unsigned long nr;
- done_index = page->index;
+ done_index = folio->index;
- lock_page(page);
+ folio_lock(folio);
/*
* Page truncated or invalidated. We can freely skip it
* then, even for data integrity operations: the page
* has disappeared concurrently, so there could be no
- * real expectation of this data interity operation
+ * real expectation of this data integrity operation
* even if there is now a new, dirty page at the same
* pagecache address.
*/
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(folio->mapping != mapping)) {
continue_unlock:
- unlock_page(page);
+ folio_unlock(folio);
continue;
}
- if (!PageDirty(page)) {
+ if (!folio_test_dirty(folio)) {
/* someone wrote it for us */
goto continue_unlock;
}
- if (PageWriteback(page)) {
+ if (folio_test_writeback(folio)) {
if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
else
goto continue_unlock;
}
- BUG_ON(PageWriteback(page));
- if (!clear_page_dirty_for_io(page))
+ BUG_ON(folio_test_writeback(folio));
+ if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- error = (*writepage)(page, wbc, data);
+ error = writepage(folio, wbc, data);
+ nr = folio_nr_pages(folio);
if (unlikely(error)) {
/*
* Handle errors according to the type of
@@ -2258,11 +2487,11 @@ continue_unlock:
* the first error.
*/
if (error == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(page);
+ folio_unlock(folio);
error = 0;
} else if (wbc->sync_mode != WB_SYNC_ALL) {
ret = error;
- done_index = page->index + 1;
+ done_index = folio->index + nr;
done = 1;
break;
}
@@ -2276,13 +2505,14 @@ continue_unlock:
* keep going until we have written all the pages
* we tagged for writeback prior to entering this loop.
*/
- if (--wbc->nr_to_write <= 0 &&
+ wbc->nr_to_write -= nr;
+ if (wbc->nr_to_write <= 0 &&
wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
@@ -2300,292 +2530,281 @@ continue_unlock:
}
EXPORT_SYMBOL(write_cache_pages);
-/*
- * Function used by generic_writepages to call the real writepage
- * function and set the mapping flags on error
- */
-static int __writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+static int writepage_cb(struct folio *folio, struct writeback_control *wbc,
+ void *data)
{
struct address_space *mapping = data;
- int ret = mapping->a_ops->writepage(page, wbc);
+ int ret = mapping->a_ops->writepage(&folio->page, wbc);
mapping_set_error(mapping, ret);
return ret;
}
-/**
- * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- *
- * Return: %0 on success, negative error code otherwise
- */
-int generic_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct blk_plug plug;
- int ret;
-
- /* deal with chardevs and other special file */
- if (!mapping->a_ops->writepage)
- return 0;
-
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __writepage, mapping);
- blk_finish_plug(&plug);
- return ret;
-}
-
-EXPORT_SYMBOL(generic_writepages);
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
+ struct bdi_writeback *wb;
if (wbc->nr_to_write <= 0)
return 0;
+ wb = inode_to_wb_wbc(mapping->host, wbc);
+ wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages)
+ if (mapping->a_ops->writepages) {
ret = mapping->a_ops->writepages(mapping, wbc);
- else
- ret = generic_writepages(mapping, wbc);
- if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
- break;
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- }
- return ret;
-}
-
-/**
- * write_one_page - write out a single page and wait on I/O
- * @page: the page to write
- *
- * The page must be locked by the caller and will be unlocked upon return.
- *
- * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
- * function returns.
- *
- * Return: %0 on success, negative error code otherwise
- */
-int write_one_page(struct page *page)
-{
- struct address_space *mapping = page->mapping;
- int ret = 0;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 1,
- };
+ } else if (mapping->a_ops->writepage) {
+ struct blk_plug plug;
- BUG_ON(!PageLocked(page));
-
- wait_on_page_writeback(page);
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, writepage_cb,
+ mapping);
+ blk_finish_plug(&plug);
+ } else {
+ /* deal with chardevs and other special files */
+ ret = 0;
+ }
+ if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
+ break;
- if (clear_page_dirty_for_io(page)) {
- get_page(page);
- ret = mapping->a_ops->writepage(page, &wbc);
- if (ret == 0)
- wait_on_page_writeback(page);
- put_page(page);
- } else {
- unlock_page(page);
+ /*
+ * Lacking an allocation context or the locality or writeback
+ * state of any of the inode's pages, throttle based on
+ * writeback activity on the local node. It's as good a
+ * guess as any.
+ */
+ reclaim_throttle(NODE_DATA(numa_node_id()),
+ VMSCAN_THROTTLE_WRITEBACK);
}
-
- if (!ret)
- ret = filemap_check_errors(mapping);
+ /*
+ * Usually few pages are written by now from those we've just submitted
+ * but if there's constant writeback being submitted, this makes sure
+ * writeback bandwidth is updated once in a while.
+ */
+ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+ BANDWIDTH_INTERVAL))
+ wb_update_bandwidth(wb);
return ret;
}
-EXPORT_SYMBOL(write_one_page);
/*
* For address_spaces which do not use buffers nor write back.
*/
-int __set_page_dirty_no_writeback(struct page *page)
+bool noop_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- if (!PageDirty(page))
- return !TestSetPageDirty(page);
- return 0;
+ if (!folio_test_dirty(folio))
+ return !folio_test_set_dirty(folio);
+ return false;
}
+EXPORT_SYMBOL(noop_dirty_folio);
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold lock_page_memcg().
+ * Caller must hold folio_memcg_lock().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+static void folio_account_dirtied(struct folio *folio,
+ struct address_space *mapping)
{
struct inode *inode = mapping->host;
- trace_writeback_dirty_page(page, mapping);
+ trace_writeback_dirty_folio(folio, mapping);
if (mapping_can_writeback(mapping)) {
struct bdi_writeback *wb;
+ long nr = folio_nr_pages(folio);
- inode_attach_wb(inode, page);
+ inode_attach_wb(inode, folio);
wb = inode_to_wb(inode);
- __inc_lruvec_page_state(page, NR_FILE_DIRTY);
- __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
- __inc_node_page_state(page, NR_DIRTIED);
- inc_wb_stat(wb, WB_RECLAIMABLE);
- inc_wb_stat(wb, WB_DIRTIED);
- task_io_account_write(PAGE_SIZE);
- current->nr_dirtied++;
- this_cpu_inc(bdp_ratelimits);
+ __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+ __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
+ __node_stat_mod_folio(folio, NR_DIRTIED, nr);
+ wb_stat_mod(wb, WB_RECLAIMABLE, nr);
+ wb_stat_mod(wb, WB_DIRTIED, nr);
+ task_io_account_write(nr * PAGE_SIZE);
+ current->nr_dirtied += nr;
+ __this_cpu_add(bdp_ratelimits, nr);
- mem_cgroup_track_foreign_dirty(page, wb);
+ mem_cgroup_track_foreign_dirty(folio, wb);
}
}
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold lock_page_memcg().
+ * Caller must hold folio_memcg_lock().
*/
-void account_page_cleaned(struct page *page, struct address_space *mapping,
- struct bdi_writeback *wb)
+void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
- if (mapping_can_writeback(mapping)) {
- dec_lruvec_page_state(page, NR_FILE_DIRTY);
- dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
- dec_wb_stat(wb, WB_RECLAIMABLE);
- task_io_account_cancelled_write(PAGE_SIZE);
- }
+ long nr = folio_nr_pages(folio);
+
+ lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+ task_io_account_cancelled_write(nr * PAGE_SIZE);
}
/*
- * For address_spaces which do not use buffers. Just tag the page as dirty in
- * the xarray.
+ * Mark the folio dirty, and set it dirty in the page cache, and mark
+ * the inode dirty.
*
- * This is also used when a single buffer is being dirtied: we want to set the
- * page dirty in that case, but not all the buffers. This is a "bottom-up"
- * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ * If warn is true, then emit a warning if the folio is not uptodate and has
+ * not been truncated.
*
- * The caller must ensure this doesn't race with truncation. Most will simply
- * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
- * the pte lock held, which also locks out truncation.
+ * The caller must hold folio_memcg_lock(). Most callers have the folio
+ * locked. A few have the folio blocked from truncation through other
+ * means (eg zap_vma_pages() has it mapped and is holding the page table
+ * lock). This can also be called from mark_buffer_dirty(), which I
+ * cannot prove is always protected against truncate.
*/
-int __set_page_dirty_nobuffers(struct page *page)
+void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
+ int warn)
{
- lock_page_memcg(page);
- if (!TestSetPageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
- unsigned long flags;
+ unsigned long flags;
- if (!mapping) {
- unlock_page_memcg(page);
- return 1;
- }
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ if (folio->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
+ folio_account_dirtied(folio, mapping);
+ __xa_set_mark(&mapping->i_pages, folio_index(folio),
+ PAGECACHE_TAG_DIRTY);
+ }
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+}
- xa_lock_irqsave(&mapping->i_pages, flags);
- BUG_ON(page_mapping(page) != mapping);
- WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping);
- __xa_set_mark(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_DIRTY);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
- unlock_page_memcg(page);
+/**
+ * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
+ * @mapping: Address space this folio belongs to.
+ * @folio: Folio to be marked as dirty.
+ *
+ * Filesystems which do not use buffer heads should call this function
+ * from their set_page_dirty address space operation. It ignores the
+ * contents of folio_get_private(), so if the filesystem marks individual
+ * blocks as dirty, the filesystem should handle that itself.
+ *
+ * This is also sometimes used by filesystems which use buffer_heads when
+ * a single buffer is being dirtied: we want to set the folio dirty in
+ * that case, but not all the buffers. This is a "bottom-up" dirtying,
+ * whereas block_dirty_folio() is a "top-down" dirtying.
+ *
+ * The caller must ensure this doesn't race with truncation. Most will
+ * simply hold the folio lock, but e.g. zap_pte_range() calls with the
+ * folio mapped and the pte lock held, which also locks out truncation.
+ */
+bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ folio_memcg_lock(folio);
+ if (folio_test_set_dirty(folio)) {
+ folio_memcg_unlock(folio);
+ return false;
+ }
- if (mapping->host) {
- /* !PageAnon && !swapper_space */
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- }
- return 1;
+ __folio_mark_dirty(folio, mapping, !folio_test_private(folio));
+ folio_memcg_unlock(folio);
+
+ if (mapping->host) {
+ /* !PageAnon && !swapper_space */
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
- unlock_page_memcg(page);
- return 0;
+ return true;
}
-EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+EXPORT_SYMBOL(filemap_dirty_folio);
-/*
- * Call this whenever redirtying a page, to de-account the dirty counters
- * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
- * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
- * systematic errors in balanced_dirty_ratelimit and the dirty pages position
- * control.
+/**
+ * folio_account_redirty - Manually account for redirtying a page.
+ * @folio: The folio which is being redirtied.
+ *
+ * Most filesystems should call folio_redirty_for_writepage() instead
+ * of this fuction. If your filesystem is doing writeback outside the
+ * context of a writeback_control(), it can call this when redirtying
+ * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
+ * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
+ * WB_WRITTEN) in long term. The mismatches will lead to systematic errors
+ * in balanced_dirty_ratelimit and the dirty pages position control.
*/
-void account_page_redirty(struct page *page)
+void folio_account_redirty(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
+ long nr = folio_nr_pages(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
- current->nr_dirtied--;
- dec_node_page_state(page, NR_DIRTIED);
- dec_wb_stat(wb, WB_DIRTIED);
+ current->nr_dirtied -= nr;
+ node_stat_mod_folio(folio, NR_DIRTIED, -nr);
+ wb_stat_mod(wb, WB_DIRTIED, -nr);
unlocked_inode_to_wb_end(inode, &cookie);
}
}
-EXPORT_SYMBOL(account_page_redirty);
+EXPORT_SYMBOL(folio_account_redirty);
-/*
- * When a writepage implementation decides that it doesn't want to write this
- * page for some reason, it should redirty the locked page via
- * redirty_page_for_writepage() and it should then unlock the page and return 0
+/**
+ * folio_redirty_for_writepage - Decline to write a dirty folio.
+ * @wbc: The writeback control.
+ * @folio: The folio.
+ *
+ * When a writepage implementation decides that it doesn't want to write
+ * @folio for some reason, it should call this function, unlock @folio and
+ * return 0.
+ *
+ * Return: True if we redirtied the folio. False if someone else dirtied
+ * it first.
*/
-int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
+bool folio_redirty_for_writepage(struct writeback_control *wbc,
+ struct folio *folio)
{
- int ret;
+ bool ret;
+ long nr = folio_nr_pages(folio);
+
+ wbc->pages_skipped += nr;
+ ret = filemap_dirty_folio(folio->mapping, folio);
+ folio_account_redirty(folio);
- wbc->pages_skipped++;
- ret = __set_page_dirty_nobuffers(page);
- account_page_redirty(page);
return ret;
}
-EXPORT_SYMBOL(redirty_page_for_writepage);
+EXPORT_SYMBOL(folio_redirty_for_writepage);
-/*
- * Dirty a page.
+/**
+ * folio_mark_dirty - Mark a folio as being modified.
+ * @folio: The folio.
*
- * For pages with a mapping this should be done under the page lock
- * for the benefit of asynchronous memory errors who prefer a consistent
- * dirty state. This rule can be broken in some special cases,
- * but should be better not to.
+ * The folio may not be truncated while this function is running.
+ * Holding the folio lock is sufficient to prevent truncation, but some
+ * callers cannot acquire a sleeping lock. These callers instead hold
+ * the page table lock for a page table which contains at least one page
+ * in this folio. Truncation will block on the page table lock as it
+ * unmaps pages before removing the folio from its mapping.
*
- * If the mapping doesn't provide a set_page_dirty a_op, then
- * just fall through and assume that it wants buffer_heads.
+ * Return: True if the folio was newly dirtied, false if it was already dirty.
*/
-int set_page_dirty(struct page *page)
+bool folio_mark_dirty(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping = folio_mapping(folio);
- page = compound_head(page);
if (likely(mapping)) {
- int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
/*
- * readahead/lru_deactivate_page could remain
- * PG_readahead/PG_reclaim due to race with end_page_writeback
- * About readahead, if the page is written, the flags would be
+ * readahead/folio_deactivate could remain
+ * PG_readahead/PG_reclaim due to race with folio_end_writeback
+ * About readahead, if the folio is written, the flags would be
* reset. So no problem.
- * About lru_deactivate_page, if the page is redirty, the flag
- * will be reset. So no problem. but if the page is used by readahead
- * it will confuse readahead and make it restart the size rampup
- * process. But it's a trivial problem.
+ * About folio_deactivate, if the folio is redirtied,
+ * the flag will be reset. So no problem. but if the
+ * folio is used by readahead it will confuse readahead
+ * and make it restart the size rampup process. But it's
+ * a trivial problem.
*/
- if (PageReclaim(page))
- ClearPageReclaim(page);
-#ifdef CONFIG_BLOCK
- if (!spd)
- spd = __set_page_dirty_buffers;
-#endif
- return (*spd)(page);
- }
- if (!PageDirty(page)) {
- if (!TestSetPageDirty(page))
- return 1;
+ if (folio_test_reclaim(folio))
+ folio_clear_reclaim(folio);
+ return mapping->a_ops->dirty_folio(mapping, folio);
}
- return 0;
+
+ return noop_dirty_folio(mapping, folio);
}
-EXPORT_SYMBOL(set_page_dirty);
+EXPORT_SYMBOL(folio_mark_dirty);
/*
* set_page_dirty() is racy if the caller has no reference against
@@ -2621,49 +2840,49 @@ EXPORT_SYMBOL(set_page_dirty_lock);
* page without actually doing it through the VM. Can you say "ext3 is
* horribly ugly"? Thought you could.
*/
-void __cancel_dirty_page(struct page *page)
+void __folio_cancel_dirty(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping = folio_mapping(folio);
if (mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
- lock_page_memcg(page);
+ folio_memcg_lock(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
- if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping, wb);
+ if (folio_test_clear_dirty(folio))
+ folio_account_cleaned(folio, wb);
unlocked_inode_to_wb_end(inode, &cookie);
- unlock_page_memcg(page);
+ folio_memcg_unlock(folio);
} else {
- ClearPageDirty(page);
+ folio_clear_dirty(folio);
}
}
-EXPORT_SYMBOL(__cancel_dirty_page);
+EXPORT_SYMBOL(__folio_cancel_dirty);
/*
- * Clear a page's dirty flag, while caring for dirty memory accounting.
- * Returns true if the page was previously dirty.
- *
- * This is for preparing to put the page under writeout. We leave the page
- * tagged as dirty in the xarray so that a concurrent write-for-sync
- * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
- * implementation will run either set_page_writeback() or set_page_dirty(),
- * at which stage we bring the page's dirty flag and xarray dirty tag
- * back into sync.
- *
- * This incoherency between the page's dirty flag and xarray tag is
- * unfortunate, but it only exists while the page is locked.
+ * Clear a folio's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the folio was previously dirty.
+ *
+ * This is for preparing to put the folio under writeout. We leave
+ * the folio tagged as dirty in the xarray so that a concurrent
+ * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
+ * The ->writepage implementation will run either folio_start_writeback()
+ * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
+ * and xarray dirty tag back into sync.
+ *
+ * This incoherency between the folio's dirty flag and xarray tag is
+ * unfortunate, but it only exists while the folio is locked.
*/
-int clear_page_dirty_for_io(struct page *page)
+bool folio_clear_dirty_for_io(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
- int ret = 0;
+ struct address_space *mapping = folio_mapping(folio);
+ bool ret = false;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
@@ -2676,73 +2895,97 @@ int clear_page_dirty_for_io(struct page *page)
* We use this sequence to make sure that
* (a) we account for dirty stats properly
* (b) we tell the low-level filesystem to
- * mark the whole page dirty if it was
+ * mark the whole folio dirty if it was
* dirty in a pagetable. Only to then
- * (c) clean the page again and return 1 to
+ * (c) clean the folio again and return 1 to
* cause the writeback.
*
* This way we avoid all nasty races with the
* dirty bit in multiple places and clearing
* them concurrently from different threads.
*
- * Note! Normally the "set_page_dirty(page)"
+ * Note! Normally the "folio_mark_dirty(folio)"
* has no effect on the actual dirty bit - since
* that will already usually be set. But we
* need the side effects, and it can help us
* avoid races.
*
- * We basically use the page "master dirty bit"
+ * We basically use the folio "master dirty bit"
* as a serialization point for all the different
* threads doing their things.
*/
- if (page_mkclean(page))
- set_page_dirty(page);
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
/*
* We carefully synchronise fault handlers against
- * installing a dirty pte and marking the page dirty
+ * installing a dirty pte and marking the folio dirty
* at this point. We do this by having them hold the
- * page lock while dirtying the page, and pages are
+ * page lock while dirtying the folio, and folios are
* always locked coming in here, so we get the desired
* exclusion.
*/
wb = unlocked_inode_to_wb_begin(inode, &cookie);
- if (TestClearPageDirty(page)) {
- dec_lruvec_page_state(page, NR_FILE_DIRTY);
- dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
- dec_wb_stat(wb, WB_RECLAIMABLE);
- ret = 1;
+ if (folio_test_clear_dirty(folio)) {
+ long nr = folio_nr_pages(folio);
+ lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+ ret = true;
}
unlocked_inode_to_wb_end(inode, &cookie);
return ret;
}
- return TestClearPageDirty(page);
+ return folio_test_clear_dirty(folio);
}
-EXPORT_SYMBOL(clear_page_dirty_for_io);
+EXPORT_SYMBOL(folio_clear_dirty_for_io);
-int test_clear_page_writeback(struct page *page)
+static void wb_inode_writeback_start(struct bdi_writeback *wb)
{
- struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
- int ret;
+ atomic_inc(&wb->writeback_inodes);
+}
+
+static void wb_inode_writeback_end(struct bdi_writeback *wb)
+{
+ unsigned long flags;
+ atomic_dec(&wb->writeback_inodes);
+ /*
+ * Make sure estimate of writeback throughput gets updated after
+ * writeback completed. We delay the update by BANDWIDTH_INTERVAL
+ * (which is the interval other bandwidth updates use for batching) so
+ * that if multiple inodes end writeback at a similar time, they get
+ * batched into one bandwidth update.
+ */
+ spin_lock_irqsave(&wb->work_lock, flags);
+ if (test_bit(WB_registered, &wb->state))
+ queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
+ spin_unlock_irqrestore(&wb->work_lock, flags);
+}
+
+bool __folio_end_writeback(struct folio *folio)
+{
+ long nr = folio_nr_pages(folio);
+ struct address_space *mapping = folio_mapping(folio);
+ bool ret;
- memcg = lock_page_memcg(page);
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- ret = TestClearPageWriteback(page);
+ ret = folio_test_clear_writeback(folio);
if (ret) {
- __xa_clear_mark(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, folio_index(folio),
PAGECACHE_TAG_WRITEBACK);
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
- dec_wb_stat(wb, WB_WRITEBACK);
- __wb_writeout_inc(wb);
+ wb_stat_mod(wb, WB_WRITEBACK, -nr);
+ __wb_writeout_add(wb, nr);
+ if (!mapping_tagged(mapping,
+ PAGECACHE_TAG_WRITEBACK))
+ wb_inode_writeback_end(wb);
}
}
@@ -2752,38 +2995,34 @@ int test_clear_page_writeback(struct page *page)
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
- ret = TestClearPageWriteback(page);
+ ret = folio_test_clear_writeback(folio);
}
- /*
- * NOTE: Page might be free now! Writeback doesn't hold a page
- * reference on its own, it relies on truncation to wait for
- * the clearing of PG_writeback. The below can only access
- * page state that is static across allocation cycles.
- */
if (ret) {
- dec_lruvec_state(lruvec, NR_WRITEBACK);
- dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
- inc_node_page_state(page, NR_WRITTEN);
+ lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ node_stat_mod_folio(folio, NR_WRITTEN, nr);
}
- __unlock_page_memcg(memcg);
+ folio_memcg_unlock(folio);
return ret;
}
-int __test_set_page_writeback(struct page *page, bool keep_write)
+bool __folio_start_writeback(struct folio *folio, bool keep_write)
{
- struct address_space *mapping = page_mapping(page);
- int ret, access_ret;
+ long nr = folio_nr_pages(folio);
+ struct address_space *mapping = folio_mapping(folio);
+ bool ret;
+ int access_ret;
- lock_page_memcg(page);
+ folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
- XA_STATE(xas, &mapping->i_pages, page_index(page));
+ XA_STATE(xas, &mapping->i_pages, folio_index(folio));
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xas_lock_irqsave(&xas, flags);
xas_load(&xas);
- ret = TestSetPageWriteback(page);
+ ret = folio_test_set_writeback(folio);
if (!ret) {
bool on_wblist;
@@ -2791,65 +3030,108 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
- inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ wb_stat_mod(wb, WB_WRITEBACK, nr);
+ if (!on_wblist)
+ wb_inode_writeback_start(wb);
+ }
/*
- * We can come through here when swapping anonymous
- * pages, so we don't necessarily have an inode to track
- * for sync.
+ * We can come through here when swapping
+ * anonymous folios, so we don't necessarily
+ * have an inode to track for sync.
*/
if (mapping->host && !on_wblist)
sb_mark_inode_writeback(mapping->host);
}
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irqrestore(&xas, flags);
} else {
- ret = TestSetPageWriteback(page);
+ ret = folio_test_set_writeback(folio);
}
if (!ret) {
- inc_lruvec_page_state(page, NR_WRITEBACK);
- inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
}
- unlock_page_memcg(page);
- access_ret = arch_make_page_accessible(page);
+ folio_memcg_unlock(folio);
+ access_ret = arch_make_folio_accessible(folio);
/*
* If writeback has been triggered on a page that cannot be made
* accessible, it is too late to recover here.
*/
- VM_BUG_ON_PAGE(access_ret != 0, page);
+ VM_BUG_ON_FOLIO(access_ret != 0, folio);
return ret;
+}
+EXPORT_SYMBOL(__folio_start_writeback);
+/**
+ * folio_wait_writeback - Wait for a folio to finish writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete.
+ *
+ * Context: Sleeps. Must be called in process context and with
+ * no spinlocks held. Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
+ */
+void folio_wait_writeback(struct folio *folio)
+{
+ while (folio_test_writeback(folio)) {
+ trace_folio_wait_writeback(folio, folio_mapping(folio));
+ folio_wait_bit(folio, PG_writeback);
+ }
}
-EXPORT_SYMBOL(__test_set_page_writeback);
+EXPORT_SYMBOL_GPL(folio_wait_writeback);
-/*
- * Wait for a page to complete writeback
+/**
+ * folio_wait_writeback_killable - Wait for a folio to finish writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete or a fatal signal to arrive.
+ *
+ * Context: Sleeps. Must be called in process context and with
+ * no spinlocks held. Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
+ * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
*/
-void wait_on_page_writeback(struct page *page)
+int folio_wait_writeback_killable(struct folio *folio)
{
- if (PageWriteback(page)) {
- trace_wait_on_page_writeback(page, page_mapping(page));
- wait_on_page_bit(page, PG_writeback);
+ while (folio_test_writeback(folio)) {
+ trace_folio_wait_writeback(folio, folio_mapping(folio));
+ if (folio_wait_bit_killable(folio, PG_writeback))
+ return -EINTR;
}
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
/**
- * wait_for_stable_page() - wait for writeback to finish, if necessary.
- * @page: The page to wait on.
+ * folio_wait_stable() - wait for writeback to finish, if necessary.
+ * @folio: The folio to wait on.
+ *
+ * This function determines if the given folio is related to a backing
+ * device that requires folio contents to be held stable during writeback.
+ * If so, then it will wait for any pending writeback to complete.
*
- * This function determines if the given page is related to a backing device
- * that requires page contents to be held stable during writeback. If so, then
- * it will wait for any pending writeback to complete.
+ * Context: Sleeps. Must be called in process context and with
+ * no spinlocks held. Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
*/
-void wait_for_stable_page(struct page *page)
+void folio_wait_stable(struct folio *folio)
{
- if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
- wait_on_page_writeback(page);
+ if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES)
+ folio_wait_writeback(folio);
}
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
+EXPORT_SYMBOL_GPL(folio_wait_stable);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0ff3a811ec5..7d3460c7a480 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,19 +18,14 @@
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
#include <linux/interrupt.h>
-#include <linux/pagemap.h>
#include <linux/jiffies.h>
-#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
+#include <linux/kmsan.h>
#include <linux/module.h>
#include <linux/suspend.h>
-#include <linux/pagevec.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
@@ -39,48 +34,131 @@
#include <linux/cpuset.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
-#include <linux/vmalloc.h>
#include <linux/vmstat.h>
-#include <linux/mempolicy.h>
-#include <linux/memremap.h>
-#include <linux/stop_machine.h>
-#include <linux/random.h>
-#include <linux/sort.h>
-#include <linux/pfn.h>
-#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
-#include <linux/page-isolation.h>
-#include <linux/debugobjects.h>
-#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
-#include <linux/hugetlb.h>
-#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
-#include <linux/kthread.h>
+#include <linux/page_table_check.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
-#include <linux/nmi.h>
#include <linux/psi.h>
-#include <linux/padata.h>
#include <linux/khugepaged.h>
-
-#include <asm/sections.h>
-#include <asm/tlbflush.h>
+#include <linux/delayacct.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
+typedef int __bitwise fpi_t;
+
+/* No special request */
+#define FPI_NONE ((__force fpi_t)0)
+
+/*
+ * Skip free page reporting notification for the (possibly merged) page.
+ * This does not hinder free page reporting from grabbing the page,
+ * reporting it and marking it "reported" - it only skips notifying
+ * the free page reporting infrastructure about a newly freed page. For
+ * example, used when temporarily pulling a page from a freelist and
+ * putting it back unmodified.
+ */
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
+
+/*
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
+ * shuffle the whole zone).
+ *
+ * Note: No code should rely on this flag for correctness - it's purely
+ * to allow for optimizations when handing back either fresh pages
+ * (memory onlining) or untouched pages (page isolation, free page
+ * reporting).
+ */
+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
-#define MIN_PERCPU_PAGELIST_FRACTION (8)
+#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags) do { } while (0)
+#define pcp_trylock_finish(flag) do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags) local_irq_save(flags)
+#define pcp_trylock_finish(flags) local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin() preempt_disable()
+#define pcpu_task_unpin() preempt_enable()
+#else
+#define pcpu_task_pin() migrate_disable()
+#define pcpu_task_unpin() migrate_enable()
+#endif
+
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ spin_lock(&_ret->member); \
+ _ret; \
+})
+
+#define pcpu_spin_trylock(type, member, ptr) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ if (!spin_trylock(&_ret->member)) { \
+ pcpu_task_unpin(); \
+ _ret = NULL; \
+ } \
+ _ret; \
+})
+
+#define pcpu_spin_unlock(member, ptr) \
+({ \
+ spin_unlock(&ptr->member); \
+ pcpu_task_unpin(); \
+})
+
+/* struct per_cpu_pages specific helpers. */
+#define pcp_spin_lock(ptr) \
+ pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_trylock(ptr) \
+ pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_unlock(ptr) \
+ pcpu_spin_unlock(lock, ptr)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -100,13 +178,7 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif
-/* work_structs for global per-cpu drains */
-struct pcpu_drain {
- struct zone *zone;
- struct work_struct work;
-};
static DEFINE_MUTEX(pcpu_drain_mutex);
-static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
@@ -130,62 +202,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
};
EXPORT_SYMBOL(node_states);
-atomic_long_t _totalram_pages __read_mostly;
-EXPORT_SYMBOL(_totalram_pages);
-unsigned long totalreserve_pages __read_mostly;
-unsigned long totalcma_pages __read_mostly;
-
-int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
-#else
-DEFINE_STATIC_KEY_FALSE(init_on_alloc);
-#endif
-EXPORT_SYMBOL(init_on_alloc);
-
-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
-DEFINE_STATIC_KEY_TRUE(init_on_free);
-#else
-DEFINE_STATIC_KEY_FALSE(init_on_free);
-#endif
-EXPORT_SYMBOL(init_on_free);
-
-static int __init early_init_on_alloc(char *buf)
-{
- int ret;
- bool bool_result;
-
- ret = kstrtobool(buf, &bool_result);
- if (ret)
- return ret;
- if (bool_result && page_poisoning_enabled())
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
- if (bool_result)
- static_branch_enable(&init_on_alloc);
- else
- static_branch_disable(&init_on_alloc);
- return 0;
-}
-early_param("init_on_alloc", early_init_on_alloc);
-
-static int __init early_init_on_free(char *buf)
-{
- int ret;
- bool bool_result;
-
- ret = kstrtobool(buf, &bool_result);
- if (ret)
- return ret;
- if (bool_result && page_poisoning_enabled())
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
- if (bool_result)
- static_branch_enable(&init_on_free);
- else
- static_branch_disable(&init_on_free);
- return 0;
-}
-early_param("init_on_free", early_init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
@@ -205,49 +222,12 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
page->index = migratetype;
}
-#ifdef CONFIG_PM_SLEEP
-/*
- * The following functions are used by the suspend/hibernate code to temporarily
- * change gfp_allowed_mask in order to avoid using I/O during memory allocations
- * while devices are suspended. To avoid races with the suspend/hibernate code,
- * they should always be called with system_transition_mutex held
- * (gfp_allowed_mask also should only be modified with system_transition_mutex
- * held, unless the suspend/hibernate code is guaranteed not to run in parallel
- * with that modification).
- */
-
-static gfp_t saved_gfp_mask;
-
-void pm_restore_gfp_mask(void)
-{
- WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (saved_gfp_mask) {
- gfp_allowed_mask = saved_gfp_mask;
- saved_gfp_mask = 0;
- }
-}
-
-void pm_restrict_gfp_mask(void)
-{
- WARN_ON(!mutex_is_locked(&system_transition_mutex));
- WARN_ON(saved_gfp_mask);
- saved_gfp_mask = gfp_allowed_mask;
- gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
-}
-
-bool pm_suspended_storage(void)
-{
- if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
- return false;
- return true;
-}
-#endif /* CONFIG_PM_SLEEP */
-
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif
-static void __free_pages_ok(struct page *page, unsigned int order);
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -260,7 +240,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
+static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#endif
@@ -274,7 +254,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
[ZONE_MOVABLE] = 0,
};
-static char * const zone_names[MAX_NR_ZONES] = {
+char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
#endif
@@ -304,7 +284,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif
};
-compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
+static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
[NULL_COMPOUND_DTOR] = NULL,
[COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
@@ -317,34 +297,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
- * are not on separate NUMA nodes. Functionally this works but with
- * watermark_boost_factor, it can reclaim prematurely as the ranges can be
- * quite small. By default, do not boost watermarks on discontigmem as in
- * many cases very high-order allocations like THP are likely to be
- * unsupported and the premature reclaim offsets the advantage of long-term
- * fragmentation avoidance.
- */
-int watermark_boost_factor __read_mostly;
-#else
-int watermark_boost_factor __read_mostly = 15000;
-#endif
-int watermark_scale_factor = 10;
-
-static unsigned long nr_kernel_pages __initdata;
-static unsigned long nr_all_pages __initdata;
-static unsigned long dma_reserve __initdata;
-
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long required_kernelcore __initdata;
-static unsigned long required_kernelcore_percent __initdata;
-static unsigned long required_movablecore __initdata;
-static unsigned long required_movablecore_percent __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
-static bool mirrored_kernelcore __meminitdata;
+static int watermark_boost_factor __read_mostly = 15000;
+static int watermark_scale_factor = 10;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -357,6 +311,12 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+static bool page_contains_unaccepted(struct page *page, unsigned int order);
+static void accept_page(struct page *page, unsigned int order);
+static bool try_to_accept_memory(struct zone *zone, unsigned int order);
+static inline bool has_unaccepted_memory(void);
+static bool __free_unaccepted(struct page *page);
+
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -365,88 +325,33 @@ int page_group_by_mobility_disabled __read_mostly;
* page_alloc_init_late() has finished, the deferred pages are all initialized,
* and we can permanently disable that path.
*/
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-
-/*
- * Calling kasan_free_pages() only after deferred memory initialization
- * has completed. Poisoning pages during deferred memory init will greatly
- * lengthen the process and cause problem in large memory systems as the
- * deferred pages initialization is done with interrupt disabled.
- *
- * Assuming that there will be no reference to those newly initialized
- * pages before they are ever allocated, this should have no effect on
- * KASAN memory tracking as the poison will be properly inserted at page
- * allocation time. The only corner case is when pages are allocated by
- * on-demand allocation and then freed again before the deferred pages
- * initialization is done, but this is not likely to happen.
- */
-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
-{
- if (!static_branch_unlikely(&deferred_pages))
- kasan_free_pages(page, order);
-}
+DEFINE_STATIC_KEY_TRUE(deferred_pages);
-/* Returns true if the struct page for the pfn is uninitialised */
-static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+static inline bool deferred_pages_enabled(void)
{
- int nid = early_pfn_to_nid(pfn);
-
- if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
- return true;
-
- return false;
+ return static_branch_unlikely(&deferred_pages);
}
/*
- * Returns true when the remaining initialisation should be deferred until
- * later in the boot cycle when it can be parallelised.
+ * deferred_grow_zone() is __init, but it is called from
+ * get_page_from_freelist() during early boot until deferred_pages permanently
+ * disables this call. This is why we have refdata wrapper to avoid warning,
+ * and to ensure that the function body gets unloaded.
*/
-static bool __meminit
-defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+static bool __ref
+_deferred_grow_zone(struct zone *zone, unsigned int order)
{
- static unsigned long prev_end_pfn, nr_initialised;
-
- /*
- * prev_end_pfn static that contains the end of previous zone
- * No need to protect because called very early in boot before smp_init.
- */
- if (prev_end_pfn != end_pfn) {
- prev_end_pfn = end_pfn;
- nr_initialised = 0;
- }
-
- /* Always populate low zones for address-constrained allocations */
- if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
- return false;
-
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- nr_initialised++;
- if ((nr_initialised > PAGES_PER_SECTION) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- NODE_DATA(nid)->first_deferred_pfn = pfn;
- return true;
- }
- return false;
+ return deferred_grow_zone(zone, order);
}
#else
-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
-
-static inline bool early_page_uninitialised(unsigned long pfn)
+static inline bool deferred_pages_enabled(void)
{
return false;
}
-
-static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
-{
- return false;
-}
-#endif
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct page *page,
+static inline unsigned long *get_pageblock_bitmap(const struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
@@ -456,26 +361,18 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
#endif /* CONFIG_SPARSEMEM */
}
-static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
#else
- pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+ pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
#endif /* CONFIG_SPARSEMEM */
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
static __always_inline
-unsigned long __get_pfnblock_flags_mask(struct page *page,
+unsigned long __get_pfnblock_flags_mask(const struct page *page,
unsigned long pfn,
unsigned long mask)
{
@@ -487,18 +384,31 @@ unsigned long __get_pfnblock_flags_mask(struct page *page,
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
-
- word = bitmap[word_bitidx];
+ /*
+ * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+ * a consistent read of the memory array, so that results, even though
+ * racy, are not corrupted.
+ */
+ word = READ_ONCE(bitmap[word_bitidx]);
return (word >> bitidx) & mask;
}
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long mask)
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+unsigned long get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn, unsigned long mask)
{
return __get_pfnblock_flags_mask(page, pfn, mask);
}
-static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+static __always_inline int get_pfnblock_migratetype(const struct page *page,
+ unsigned long pfn)
{
return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
@@ -516,7 +426,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
- unsigned long old_word, word;
+ unsigned long word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
@@ -532,12 +442,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
- for (;;) {
- old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
- if (word == old_word)
- break;
- word = old_word;
- }
+ do {
+ } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
}
void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -574,15 +480,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
return ret;
}
-static int page_is_consistent(struct zone *zone, struct page *page)
-{
- if (!pfn_valid_within(page_to_pfn(page)))
- return 0;
- if (zone != page_zone(page))
- return 0;
-
- return 1;
-}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
@@ -590,7 +487,7 @@ static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
- if (!page_is_consistent(zone, page))
+ if (zone != page_zone(page))
return 1;
return 0;
@@ -630,8 +527,7 @@ static void bad_page(struct page *page, const char *reason)
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- __dump_page(page, reason);
- dump_page_owner(page);
+ dump_page(page, reason);
print_modules();
dump_stack();
@@ -641,6 +537,55 @@ out:
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
+static inline unsigned int order_to_pindex(int migratetype, int order)
+{
+ int base = order;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ VM_BUG_ON(order != pageblock_order);
+ return NR_LOWORDER_PCP_LISTS;
+ }
+#else
+ VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+ return (MIGRATE_PCPTYPES * base) + migratetype;
+}
+
+static inline int pindex_to_order(unsigned int pindex)
+{
+ int order = pindex / MIGRATE_PCPTYPES;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pindex == NR_LOWORDER_PCP_LISTS)
+ order = pageblock_order;
+#else
+ VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+ return order;
+}
+
+static inline bool pcp_allowed_order(unsigned int order)
+{
+ if (order <= PAGE_ALLOC_COSTLY_ORDER)
+ return true;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order == pageblock_order)
+ return true;
+#endif
+ return false;
+}
+
+static inline void free_the_page(struct page *page, unsigned int order)
+{
+ if (pcp_allowed_order(order)) /* Via pcp? */
+ free_unref_page(page, order);
+ else
+ __free_pages_ok(page, order, FPI_NONE);
+}
+
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
@@ -658,8 +603,8 @@ out:
void free_compound_page(struct page *page)
{
- mem_cgroup_uncharge(page);
- __free_pages_ok(page, compound_order(page));
+ mem_cgroup_uncharge(page_folio(page));
+ free_the_page(page, compound_order(page));
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -668,141 +613,26 @@ void prep_compound_page(struct page *page, unsigned int order)
int nr_pages = 1 << order;
__SetPageHead(page);
- for (i = 1; i < nr_pages; i++) {
- struct page *p = page + i;
- set_page_count(p, 0);
- p->mapping = TAIL_MAPPING;
- set_compound_head(p, page);
- }
+ for (i = 1; i < nr_pages; i++)
+ prep_compound_tail(page, i);
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
- set_compound_order(page, order);
- atomic_set(compound_mapcount_ptr(page), -1);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ prep_compound_head(page, order);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
-unsigned int _debug_guardpage_minorder;
-
-bool _debug_pagealloc_enabled_early __read_mostly
- = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
-EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
-DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
-EXPORT_SYMBOL(_debug_pagealloc_enabled);
-
-DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
-
-static int __init early_debug_pagealloc(char *buf)
+void destroy_large_folio(struct folio *folio)
{
- return kstrtobool(buf, &_debug_pagealloc_enabled_early);
-}
-early_param("debug_pagealloc", early_debug_pagealloc);
-
-void init_debug_pagealloc(void)
-{
- if (!debug_pagealloc_enabled())
- return;
+ enum compound_dtor_id dtor = folio->_folio_dtor;
- static_branch_enable(&_debug_pagealloc_enabled);
-
- if (!debug_guardpage_minorder())
- return;
-
- static_branch_enable(&_debug_guardpage_enabled);
+ VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
+ compound_page_dtors[dtor](&folio->page);
}
-static int __init debug_guardpage_minorder_setup(char *buf)
-{
- unsigned long res;
-
- if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
- return 0;
- }
- _debug_guardpage_minorder = res;
- pr_info("Setting debug_guardpage_minorder to %lu\n", res);
- return 0;
-}
-early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
-
-static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
-{
- if (!debug_guardpage_enabled())
- return false;
-
- if (order >= debug_guardpage_minorder())
- return false;
-
- __SetPageGuard(page);
- INIT_LIST_HEAD(&page->lru);
- set_page_private(page, order);
- /* Guard pages are not available for any usage */
- __mod_zone_freepage_state(zone, -(1 << order), migratetype);
-
- return true;
-}
-
-static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
-{
- if (!debug_guardpage_enabled())
- return;
-
- __ClearPageGuard(page);
-
- set_page_private(page, 0);
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, (1 << order), migratetype);
-}
-#else
-static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) { return false; }
-static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
-#endif
-
-static inline void set_page_order(struct page *page, unsigned int order)
+static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
}
-/*
- * This function checks whether a page is free && is the buddy
- * we can coalesce a page and its buddy if
- * (a) the buddy is not in a hole (check before calling!) &&
- * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order &&
- * (d) a page and its buddy are in the same zone.
- *
- * For recording whether a page is in the buddy system, we set PageBuddy.
- * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
- *
- * For recording page's order, we use page_private(page).
- */
-static inline bool page_is_buddy(struct page *page, struct page *buddy,
- unsigned int order)
-{
- if (!page_is_guard(buddy) && !PageBuddy(buddy))
- return false;
-
- if (page_order(buddy) != order)
- return false;
-
- /*
- * zone check is done late to avoid uselessly calculating
- * zone/node ids for pages that could never merge.
- */
- if (page_zone_id(page) != page_zone_id(buddy))
- return false;
-
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
- return true;
-}
-
#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
@@ -827,7 +657,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
return false;
/*
- * Do not let lower order allocations polluate a movable pageblock.
+ * Do not let lower order allocations pollute a movable pageblock.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
@@ -859,7 +689,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_add(&page->lru, &area->free_list[migratetype]);
+ list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
@@ -869,17 +699,21 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_add_tail(&page->lru, &area->free_list[migratetype]);
+ list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
-/* Used for pages which are on another list */
+/*
+ * Used for pages which are on another list. Move the pages to the tail
+ * of the list - so the moved pages won't immediately be considered for
+ * allocation again (e.g., optimization for memory onlining).
+ */
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
- list_move(&page->lru, &area->free_list[migratetype]);
+ list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -889,12 +723,19 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
if (page_reported(page))
__ClearPageReported(page);
- list_del(&page->lru);
+ list_del(&page->buddy_list);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
}
+static inline struct page *get_page_from_free_area(struct free_area *area,
+ int migratetype)
+{
+ return list_first_entry_or_null(&area->free_list[migratetype],
+ struct page, buddy_list);
+}
+
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
@@ -907,22 +748,17 @@ static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
struct page *page, unsigned int order)
{
- struct page *higher_page, *higher_buddy;
- unsigned long combined_pfn;
-
- if (order >= MAX_ORDER - 2)
- return false;
+ unsigned long higher_page_pfn;
+ struct page *higher_page;
- if (!pfn_valid_within(buddy_pfn))
+ if (order >= MAX_ORDER - 1)
return false;
- combined_pfn = buddy_pfn & pfn;
- higher_page = page + (combined_pfn - pfn);
- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+ higher_page_pfn = buddy_pfn & pfn;
+ higher_page = page + (higher_page_pfn - pfn);
- return pfn_valid_within(buddy_pfn) &&
- page_is_buddy(higher_page, higher_buddy, order + 1);
+ return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
+ NULL) != NULL;
}
/*
@@ -952,17 +788,14 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
- int migratetype, bool report)
+ int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
- unsigned long buddy_pfn;
+ unsigned long buddy_pfn = 0;
unsigned long combined_pfn;
- unsigned int max_order;
struct page *buddy;
bool to_tail;
- max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
-
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -973,20 +806,32 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
-continue_merging:
- while (order < max_order - 1) {
+ while (order < MAX_ORDER) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
- if (!pfn_valid_within(buddy_pfn))
- goto done_merging;
- if (!page_is_buddy(page, buddy, order))
+ buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
+ if (!buddy)
goto done_merging;
+
+ if (unlikely(order >= pageblock_order)) {
+ /*
+ * We want to prevent merge between freepages on pageblock
+ * without fallbacks and normal pageblock. Without this,
+ * pageblock isolation could cause incorrect freepage or CMA
+ * accounting or HIGHATOMIC accounting.
+ */
+ int buddy_mt = get_pageblock_migratetype(buddy);
+
+ if (migratetype != buddy_mt
+ && (!migratetype_is_mergeable(migratetype) ||
+ !migratetype_is_mergeable(buddy_mt)))
+ goto done_merging;
+ }
+
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
@@ -1000,35 +845,13 @@ continue_merging:
pfn = combined_pfn;
order++;
}
- if (max_order < MAX_ORDER) {
- /* If we are here, it means order is >= pageblock_order.
- * We want to prevent merge between freepages on isolate
- * pageblock and normal pageblock. Without this, pageblock
- * isolation could cause incorrect freepage or CMA accounting.
- *
- * We don't want to hit this code for the more frequent
- * low-order merging.
- */
- if (unlikely(has_isolate_pageblock(zone))) {
- int buddy_mt;
-
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
- buddy_mt = get_pageblock_migratetype(buddy);
-
- if (migratetype != buddy_mt
- && (is_migrate_isolate(migratetype) ||
- is_migrate_isolate(buddy_mt)))
- goto done_merging;
- }
- max_order++;
- goto continue_merging;
- }
done_merging:
- set_page_order(page, order);
+ set_buddy_order(page, order);
- if (is_shuffle_order(order))
+ if (fpi_flags & FPI_TO_TAIL)
+ to_tail = true;
+ else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -1039,10 +862,68 @@ done_merging:
add_to_free_list(page, zone, order, migratetype);
/* Notify page reporting subsystem of freed page */
- if (report)
+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page: the original free page
+ * @order: the order of the page
+ * @split_pfn_offset: split offset within the page
+ *
+ * Return -ENOENT if the free page is changed, otherwise 0
+ *
+ * It is used when the free page crosses two pageblocks with different migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset)
+{
+ struct zone *zone = page_zone(free_page);
+ unsigned long free_page_pfn = page_to_pfn(free_page);
+ unsigned long pfn;
+ unsigned long flags;
+ int free_page_order;
+ int mt;
+ int ret = 0;
+
+ if (split_pfn_offset == 0)
+ return ret;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mt = get_pageblock_migratetype(free_page);
+ if (likely(!is_migrate_isolate(mt)))
+ __mod_zone_freepage_state(zone, -(1UL << order), mt);
+
+ del_page_from_free_list(free_page, zone, order);
+ for (pfn = free_page_pfn;
+ pfn < free_page_pfn + (1UL << order);) {
+ int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+ free_page_order = min_t(unsigned int,
+ pfn ? __ffs(pfn) : order,
+ __fls(split_pfn_offset));
+ __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+ mt, FPI_NONE);
+ pfn += 1UL << free_page_order;
+ split_pfn_offset -= (1UL << free_page_order);
+ /* we have done the first part, now switch to second part */
+ if (split_pfn_offset == 0)
+ split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+ }
+out:
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
@@ -1057,7 +938,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page->mem_cgroup |
+ page->memcg_data |
#endif
(page->flags & check_flags)))
return false;
@@ -1082,30 +963,36 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page->mem_cgroup))
+ if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
}
-static void check_free_page_bad(struct page *page)
+static void free_page_is_bad_report(struct page *page)
{
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
}
-static inline int check_free_page(struct page *page)
+static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
- return 0;
+ return false;
/* Something has gone sideways, find it */
- check_free_page_bad(page);
- return 1;
+ free_page_is_bad_report(page);
+ return true;
+}
+
+static inline bool is_check_pages_enabled(void)
+{
+ return static_branch_unlikely(&check_pages_enabled);
}
-static int free_tail_pages_check(struct page *head_page, struct page *page)
+static int free_tail_page_prepare(struct page *head_page, struct page *page)
{
+ struct folio *folio = (struct folio *)head_page;
int ret = 1;
/*
@@ -1114,15 +1001,23 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
*/
BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
- if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+ if (!is_check_pages_enabled()) {
ret = 0;
goto out;
}
switch (page - head_page) {
case 1:
- /* the first tail page: ->mapping may be compound_mapcount() */
- if (unlikely(compound_mapcount(page))) {
- bad_page(page, "nonzero compound_mapcount");
+ /* the first tail page: these may be in place of ->mapping */
+ if (unlikely(folio_entire_mapcount(folio))) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
+ bad_page(page, "nonzero nr_pages_mapped");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
goto out;
}
break;
@@ -1154,25 +1049,77 @@ out:
return ret;
}
-static void kernel_init_free_pages(struct page *page, int numpages)
+/*
+ * Skip KASAN memory poisoning when either:
+ *
+ * 1. For generic KASAN: deferred memory initialization has not yet completed.
+ * Tag-based KASAN modes skip pages freed via deferred memory initialization
+ * using page tags instead (see below).
+ * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
+ * that error detection is disabled for accesses via the page address.
+ *
+ * Pages will have match-all tags in the following circumstances:
+ *
+ * 1. Pages are being initialized for the first time, including during deferred
+ * memory init; see the call to page_kasan_tag_reset in __init_single_page.
+ * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
+ * exception of pages unpoisoned by kasan_unpoison_vmalloc.
+ * 3. The allocation was excluded from being checked due to sampling,
+ * see the call to kasan_unpoison_pages.
+ *
+ * Poisoning pages during deferred memory init will greatly lengthen the
+ * process and cause problem in large memory systems as the deferred pages
+ * initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+{
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ return deferred_pages_enabled();
+
+ return page_kasan_tag(page) == 0xff;
+}
+
+static void kernel_init_pages(struct page *page, int numpages)
{
int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++)
- clear_highpage(page + i);
+ clear_highpage_kasan_tagged(page + i);
kasan_enable_current();
}
static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, bool check_free)
+ unsigned int order, fpi_t fpi_flags)
{
int bad = 0;
+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
+ bool init = want_init_on_free();
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
+ kmsan_free_page(page, order);
+
+ if (unlikely(PageHWPoison(page)) && !order) {
+ /*
+ * Do not let hwpoison pages hit pcplists/buddy
+ * Untie memcg state and reset page's owner
+ */
+ if (memcg_kmem_online() && PageMemcgKmem(page))
+ __memcg_kmem_uncharge_page(page, order);
+ reset_page_owner(page, order);
+ page_table_check_free(page, order);
+ return false;
+ }
/*
* Check tail pages before head page information is cleared to
@@ -1185,29 +1132,34 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
if (compound)
- ClearPageDoubleMap(page);
+ ClearPageHasHWPoisoned(page);
for (i = 1; i < (1 << order); i++) {
if (compound)
- bad += free_tail_pages_check(page, page + i);
- if (unlikely(check_free_page(page + i))) {
- bad++;
- continue;
+ bad += free_tail_page_prepare(page, page + i);
+ if (is_check_pages_enabled()) {
+ if (free_page_is_bad(page + i)) {
+ bad++;
+ continue;
+ }
}
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (PageMappingFlags(page))
page->mapping = NULL;
- if (memcg_kmem_enabled() && PageKmemcg(page))
+ if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
- if (check_free)
- bad += check_free_page(page);
- if (bad)
- return false;
+ if (is_check_pages_enabled()) {
+ if (free_page_is_bad(page))
+ bad++;
+ if (bad)
+ return false;
+ }
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
+ page_table_check_free(page, order);
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
@@ -1215,10 +1167,27 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- if (want_init_on_free())
- kernel_init_free_pages(page, 1 << order);
- kernel_poison_pages(page, 1 << order, 0);
+ kernel_poison_pages(page, 1 << order);
+
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * KASAN poisoning and memory initialization code must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * With hardware tag-based KASAN, memory tags must be set before the
+ * page becomes unavailable via debug_pagealloc or arch_free_page.
+ */
+ if (!skip_kasan_poison) {
+ kasan_poison_pages(page, order, init);
+
+ /* Memory is already initialized if KASAN did it internally. */
+ if (kasan_has_integrated_init())
+ init = false;
+ }
+ if (init)
+ kernel_init_pages(page, 1 << order);
+
/*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
@@ -1226,257 +1195,127 @@ static __always_inline bool free_pages_prepare(struct page *page,
*/
arch_free_page(page, order);
- if (debug_pagealloc_enabled_static())
- kernel_map_pages(page, 1 << order, 0);
-
- kasan_free_nondeferred_pages(page, order);
+ debug_pagealloc_unmap_pages(page, 1 << order);
return true;
}
-#ifdef CONFIG_DEBUG_VM
-/*
- * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
- * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
- * moved from pcp lists to free lists.
- */
-static bool free_pcp_prepare(struct page *page)
-{
- return free_pages_prepare(page, 0, true);
-}
-
-static bool bulkfree_pcp_prepare(struct page *page)
-{
- if (debug_pagealloc_enabled_static())
- return check_free_page(page);
- else
- return false;
-}
-#else
-/*
- * With DEBUG_VM disabled, order-0 pages being freed are checked only when
- * moving from pcp lists to free list in order to reduce overhead. With
- * debug_pagealloc enabled, they are checked also immediately when being freed
- * to the pcp lists.
- */
-static bool free_pcp_prepare(struct page *page)
-{
- if (debug_pagealloc_enabled_static())
- return free_pages_prepare(page, 0, true);
- else
- return free_pages_prepare(page, 0, false);
-}
-
-static bool bulkfree_pcp_prepare(struct page *page)
-{
- return check_free_page(page);
-}
-#endif /* CONFIG_DEBUG_VM */
-
-static inline void prefetch_buddy(struct page *page)
-{
- unsigned long pfn = page_to_pfn(page);
- unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
- struct page *buddy = page + (buddy_pfn - pfn);
-
- prefetch(buddy);
-}
-
/*
* Frees a number of pages from the PCP lists
- * Assumes all pages on list are in same zone, and of same order.
+ * Assumes all pages on list are in same zone.
* count is the number of pages to free.
- *
- * If the zone was previously in an "all pages pinned" state then look to
- * see if this freeing clears that state.
- *
- * And clear the zone's pages_scanned counter, to hold off the "all pages are
- * pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
- struct per_cpu_pages *pcp)
+ struct per_cpu_pages *pcp,
+ int pindex)
{
- int migratetype = 0;
- int batch_free = 0;
- int prefetch_nr = 0;
+ unsigned long flags;
+ int min_pindex = 0;
+ int max_pindex = NR_PCP_LISTS - 1;
+ unsigned int order;
bool isolated_pageblocks;
- struct page *page, *tmp;
- LIST_HEAD(head);
+ struct page *page;
/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
- while (count) {
- struct list_head *list;
- /*
- * Remove pages from lists in a round-robin fashion. A
- * batch_free count is maintained that is incremented when an
- * empty list is encountered. This is so more pages are freed
- * off fuller lists instead of spinning excessively around empty
- * lists
- */
- do {
- batch_free++;
- if (++migratetype == MIGRATE_PCPTYPES)
- migratetype = 0;
- list = &pcp->lists[migratetype];
- } while (list_empty(list));
+ /* Ensure requested pindex is drained first. */
+ pindex = pindex - 1;
- /* This is the only non-empty list. Free them all. */
- if (batch_free == MIGRATE_PCPTYPES)
- batch_free = count;
+ spin_lock_irqsave(&zone->lock, flags);
+ isolated_pageblocks = has_isolate_pageblock(zone);
+ while (count > 0) {
+ struct list_head *list;
+ int nr_pages;
+
+ /* Remove pages from lists in a round-robin fashion. */
do {
- page = list_last_entry(list, struct page, lru);
- /* must delete to avoid corrupting pcp list */
- list_del(&page->lru);
- pcp->count--;
+ if (++pindex > max_pindex)
+ pindex = min_pindex;
+ list = &pcp->lists[pindex];
+ if (!list_empty(list))
+ break;
- if (bulkfree_pcp_prepare(page))
- continue;
+ if (pindex == max_pindex)
+ max_pindex--;
+ if (pindex == min_pindex)
+ min_pindex++;
+ } while (1);
- list_add_tail(&page->lru, &head);
+ order = pindex_to_order(pindex);
+ nr_pages = 1 << order;
+ do {
+ int mt;
- /*
- * We are going to put the page back to the global
- * pool, prefetch its buddy to speed up later access
- * under zone->lock. It is believed the overhead of
- * an additional test and calculating buddy_pfn here
- * can be offset by reduced memory latency later. To
- * avoid excessive prefetching due to large count, only
- * prefetch buddy for the first pcp->batch nr of pages.
- */
- if (prefetch_nr++ < pcp->batch)
- prefetch_buddy(page);
- } while (--count && --batch_free && !list_empty(list));
- }
+ page = list_last_entry(list, struct page, pcp_list);
+ mt = get_pcppage_migratetype(page);
- spin_lock(&zone->lock);
- isolated_pageblocks = has_isolate_pageblock(zone);
+ /* must delete to avoid corrupting pcp list */
+ list_del(&page->pcp_list);
+ count -= nr_pages;
+ pcp->count -= nr_pages;
- /*
- * Use safe version since after __free_one_page(),
- * page->lru.next will not point to original list.
- */
- list_for_each_entry_safe(page, tmp, &head, lru) {
- int mt = get_pcppage_migratetype(page);
- /* MIGRATE_ISOLATE page should not go to pcplists */
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- /* Pageblock could have been isolated meanwhile */
- if (unlikely(isolated_pageblocks))
- mt = get_pageblock_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
+ if (unlikely(isolated_pageblocks))
+ mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
- trace_mm_page_pcpu_drain(page, 0, mt);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+ trace_mm_page_pcpu_drain(page, order, mt);
+ } while (count > 0 && !list_empty(list));
}
- spin_unlock(&zone->lock);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
}
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
- int migratetype)
+ int migratetype, fpi_t fpi_flags)
{
- spin_lock(&zone->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
- __free_one_page(page, pfn, zone, order, migratetype, true);
- spin_unlock(&zone->lock);
-}
-
-static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid)
-{
- mm_zero_struct_page(page);
- set_page_links(page, zone, nid, pfn);
- init_page_count(page);
- page_mapcount_reset(page);
- page_cpupid_reset_last(page);
- page_kasan_tag_reset(page);
-
- INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
- /* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
- set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
-}
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __meminit init_reserved_page(unsigned long pfn)
-{
- pg_data_t *pgdat;
- int nid, zid;
-
- if (!early_page_uninitialised(pfn))
- return;
-
- nid = early_pfn_to_nid(pfn);
- pgdat = NODE_DATA(nid);
-
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
-
- if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
- break;
- }
- __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
-}
-#else
-static inline void init_reserved_page(unsigned long pfn)
-{
-}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-
-/*
- * Initialised pages do not have PageReserved set. This function is
- * called for each range allocated by the bootmem allocator and
- * marks the pages PageReserved. The remaining valid pages are later
- * sent to the buddy page allocator.
- */
-void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
-{
- unsigned long start_pfn = PFN_DOWN(start);
- unsigned long end_pfn = PFN_UP(end);
-
- for (; start_pfn < end_pfn; start_pfn++) {
- if (pfn_valid(start_pfn)) {
- struct page *page = pfn_to_page(start_pfn);
-
- init_reserved_page(start_pfn);
-
- /* Avoid false-positive PageTail() */
- INIT_LIST_HEAD(&page->lru);
-
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
- }
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
+ struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order, true))
+ if (!free_pages_prepare(page, order, fpi_flags))
return;
+ /*
+ * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
+ * is used to avoid calling get_pfnblock_migratetype() under the lock.
+ * This will reduce the lock holding time.
+ */
migratetype = get_pfnblock_migratetype(page, pfn);
- local_irq_save(flags);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(has_isolate_pageblock(zone) ||
+ is_migrate_isolate(migratetype))) {
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ }
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, pfn, order, migratetype);
- local_irq_restore(flags);
}
void __free_pages_core(struct page *page, unsigned int order)
@@ -1485,6 +1324,11 @@ void __free_pages_core(struct page *page, unsigned int order)
struct page *p = page;
unsigned int loop;
+ /*
+ * When initializing the memmap, __init_single_page() sets the refcount
+ * of all pages to 1 ("allocated"/"not free"). We have to set the
+ * refcount of all involved pages to 0.
+ */
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
@@ -1495,68 +1339,25 @@ void __free_pages_core(struct page *page, unsigned int order)
set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
- set_page_refcounted(page);
- __free_pages(page, order);
-}
-
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-
-static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-
-/*
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- */
-int __meminit __early_pfn_to_nid(unsigned long pfn,
- struct mminit_pfnnid_cache *state)
-{
- unsigned long start_pfn, end_pfn;
- int nid;
-
- if (state->last_start <= pfn && pfn < state->last_end)
- return state->last_nid;
+ if (page_contains_unaccepted(page, order)) {
+ if (order == MAX_ORDER && __free_unaccepted(page))
+ return;
- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != NUMA_NO_NODE) {
- state->last_start = start_pfn;
- state->last_end = end_pfn;
- state->last_nid = nid;
+ accept_page(page, order);
}
- return nid;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-int __meminit early_pfn_to_nid(unsigned long pfn)
-{
- static DEFINE_SPINLOCK(early_pfn_lock);
- int nid;
-
- spin_lock(&early_pfn_lock);
- nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid < 0)
- nid = first_online_node;
- spin_unlock(&early_pfn_lock);
-
- return nid;
-}
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
-
-void __init memblock_free_pages(struct page *page, unsigned long pfn,
- unsigned int order)
-{
- if (early_page_uninitialised(pfn))
- return;
- __free_pages_core(page, order);
+ /*
+ * Bypass PCP and place fresh pages right to the tail, primarily
+ * relevant for memory onlining.
+ */
+ __free_pages_ok(page, order, FPI_TO_TAIL);
}
/*
* Check that the whole (or subset of) a pageblock given by the interval of
* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
+ * with the migration of free compaction scanner.
*
* Return struct page pointer of start_pfn, or NULL if checks were not passed.
*
@@ -1567,6 +1368,15 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
* interleaving within a single pageblock. It is therefore sufficient to check
* the first and last page of a pageblock and avoid checking each individual
* page in a pageblock.
+ *
+ * Note: the function may return non-NULL struct page even for a page block
+ * which contains a memory hole (i.e. there is no physical memory for a subset
+ * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
+ * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
+ * even though the start pfn is online and valid. This should be safe most of
+ * the time because struct pages are still initialized via init_unavailable_range()
+ * and pfn walkers shouldn't touch any physical memory range for which they do
+ * not recognize any specific metadata in struct pages.
*/
struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone)
@@ -1577,7 +1387,7 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
/* end_pfn is one past the range we are checking */
end_pfn--;
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ if (!pfn_valid(end_pfn))
return NULL;
start_page = pfn_to_online_page(start_pfn);
@@ -1596,497 +1406,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
return start_page;
}
-void set_zone_contiguous(struct zone *zone)
-{
- unsigned long block_start_pfn = zone->zone_start_pfn;
- unsigned long block_end_pfn;
-
- block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
- for (; block_start_pfn < zone_end_pfn(zone);
- block_start_pfn = block_end_pfn,
- block_end_pfn += pageblock_nr_pages) {
-
- block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
-
- if (!__pageblock_pfn_to_page(block_start_pfn,
- block_end_pfn, zone))
- return;
- cond_resched();
- }
-
- /* We confirm that there is no hole */
- zone->contiguous = true;
-}
-
-void clear_zone_contiguous(struct zone *zone)
-{
- zone->contiguous = false;
-}
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(unsigned long pfn,
- unsigned long nr_pages)
-{
- struct page *page;
- unsigned long i;
-
- if (!nr_pages)
- return;
-
- page = pfn_to_page(pfn);
-
- /* Free a large naturally-aligned chunk if possible */
- if (nr_pages == pageblock_nr_pages &&
- (pfn & (pageblock_nr_pages - 1)) == 0) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_core(page, pageblock_order);
- return;
- }
-
- for (i = 0; i < nr_pages; i++, page++, pfn++) {
- if ((pfn & (pageblock_nr_pages - 1)) == 0)
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_core(page, 0);
- }
-}
-
-/* Completion tracking for deferred_init_memmap() threads */
-static atomic_t pgdat_init_n_undone __initdata;
-static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
-
-static inline void __init pgdat_init_report_one_done(void)
-{
- if (atomic_dec_and_test(&pgdat_init_n_undone))
- complete(&pgdat_init_all_done_comp);
-}
-
-/*
- * Returns true if page needs to be initialized or freed to buddy allocator.
- *
- * First we check if pfn is valid on architectures where it is possible to have
- * holes within pageblock_nr_pages. On systems where it is not possible, this
- * function is optimized out.
- *
- * Then, we check if a current large page is valid by only checking the validity
- * of the head pfn.
- */
-static inline bool __init deferred_pfn_valid(unsigned long pfn)
-{
- if (!pfn_valid_within(pfn))
- return false;
- if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
- return false;
- return true;
-}
-
-/*
- * Free pages to buddy allocator. Try to free aligned pages in
- * pageblock_nr_pages sizes.
- */
-static void __init deferred_free_pages(unsigned long pfn,
- unsigned long end_pfn)
-{
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
- unsigned long nr_free = 0;
-
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(pfn)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 0;
- } else if (!(pfn & nr_pgmask)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 1;
- } else {
- nr_free++;
- }
- }
- /* Free the last block of pages to allocator */
- deferred_free_range(pfn - nr_free, nr_free);
-}
-
-/*
- * Initialize struct pages. We minimize pfn page lookups and scheduler checks
- * by performing it only once every pageblock_nr_pages.
- * Return number of pages initialized.
- */
-static unsigned long __init deferred_init_pages(struct zone *zone,
- unsigned long pfn,
- unsigned long end_pfn)
-{
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
- int nid = zone_to_nid(zone);
- unsigned long nr_pages = 0;
- int zid = zone_idx(zone);
- struct page *page = NULL;
-
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(pfn)) {
- page = NULL;
- continue;
- } else if (!page || !(pfn & nr_pgmask)) {
- page = pfn_to_page(pfn);
- } else {
- page++;
- }
- __init_single_page(page, pfn, zid, nid);
- nr_pages++;
- }
- return (nr_pages);
-}
-
-/*
- * This function is meant to pre-load the iterator for the zone init.
- * Specifically it walks through the ranges until we are caught up to the
- * first_init_pfn value and exits there. If we never encounter the value we
- * return false indicating there are no valid ranges left.
- */
-static bool __init
-deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
- unsigned long *spfn, unsigned long *epfn,
- unsigned long first_init_pfn)
-{
- u64 j;
-
- /*
- * Start out by walking through the ranges in this zone that have
- * already been initialized. We don't need to do anything with them
- * so we just need to flush them out of the system.
- */
- for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
- if (*epfn <= first_init_pfn)
- continue;
- if (*spfn < first_init_pfn)
- *spfn = first_init_pfn;
- *i = j;
- return true;
- }
-
- return false;
-}
-
-/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, then free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
- *
- * In order to try and keep some memory in the cache we have the loop
- * broken along max page order boundaries. This way we will not cause
- * any issues with the buddy page computation.
- */
-static unsigned long __init
-deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
- unsigned long *end_pfn)
-{
- unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
- unsigned long spfn = *start_pfn, epfn = *end_pfn;
- unsigned long nr_pages = 0;
- u64 j = *i;
-
- /* First we loop through and initialize the page values */
- for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
- unsigned long t;
-
- if (mo_pfn <= *start_pfn)
- break;
-
- t = min(mo_pfn, *end_pfn);
- nr_pages += deferred_init_pages(zone, *start_pfn, t);
-
- if (mo_pfn < *end_pfn) {
- *start_pfn = mo_pfn;
- break;
- }
- }
-
- /* Reset values and now loop through freeing pages as needed */
- swap(j, *i);
-
- for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
- unsigned long t;
-
- if (mo_pfn <= spfn)
- break;
-
- t = min(mo_pfn, epfn);
- deferred_free_pages(spfn, t);
-
- if (mo_pfn <= epfn)
- break;
- }
-
- return nr_pages;
-}
-
-static void __init
-deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
- void *arg)
-{
- unsigned long spfn, epfn;
- struct zone *zone = arg;
- u64 i;
-
- deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
-
- /*
- * Initialize and free pages in MAX_ORDER sized increments so that we
- * can avoid introducing any issues with the buddy allocator.
- */
- while (spfn < end_pfn) {
- deferred_init_maxorder(&i, zone, &spfn, &epfn);
- cond_resched();
- }
-}
-
-/* An arch may override for more concurrency. */
-__weak int __init
-deferred_page_init_max_threads(const struct cpumask *node_cpumask)
-{
- return 1;
-}
-
-/* Initialise remaining memory on a node */
-static int __init deferred_init_memmap(void *data)
-{
- pg_data_t *pgdat = data;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- unsigned long spfn = 0, epfn = 0;
- unsigned long first_init_pfn, flags;
- unsigned long start = jiffies;
- struct zone *zone;
- int zid, max_threads;
- u64 i;
-
- /* Bind memory initialisation thread to a local node if possible */
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(current, cpumask);
-
- pgdat_resize_lock(pgdat, &flags);
- first_init_pfn = pgdat->first_deferred_pfn;
- if (first_init_pfn == ULONG_MAX) {
- pgdat_resize_unlock(pgdat, &flags);
- pgdat_init_report_one_done();
- return 0;
- }
-
- /* Sanity check boundaries */
- BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
- BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
- pgdat->first_deferred_pfn = ULONG_MAX;
-
- /*
- * Once we unlock here, the zone cannot be grown anymore, thus if an
- * interrupt thread must allocate this early in boot, zone must be
- * pre-grown prior to start of deferred page initialization.
- */
- pgdat_resize_unlock(pgdat, &flags);
-
- /* Only the highest zone is deferred so find it */
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- zone = pgdat->node_zones + zid;
- if (first_init_pfn < zone_end_pfn(zone))
- break;
- }
-
- /* If the zone is empty somebody else may have cleared out the zone */
- if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- first_init_pfn))
- goto zone_empty;
-
- max_threads = deferred_page_init_max_threads(cpumask);
-
- while (spfn < epfn) {
- unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
- struct padata_mt_job job = {
- .thread_fn = deferred_init_memmap_chunk,
- .fn_arg = zone,
- .start = spfn,
- .size = epfn_align - spfn,
- .align = PAGES_PER_SECTION,
- .min_chunk = PAGES_PER_SECTION,
- .max_threads = max_threads,
- };
-
- padata_do_multithreaded(&job);
- deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- epfn_align);
- }
-zone_empty:
- /* Sanity check that the next zone really is unpopulated */
- WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
-
- pr_info("node %d deferred pages initialised in %ums\n",
- pgdat->node_id, jiffies_to_msecs(jiffies - start));
-
- pgdat_init_report_one_done();
- return 0;
-}
-
-/*
- * If this zone has deferred pages, try to grow it by initializing enough
- * deferred pages to satisfy the allocation specified by order, rounded up to
- * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
- * of SECTION_SIZE bytes by initializing struct pages in increments of
- * PAGES_PER_SECTION * sizeof(struct page) bytes.
- *
- * Return true when zone was grown, otherwise return false. We return true even
- * when we grow less than requested, to let the caller decide if there are
- * enough pages to satisfy the allocation.
- *
- * Note: We use noinline because this function is needed only during boot, and
- * it is called from a __ref function _deferred_grow_zone. This way we are
- * making sure that it is not inlined into permanent text section.
- */
-static noinline bool __init
-deferred_grow_zone(struct zone *zone, unsigned int order)
-{
- unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- pg_data_t *pgdat = zone->zone_pgdat;
- unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- unsigned long spfn, epfn, flags;
- unsigned long nr_pages = 0;
- u64 i;
-
- /* Only the last zone may have deferred pages */
- if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
- return false;
-
- pgdat_resize_lock(pgdat, &flags);
-
- /*
- * If someone grew this zone while we were waiting for spinlock, return
- * true, as there might be enough pages already.
- */
- if (first_deferred_pfn != pgdat->first_deferred_pfn) {
- pgdat_resize_unlock(pgdat, &flags);
- return true;
- }
-
- /* If the zone is empty somebody else may have cleared out the zone */
- if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- first_deferred_pfn)) {
- pgdat->first_deferred_pfn = ULONG_MAX;
- pgdat_resize_unlock(pgdat, &flags);
- /* Retry only once. */
- return first_deferred_pfn != ULONG_MAX;
- }
-
- /*
- * Initialize and free pages in MAX_ORDER sized increments so
- * that we can avoid introducing any issues with the buddy
- * allocator.
- */
- while (spfn < epfn) {
- /* update our first deferred PFN for this section */
- first_deferred_pfn = spfn;
-
- nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
- touch_nmi_watchdog();
-
- /* We should only stop along section boundaries */
- if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
- continue;
-
- /* If our quota has been met we can stop here */
- if (nr_pages >= nr_pages_needed)
- break;
- }
-
- pgdat->first_deferred_pfn = spfn;
- pgdat_resize_unlock(pgdat, &flags);
-
- return nr_pages > 0;
-}
-
-/*
- * deferred_grow_zone() is __init, but it is called from
- * get_page_from_freelist() during early boot until deferred_pages permanently
- * disables this call. This is why we have refdata wrapper to avoid warning,
- * and to ensure that the function body gets unloaded.
- */
-static bool __ref
-_deferred_grow_zone(struct zone *zone, unsigned int order)
-{
- return deferred_grow_zone(zone, order);
-}
-
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-
-void __init page_alloc_init_late(void)
-{
- struct zone *zone;
- int nid;
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-
- /* There will be num_node_state(N_MEMORY) threads */
- atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
- for_each_node_state(nid, N_MEMORY) {
- kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
- }
-
- /* Block until all are initialised */
- wait_for_completion(&pgdat_init_all_done_comp);
-
- /*
- * The number of managed pages has changed due to the initialisation
- * so the pcpu batch and high limits needs to be updated or the limits
- * will be artificially small.
- */
- for_each_populated_zone(zone)
- zone_pcp_update(zone);
-
- /*
- * We initialized the rest of the deferred pages. Permanently disable
- * on-demand struct page initialization.
- */
- static_branch_disable(&deferred_pages);
-
- /* Reinit limits that are based on free pages after the kernel is up */
- files_maxfiles_init();
-#endif
-
- /* Discard memblock private memory */
- memblock_discard();
-
- for_each_node_state(nid, N_MEMORY)
- shuffle_free_memory(NODE_DATA(nid));
-
- for_each_populated_zone(zone)
- set_zone_contiguous(zone);
-}
-
-#ifdef CONFIG_CMA
-/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
-void __init init_cma_reserved_pageblock(struct page *page)
-{
- unsigned i = pageblock_nr_pages;
- struct page *p = page;
-
- do {
- __ClearPageReserved(p);
- set_page_count(p, 0);
- } while (++p, --i);
-
- set_pageblock_migratetype(page, MIGRATE_CMA);
-
- if (pageblock_order >= MAX_ORDER) {
- i = pageblock_nr_pages;
- p = page;
- do {
- set_page_refcounted(p);
- __free_pages(p, MAX_ORDER - 1);
- p += MAX_ORDER_NR_PAGES;
- } while (i -= MAX_ORDER_NR_PAGES);
- } else {
- set_page_refcounted(page);
- __free_pages(page, pageblock_order);
- }
-
- adjust_managed_page_count(page, pageblock_nr_pages);
-}
-#endif
-
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
@@ -2121,7 +1440,7 @@ static inline void expand(struct zone *zone, struct page *page,
continue;
add_to_free_list(&page[size], zone, high, migratetype);
- set_page_order(&page[size], high);
+ set_buddy_order(&page[size], high);
}
}
@@ -2140,7 +1459,7 @@ static void check_new_page_bad(struct page *page)
/*
* This page is about to be returned from the page allocator
*/
-static inline int check_new_page(struct page *page)
+static int check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
@@ -2150,74 +1469,106 @@ static inline int check_new_page(struct page *page)
return 1;
}
-static inline bool free_pages_prezeroed(void)
+static inline bool check_new_pages(struct page *page, unsigned int order)
{
- return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
- page_poisoning_enabled()) || want_init_on_free();
-}
+ if (is_check_pages_enabled()) {
+ for (int i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
-#ifdef CONFIG_DEBUG_VM
-/*
- * With DEBUG_VM enabled, order-0 pages are checked for expected state when
- * being allocated from pcp lists. With debug_pagealloc also enabled, they are
- * also checked when pcp lists are refilled from the free lists.
- */
-static inline bool check_pcp_refill(struct page *page)
-{
- if (debug_pagealloc_enabled_static())
- return check_new_page(page);
- else
- return false;
-}
+ if (check_new_page(p))
+ return true;
+ }
+ }
-static inline bool check_new_pcp(struct page *page)
-{
- return check_new_page(page);
-}
-#else
-/*
- * With DEBUG_VM disabled, free order-0 pages are checked for expected state
- * when pcp lists are being refilled from the free lists. With debug_pagealloc
- * enabled, they are also checked when being allocated from the pcp lists.
- */
-static inline bool check_pcp_refill(struct page *page)
-{
- return check_new_page(page);
+ return false;
}
-static inline bool check_new_pcp(struct page *page)
+
+static inline bool should_skip_kasan_unpoison(gfp_t flags)
{
- if (debug_pagealloc_enabled_static())
- return check_new_page(page);
- else
+ /* Don't skip if a software KASAN mode is enabled. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+ IS_ENABLED(CONFIG_KASAN_SW_TAGS))
return false;
+
+ /* Skip, if hardware tag-based KASAN is not enabled. */
+ if (!kasan_hw_tags_enabled())
+ return true;
+
+ /*
+ * With hardware tag-based KASAN enabled, skip if this has been
+ * requested via __GFP_SKIP_KASAN.
+ */
+ return flags & __GFP_SKIP_KASAN;
}
-#endif /* CONFIG_DEBUG_VM */
-static bool check_new_pages(struct page *page, unsigned int order)
+static inline bool should_skip_init(gfp_t flags)
{
- int i;
- for (i = 0; i < (1 << order); i++) {
- struct page *p = page + i;
-
- if (unlikely(check_new_page(p)))
- return true;
- }
+ /* Don't skip, if hardware tag-based KASAN is not enabled. */
+ if (!kasan_hw_tags_enabled())
+ return false;
- return false;
+ /* For hardware tag-based KASAN, skip if requested. */
+ return (flags & __GFP_SKIP_ZERO);
}
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
+ !should_skip_init(gfp_flags);
+ bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ int i;
+
set_page_private(page, 0);
set_page_refcounted(page);
arch_alloc_page(page, order);
- if (debug_pagealloc_enabled_static())
- kernel_map_pages(page, 1 << order, 1);
- kasan_alloc_pages(page, order);
- kernel_poison_pages(page, 1 << order, 1);
+ debug_pagealloc_map_pages(page, 1 << order);
+
+ /*
+ * Page unpoisoning must happen before memory initialization.
+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
+ * allocations and the page unpoisoning code will complain.
+ */
+ kernel_unpoison_pages(page, 1 << order);
+
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * KASAN unpoisoning and memory initializion code must be
+ * kept together to avoid discrepancies in behavior.
+ */
+
+ /*
+ * If memory tags should be zeroed
+ * (which happens only when memory should be initialized as well).
+ */
+ if (zero_tags) {
+ /* Initialize both memory and memory tags. */
+ for (i = 0; i != 1 << order; ++i)
+ tag_clear_highpage(page + i);
+
+ /* Take note that memory was initialized by the loop above. */
+ init = false;
+ }
+ if (!should_skip_kasan_unpoison(gfp_flags) &&
+ kasan_unpoison_pages(page, order, init)) {
+ /* Take note that memory was initialized by KASAN. */
+ if (kasan_has_integrated_init())
+ init = false;
+ } else {
+ /*
+ * If memory tags have not been set by KASAN, reset the page
+ * tags to ensure page_address() dereferencing does not fault.
+ */
+ for (i = 0; i != 1 << order; ++i)
+ page_kasan_tag_reset(page + i);
+ }
+ /* If memory is still not initialized, initialize it now. */
+ if (init)
+ kernel_init_pages(page, 1 << order);
+
set_page_owner(page, order, gfp_flags);
+ page_table_check_alloc(page, order);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -2225,9 +1576,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
{
post_alloc_hook(page, order, gfp_flags);
- if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
- kernel_init_free_pages(page, 1 << order);
-
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
@@ -2256,7 +1604,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
struct page *page;
/* Find a page of the appropriate size in the preferred list */
- for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+ for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
@@ -2264,6 +1612,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
+ trace_mm_page_alloc_zone_locked(page, order, migratetype,
+ pcp_allowed_order(order) &&
+ migratetype < MIGRATE_PCPTYPES);
return page;
}
@@ -2274,17 +1625,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
+ *
+ * The other migratetypes do not have fallbacks.
*/
-static int fallbacks[MIGRATE_TYPES][3] = {
- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
-#ifdef CONFIG_CMA
- [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
-#endif
+static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
};
#ifdef CONFIG_CMA
@@ -2299,24 +1646,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Move the free pages in a range to the free lists of the requested type.
+ * Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
static int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
+ unsigned long start_pfn, unsigned long end_pfn,
int migratetype, int *num_movable)
{
struct page *page;
+ unsigned long pfn;
unsigned int order;
int pages_moved = 0;
- for (page = start_page; page <= end_page;) {
- if (!pfn_valid_within(page_to_pfn(page))) {
- page++;
- continue;
- }
-
+ for (pfn = start_pfn; pfn <= end_pfn;) {
+ page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2326,8 +1670,7 @@ static int move_freepages(struct zone *zone,
if (num_movable &&
(PageLRU(page) || __PageMovable(page)))
(*num_movable)++;
-
- page++;
+ pfn++;
continue;
}
@@ -2335,9 +1678,9 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
- order = page_order(page);
+ order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
- page += 1 << order;
+ pfn += 1 << order;
pages_moved += 1 << order;
}
@@ -2347,25 +1690,22 @@ static int move_freepages(struct zone *zone,
int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable)
{
- unsigned long start_pfn, end_pfn;
- struct page *start_page, *end_page;
+ unsigned long start_pfn, end_pfn, pfn;
if (num_movable)
*num_movable = 0;
- start_pfn = page_to_pfn(page);
- start_pfn = start_pfn & ~(pageblock_nr_pages-1);
- start_page = pfn_to_page(start_pfn);
- end_page = start_page + pageblock_nr_pages - 1;
- end_pfn = start_pfn + pageblock_nr_pages - 1;
+ pfn = page_to_pfn(page);
+ start_pfn = pageblock_start_pfn(pfn);
+ end_pfn = pageblock_end_pfn(pfn) - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
- start_page = page;
+ start_pfn = pfn;
if (!zone_spans_pfn(zone, end_pfn))
return 0;
- return move_freepages(zone, start_page, end_page, migratetype,
+ return move_freepages(zone, start_pfn, end_pfn, migratetype,
num_movable);
}
@@ -2413,12 +1753,12 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
return false;
}
-static inline void boost_watermark(struct zone *zone)
+static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
if (!watermark_boost_factor)
- return;
+ return false;
/*
* Don't bother in zones that are unlikely to produce results.
* On small machines, including kdump capture kernels running
@@ -2426,7 +1766,7 @@ static inline void boost_watermark(struct zone *zone)
* memory situation immediately.
*/
if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
- return;
+ return false;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
@@ -2440,12 +1780,14 @@ static inline void boost_watermark(struct zone *zone)
* boosted watermark resulting in a hang.
*/
if (!max_boost)
- return;
+ return false;
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
max_boost);
+
+ return true;
}
/*
@@ -2459,7 +1801,7 @@ static inline void boost_watermark(struct zone *zone)
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
- unsigned int current_order = page_order(page);
+ unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
@@ -2483,8 +1825,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
* likelihood of future fallbacks. Wake kswapd now as the node
* may be balanced overall and kswapd will not wake naturally.
*/
- boost_watermark(zone);
- if (alloc_flags & ALLOC_KSWAPD)
+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
@@ -2549,11 +1890,8 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
return -1;
*can_steal = false;
- for (i = 0;; i++) {
+ for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
fallback_mt = fallbacks[migratetype][i];
- if (fallback_mt == MIGRATE_TYPES)
- break;
-
if (free_area_empty(area, fallback_mt))
continue;
@@ -2596,8 +1934,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
/* Yoink! */
mt = get_pageblock_migratetype(page);
- if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
- && !is_migrate_cma(mt)) {
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (migratetype_is_mergeable(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
@@ -2638,7 +1976,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
continue;
spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
@@ -2648,7 +1986,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
/*
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
- * in this loop althoug we changed the pageblock type
+ * in this loop although we changed the pageblock type
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
@@ -2714,7 +2052,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
- if (alloc_flags & ALLOC_NOFRAGMENT)
+ if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
@@ -2722,7 +2060,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
- for (current_order = MAX_ORDER - 1; current_order >= min_order;
+ for (current_order = MAX_ORDER; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2748,7 +2086,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
return false;
find_smallest:
- for (current_order = order; current_order < MAX_ORDER;
+ for (current_order = order; current_order <= MAX_ORDER;
current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2761,7 +2099,7 @@ find_smallest:
* This should not happen - we already found a suitable fallback
* when looking for the largest page.
*/
- VM_BUG_ON(current_order == MAX_ORDER);
+ VM_BUG_ON(current_order > MAX_ORDER);
do_steal:
page = get_page_from_free_area(area, fallback_mt);
@@ -2786,20 +2124,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
{
struct page *page;
-#ifdef CONFIG_CMA
- /*
- * Balance movable allocations between regular and CMA areas by
- * allocating from CMA when over half of the zone's free memory
- * is in the CMA area.
- */
- if (alloc_flags & ALLOC_CMA &&
- zone_page_state(zone, NR_FREE_CMA_PAGES) >
- zone_page_state(zone, NR_FREE_PAGES) / 2) {
- page = __rmqueue_cma_fallback(zone, order);
- if (page)
- return page;
+ if (IS_ENABLED(CONFIG_CMA)) {
+ /*
+ * Balance movable allocations between regular and CMA areas by
+ * allocating from CMA when over half of the zone's free memory
+ * is in the CMA area.
+ */
+ if (alloc_flags & ALLOC_CMA &&
+ zone_page_state(zone, NR_FREE_CMA_PAGES) >
+ zone_page_state(zone, NR_FREE_PAGES) / 2) {
+ page = __rmqueue_cma_fallback(zone, order);
+ if (page)
+ return page;
+ }
}
-#endif
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
@@ -2810,8 +2148,6 @@ retry:
alloc_flags))
goto retry;
}
-
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
@@ -2824,18 +2160,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
- int i, alloced = 0;
+ unsigned long flags;
+ int i;
- spin_lock(&zone->lock);
+ spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
if (unlikely(page == NULL))
break;
- if (unlikely(check_pcp_refill(page)))
- continue;
-
/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
@@ -2846,22 +2180,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
- list_add_tail(&page->lru, list);
- alloced++;
+ list_add_tail(&page->pcp_list, list);
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
- /*
- * i pages were removed from the buddy list even if some leak due
- * to check_pcp_refill failing so adjust NR_FREE_PAGES based
- * on i. Do not confuse with 'alloced' which is the number of
- * pages added to the pcp list.
- */
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
- spin_unlock(&zone->lock);
- return alloced;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return i;
}
#ifdef CONFIG_NUMA
@@ -2869,52 +2197,38 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
- *
- * Note that this function must be called with the thread pinned to
- * a single processor.
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
- unsigned long flags;
int to_drain, batch;
- local_irq_save(flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
- if (to_drain > 0)
- free_pcppages_bulk(zone, to_drain, pcp);
- local_irq_restore(flags);
+ if (to_drain > 0) {
+ spin_lock(&pcp->lock);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
+ spin_unlock(&pcp->lock);
+ }
}
#endif
/*
* Drain pcplists of the indicated processor and zone.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
*/
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
- unsigned long flags;
- struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- local_irq_save(flags);
- pset = per_cpu_ptr(zone->pageset, cpu);
-
- pcp = &pset->pcp;
- if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp);
- local_irq_restore(flags);
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ if (pcp->count) {
+ spin_lock(&pcp->lock);
+ free_pcppages_bulk(zone, pcp->count, pcp, 0);
+ spin_unlock(&pcp->lock);
+ }
}
/*
* Drain pcplists of all zones on the indicated processor.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
*/
static void drain_pages(unsigned int cpu)
{
@@ -2927,9 +2241,6 @@ static void drain_pages(unsigned int cpu)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
- *
- * The CPU has to be pinned. When zone parameter is non-NULL, spill just
- * the single zone's pages.
*/
void drain_local_pages(struct zone *zone)
{
@@ -2941,49 +2252,27 @@ void drain_local_pages(struct zone *zone)
drain_pages(cpu);
}
-static void drain_local_pages_wq(struct work_struct *work)
-{
- struct pcpu_drain *drain;
-
- drain = container_of(work, struct pcpu_drain, work);
-
- /*
- * drain_all_pages doesn't use proper cpu hotplug protection so
- * we can race with cpu offline when the WQ can move this from
- * a cpu pinned worker to an unbound one. We can operate on a different
- * cpu which is allright but we also have to make sure to not move to
- * a different one.
- */
- preempt_disable();
- drain_local_pages(drain->zone);
- preempt_enable();
-}
-
/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
- *
- * When zone parameter is non-NULL, spill just the single zone's pages.
+ * The implementation of drain_all_pages(), exposing an extra parameter to
+ * drain on all cpus.
*
- * Note that this can be extremely slow as the draining happens in a workqueue.
+ * drain_all_pages() is optimized to only execute on cpus where pcplists are
+ * not empty. The check for non-emptiness can however race with a free to
+ * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
+ * that need the guarantee that every CPU has drained can disable the
+ * optimizing racy check.
*/
-void drain_all_pages(struct zone *zone)
+static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
{
int cpu;
/*
- * Allocate in the BSS so we wont require allocation in
+ * Allocate in the BSS so we won't require allocation in
* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
*/
static cpumask_t cpus_with_pcps;
/*
- * Make sure nobody triggers this path before mm_percpu_wq is fully
- * initialized.
- */
- if (WARN_ON_ONCE(!mm_percpu_wq))
- return;
-
- /*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
* the drain to be complete when the call returns.
@@ -3001,18 +2290,24 @@ void drain_all_pages(struct zone *zone)
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pcp;
+ struct per_cpu_pages *pcp;
struct zone *z;
bool has_pcps = false;
- if (zone) {
- pcp = per_cpu_ptr(zone->pageset, cpu);
- if (pcp->pcp.count)
+ if (force_all_cpus) {
+ /*
+ * The pcp.count check is racy, some callers need a
+ * guarantee that no cpu is missed.
+ */
+ has_pcps = true;
+ } else if (zone) {
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ if (pcp->count)
has_pcps = true;
} else {
for_each_populated_zone(z) {
- pcp = per_cpu_ptr(z->pageset, cpu);
- if (pcp->pcp.count) {
+ pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
+ if (pcp->count) {
has_pcps = true;
break;
}
@@ -3026,132 +2321,154 @@ void drain_all_pages(struct zone *zone)
}
for_each_cpu(cpu, &cpus_with_pcps) {
- struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
-
- drain->zone = zone;
- INIT_WORK(&drain->work, drain_local_pages_wq);
- queue_work_on(cpu, mm_percpu_wq, &drain->work);
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
}
- for_each_cpu(cpu, &cpus_with_pcps)
- flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
-#ifdef CONFIG_HIBERNATION
-
/*
- * Touch the watchdog for every WD_PAGE_COUNT pages.
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
*/
-#define WD_PAGE_COUNT (128*1024)
+void drain_all_pages(struct zone *zone)
+{
+ __drain_all_pages(zone, false);
+}
-void mark_free_pages(struct zone *zone)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+ unsigned int order)
{
- unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
- unsigned long flags;
- unsigned int order, t;
- struct page *page;
+ int migratetype;
- if (zone_is_empty(zone))
- return;
+ if (!free_pages_prepare(page, order, FPI_NONE))
+ return false;
- spin_lock_irqsave(&zone->lock, flags);
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ set_pcppage_migratetype(page, migratetype);
+ return true;
+}
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
+static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
+ bool free_high)
+{
+ int min_nr_free, max_nr_free;
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
+ /* Free everything if batch freeing high-order pages. */
+ if (unlikely(free_high))
+ return pcp->count;
- if (page_zone(page) != zone)
- continue;
+ /* Check for PCP disabled or boot pageset */
+ if (unlikely(high < batch))
+ return 1;
- if (!swsusp_page_is_forbidden(page))
- swsusp_unset_page_free(page);
- }
+ /* Leave at least pcp->batch pages on the list */
+ min_nr_free = batch;
+ max_nr_free = high - batch;
- for_each_migratetype_order(order, t) {
- list_for_each_entry(page,
- &zone->free_area[order].free_list[t], lru) {
- unsigned long i;
+ /*
+ * Double the number of pages freed each time there is subsequent
+ * freeing of pages without any allocation.
+ */
+ batch <<= pcp->free_factor;
+ if (batch < max_nr_free)
+ pcp->free_factor++;
+ batch = clamp(batch, min_nr_free, max_nr_free);
- pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++) {
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
- swsusp_set_page_free(pfn_to_page(pfn + i));
- }
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
+ return batch;
}
-#endif /* CONFIG_PM */
-static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
+ bool free_high)
{
- int migratetype;
+ int high = READ_ONCE(pcp->high);
- if (!free_pcp_prepare(page))
- return false;
+ if (unlikely(!high || free_high))
+ return 0;
- migratetype = get_pfnblock_migratetype(page, pfn);
- set_pcppage_migratetype(page, migratetype);
- return true;
+ if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+ return high;
+
+ /*
+ * If reclaim is active, limit the number of pages that can be
+ * stored on pcp lists
+ */
+ return min(READ_ONCE(pcp->batch) << 2, high);
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
+ struct page *page, int migratetype,
+ unsigned int order)
{
- struct zone *zone = page_zone(page);
+ int high;
+ int pindex;
+ bool free_high;
+
+ __count_vm_events(PGFREE, 1 << order);
+ pindex = order_to_pindex(migratetype, order);
+ list_add(&page->pcp_list, &pcp->lists[pindex]);
+ pcp->count += 1 << order;
+
+ /*
+ * As high-order pages other than THP's stored on PCP can contribute
+ * to fragmentation, limit the number stored when PCP is heavily
+ * freeing without allocation. The remainder after bulk freeing
+ * stops will be drained from vmstat refresh context.
+ */
+ free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
+
+ high = nr_pcp_high(pcp, zone, free_high);
+ if (pcp->count >= high) {
+ int batch = READ_ONCE(pcp->batch);
+
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
+ }
+}
+
+/*
+ * Free a pcp page
+ */
+void free_unref_page(struct page *page, unsigned int order)
+{
+ unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
+ struct zone *zone;
+ unsigned long pfn = page_to_pfn(page);
int migratetype;
- migratetype = get_pcppage_migratetype(page);
- __count_vm_event(PGFREE);
+ if (!free_unref_page_prepare(page, pfn, order))
+ return;
/*
* We only track unmovable, reclaimable and movable on pcp lists.
- * Free ISOLATE pages back to the allocator because they are being
+ * Place ISOLATE pages on the isolated list because they are being
* offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
- if (migratetype >= MIGRATE_PCPTYPES) {
+ migratetype = get_pcppage_migratetype(page);
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype);
+ free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
}
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list_add(&page->lru, &pcp->lists[migratetype]);
- pcp->count++;
- if (pcp->count >= pcp->high) {
- unsigned long batch = READ_ONCE(pcp->batch);
- free_pcppages_bulk(zone, batch, pcp);
+ zone = page_zone(page);
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ if (pcp) {
+ free_unref_page_commit(zone, pcp, page, migratetype, order);
+ pcp_spin_unlock(pcp);
+ } else {
+ free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
}
-}
-
-/*
- * Free a 0-order page
- */
-void free_unref_page(struct page *page)
-{
- unsigned long flags;
- unsigned long pfn = page_to_pfn(page);
-
- if (!free_unref_page_prepare(page, pfn))
- return;
-
- local_irq_save(flags);
- free_unref_page_commit(page, pfn);
- local_irq_restore(flags);
+ pcp_trylock_finish(UP_flags);
}
/*
@@ -3159,37 +2476,84 @@ void free_unref_page(struct page *page)
*/
void free_unref_page_list(struct list_head *list)
{
+ unsigned long __maybe_unused UP_flags;
struct page *page, *next;
- unsigned long flags, pfn;
+ struct per_cpu_pages *pcp = NULL;
+ struct zone *locked_zone = NULL;
int batch_count = 0;
+ int migratetype;
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
- pfn = page_to_pfn(page);
- if (!free_unref_page_prepare(page, pfn))
+ unsigned long pfn = page_to_pfn(page);
+ if (!free_unref_page_prepare(page, pfn, 0)) {
+ list_del(&page->lru);
+ continue;
+ }
+
+ /*
+ * Free isolated pages directly to the allocator, see
+ * comment in free_unref_page.
+ */
+ migratetype = get_pcppage_migratetype(page);
+ if (unlikely(is_migrate_isolate(migratetype))) {
list_del(&page->lru);
- set_page_private(page, pfn);
+ free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+ continue;
+ }
}
- local_irq_save(flags);
list_for_each_entry_safe(page, next, list, lru) {
- unsigned long pfn = page_private(page);
+ struct zone *zone = page_zone(page);
- set_page_private(page, 0);
- trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn);
+ list_del(&page->lru);
+ migratetype = get_pcppage_migratetype(page);
/*
- * Guard against excessive IRQ disabled times when we get
- * a large list of pages to free.
+ * Either different zone requiring a different pcp lock or
+ * excessive lock hold times when freeing a large list of
+ * pages.
*/
- if (++batch_count == SWAP_CLUSTER_MAX) {
- local_irq_restore(flags);
+ if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
+ if (pcp) {
+ pcp_spin_unlock(pcp);
+ pcp_trylock_finish(UP_flags);
+ }
+
batch_count = 0;
- local_irq_save(flags);
+
+ /*
+ * trylock is necessary as pages may be getting freed
+ * from IRQ or SoftIRQ context after an IO completion.
+ */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ if (unlikely(!pcp)) {
+ pcp_trylock_finish(UP_flags);
+ free_one_page(zone, page, page_to_pfn(page),
+ 0, migratetype, FPI_NONE);
+ locked_zone = NULL;
+ continue;
+ }
+ locked_zone = zone;
}
+
+ /*
+ * Non-isolated types over MIGRATE_PCPTYPES get added
+ * to the MIGRATE_MOVABLE pcp list.
+ */
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES))
+ migratetype = MIGRATE_MOVABLE;
+
+ trace_mm_page_free_batched(page);
+ free_unref_page_commit(zone, pcp, page, migratetype, 0);
+ batch_count++;
+ }
+
+ if (pcp) {
+ pcp_spin_unlock(pcp);
+ pcp_trylock_finish(UP_flags);
}
- local_irq_restore(flags);
}
/*
@@ -3209,22 +2573,18 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, order);
+ split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
- unsigned long watermark;
- struct zone *zone;
- int mt;
-
- BUG_ON(!PageBuddy(page));
-
- zone = page_zone(page);
- mt = get_pageblock_migratetype(page);
+ struct zone *zone = page_zone(page);
+ int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
+ unsigned long watermark;
/*
* Obey watermarks as if the page was being allocated. We can
* emulate a high-order watermark check with a raised order-0
@@ -3238,8 +2598,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
__mod_zone_freepage_state(zone, -(1UL << order), mt);
}
- /* Remove page from free list */
-
del_page_from_free_list(page, zone, order);
/*
@@ -3250,14 +2608,16 @@ int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
- && !is_migrate_highatomic(mt))
+ /*
+ * Only change normal pageblocks (i.e., they can merge
+ * with others)
+ */
+ if (migratetype_is_mergeable(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
}
-
return 1UL << order;
}
@@ -3278,15 +2638,15 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
lockdep_assert_held(&zone->lock);
/* Return isolated page to tail of freelist. */
- __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
+ FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
}
/*
* Update NUMA hit/miss statistics
- *
- * Must be called with interrupts disabled.
*/
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+ long nr_account)
{
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3299,17 +2659,66 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
local_stat = NUMA_OTHER;
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
- __inc_numa_state(z, NUMA_HIT);
+ __count_numa_events(z, NUMA_HIT, nr_account);
else {
- __inc_numa_state(z, NUMA_MISS);
- __inc_numa_state(preferred_zone, NUMA_FOREIGN);
+ __count_numa_events(z, NUMA_MISS, nr_account);
+ __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
- __inc_numa_state(z, local_stat);
+ __count_numa_events(z, local_stat, nr_account);
#endif
}
+static __always_inline
+struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+ unsigned int order, unsigned int alloc_flags,
+ int migratetype)
+{
+ struct page *page;
+ unsigned long flags;
+
+ do {
+ page = NULL;
+ spin_lock_irqsave(&zone->lock, flags);
+ /*
+ * order-0 request can reach here when the pcplist is skipped
+ * due to non-CMA allocation context. HIGHATOMIC area is
+ * reserved for high-order atomic allocation, so order-0
+ * request should skip it.
+ */
+ if (alloc_flags & ALLOC_HIGHATOMIC)
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (!page) {
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
+
+ /*
+ * If the allocation fails, allow OOM handling access
+ * to HIGHATOMIC reserves as failing now is worse than
+ * failing a high-order atomic allocation in the
+ * future.
+ */
+ if (!page && (alloc_flags & ALLOC_OOM))
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+
+ if (!page) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return NULL;
+ }
+ }
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
+ spin_unlock_irqrestore(&zone->lock, flags);
+ } while (check_new_pages(page, order));
+
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ zone_statistics(preferred_zone, zone, 1);
+
+ return page;
+}
+
/* Remove page from the per-cpu list, caller must protect the list */
-static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+static inline
+struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
+ int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
@@ -3318,184 +2727,125 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
do {
if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
+ int batch = READ_ONCE(pcp->batch);
+ int alloced;
+
+ /*
+ * Scale batch relative to order if batch implies
+ * free pages can be stored on the PCP. Batch can
+ * be 1 for small zones or for boot pagesets which
+ * should never store free pages as the pages may
+ * belong to arbitrary zones.
+ */
+ if (batch > 1)
+ batch = max(batch >> order, 2);
+ alloced = rmqueue_bulk(zone, order,
+ batch, list,
migratetype, alloc_flags);
+
+ pcp->count += alloced << order;
if (unlikely(list_empty(list)))
return NULL;
}
- page = list_first_entry(list, struct page, lru);
- list_del(&page->lru);
- pcp->count--;
- } while (check_new_pcp(page));
+ page = list_first_entry(list, struct page, pcp_list);
+ list_del(&page->pcp_list);
+ pcp->count -= 1 << order;
+ } while (check_new_pages(page, order));
return page;
}
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
- struct zone *zone, gfp_t gfp_flags,
+ struct zone *zone, unsigned int order,
int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
- unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
- local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
+ /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ if (!pcp) {
+ pcp_trylock_finish(UP_flags);
+ return NULL;
+ }
+
+ /*
+ * On allocation, reduce the number of pages that are batch freed.
+ * See nr_pcp_free() where free_factor is increased for subsequent
+ * frees.
+ */
+ pcp->free_factor >>= 1;
+ list = &pcp->lists[order_to_pindex(migratetype, order)];
+ page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
+ pcp_spin_unlock(pcp);
+ pcp_trylock_finish(UP_flags);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
- zone_statistics(preferred_zone, zone);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ zone_statistics(preferred_zone, zone, 1);
}
- local_irq_restore(flags);
return page;
}
/*
- * Allocate a page from the given zone. Use pcplists for order-0 allocations.
+ * Allocate a page from the given zone.
+ * Use pcplists for THP or "cheap" high-order allocations.
*/
+
+/*
+ * Do not instrument rmqueue() with KMSAN. This function may call
+ * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
+ * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
+ * may call rmqueue() again, which will result in a deadlock.
+ */
+__no_sanitize_memory
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
- unsigned long flags;
struct page *page;
- if (likely(order == 0)) {
- /*
- * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
- * we need to skip it when CMA area isn't allowed.
- */
- if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
- migratetype != MIGRATE_MOVABLE) {
- page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
- migratetype, alloc_flags);
- goto out;
- }
- }
-
/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- spin_lock_irqsave(&zone->lock, flags);
- do {
- page = NULL;
+ if (likely(pcp_allowed_order(order))) {
/*
- * order-0 request can reach here when the pcplist is skipped
- * due to non-CMA allocation context. HIGHATOMIC area is
- * reserved for high-order atomic allocation, so order-0
- * request should skip it.
+ * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
+ * we need to skip it when CMA area isn't allowed.
*/
- if (order > 0 && alloc_flags & ALLOC_HARDER) {
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
+ migratetype != MIGRATE_MOVABLE) {
+ page = rmqueue_pcplist(preferred_zone, zone, order,
+ migratetype, alloc_flags);
+ if (likely(page))
+ goto out;
}
- if (!page)
- page = __rmqueue(zone, order, migratetype, alloc_flags);
- } while (page && check_new_pages(page, order));
- spin_unlock(&zone->lock);
- if (!page)
- goto failed;
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
+ }
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);
+ page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
+ migratetype);
out:
/* Separate test+clear to avoid unnecessary atomics */
- if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ if ((alloc_flags & ALLOC_KSWAPD) &&
+ unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
-
-failed:
- local_irq_restore(flags);
- return NULL;
-}
-
-#ifdef CONFIG_FAIL_PAGE_ALLOC
-
-static struct {
- struct fault_attr attr;
-
- bool ignore_gfp_highmem;
- bool ignore_gfp_reclaim;
- u32 min_order;
-} fail_page_alloc = {
- .attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_reclaim = true,
- .ignore_gfp_highmem = true,
- .min_order = 1,
-};
-
-static int __init setup_fail_page_alloc(char *str)
-{
- return setup_fault_attr(&fail_page_alloc.attr, str);
-}
-__setup("fail_page_alloc=", setup_fail_page_alloc);
-
-static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
-{
- if (order < fail_page_alloc.min_order)
- return false;
- if (gfp_mask & __GFP_NOFAIL)
- return false;
- if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
- return false;
- if (fail_page_alloc.ignore_gfp_reclaim &&
- (gfp_mask & __GFP_DIRECT_RECLAIM))
- return false;
-
- return should_fail(&fail_page_alloc.attr, 1 << order);
-}
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init fail_page_alloc_debugfs(void)
-{
- umode_t mode = S_IFREG | 0600;
- struct dentry *dir;
-
- dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
- &fail_page_alloc.attr);
-
- debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim);
- debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem);
- debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
-
- return 0;
-}
-
-late_initcall(fail_page_alloc_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else /* CONFIG_FAIL_PAGE_ALLOC */
-
-static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
-{
- return false;
}
-#endif /* CONFIG_FAIL_PAGE_ALLOC */
-
noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
@@ -3505,15 +2855,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
{
- const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
long unusable_free = (1 << order) - 1;
/*
- * If the caller does not have rights to ALLOC_HARDER then subtract
- * the high-atomic reserves. This will over-estimate the size of the
- * atomic reserve but it avoids a search.
+ * If the caller does not have rights to reserves below the min
+ * watermark then subtract the high-atomic reserves. This will
+ * over-estimate the size of the atomic reserve but it avoids a search.
*/
- if (likely(!alloc_harder))
+ if (likely(!(alloc_flags & ALLOC_RESERVES)))
unusable_free += z->nr_reserved_highatomic;
#ifdef CONFIG_CMA
@@ -3521,6 +2870,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ unusable_free += zone_page_state(z, NR_UNACCEPTED);
+#endif
return unusable_free;
}
@@ -3537,25 +2889,37 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
{
long min = mark;
int o;
- const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
- if (alloc_flags & ALLOC_HIGH)
- min -= min / 2;
+ if (unlikely(alloc_flags & ALLOC_RESERVES)) {
+ /*
+ * __GFP_HIGH allows access to 50% of the min reserve as well
+ * as OOM.
+ */
+ if (alloc_flags & ALLOC_MIN_RESERVE) {
+ min -= min / 2;
+
+ /*
+ * Non-blocking allocations (e.g. GFP_ATOMIC) can
+ * access more reserves than just __GFP_HIGH. Other
+ * non-blocking allocations requests such as GFP_NOWAIT
+ * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
+ * access to the min reserve.
+ */
+ if (alloc_flags & ALLOC_NON_BLOCK)
+ min -= min / 4;
+ }
- if (unlikely(alloc_harder)) {
/*
- * OOM victims can try even harder than normal ALLOC_HARDER
+ * OOM victims can try even harder than the normal reserve
* users on the grounds that it's definitely going to be in
* the exit path shortly and free memory. Any allocation it
* makes during the free path will be small and short-lived.
*/
if (alloc_flags & ALLOC_OOM)
min -= min / 2;
- else
- min -= min / 4;
}
/*
@@ -3571,7 +2935,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
/* For a high-order request, check at least one suitable page is free */
- for (o = order; o < MAX_ORDER; o++) {
+ for (o = order; o <= MAX_ORDER; o++) {
struct free_area *area = &z->free_area[o];
int mt;
@@ -3589,8 +2953,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
}
#endif
- if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
+ if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
+ !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
return true;
+ }
}
return false;
}
@@ -3615,24 +2981,29 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* need to be calculated.
*/
if (!order) {
- long fast_free;
+ long usable_free;
+ long reserved;
+
+ usable_free = free_pages;
+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
- fast_free = free_pages;
- fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
- if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+ /* reserved may over estimate high-atomic reserves. */
+ usable_free -= min(usable_free, reserved);
+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
return true;
}
if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
free_pages))
return true;
+
/*
- * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * Ignore watermark boosting for __GFP_HIGH order-0 allocations
* when checking the min watermark. The min watermark is the
* point where boosting is ignored so that kswapd is woken up
* when below the low watermark.
*/
- if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
mark = z->_watermark[WMARK_MIN];
return __zone_watermark_ok(z, order, mark, highest_zoneidx,
@@ -3655,6 +3026,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
}
#ifdef CONFIG_NUMA
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
@@ -3707,16 +3080,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
return alloc_flags;
}
-static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
- unsigned int alloc_flags)
+/* Must be called after current_gfp_context() which can change gfp_mask */
+static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
+ unsigned int alloc_flags)
{
#ifdef CONFIG_CMA
- unsigned int pflags = current->flags;
-
- if (!(pflags & PF_MEMALLOC_NOCMA) &&
- gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
-
#endif
return alloc_flags;
}
@@ -3731,13 +3101,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
{
struct zoneref *z;
struct zone *zone;
- struct pglist_data *last_pgdat_dirty_limit = NULL;
+ struct pglist_data *last_pgdat = NULL;
+ bool last_pgdat_dirty_ok = false;
bool no_fallback;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
+ * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -3770,13 +3141,13 @@ retry:
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
- if (last_pgdat_dirty_limit == zone->zone_pgdat)
- continue;
+ if (last_pgdat != zone->zone_pgdat) {
+ last_pgdat = zone->zone_pgdat;
+ last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
+ }
- if (!node_dirty_ok(zone->zone_pgdat)) {
- last_pgdat_dirty_limit = zone->zone_pgdat;
+ if (!last_pgdat_dirty_ok)
continue;
- }
}
if (no_fallback && nr_online_nodes > 1 &&
@@ -3801,12 +3172,17 @@ retry:
gfp_mask)) {
int ret;
+ if (has_unaccepted_memory()) {
+ if (try_to_accept_memory(zone, order))
+ goto try_this_zone;
+ }
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
- if (static_branch_unlikely(&deferred_pages)) {
+ if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
@@ -3816,7 +3192,7 @@ retry:
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (node_reclaim_mode == 0 ||
+ if (!node_reclaim_enabled() ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
@@ -3848,14 +3224,19 @@ try_this_zone:
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
- if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
+ if (has_unaccepted_memory()) {
+ if (try_to_accept_memory(zone, order))
+ goto try_this_zone;
+ }
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
- if (static_branch_unlikely(&deferred_pages)) {
+ if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
@@ -3888,10 +3269,10 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
if (tsk_is_oom_victim(current) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
- if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
+ if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
- show_mem(filter, nodemask);
+ __show_mem(filter, nodemask, gfp_zone(gfp_mask));
}
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -3900,7 +3281,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
va_list args;
static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ if ((gfp_mask & __GFP_NOWARN) ||
+ !__ratelimit(&nopage_rs) ||
+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
return;
va_start(args, fmt);
@@ -4007,7 +3390,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*/
/* Exhausted what can be done so it's blame time */
- if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+ if (out_of_memory(&oc) ||
+ WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
*did_some_progress = 1;
/*
@@ -4024,7 +3408,7 @@ out:
}
/*
- * Maximum number of compaction retries wit a progress before OOM
+ * Maximum number of compaction retries with a progress before OOM
* killer is consider as the only way to move forward.
*/
#define MAX_COMPACT_RETRIES 16
@@ -4044,6 +3428,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
psi_memstall_enter(&pflags);
+ delayacct_compact_start();
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
@@ -4051,7 +3436,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
+ delayacct_compact_end();
+ if (*compact_result == COMPACT_SKIPPED)
+ return NULL;
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
@@ -4101,56 +3489,44 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
if (!order)
return false;
- if (compaction_made_progress(compact_result))
- (*compaction_retries)++;
-
- /*
- * compaction considers all the zone as desperately out of memory
- * so it doesn't really make much sense to retry except when the
- * failure could be caused by insufficient priority
- */
- if (compaction_failed(compact_result))
- goto check_priority;
+ if (fatal_signal_pending(current))
+ return false;
/*
- * compaction was skipped because there are not enough order-0 pages
- * to work with, so we retry only if it looks like reclaim can help.
+ * Compaction was skipped due to a lack of free order-0
+ * migration targets. Continue if reclaim can help.
*/
- if (compaction_needs_reclaim(compact_result)) {
+ if (compact_result == COMPACT_SKIPPED) {
ret = compaction_zonelist_suitable(ac, order, alloc_flags);
goto out;
}
/*
- * make sure the compaction wasn't deferred or didn't bail out early
- * due to locks contention before we declare that we should give up.
- * But the next retry should use a higher priority if allowed, so
- * we don't just keep bailing out endlessly.
+ * Compaction managed to coalesce some page blocks, but the
+ * allocation failed presumably due to a race. Retry some.
*/
- if (compaction_withdrawn(compact_result)) {
- goto check_priority;
- }
+ if (compact_result == COMPACT_SUCCESS) {
+ /*
+ * !costly requests are much more important than
+ * __GFP_RETRY_MAYFAIL costly ones because they are de
+ * facto nofail and invoke OOM killer to move on while
+ * costly can fail and users are ready to cope with
+ * that. 1/4 retries is rather arbitrary but we would
+ * need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
- /*
- * !costly requests are much more important than __GFP_RETRY_MAYFAIL
- * costly ones because they are de facto nofail and invoke OOM
- * killer to move on while costly can fail and users are ready
- * to cope with that. 1/4 retries is rather arbitrary but we
- * would need much more detailed feedback from compaction to
- * make a better decision.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- max_retries /= 4;
- if (*compaction_retries <= max_retries) {
- ret = true;
- goto out;
+ if (++(*compaction_retries) <= max_retries) {
+ ret = true;
+ goto out;
+ }
}
/*
- * Make sure there are attempts at the highest priority if we exhausted
- * all retries or failed at the lower priorities.
+ * Compaction failed. Retry with increasing priority.
*/
-check_priority:
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
@@ -4205,10 +3581,8 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
static struct lockdep_map __fs_reclaim_map =
STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
-static bool __need_fs_reclaim(gfp_t gfp_mask)
+static bool __need_reclaim(gfp_t gfp_mask)
{
- gfp_mask = current_gfp_context(gfp_mask);
-
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
@@ -4217,54 +3591,87 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
if (current->flags & PF_MEMALLOC)
return false;
- /* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS))
- return false;
-
if (gfp_mask & __GFP_NOLOCKDEP)
return false;
return true;
}
-void __fs_reclaim_acquire(void)
+void __fs_reclaim_acquire(unsigned long ip)
{
- lock_map_acquire(&__fs_reclaim_map);
+ lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
}
-void __fs_reclaim_release(void)
+void __fs_reclaim_release(unsigned long ip)
{
- lock_map_release(&__fs_reclaim_map);
+ lock_release(&__fs_reclaim_map, ip);
}
void fs_reclaim_acquire(gfp_t gfp_mask)
{
- if (__need_fs_reclaim(gfp_mask))
- __fs_reclaim_acquire();
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ if (__need_reclaim(gfp_mask)) {
+ if (gfp_mask & __GFP_FS)
+ __fs_reclaim_acquire(_RET_IP_);
+
+#ifdef CONFIG_MMU_NOTIFIER
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+#endif
+
+ }
}
EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
void fs_reclaim_release(gfp_t gfp_mask)
{
- if (__need_fs_reclaim(gfp_mask))
- __fs_reclaim_release();
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ if (__need_reclaim(gfp_mask)) {
+ if (gfp_mask & __GFP_FS)
+ __fs_reclaim_release(_RET_IP_);
+ }
}
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
unsigned int noreclaim_flag;
- unsigned long pflags, progress;
+ unsigned long progress;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
@@ -4273,7 +3680,6 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
- psi_memstall_leave(&pflags);
cond_resched();
@@ -4287,11 +3693,13 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned long *did_some_progress)
{
struct page *page = NULL;
+ unsigned long pflags;
bool drained = false;
+ psi_memstall_enter(&pflags);
*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
if (unlikely(!(*did_some_progress)))
- return NULL;
+ goto out;
retry:
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
@@ -4307,6 +3715,8 @@ retry:
drained = true;
goto retry;
}
+out:
+ psi_memstall_leave(&pflags);
return page;
}
@@ -4321,50 +3731,60 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
- if (last_pgdat != zone->zone_pgdat)
+ if (!managed_zone(zone))
+ continue;
+ if (last_pgdat != zone->zone_pgdat) {
wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- last_pgdat = zone->zone_pgdat;
+ last_pgdat = zone->zone_pgdat;
+ }
}
}
static inline unsigned int
-gfp_to_alloc_flags(gfp_t gfp_mask)
+gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
{
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
/*
- * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
+ * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
* and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
* to save two branches.
*/
- BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
+ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
*/
alloc_flags |= (__force int)
(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
- if (gfp_mask & __GFP_ATOMIC) {
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
*/
- if (!(gfp_mask & __GFP_NOMEMALLOC))
- alloc_flags |= ALLOC_HARDER;
+ if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+ alloc_flags |= ALLOC_NON_BLOCK;
+
+ if (order > 0)
+ alloc_flags |= ALLOC_HIGHATOMIC;
+ }
+
/*
- * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
- * comment for __cpuset_node_allowed().
+ * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
+ * GFP_ATOMIC) rather than fail, see the comment for
+ * cpuset_node_allowed().
*/
- alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(current)) && !in_interrupt())
- alloc_flags |= ALLOC_HARDER;
+ if (alloc_flags & ALLOC_MIN_RESERVE)
+ alloc_flags &= ~ALLOC_CPUSET;
+ } else if (unlikely(rt_task(current)) && in_task())
+ alloc_flags |= ALLOC_MIN_RESERVE;
- alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+ alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
return alloc_flags;
}
@@ -4474,30 +3894,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
- /*
- * If we didn't make any progress and have a lot of
- * dirty + writeback pages then we should wait for
- * an IO to complete to slow down the reclaim and
- * prevent from pre mature OOM
- */
- if (!did_some_progress) {
- unsigned long write_pending;
-
- write_pending = zone_page_state_snapshot(zone,
- NR_ZONE_WRITE_PENDING);
-
- if (2 * write_pending > reclaimable) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- return true;
- }
- }
-
ret = true;
- goto out;
+ break;
}
}
-out:
/*
* Memory allocation/reclaim might be called from a WQ context and the
* current implementation of the WQ concurrency control doesn't
@@ -4559,28 +3960,22 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
- /*
- * We also sanity check to catch abuse of atomic reserves being used by
- * callers that are not in atomic context.
- */
- if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
- (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
- gfp_mask &= ~__GFP_ATOMIC;
-
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
/*
* We need to recalculate the starting point for the zonelist iterator
@@ -4593,6 +3988,19 @@ retry_cpuset:
if (!ac->preferred_zoneref->zone)
goto nopage;
+ /*
+ * Check for insane configurations where the cpuset doesn't contain
+ * any suitable zone to satisfy the request - e.g. non-movable
+ * GFP_HIGHUSER allocations from MOVABLE nodes only.
+ */
+ if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
+ struct zoneref *z = first_zones_zonelist(ac->zonelist,
+ ac->highest_zoneidx,
+ &cpuset_current_mems_allowed);
+ if (!z->zone)
+ goto nopage;
+ }
+
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -4666,7 +4074,8 @@ retry:
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
- alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
+ alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
+ (alloc_flags & ALLOC_KSWAPD);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
@@ -4732,9 +4141,13 @@ retry:
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -4754,9 +4167,13 @@ retry:
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -4767,7 +4184,7 @@ nopage:
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
- if (WARN_ON_ONCE(!can_direct_reclaim))
+ if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
goto fail;
/*
@@ -4775,7 +4192,7 @@ nopage:
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
- WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+ WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
/*
* non failing costly orders are a hard requirement which we
@@ -4783,15 +4200,16 @@ nopage:
* so that we can identify them and convert them to something
* else.
*/
- WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+ WARN_ON_ONCE_GFP(costly_order, gfp_mask);
/*
- * Help non-failing allocations by giving them access to memory
- * reserves but do not use ALLOC_NO_WATERMARKS because this
+ * Help non-failing allocations by giving some access to memory
+ * reserves normally used for high priority non-blocking
+ * allocations but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
- * the situation worse
+ * the situation worse.
*/
- page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
if (page)
goto got_pg;
@@ -4807,7 +4225,7 @@ got_pg:
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
- struct alloc_context *ac, gfp_t *alloc_mask,
+ struct alloc_context *ac, gfp_t *alloc_gfp,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
@@ -4816,26 +4234,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
- *alloc_mask |= __GFP_HARDWALL;
+ *alloc_gfp |= __GFP_HARDWALL;
/*
* When we are in the interrupt context, it is irrelevant
* to the current task context. It means that any node ok.
*/
- if (!in_interrupt() && !ac->nodemask)
+ if (in_task() && !ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}
- fs_reclaim_acquire(gfp_mask);
- fs_reclaim_release(gfp_mask);
-
- might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+ might_alloc(gfp_mask);
if (should_fail_alloc_page(gfp_mask, order))
return false;
- *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
+ *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4852,49 +4267,218 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * @gfp: GFP flags for the allocation
+ * @preferred_nid: The preferred NUMA node ID to allocate from
+ * @nodemask: Set of nodes to allocate from, may be NULL
+ * @nr_pages: The number of pages desired on the list or array
+ * @page_list: Optional list to store the allocated pages
+ * @page_array: Optional array to store the pages
+ *
+ * This is a batched version of the page allocator that attempts to
+ * allocate nr_pages quickly. Pages are added to page_list if page_list
+ * is not NULL, otherwise it is assumed that the page_array is valid.
+ *
+ * For lists, nr_pages is the number of pages that should be allocated.
+ *
+ * For arrays, only NULL elements are populated with pages and nr_pages
+ * is the maximum number of pages that will be stored in the array.
+ *
+ * Returns the number of pages on the list or array.
+ */
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+ nodemask_t *nodemask, int nr_pages,
+ struct list_head *page_list,
+ struct page **page_array)
+{
+ struct page *page;
+ unsigned long __maybe_unused UP_flags;
+ struct zone *zone;
+ struct zoneref *z;
+ struct per_cpu_pages *pcp;
+ struct list_head *pcp_list;
+ struct alloc_context ac;
+ gfp_t alloc_gfp;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
+ int nr_populated = 0, nr_account = 0;
+
+ /*
+ * Skip populated array elements to determine if any pages need
+ * to be allocated before disabling IRQs.
+ */
+ while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ nr_populated++;
+
+ /* No pages requested? */
+ if (unlikely(nr_pages <= 0))
+ goto out;
+
+ /* Already populated array? */
+ if (unlikely(page_array && nr_pages - nr_populated == 0))
+ goto out;
+
+ /* Bulk allocator does not support memcg accounting. */
+ if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
+ goto failed;
+
+ /* Use the single page allocator for one page. */
+ if (nr_pages - nr_populated == 1)
+ goto failed;
+
+#ifdef CONFIG_PAGE_OWNER
+ /*
+ * PAGE_OWNER may recurse into the allocator to allocate space to
+ * save the stack with pagesets.lock held. Releasing/reacquiring
+ * removes much of the performance benefit of bulk allocation so
+ * force the caller to allocate one page at a time as it'll have
+ * similar performance to added complexity to the bulk allocator.
+ */
+ if (static_branch_unlikely(&page_owner_inited))
+ goto failed;
+#endif
+
+ /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
+ gfp &= gfp_allowed_mask;
+ alloc_gfp = gfp;
+ if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+ goto out;
+ gfp = alloc_gfp;
+
+ /* Find an allowed local zone that meets the low watermark. */
+ for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+ unsigned long mark;
+
+ if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp)) {
+ continue;
+ }
+
+ if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
+ zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+ goto failed;
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
+ if (zone_watermark_fast(zone, 0, mark,
+ zonelist_zone_idx(ac.preferred_zoneref),
+ alloc_flags, gfp)) {
+ break;
+ }
+ }
+
+ /*
+ * If there are no allowed local zones that meets the watermarks then
+ * try to allocate a single page and reclaim if necessary.
+ */
+ if (unlikely(!zone))
+ goto failed;
+
+ /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ if (!pcp)
+ goto failed_irq;
+
+ /* Attempt the batch allocation */
+ pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
+ while (nr_populated < nr_pages) {
+
+ /* Skip existing pages */
+ if (page_array && page_array[nr_populated]) {
+ nr_populated++;
+ continue;
+ }
+
+ page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
+ pcp, pcp_list);
+ if (unlikely(!page)) {
+ /* Try and allocate at least one page */
+ if (!nr_account) {
+ pcp_spin_unlock(pcp);
+ goto failed_irq;
+ }
+ break;
+ }
+ nr_account++;
+
+ prep_new_page(page, 0, gfp, 0);
+ if (page_list)
+ list_add(&page->lru, page_list);
+ else
+ page_array[nr_populated] = page;
+ nr_populated++;
+ }
+
+ pcp_spin_unlock(pcp);
+ pcp_trylock_finish(UP_flags);
+
+ __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+ zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+
+out:
+ return nr_populated;
+
+failed_irq:
+ pcp_trylock_finish(UP_flags);
+
+failed:
+ page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
+ if (page) {
+ if (page_list)
+ list_add(&page->lru, page_list);
+ else
+ page_array[nr_populated] = page;
+ nr_populated++;
+ }
+
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+
+/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
- if (unlikely(order >= MAX_ORDER)) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
return NULL;
- }
- gfp_mask &= gfp_allowed_mask;
- alloc_mask = gfp_mask;
- if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
+ gfp &= gfp_allowed_mask;
+ /*
+ * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+ * resp. GFP_NOIO which has to be inherited for all allocation requests
+ * from a particular context which has been marked by
+ * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
+ * movable zones are not used during allocation.
+ */
+ gfp = current_gfp_context(gfp);
+ alloc_gfp = gfp;
+ if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
+ &alloc_gfp, &alloc_flags))
return NULL;
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
- alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
/* First allocation attempt */
- page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;
- /*
- * Apply scoped allocation constraints. This is mainly about GFP_NOFS
- * resp. GFP_NOIO which has to be inherited for all allocation requests
- * from a particular context which has been marked by
- * memalloc_no{fs,io}_{save,restore}.
- */
- alloc_mask = current_gfp_context(gfp_mask);
+ alloc_gfp = gfp;
ac.spread_dirty_pages = false;
/*
@@ -4903,20 +4487,33 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
*/
ac.nodemask = nodemask;
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+ page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
out:
- if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
+ if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
+ unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
- trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
return page;
}
-EXPORT_SYMBOL(__alloc_pages_nodemask);
+EXPORT_SYMBOL(__alloc_pages);
+
+struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+ nodemask_t *nodemask)
+{
+ struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
+ preferred_nid, nodemask);
+
+ if (page && order > 1)
+ prep_transhuge_page(page);
+ return (struct folio *)page;
+}
+EXPORT_SYMBOL(__folio_alloc);
/*
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
@@ -4936,23 +4533,38 @@ EXPORT_SYMBOL(__get_free_pages);
unsigned long get_zeroed_page(gfp_t gfp_mask)
{
- return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
+ return __get_free_page(gfp_mask | __GFP_ZERO);
}
EXPORT_SYMBOL(get_zeroed_page);
-static inline void free_the_page(struct page *page, unsigned int order)
-{
- if (order == 0) /* Via pcp? */
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
-}
-
+/**
+ * __free_pages - Free pages allocated with alloc_pages().
+ * @page: The page pointer returned from alloc_pages().
+ * @order: The order of the allocation.
+ *
+ * This function can free multi-page allocations that are not compound
+ * pages. It does not check that the @order passed in matches that of
+ * the allocation, so it is easy to leak memory. Freeing more memory
+ * than was allocated will probably emit a warning.
+ *
+ * If the last reference to this page is speculative, it will be released
+ * by put_page() which only frees the first page of a non-compound
+ * allocation. To prevent the remaining pages from being leaked, we free
+ * the subsequent pages here. If you want to use the page's reference
+ * count to decide when to free the allocation, you should allocate a
+ * compound page, and use put_page() instead of __free_pages().
+ *
+ * Context: May be called in interrupt context or while holding a normal
+ * spinlock, but not in NMI context or while holding a raw spinlock.
+ */
void __free_pages(struct page *page, unsigned int order)
{
+ /* get PageHead before we drop reference */
+ int head = PageHead(page);
+
if (put_page_testzero(page))
free_the_page(page, order);
- else if (!PageHead(page))
+ else if (!head)
while (order-- > 0)
free_the_page(page + (1 << order), order);
}
@@ -5009,8 +4621,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
}
EXPORT_SYMBOL(__page_frag_cache_drain);
-void *page_frag_alloc(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask)
+void *page_frag_alloc_align(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -5044,6 +4657,11 @@ refill:
if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
+ if (unlikely(nc->pfmemalloc)) {
+ free_the_page(page, compound_order(page));
+ goto refill;
+ }
+
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
@@ -5054,14 +4672,27 @@ refill:
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
}
nc->pagecnt_bias--;
+ offset &= align_mask;
nc->offset = offset;
return nc->va + offset;
}
-EXPORT_SYMBOL(page_frag_alloc);
+EXPORT_SYMBOL(page_frag_alloc_align);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
@@ -5079,14 +4710,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
if (addr) {
- unsigned long alloc_end = addr + (PAGE_SIZE << order);
- unsigned long used = addr + PAGE_ALIGN(size);
+ unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
+ struct page *page = virt_to_page((void *)addr);
+ struct page *last = page + nr;
- split_page(virt_to_page((void *)addr), order);
- while (used < alloc_end) {
- free_page(used);
- used += PAGE_SIZE;
- }
+ split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
+ while (page < --last)
+ set_page_refcounted(last);
+
+ last = page + (1UL << order);
+ for (page += nr; page < last; page++)
+ __free_pages_ok(page, 0, FPI_TO_TAIL);
}
return (void *)addr;
}
@@ -5111,8 +4746,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
unsigned long addr;
- if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
- gfp_mask &= ~__GFP_COMP;
+ if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
+ gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
@@ -5136,8 +4771,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
struct page *p;
- if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
- gfp_mask &= ~__GFP_COMP;
+ if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
+ gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
@@ -5211,357 +4846,6 @@ unsigned long nr_free_buffer_pages(void)
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-static inline void show_node(struct zone *zone)
-{
- if (IS_ENABLED(CONFIG_NUMA))
- printk("Node %d ", zone_to_nid(zone));
-}
-
-long si_mem_available(void)
-{
- long available;
- unsigned long pagecache;
- unsigned long wmark_low = 0;
- unsigned long pages[NR_LRU_LISTS];
- unsigned long reclaimable;
- struct zone *zone;
- int lru;
-
- for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
-
- for_each_zone(zone)
- wmark_low += low_wmark_pages(zone);
-
- /*
- * Estimate the amount of memory available for userspace allocations,
- * without causing swapping.
- */
- available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
-
- /*
- * Not all the page cache can be freed, otherwise the system will
- * start swapping. Assume at least half of the page cache, or the
- * low watermark worth of cache, needs to stay.
- */
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
- pagecache -= min(pagecache / 2, wmark_low);
- available += pagecache;
-
- /*
- * Part of the reclaimable slab and other kernel memory consists of
- * items that are in use, and cannot be freed. Cap this estimate at the
- * low watermark.
- */
- reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
- available += reclaimable - min(reclaimable / 2, wmark_low);
-
- if (available < 0)
- available = 0;
- return available;
-}
-EXPORT_SYMBOL_GPL(si_mem_available);
-
-void si_meminfo(struct sysinfo *val)
-{
- val->totalram = totalram_pages();
- val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_zone_page_state(NR_FREE_PAGES);
- val->bufferram = nr_blockdev_pages();
- val->totalhigh = totalhigh_pages();
- val->freehigh = nr_free_highpages();
- val->mem_unit = PAGE_SIZE;
-}
-
-EXPORT_SYMBOL(si_meminfo);
-
-#ifdef CONFIG_NUMA
-void si_meminfo_node(struct sysinfo *val, int nid)
-{
- int zone_type; /* needs to be signed */
- unsigned long managed_pages = 0;
- unsigned long managed_highpages = 0;
- unsigned long free_highpages = 0;
- pg_data_t *pgdat = NODE_DATA(nid);
-
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
- struct zone *zone = &pgdat->node_zones[zone_type];
-
- if (is_highmem(zone)) {
- managed_highpages += zone_managed_pages(zone);
- free_highpages += zone_page_state(zone, NR_FREE_PAGES);
- }
- }
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
- val->mem_unit = PAGE_SIZE;
-}
-#endif
-
-/*
- * Determine whether the node should be displayed or not, depending on whether
- * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
- */
-static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
-{
- if (!(flags & SHOW_MEM_FILTER_NODES))
- return false;
-
- /*
- * no node mask - aka implicit memory numa policy. Do not bother with
- * the synchronization - read_mems_allowed_begin - because we do not
- * have to be precise here.
- */
- if (!nodemask)
- nodemask = &cpuset_current_mems_allowed;
-
- return !node_isset(nid, *nodemask);
-}
-
-#define K(x) ((x) << (PAGE_SHIFT-10))
-
-static void show_migration_types(unsigned char type)
-{
- static const char types[MIGRATE_TYPES] = {
- [MIGRATE_UNMOVABLE] = 'U',
- [MIGRATE_MOVABLE] = 'M',
- [MIGRATE_RECLAIMABLE] = 'E',
- [MIGRATE_HIGHATOMIC] = 'H',
-#ifdef CONFIG_CMA
- [MIGRATE_CMA] = 'C',
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = 'I',
-#endif
- };
- char tmp[MIGRATE_TYPES + 1];
- char *p = tmp;
- int i;
-
- for (i = 0; i < MIGRATE_TYPES; i++) {
- if (type & (1 << i))
- *p++ = types[i];
- }
-
- *p = '\0';
- printk(KERN_CONT "(%s) ", tmp);
-}
-
-/*
- * Show free area list (used inside shift_scroll-lock stuff)
- * We also calculate the percentage fragmentation. We do this by counting the
- * memory on each free list with the exception of the first item on the list.
- *
- * Bits in @filter:
- * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
- * cpuset.
- */
-void show_free_areas(unsigned int filter, nodemask_t *nodemask)
-{
- unsigned long free_pcp = 0;
- int cpu;
- struct zone *zone;
- pg_data_t *pgdat;
-
- for_each_populated_zone(zone) {
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
-
- for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
- }
-
- printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
- " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu dirty:%lu writeback:%lu\n"
- " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
- " free:%lu free_pcp:%lu free_cma:%lu\n",
- global_node_page_state(NR_ACTIVE_ANON),
- global_node_page_state(NR_INACTIVE_ANON),
- global_node_page_state(NR_ISOLATED_ANON),
- global_node_page_state(NR_ACTIVE_FILE),
- global_node_page_state(NR_INACTIVE_FILE),
- global_node_page_state(NR_ISOLATED_FILE),
- global_node_page_state(NR_UNEVICTABLE),
- global_node_page_state(NR_FILE_DIRTY),
- global_node_page_state(NR_WRITEBACK),
- global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
- global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
- global_node_page_state(NR_FILE_MAPPED),
- global_node_page_state(NR_SHMEM),
- global_zone_page_state(NR_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
- global_zone_page_state(NR_FREE_PAGES),
- free_pcp,
- global_zone_page_state(NR_FREE_CMA_PAGES));
-
- for_each_online_pgdat(pgdat) {
- if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
- continue;
-
- printk("Node %d"
- " active_anon:%lukB"
- " inactive_anon:%lukB"
- " active_file:%lukB"
- " inactive_file:%lukB"
- " unevictable:%lukB"
- " isolated(anon):%lukB"
- " isolated(file):%lukB"
- " mapped:%lukB"
- " dirty:%lukB"
- " writeback:%lukB"
- " shmem:%lukB"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " shmem_thp: %lukB"
- " shmem_pmdmapped: %lukB"
- " anon_thp: %lukB"
-#endif
- " writeback_tmp:%lukB"
- " kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
- " shadow_call_stack:%lukB"
-#endif
- " all_unreclaimable? %s"
- "\n",
- pgdat->node_id,
- K(node_page_state(pgdat, NR_ACTIVE_ANON)),
- K(node_page_state(pgdat, NR_INACTIVE_ANON)),
- K(node_page_state(pgdat, NR_ACTIVE_FILE)),
- K(node_page_state(pgdat, NR_INACTIVE_FILE)),
- K(node_page_state(pgdat, NR_UNEVICTABLE)),
- K(node_page_state(pgdat, NR_ISOLATED_ANON)),
- K(node_page_state(pgdat, NR_ISOLATED_FILE)),
- K(node_page_state(pgdat, NR_FILE_MAPPED)),
- K(node_page_state(pgdat, NR_FILE_DIRTY)),
- K(node_page_state(pgdat, NR_WRITEBACK)),
- K(node_page_state(pgdat, NR_SHMEM)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
- K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
- * HPAGE_PMD_NR),
- K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
-#endif
- K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- node_page_state(pgdat, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
- node_page_state(pgdat, NR_KERNEL_SCS_KB),
-#endif
- pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
- "yes" : "no");
- }
-
- for_each_populated_zone(zone) {
- int i;
-
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
-
- free_pcp = 0;
- for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
-
- show_node(zone);
- printk(KERN_CONT
- "%s"
- " free:%lukB"
- " min:%lukB"
- " low:%lukB"
- " high:%lukB"
- " reserved_highatomic:%luKB"
- " active_anon:%lukB"
- " inactive_anon:%lukB"
- " active_file:%lukB"
- " inactive_file:%lukB"
- " unevictable:%lukB"
- " writepending:%lukB"
- " present:%lukB"
- " managed:%lukB"
- " mlocked:%lukB"
- " pagetables:%lukB"
- " bounce:%lukB"
- " free_pcp:%lukB"
- " local_pcp:%ukB"
- " free_cma:%lukB"
- "\n",
- zone->name,
- K(zone_page_state(zone, NR_FREE_PAGES)),
- K(min_wmark_pages(zone)),
- K(low_wmark_pages(zone)),
- K(high_wmark_pages(zone)),
- K(zone->nr_reserved_highatomic),
- K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
- K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
- K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
- K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
- K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
- K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
- K(zone->present_pages),
- K(zone_managed_pages(zone)),
- K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_PAGETABLE)),
- K(zone_page_state(zone, NR_BOUNCE)),
- K(free_pcp),
- K(this_cpu_read(zone->pageset->pcp.count)),
- K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
- printk("lowmem_reserve[]:");
- for (i = 0; i < MAX_NR_ZONES; i++)
- printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
- printk(KERN_CONT "\n");
- }
-
- for_each_populated_zone(zone) {
- unsigned int order;
- unsigned long nr[MAX_ORDER], flags, total = 0;
- unsigned char types[MAX_ORDER];
-
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
- show_node(zone);
- printk(KERN_CONT "%s: ", zone->name);
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[order];
- int type;
-
- nr[order] = area->nr_free;
- total += nr[order] << order;
-
- types[order] = 0;
- for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!free_area_empty(area, type))
- types[order] |= 1 << type;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
- printk(KERN_CONT "%lu*%lukB ",
- nr[order], K(1UL) << order);
- if (nr[order])
- show_migration_types(types[order]);
- }
- printk(KERN_CONT "= %lukB\n", K(total));
- }
-
- hugetlb_show_meminfo();
-
- printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
-
- show_swap_cache_info();
-}
-
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
@@ -5582,7 +4866,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
- if (managed_zone(zone)) {
+ if (populated_zone(zone)) {
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
@@ -5596,7 +4880,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
static int __parse_numa_zonelist_order(char *s)
{
/*
- * We used to support different zonlists modes but they turned
+ * We used to support different zonelists modes but they turned
* out to be just not useful. Let's keep the warning in place
* if somebody still use the cmd line parameter so that we do
* not fail it silently
@@ -5608,12 +4892,12 @@ static int __parse_numa_zonelist_order(char *s)
return 0;
}
-char numa_zonelist_order[] = "Node";
-
+static char numa_zonelist_order[] = "Node";
+#define NUMA_ZONELIST_ORDER_LEN 16
/*
* sysctl handler for numa_zonelist_order
*/
-int numa_zonelist_order_handler(struct ctl_table *table, int write,
+static int numa_zonelist_order_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
if (write)
@@ -5621,8 +4905,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
return proc_dostring(table, write, buffer, length, ppos);
}
-
-#define MAX_NODE_LOAD (nr_online_nodes)
static int node_load[MAX_NUMNODES];
/**
@@ -5640,7 +4922,7 @@ static int node_load[MAX_NUMNODES];
*
* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
-static int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
@@ -5669,7 +4951,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
- val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+ val *= MAX_NUMNODES;
val += node_load[n];
if (val < min_val) {
@@ -5735,13 +5017,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
- int node, load, nr_nodes = 0;
+ int node, nr_nodes = 0;
nodemask_t used_mask = NODE_MASK_NONE;
int local_node, prev_node;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
- load = nr_online_nodes;
prev_node = local_node;
memset(node_order, 0, sizeof(node_order));
@@ -5753,15 +5034,18 @@ static void build_zonelists(pg_data_t *pgdat)
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
- node_load[node] = load;
+ node_load[node] += 1;
node_order[nr_nodes++] = node;
prev_node = node;
- load--;
}
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+ pr_info("Fallback order for Node %d: ", local_node);
+ for (node = 0; node < nr_nodes; node++)
+ pr_cont("%d ", node_order[node]);
+ pr_cont("\n");
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -5840,18 +5124,34 @@ static void build_zonelists(pg_data_t *pgdat)
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
-static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
-static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
+/* These effectively disable the pcplists in the boot pageset completely */
+#define BOOT_PAGESET_HIGH 0
+#define BOOT_PAGESET_BATCH 1
+static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
static void __build_all_zonelists(void *data)
{
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
- spin_lock(&lock);
+ /*
+ * Explicitly disable this CPU's interrupts before taking seqlock
+ * to prevent any IRQ handler from calling into the page allocator
+ * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
+ */
+ local_irq_save(flags);
+ /*
+ * Explicitly disable this CPU's synchronous printk() before taking
+ * seqlock to prevent any printk() from trying to hold port->lock, for
+ * tty_insert_flip_string_and_push_buffer() on other CPU might be
+ * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
+ */
+ printk_deferred_enter();
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -5864,7 +5164,11 @@ static void __build_all_zonelists(void *data)
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
- for_each_online_node(nid) {
+ /*
+ * All possible nodes have pgdat preallocated
+ * in free_area_init
+ */
+ for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
@@ -5884,7 +5188,9 @@ static void __build_all_zonelists(void *data)
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
+ printk_deferred_exit();
+ local_irq_restore(flags);
}
static noinline void __init
@@ -5908,7 +5214,7 @@ build_all_zonelists_init(void)
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu)
- setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+ per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
@@ -5953,214 +5259,18 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
-/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
-static bool __meminit
-overlap_memmap_init(unsigned long zone, unsigned long *pfn)
-{
- static struct memblock_region *r;
-
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_mem_region(r) {
- if (*pfn < memblock_region_memory_end_pfn(r))
- break;
- }
- }
- if (*pfn >= memblock_region_memory_base_pfn(r) &&
- memblock_is_mirror(r)) {
- *pfn = memblock_region_memory_end_pfn(r);
- return true;
- }
- }
- return false;
-}
-
-/*
- * Initially all pages are reserved - free ones are freed
- * up by memblock_free_all() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- */
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum meminit_context context,
- struct vmem_altmap *altmap)
-{
- unsigned long pfn, end_pfn = start_pfn + size;
- struct page *page;
-
- if (highest_memmap_pfn < end_pfn - 1)
- highest_memmap_pfn = end_pfn - 1;
-
-#ifdef CONFIG_ZONE_DEVICE
- /*
- * Honor reservation requested by the driver for this ZONE_DEVICE
- * memory. We limit the total number of pages to initialize to just
- * those that might contain the memory mapping. We will defer the
- * ZONE_DEVICE page initialization until after we have released
- * the hotplug lock.
- */
- if (zone == ZONE_DEVICE) {
- if (!altmap)
- return;
-
- if (start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
- end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- }
-#endif
-
- for (pfn = start_pfn; pfn < end_pfn; ) {
- /*
- * There can be holes in boot-time mem_map[]s handed to this
- * function. They do not exist on hotplugged memory.
- */
- if (context == MEMINIT_EARLY) {
- if (overlap_memmap_init(zone, &pfn))
- continue;
- if (defer_init(nid, pfn, end_pfn))
- break;
- }
-
- page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone, nid);
- if (context == MEMINIT_HOTPLUG)
- __SetPageReserved(page);
-
- /*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- */
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- cond_resched();
- }
- pfn++;
- }
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-void __ref memmap_init_zone_device(struct zone *zone,
- unsigned long start_pfn,
- unsigned long nr_pages,
- struct dev_pagemap *pgmap)
-{
- unsigned long pfn, end_pfn = start_pfn + nr_pages;
- struct pglist_data *pgdat = zone->zone_pgdat;
- struct vmem_altmap *altmap = pgmap_altmap(pgmap);
- unsigned long zone_idx = zone_idx(zone);
- unsigned long start = jiffies;
- int nid = pgdat->node_id;
-
- if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
- return;
-
- /*
- * The call to memmap_init_zone should have already taken care
- * of the pages reserved for the memmap, so we can just jump to
- * the end of that region and start processing the device pages.
- */
- if (altmap) {
- start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- nr_pages = end_pfn - start_pfn;
- }
-
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- struct page *page = pfn_to_page(pfn);
-
- __init_single_page(page, pfn, zone_idx, nid);
-
- /*
- * Mark page reserved as it will need to wait for onlining
- * phase for it to be fully associated with a zone.
- *
- * We can use the non-atomic __set_bit operation for setting
- * the flag as we are still initializing the pages.
- */
- __SetPageReserved(page);
-
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
- * and zone_device_data. It is a bug if a ZONE_DEVICE page is
- * ever freed or placed on a driver-private list.
- */
- page->pgmap = pgmap;
- page->zone_device_data = NULL;
-
- /*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
- * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
- * because this is done early in section_activate()
- */
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- cond_resched();
- }
- }
-
- pr_info("%s initialised %lu pages in %ums\n", __func__,
- nr_pages, jiffies_to_msecs(jiffies - start));
-}
-
-#endif
-static void __meminit zone_init_free_lists(struct zone *zone)
-{
- unsigned int order, t;
- for_each_migratetype_order(order, t) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
- zone->free_area[order].nr_free = 0;
- }
-}
-
-void __meminit __weak memmap_init(unsigned long size, int nid,
- unsigned long zone,
- unsigned long range_start_pfn)
-{
- unsigned long start_pfn, end_pfn;
- unsigned long range_end_pfn = range_start_pfn + size;
- int i;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
- end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
-
- if (end_pfn > start_pfn) {
- size = end_pfn - start_pfn;
- memmap_init_zone(size, nid, zone, start_pfn,
- MEMINIT_EARLY, NULL);
- }
- }
-}
-
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
/*
- * The per-cpu-pages pools are set to around 1000th of the
- * size of the zone.
+ * The number of pages to batch allocate is either ~0.1%
+ * of the zone or 1MB, whichever is smaller. The batch
+ * size is striking a balance between allocation latency
+ * and zone lock contention.
*/
- batch = zone_managed_pages(zone) / 1024;
- /* But no more than a meg. */
- if (batch * PAGE_SIZE > 1024 * 1024)
- batch = (1024 * 1024) / PAGE_SIZE;
+ batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
@@ -6197,14 +5307,66 @@ static int zone_batchsize(struct zone *zone)
#endif
}
+static int percpu_pagelist_high_fraction;
+static int zone_highsize(struct zone *zone, int batch, int cpu_online)
+{
+#ifdef CONFIG_MMU
+ int high;
+ int nr_split_cpus;
+ unsigned long total_pages;
+
+ if (!percpu_pagelist_high_fraction) {
+ /*
+ * By default, the high value of the pcp is based on the zone
+ * low watermark so that if they are full then background
+ * reclaim will not be started prematurely.
+ */
+ total_pages = low_wmark_pages(zone);
+ } else {
+ /*
+ * If percpu_pagelist_high_fraction is configured, the high
+ * value is based on a fraction of the managed pages in the
+ * zone.
+ */
+ total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+ }
+
+ /*
+ * Split the high value across all online CPUs local to the zone. Note
+ * that early in boot that CPUs may not be online yet and that during
+ * CPU hotplug that the cpumask is not yet updated when a CPU is being
+ * onlined. For memory nodes that have no CPUs, split pcp->high across
+ * all online CPUs to mitigate the risk that reclaim is triggered
+ * prematurely due to pages stored on pcp lists.
+ */
+ nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
+ if (!nr_split_cpus)
+ nr_split_cpus = num_online_cpus();
+ high = total_pages / nr_split_cpus;
+
+ /*
+ * Ensure high is at least batch*4. The multiple is based on the
+ * historical relationship between high and batch.
+ */
+ high = max(high, batch << 2);
+
+ return high;
+#else
+ return 0;
+#endif
+}
+
/*
- * pcp->high and pcp->batch values are related and dependent on one another:
- * ->batch must never be higher then ->high.
- * The following function updates them in a safe manner without read side
- * locking.
+ * pcp->high and pcp->batch values are related and generally batch is lower
+ * than high. They are also related to pcp->count such that count is lower
+ * than high, and as soon as it reaches high, the pcplist is flushed.
*
- * Any new users of pcp->batch and pcp->high should ensure they can cope with
- * those fields changing asynchronously (acording to the above rule).
+ * However, guaranteeing these relations at all times would require e.g. write
+ * barriers here but also careful usage of read barriers at the read side, and
+ * thus be prone to error and bad for performance. Thus the update only prevents
+ * store tearing. Any new users of pcp->batch and pcp->high should ensure they
+ * can cope with those fields changing asynchronously, and fully trust only the
+ * pcp->count field on the local CPU with interrupts disabled.
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
@@ -6213,80 +5375,95 @@ static int zone_batchsize(struct zone *zone)
static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
unsigned long batch)
{
- /* start with a fail safe value for batch */
- pcp->batch = 1;
- smp_wmb();
-
- /* Update high, then batch, in order */
- pcp->high = high;
- smp_wmb();
-
- pcp->batch = batch;
+ WRITE_ONCE(pcp->batch, batch);
+ WRITE_ONCE(pcp->high, high);
}
-/* a companion to pageset_set_high() */
-static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
{
- pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
-}
+ int pindex;
-static void pageset_init(struct per_cpu_pageset *p)
-{
- struct per_cpu_pages *pcp;
- int migratetype;
+ memset(pcp, 0, sizeof(*pcp));
+ memset(pzstats, 0, sizeof(*pzstats));
- memset(p, 0, sizeof(*p));
+ spin_lock_init(&pcp->lock);
+ for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
+ INIT_LIST_HEAD(&pcp->lists[pindex]);
- pcp = &p->pcp;
- for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
- INIT_LIST_HEAD(&pcp->lists[migratetype]);
+ /*
+ * Set batch and high values safe for a boot pageset. A true percpu
+ * pageset's initialization will update them subsequently. Here we don't
+ * need to be as careful as pageset_update() as nobody can access the
+ * pageset yet.
+ */
+ pcp->high = BOOT_PAGESET_HIGH;
+ pcp->batch = BOOT_PAGESET_BATCH;
+ pcp->free_factor = 0;
}
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
+ unsigned long batch)
{
- pageset_init(p);
- pageset_set_batch(p, batch);
+ struct per_cpu_pages *pcp;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ pageset_update(pcp, high, batch);
+ }
}
/*
- * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
+ * Calculate and set new high and batch values for all per-cpu pagesets of a
+ * zone based on the zone's size.
*/
-static void pageset_set_high(struct per_cpu_pageset *p,
- unsigned long high)
+static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
{
- unsigned long batch = max(1UL, high / 4);
- if ((high / 4) > (PAGE_SHIFT * 8))
- batch = PAGE_SHIFT * 8;
+ int new_high, new_batch;
- pageset_update(&p->pcp, high, batch);
-}
+ new_batch = max(1, zone_batchsize(zone));
+ new_high = zone_highsize(zone, new_batch, cpu_online);
-static void pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
-{
- if (percpu_pagelist_fraction)
- pageset_set_high(pcp,
- (zone_managed_pages(zone) /
- percpu_pagelist_fraction));
- else
- pageset_set_batch(pcp, zone_batchsize(zone));
-}
+ if (zone->pageset_high == new_high &&
+ zone->pageset_batch == new_batch)
+ return;
-static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-{
- struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+ zone->pageset_high = new_high;
+ zone->pageset_batch = new_batch;
- pageset_init(pcp);
- pageset_set_high_and_batch(zone, pcp);
+ __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
}
void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
- zone->pageset = alloc_percpu(struct per_cpu_pageset);
- for_each_possible_cpu(cpu)
- zone_pageset_init(zone, cpu);
+
+ /* Size may be 0 on !SMP && !NUMA */
+ if (sizeof(struct per_cpu_zonestat) > 0)
+ zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
+
+ zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pages *pcp;
+ struct per_cpu_zonestat *pzstats;
+
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ per_cpu_pages_init(pcp, pzstats);
+ }
+
+ zone_set_pageset_high_and_batch(zone, 0);
+}
+
+/*
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
+ * page high values need to be recalculated.
+ */
+static void zone_pcp_update(struct zone *zone, int cpu_online)
+{
+ mutex_lock(&pcp_batch_high_lock);
+ zone_set_pageset_high_and_batch(zone, cpu_online);
+ mutex_unlock(&pcp_batch_high_lock);
}
/*
@@ -6310,9 +5487,9 @@ void __init setup_per_cpu_pageset(void)
* the nodes these zones are associated with.
*/
for_each_possible_cpu(cpu) {
- struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
- memset(pcp->vm_numa_stat_diff, 0,
- sizeof(pcp->vm_numa_stat_diff));
+ struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
+ memset(pzstats->vm_numa_event, 0,
+ sizeof(pzstats->vm_numa_event));
}
#endif
@@ -6321,1196 +5498,23 @@ void __init setup_per_cpu_pageset(void)
alloc_percpu(struct per_cpu_nodestat);
}
-static __meminit void zone_pcp_init(struct zone *zone)
+__meminit void zone_pcp_init(struct zone *zone)
{
/*
* per cpu subsystem is not up at this point. The following code
* relies on the ability of the linker to provide the
* offset of a (static) per cpu variable into the per cpu area.
*/
- zone->pageset = &boot_pageset;
+ zone->per_cpu_pageset = &boot_pageset;
+ zone->per_cpu_zonestats = &boot_zonestats;
+ zone->pageset_high = BOOT_PAGESET_HIGH;
+ zone->pageset_batch = BOOT_PAGESET_BATCH;
if (populated_zone(zone))
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
- zone->name, zone->present_pages,
- zone_batchsize(zone));
-}
-
-void __meminit init_currently_empty_zone(struct zone *zone,
- unsigned long zone_start_pfn,
- unsigned long size)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- int zone_idx = zone_idx(zone) + 1;
-
- if (zone_idx > pgdat->nr_zones)
- pgdat->nr_zones = zone_idx;
-
- zone->zone_start_pfn = zone_start_pfn;
-
- mminit_dprintk(MMINIT_TRACE, "memmap_init",
- "Initialising map node %d zone %lu pfns %lu -> %lu\n",
- pgdat->node_id,
- (unsigned long)zone_idx(zone),
- zone_start_pfn, (zone_start_pfn + size));
-
- zone_init_free_lists(zone);
- zone->initialized = 1;
-}
-
-/**
- * get_pfn_range_for_nid - Return the start and end page frames for a node
- * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
- * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
- * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
- *
- * It returns the start and end page frame of a node based on information
- * provided by memblock_set_node(). If called for a node
- * with no available memory, a warning is printed and the start and end
- * PFNs will be 0.
- */
-void __init get_pfn_range_for_nid(unsigned int nid,
- unsigned long *start_pfn, unsigned long *end_pfn)
-{
- unsigned long this_start_pfn, this_end_pfn;
- int i;
-
- *start_pfn = -1UL;
- *end_pfn = 0;
-
- for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
- *start_pfn = min(*start_pfn, this_start_pfn);
- *end_pfn = max(*end_pfn, this_end_pfn);
- }
-
- if (*start_pfn == -1UL)
- *start_pfn = 0;
-}
-
-/*
- * This finds a zone that can be used for ZONE_MOVABLE pages. The
- * assumption is made that zones within a node are ordered in monotonic
- * increasing memory addresses so that the "highest" populated zone is used
- */
-static void __init find_usable_zone_for_movable(void)
-{
- int zone_index;
- for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
- if (zone_index == ZONE_MOVABLE)
- continue;
-
- if (arch_zone_highest_possible_pfn[zone_index] >
- arch_zone_lowest_possible_pfn[zone_index])
- break;
- }
-
- VM_BUG_ON(zone_index == -1);
- movable_zone = zone_index;
-}
-
-/*
- * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independent of architecture. Unlike the other zones,
- * the starting point for ZONE_MOVABLE is not fixed. It may be different
- * in each node depending on the size of each node and how evenly kernelcore
- * is distributed. This helper function adjusts the zone ranges
- * provided by the architecture for a given node by using the end of the
- * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
- * zones within a node are in order of monotonic increases memory addresses
- */
-static void __init adjust_zone_range_for_zone_movable(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn)
-{
- /* Only adjust if ZONE_MOVABLE is on this node */
- if (zone_movable_pfn[nid]) {
- /* Size ZONE_MOVABLE */
- if (zone_type == ZONE_MOVABLE) {
- *zone_start_pfn = zone_movable_pfn[nid];
- *zone_end_pfn = min(node_end_pfn,
- arch_zone_highest_possible_pfn[movable_zone]);
-
- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (!mirrored_kernelcore &&
- *zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
- /* Check if this whole range is within ZONE_MOVABLE */
- } else if (*zone_start_pfn >= zone_movable_pfn[nid])
- *zone_start_pfn = *zone_end_pfn;
- }
-}
-
-/*
- * Return the number of pages a zone spans in a node, including holes
- * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
- */
-static unsigned long __init zone_spanned_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn)
-{
- unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
- return 0;
-
- /* Get the start and end of the zone */
- *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- zone_start_pfn, zone_end_pfn);
-
- /* Check that this node has pages within the zone's required range */
- if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
- return 0;
-
- /* Move the zone boundaries inside the node if necessary */
- *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
- *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
-
- /* Return the spanned pages */
- return *zone_end_pfn - *zone_start_pfn;
-}
-
-/*
- * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
- * then all holes in the requested range will be accounted for.
- */
-unsigned long __init __absent_pages_in_range(int nid,
- unsigned long range_start_pfn,
- unsigned long range_end_pfn)
-{
- unsigned long nr_absent = range_end_pfn - range_start_pfn;
- unsigned long start_pfn, end_pfn;
- int i;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
- end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
- nr_absent -= end_pfn - start_pfn;
- }
- return nr_absent;
-}
-
-/**
- * absent_pages_in_range - Return number of page frames in holes within a range
- * @start_pfn: The start PFN to start searching for holes
- * @end_pfn: The end PFN to stop searching for holes
- *
- * Return: the number of pages frames in memory holes within a range.
- */
-unsigned long __init absent_pages_in_range(unsigned long start_pfn,
- unsigned long end_pfn)
-{
- return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
-}
-
-/* Return the number of page frames in holes in a zone on a node */
-static unsigned long __init zone_absent_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn)
-{
- unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- unsigned long zone_start_pfn, zone_end_pfn;
- unsigned long nr_absent;
-
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
- return 0;
-
- zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
-
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
- nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
-
- /*
- * ZONE_MOVABLE handling.
- * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
- * and vice versa.
- */
- if (mirrored_kernelcore && zone_movable_pfn[nid]) {
- unsigned long start_pfn, end_pfn;
- struct memblock_region *r;
-
- for_each_mem_region(r) {
- start_pfn = clamp(memblock_region_memory_base_pfn(r),
- zone_start_pfn, zone_end_pfn);
- end_pfn = clamp(memblock_region_memory_end_pfn(r),
- zone_start_pfn, zone_end_pfn);
-
- if (zone_type == ZONE_MOVABLE &&
- memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
-
- if (zone_type == ZONE_NORMAL &&
- !memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
- }
- }
-
- return nr_absent;
-}
-
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn)
-{
- unsigned long realtotalpages = 0, totalpages = 0;
- enum zone_type i;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
- unsigned long zone_start_pfn, zone_end_pfn;
- unsigned long spanned, absent;
- unsigned long size, real_size;
-
- spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- &zone_start_pfn,
- &zone_end_pfn);
- absent = zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn);
-
- size = spanned;
- real_size = size - absent;
-
- if (size)
- zone->zone_start_pfn = zone_start_pfn;
- else
- zone->zone_start_pfn = 0;
- zone->spanned_pages = size;
- zone->present_pages = real_size;
-
- totalpages += size;
- realtotalpages += real_size;
- }
-
- pgdat->node_spanned_pages = totalpages;
- pgdat->node_present_pages = realtotalpages;
- printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
- realtotalpages);
-}
-
-#ifndef CONFIG_SPARSEMEM
-/*
- * Calculate the size of the zone->blockflags rounded to an unsigned long
- * Start by making sure zonesize is a multiple of pageblock_order by rounding
- * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
- * round what is now in bits to nearest long in bits, then return it in
- * bytes.
- */
-static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
-{
- unsigned long usemapsize;
-
- zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- usemapsize = roundup(zonesize, pageblock_nr_pages);
- usemapsize = usemapsize >> pageblock_order;
- usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
-
- return usemapsize / 8;
+ pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
+ zone->present_pages, zone_batchsize(zone));
}
-static void __ref setup_usemap(struct pglist_data *pgdat,
- struct zone *zone,
- unsigned long zone_start_pfn,
- unsigned long zonesize)
-{
- unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
- zone->pageblock_flags = NULL;
- if (usemapsize) {
- zone->pageblock_flags =
- memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
- pgdat->node_id);
- if (!zone->pageblock_flags)
- panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
- usemapsize, zone->name, pgdat->node_id);
- }
-}
-#else
-static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
- unsigned long zone_start_pfn, unsigned long zonesize) {}
-#endif /* CONFIG_SPARSEMEM */
-
-#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-
-/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __init set_pageblock_order(void)
-{
- unsigned int order;
-
- /* Check that pageblock_nr_pages has not already been setup */
- if (pageblock_order)
- return;
-
- if (HPAGE_SHIFT > PAGE_SHIFT)
- order = HUGETLB_PAGE_ORDER;
- else
- order = MAX_ORDER - 1;
-
- /*
- * Assume the largest contiguous order of interest is a huge page.
- * This value may be variable depending on boot parameters on IA64 and
- * powerpc.
- */
- pageblock_order = order;
-}
-#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-
-/*
- * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * is unused as pageblock_order is set at compile-time. See
- * include/linux/pageblock-flags.h for the values of pageblock_order based on
- * the kernel config
- */
-void __init set_pageblock_order(void)
-{
-}
-
-#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-
-static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
- unsigned long present_pages)
-{
- unsigned long pages = spanned_pages;
-
- /*
- * Provide a more accurate estimation if there are holes within
- * the zone and SPARSEMEM is in use. If there are holes within the
- * zone, each populated memory region may cost us one or two extra
- * memmap pages due to alignment because memmap pages for each
- * populated regions may not be naturally aligned on page boundary.
- * So the (present_pages >> 4) heuristic is a tradeoff for that.
- */
- if (spanned_pages > present_pages + (present_pages >> 4) &&
- IS_ENABLED(CONFIG_SPARSEMEM))
- pages = present_pages;
-
- return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void pgdat_init_split_queue(struct pglist_data *pgdat)
-{
- struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
-
- spin_lock_init(&ds_queue->split_queue_lock);
- INIT_LIST_HEAD(&ds_queue->split_queue);
- ds_queue->split_queue_len = 0;
-}
-#else
-static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
-#endif
-
-#ifdef CONFIG_COMPACTION
-static void pgdat_init_kcompactd(struct pglist_data *pgdat)
-{
- init_waitqueue_head(&pgdat->kcompactd_wait);
-}
-#else
-static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
-#endif
-
-static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
-{
- pgdat_resize_init(pgdat);
-
- pgdat_init_split_queue(pgdat);
- pgdat_init_kcompactd(pgdat);
-
- init_waitqueue_head(&pgdat->kswapd_wait);
- init_waitqueue_head(&pgdat->pfmemalloc_wait);
-
- pgdat_page_ext_init(pgdat);
- spin_lock_init(&pgdat->lru_lock);
- lruvec_init(&pgdat->__lruvec);
-}
-
-static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
- unsigned long remaining_pages)
-{
- atomic_long_set(&zone->managed_pages, remaining_pages);
- zone_set_nid(zone, nid);
- zone->name = zone_names[idx];
- zone->zone_pgdat = NODE_DATA(nid);
- spin_lock_init(&zone->lock);
- zone_seqlock_init(zone);
- zone_pcp_init(zone);
-}
-
-/*
- * Set up the zone data structures
- * - init pgdat internals
- * - init all zones belonging to this node
- *
- * NOTE: this function is only called during memory hotplug
- */
-#ifdef CONFIG_MEMORY_HOTPLUG
-void __ref free_area_init_core_hotplug(int nid)
-{
- enum zone_type z;
- pg_data_t *pgdat = NODE_DATA(nid);
-
- pgdat_init_internals(pgdat);
- for (z = 0; z < MAX_NR_ZONES; z++)
- zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
-}
-#endif
-
-/*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- * NOTE: this function is only called during early init.
- */
-static void __init free_area_init_core(struct pglist_data *pgdat)
-{
- enum zone_type j;
- int nid = pgdat->node_id;
-
- pgdat_init_internals(pgdat);
- pgdat->per_cpu_nodestats = &boot_nodestats;
-
- for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone *zone = pgdat->node_zones + j;
- unsigned long size, freesize, memmap_pages;
- unsigned long zone_start_pfn = zone->zone_start_pfn;
-
- size = zone->spanned_pages;
- freesize = zone->present_pages;
-
- /*
- * Adjust freesize so that it accounts for how much memory
- * is used by this zone for memmap. This affects the watermark
- * and per-cpu initialisations
- */
- memmap_pages = calc_memmap_size(size, freesize);
- if (!is_highmem_idx(j)) {
- if (freesize >= memmap_pages) {
- freesize -= memmap_pages;
- if (memmap_pages)
- printk(KERN_DEBUG
- " %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
- } else
- pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
- zone_names[j], memmap_pages, freesize);
- }
-
- /* Account for reserved pages */
- if (j == 0 && freesize > dma_reserve) {
- freesize -= dma_reserve;
- printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
- zone_names[0], dma_reserve);
- }
-
- if (!is_highmem_idx(j))
- nr_kernel_pages += freesize;
- /* Charge for highmem memmap if there are enough kernel pages */
- else if (nr_kernel_pages > memmap_pages * 2)
- nr_kernel_pages -= memmap_pages;
- nr_all_pages += freesize;
-
- /*
- * Set an approximate value for lowmem here, it will be adjusted
- * when the bootmem allocator frees pages into the buddy system.
- * And all highmem pages will be managed by the buddy system.
- */
- zone_init_internals(zone, j, nid, freesize);
-
- if (!size)
- continue;
-
- set_pageblock_order();
- setup_usemap(pgdat, zone, zone_start_pfn, size);
- init_currently_empty_zone(zone, zone_start_pfn, size);
- memmap_init(size, nid, j, zone_start_pfn);
- }
-}
-
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
-{
- unsigned long __maybe_unused start = 0;
- unsigned long __maybe_unused offset = 0;
-
- /* Skip empty nodes */
- if (!pgdat->node_spanned_pages)
- return;
-
- start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
- offset = pgdat->node_start_pfn - start;
- /* ia64 gets its own node_mem_map, before this, without bootmem */
- if (!pgdat->node_mem_map) {
- unsigned long size, end;
- struct page *map;
-
- /*
- * The zone's endpoints aren't required to be MAX_ORDER
- * aligned but the node_mem_map endpoints must be in order
- * for the buddy allocator to function correctly.
- */
- end = pgdat_end_pfn(pgdat);
- end = ALIGN(end, MAX_ORDER_NR_PAGES);
- size = (end - start) * sizeof(struct page);
- map = memblock_alloc_node(size, SMP_CACHE_BYTES,
- pgdat->node_id);
- if (!map)
- panic("Failed to allocate %ld bytes for node %d memory map\n",
- size, pgdat->node_id);
- pgdat->node_mem_map = map + offset;
- }
- pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
- __func__, pgdat->node_id, (unsigned long)pgdat,
- (unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- /*
- * With no DISCONTIG, the global mem_map is just set as node 0's
- */
- if (pgdat == NODE_DATA(0)) {
- mem_map = NODE_DATA(0)->node_mem_map;
- if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= offset;
- }
-#endif
-}
-#else
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
-{
- pgdat->first_deferred_pfn = ULONG_MAX;
-}
-#else
-static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
-#endif
-
-static void __init free_area_init_node(int nid)
-{
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long start_pfn = 0;
- unsigned long end_pfn = 0;
-
- /* pg_data_t should be reset to zero when it's allocated */
- WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
-
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-
- pgdat->node_id = nid;
- pgdat->node_start_pfn = start_pfn;
- pgdat->per_cpu_nodestats = NULL;
-
- pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT,
- end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
- calculate_node_totalpages(pgdat, start_pfn, end_pfn);
-
- alloc_node_mem_map(pgdat);
- pgdat_set_deferred_range(pgdat);
-
- free_area_init_core(pgdat);
-}
-
-void __init free_area_init_memoryless_node(int nid)
-{
- free_area_init_node(nid);
-}
-
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
-/*
- * Initialize all valid struct pages in the range [spfn, epfn) and mark them
- * PageReserved(). Return the number of struct pages that were initialized.
- */
-static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
-{
- unsigned long pfn;
- u64 pgcnt = 0;
-
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
- continue;
- }
- /*
- * Use a fake node/zone (0) for now. Some of these pages
- * (in memblock.reserved but not in memblock.memory) will
- * get re-initialized via reserve_bootmem_region() later.
- */
- __init_single_page(pfn_to_page(pfn), pfn, 0, 0);
- __SetPageReserved(pfn_to_page(pfn));
- pgcnt++;
- }
-
- return pgcnt;
-}
-
-/*
- * Only struct pages that are backed by physical memory are zeroed and
- * initialized by going through __init_single_page(). But, there are some
- * struct pages which are reserved in memblock allocator and their fields
- * may be accessed (for example page_to_pfn() on some configuration accesses
- * flags). We must explicitly initialize those struct pages.
- *
- * This function also addresses a similar issue where struct pages are left
- * uninitialized because the physical address range is not covered by
- * memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=, or when the highest physical
- * address (max_pfn) does not end on a section boundary.
- */
-static void __init init_unavailable_mem(void)
-{
- phys_addr_t start, end;
- u64 i, pgcnt;
- phys_addr_t next = 0;
-
- /*
- * Loop through unavailable ranges not covered by memblock.memory.
- */
- pgcnt = 0;
- for_each_mem_range(i, &start, &end) {
- if (next < start)
- pgcnt += init_unavailable_range(PFN_DOWN(next),
- PFN_UP(start));
- next = end;
- }
-
- /*
- * Early sections always have a fully populated memmap for the whole
- * section - see pfn_valid(). If the last section has holes at the
- * end and that section is marked "online", the memmap will be
- * considered initialized. Make sure that memmap has a well defined
- * state.
- */
- pgcnt += init_unavailable_range(PFN_DOWN(next),
- round_up(max_pfn, PAGES_PER_SECTION));
-
- /*
- * Struct pages that do not have backing memory. This could be because
- * firmware is using some of this memory, or for some other reasons.
- */
- if (pgcnt)
- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
-}
-#else
-static inline void __init init_unavailable_mem(void)
-{
-}
-#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
-
-#if MAX_NUMNODES > 1
-/*
- * Figure out the number of possible node ids.
- */
-void __init setup_nr_node_ids(void)
-{
- unsigned int highest;
-
- highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
- nr_node_ids = highest + 1;
-}
-#endif
-
-/**
- * node_map_pfn_alignment - determine the maximum internode alignment
- *
- * This function should be called after node map is populated and sorted.
- * It calculates the maximum power of two alignment which can distinguish
- * all the nodes.
- *
- * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
- * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
- * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
- * shifted, 1GiB is enough and this function will indicate so.
- *
- * This is used to test whether pfn -> nid mapping of the chosen memory
- * model has fine enough granularity to avoid incorrect mapping for the
- * populated node map.
- *
- * Return: the determined alignment in pfn's. 0 if there is no alignment
- * requirement (single node).
- */
-unsigned long __init node_map_pfn_alignment(void)
-{
- unsigned long accl_mask = 0, last_end = 0;
- unsigned long start, end, mask;
- int last_nid = NUMA_NO_NODE;
- int i, nid;
-
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
- if (!start || last_nid < 0 || last_nid == nid) {
- last_nid = nid;
- last_end = end;
- continue;
- }
-
- /*
- * Start with a mask granular enough to pin-point to the
- * start pfn and tick off bits one-by-one until it becomes
- * too coarse to separate the current node from the last.
- */
- mask = ~((1 << __ffs(start)) - 1);
- while (mask && last_end <= (start & (mask << 1)))
- mask <<= 1;
-
- /* accumulate all internode masks */
- accl_mask |= mask;
- }
-
- /* convert mask to number of pages */
- return ~accl_mask + 1;
-}
-
-/**
- * find_min_pfn_with_active_regions - Find the minimum PFN registered
- *
- * Return: the minimum PFN based on information provided via
- * memblock_set_node().
- */
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
- return PHYS_PFN(memblock_start_of_DRAM());
-}
-
-/*
- * early_calculate_totalpages()
- * Sum pages in active regions for movable zone.
- * Populate N_MEMORY for calculating usable_nodes.
- */
-static unsigned long __init early_calculate_totalpages(void)
-{
- unsigned long totalpages = 0;
- unsigned long start_pfn, end_pfn;
- int i, nid;
-
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- unsigned long pages = end_pfn - start_pfn;
-
- totalpages += pages;
- if (pages)
- node_set_state(nid, N_MEMORY);
- }
- return totalpages;
-}
-
-/*
- * Find the PFN the Movable zone begins in each node. Kernel memory
- * is spread evenly between nodes as long as the nodes have enough
- * memory. When they don't, some nodes will have more kernelcore than
- * others
- */
-static void __init find_zone_movable_pfns_for_nodes(void)
-{
- int i, nid;
- unsigned long usable_startpfn;
- unsigned long kernelcore_node, kernelcore_remaining;
- /* save the state before borrow the nodemask */
- nodemask_t saved_node_state = node_states[N_MEMORY];
- unsigned long totalpages = early_calculate_totalpages();
- int usable_nodes = nodes_weight(node_states[N_MEMORY]);
- struct memblock_region *r;
-
- /* Need to find movable_zone earlier when movable_node is specified. */
- find_usable_zone_for_movable();
-
- /*
- * If movable_node is specified, ignore kernelcore and movablecore
- * options.
- */
- if (movable_node_is_enabled()) {
- for_each_mem_region(r) {
- if (!memblock_is_hotpluggable(r))
- continue;
-
- nid = memblock_get_region_node(r);
-
- usable_startpfn = PFN_DOWN(r->base);
- zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
- min(usable_startpfn, zone_movable_pfn[nid]) :
- usable_startpfn;
- }
-
- goto out2;
- }
-
- /*
- * If kernelcore=mirror is specified, ignore movablecore option
- */
- if (mirrored_kernelcore) {
- bool mem_below_4gb_not_mirrored = false;
-
- for_each_mem_region(r) {
- if (memblock_is_mirror(r))
- continue;
-
- nid = memblock_get_region_node(r);
-
- usable_startpfn = memblock_region_memory_base_pfn(r);
-
- if (usable_startpfn < 0x100000) {
- mem_below_4gb_not_mirrored = true;
- continue;
- }
-
- zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
- min(usable_startpfn, zone_movable_pfn[nid]) :
- usable_startpfn;
- }
-
- if (mem_below_4gb_not_mirrored)
- pr_warn("This configuration results in unmirrored kernel memory.\n");
-
- goto out2;
- }
-
- /*
- * If kernelcore=nn% or movablecore=nn% was specified, calculate the
- * amount of necessary memory.
- */
- if (required_kernelcore_percent)
- required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
- 10000UL;
- if (required_movablecore_percent)
- required_movablecore = (totalpages * 100 * required_movablecore_percent) /
- 10000UL;
-
- /*
- * If movablecore= was specified, calculate what size of
- * kernelcore that corresponds so that memory usable for
- * any allocation type is evenly spread. If both kernelcore
- * and movablecore are specified, then the value of kernelcore
- * will be used for required_kernelcore if it's greater than
- * what movablecore would have allowed.
- */
- if (required_movablecore) {
- unsigned long corepages;
-
- /*
- * Round-up so that ZONE_MOVABLE is at least as large as what
- * was requested by the user
- */
- required_movablecore =
- roundup(required_movablecore, MAX_ORDER_NR_PAGES);
- required_movablecore = min(totalpages, required_movablecore);
- corepages = totalpages - required_movablecore;
-
- required_kernelcore = max(required_kernelcore, corepages);
- }
-
- /*
- * If kernelcore was not specified or kernelcore size is larger
- * than totalpages, there is no ZONE_MOVABLE.
- */
- if (!required_kernelcore || required_kernelcore >= totalpages)
- goto out;
-
- /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
- usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
-
-restart:
- /* Spread kernelcore memory as evenly as possible throughout nodes */
- kernelcore_node = required_kernelcore / usable_nodes;
- for_each_node_state(nid, N_MEMORY) {
- unsigned long start_pfn, end_pfn;
-
- /*
- * Recalculate kernelcore_node if the division per node
- * now exceeds what is necessary to satisfy the requested
- * amount of memory for the kernel
- */
- if (required_kernelcore < kernelcore_node)
- kernelcore_node = required_kernelcore / usable_nodes;
-
- /*
- * As the map is walked, we track how much memory is usable
- * by the kernel using kernelcore_remaining. When it is
- * 0, the rest of the node is usable by ZONE_MOVABLE
- */
- kernelcore_remaining = kernelcore_node;
-
- /* Go through each range of PFNs within this node */
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- unsigned long size_pages;
-
- start_pfn = max(start_pfn, zone_movable_pfn[nid]);
- if (start_pfn >= end_pfn)
- continue;
-
- /* Account for what is only usable for kernelcore */
- if (start_pfn < usable_startpfn) {
- unsigned long kernel_pages;
- kernel_pages = min(end_pfn, usable_startpfn)
- - start_pfn;
-
- kernelcore_remaining -= min(kernel_pages,
- kernelcore_remaining);
- required_kernelcore -= min(kernel_pages,
- required_kernelcore);
-
- /* Continue if range is now fully accounted */
- if (end_pfn <= usable_startpfn) {
-
- /*
- * Push zone_movable_pfn to the end so
- * that if we have to rebalance
- * kernelcore across nodes, we will
- * not double account here
- */
- zone_movable_pfn[nid] = end_pfn;
- continue;
- }
- start_pfn = usable_startpfn;
- }
-
- /*
- * The usable PFN range for ZONE_MOVABLE is from
- * start_pfn->end_pfn. Calculate size_pages as the
- * number of pages used as kernelcore
- */
- size_pages = end_pfn - start_pfn;
- if (size_pages > kernelcore_remaining)
- size_pages = kernelcore_remaining;
- zone_movable_pfn[nid] = start_pfn + size_pages;
-
- /*
- * Some kernelcore has been met, update counts and
- * break if the kernelcore for this node has been
- * satisfied
- */
- required_kernelcore -= min(required_kernelcore,
- size_pages);
- kernelcore_remaining -= size_pages;
- if (!kernelcore_remaining)
- break;
- }
- }
-
- /*
- * If there is still required_kernelcore, we do another pass with one
- * less node in the count. This will push zone_movable_pfn[nid] further
- * along on the nodes that still have memory until kernelcore is
- * satisfied
- */
- usable_nodes--;
- if (usable_nodes && required_kernelcore > usable_nodes)
- goto restart;
-
-out2:
- /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++)
- zone_movable_pfn[nid] =
- roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
-
-out:
- /* restore the node_state */
- node_states[N_MEMORY] = saved_node_state;
-}
-
-/* Any regular or high memory on that node ? */
-static void check_for_memory(pg_data_t *pgdat, int nid)
-{
- enum zone_type zone_type;
-
- for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
- struct zone *zone = &pgdat->node_zones[zone_type];
- if (populated_zone(zone)) {
- if (IS_ENABLED(CONFIG_HIGHMEM))
- node_set_state(nid, N_HIGH_MEMORY);
- if (zone_type <= ZONE_NORMAL)
- node_set_state(nid, N_NORMAL_MEMORY);
- break;
- }
- }
-}
-
-/*
- * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
- * such cases we allow max_zone_pfn sorted in the descending order
- */
-bool __weak arch_has_descending_max_zone_pfns(void)
-{
- return false;
-}
-
-/**
- * free_area_init - Initialise all pg_data_t and zone data
- * @max_zone_pfn: an array of max PFNs for each zone
- *
- * This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by memblock_set_node(), the size of each
- * zone in each node and their holes is calculated. If the maximum PFN
- * between two adjacent zones match, it is assumed that the zone is empty.
- * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
- * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
- * starts where the previous one ended. For example, ZONE_DMA32 starts
- * at arch_max_dma_pfn.
- */
-void __init free_area_init(unsigned long *max_zone_pfn)
-{
- unsigned long start_pfn, end_pfn;
- int i, nid, zone;
- bool descending;
-
- /* Record where the zone boundaries are */
- memset(arch_zone_lowest_possible_pfn, 0,
- sizeof(arch_zone_lowest_possible_pfn));
- memset(arch_zone_highest_possible_pfn, 0,
- sizeof(arch_zone_highest_possible_pfn));
-
- start_pfn = find_min_pfn_with_active_regions();
- descending = arch_has_descending_max_zone_pfns();
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (descending)
- zone = MAX_NR_ZONES - i - 1;
- else
- zone = i;
-
- if (zone == ZONE_MOVABLE)
- continue;
-
- end_pfn = max(max_zone_pfn[zone], start_pfn);
- arch_zone_lowest_possible_pfn[zone] = start_pfn;
- arch_zone_highest_possible_pfn[zone] = end_pfn;
-
- start_pfn = end_pfn;
- }
-
- /* Find the PFNs that ZONE_MOVABLE begins at in each node */
- memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
- find_zone_movable_pfns_for_nodes();
-
- /* Print out the zone ranges */
- pr_info("Zone ranges:\n");
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
- continue;
- pr_info(" %-8s ", zone_names[i]);
- if (arch_zone_lowest_possible_pfn[i] ==
- arch_zone_highest_possible_pfn[i])
- pr_cont("empty\n");
- else
- pr_cont("[mem %#018Lx-%#018Lx]\n",
- (u64)arch_zone_lowest_possible_pfn[i]
- << PAGE_SHIFT,
- ((u64)arch_zone_highest_possible_pfn[i]
- << PAGE_SHIFT) - 1);
- }
-
- /* Print out the PFNs ZONE_MOVABLE begins at in each node */
- pr_info("Movable zone start for each node\n");
- for (i = 0; i < MAX_NUMNODES; i++) {
- if (zone_movable_pfn[i])
- pr_info(" Node %d: %#018Lx\n", i,
- (u64)zone_movable_pfn[i] << PAGE_SHIFT);
- }
-
- /*
- * Print out the early node map, and initialize the
- * subsection-map relative to active online memory ranges to
- * enable future "sub-section" extensions of the memory map.
- */
- pr_info("Early memory node ranges\n");
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT,
- ((u64)end_pfn << PAGE_SHIFT) - 1);
- subsection_map_init(start_pfn, end_pfn - start_pfn);
- }
-
- /* Initialise every node */
- mminit_verify_pageflags_layout();
- setup_nr_node_ids();
- init_unavailable_mem();
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
- free_area_init_node(nid);
-
- /* Any memory on that node */
- if (pgdat->node_present_pages)
- node_set_state(nid, N_MEMORY);
- check_for_memory(pgdat, nid);
- }
-}
-
-static int __init cmdline_parse_core(char *p, unsigned long *core,
- unsigned long *percent)
-{
- unsigned long long coremem;
- char *endptr;
-
- if (!p)
- return -EINVAL;
-
- /* Value may be a percentage of total memory, otherwise bytes */
- coremem = simple_strtoull(p, &endptr, 0);
- if (*endptr == '%') {
- /* Paranoid check for percent values greater than 100 */
- WARN_ON(coremem > 100);
-
- *percent = coremem;
- } else {
- coremem = memparse(p, &p);
- /* Paranoid check that UL is enough for the coremem value */
- WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
-
- *core = coremem >> PAGE_SHIFT;
- *percent = 0UL;
- }
- return 0;
-}
-
-/*
- * kernelcore=size sets the amount of memory for use for allocations that
- * cannot be reclaimed or migrated.
- */
-static int __init cmdline_parse_kernelcore(char *p)
-{
- /* parse kernelcore=mirror */
- if (parse_option_str(p, "mirror")) {
- mirrored_kernelcore = true;
- return 0;
- }
-
- return cmdline_parse_core(p, &required_kernelcore,
- &required_kernelcore_percent);
-}
-
-/*
- * movablecore=size sets the amount of memory for use for allocations that
- * can be reclaimed or migrated.
- */
-static int __init cmdline_parse_movablecore(char *p)
-{
- return cmdline_parse_core(p, &required_movablecore,
- &required_movablecore_percent);
-}
-
-early_param("kernelcore", cmdline_parse_kernelcore);
-early_param("movablecore", cmdline_parse_movablecore);
-
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
@@ -7541,6 +5545,11 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
* alias for the memset().
*/
direct_map_addr = page_address(page);
+ /*
+ * Perform a kasan-unchecked memset() since this memory
+ * has not been initialized.
+ */
+ direct_map_addr = kasan_reset_tag(direct_map_addr);
if ((unsigned int)poison <= 0xFF)
memset(direct_map_addr, poison, PAGE_SIZE);
@@ -7548,95 +5557,17 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
}
if (pages && s)
- pr_info("Freeing %s memory: %ldK\n",
- s, pages << (PAGE_SHIFT - 10));
+ pr_info("Freeing %s memory: %ldK\n", s, K(pages));
return pages;
}
-#ifdef CONFIG_HIGHMEM
-void free_highmem_page(struct page *page)
-{
- __free_reserved_page(page);
- totalram_pages_inc();
- atomic_long_inc(&page_zone(page)->managed_pages);
- totalhigh_pages_inc();
-}
-#endif
-
-
-void __init mem_init_print_info(const char *str)
-{
- unsigned long physpages, codesize, datasize, rosize, bss_size;
- unsigned long init_code_size, init_data_size;
-
- physpages = get_num_physpages();
- codesize = _etext - _stext;
- datasize = _edata - _sdata;
- rosize = __end_rodata - __start_rodata;
- bss_size = __bss_stop - __bss_start;
- init_data_size = __init_end - __init_begin;
- init_code_size = _einittext - _sinittext;
-
- /*
- * Detect special cases and adjust section sizes accordingly:
- * 1) .init.* may be embedded into .data sections
- * 2) .init.text.* may be out of [__init_begin, __init_end],
- * please refer to arch/tile/kernel/vmlinux.lds.S.
- * 3) .rodata.* may be embedded into .text or .data sections.
- */
-#define adj_init_size(start, end, size, pos, adj) \
- do { \
- if (start <= pos && pos < end && size > adj) \
- size -= adj; \
- } while (0)
-
- adj_init_size(__init_begin, __init_end, init_data_size,
- _sinittext, init_code_size);
- adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
- adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
- adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
- adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
-
-#undef adj_init_size
-
- pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
-#ifdef CONFIG_HIGHMEM
- ", %luK highmem"
-#endif
- "%s%s)\n",
- nr_free_pages() << (PAGE_SHIFT - 10),
- physpages << (PAGE_SHIFT - 10),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
- totalcma_pages << (PAGE_SHIFT - 10),
-#ifdef CONFIG_HIGHMEM
- totalhigh_pages() << (PAGE_SHIFT - 10),
-#endif
- str ? ", " : "", str ? str : "");
-}
-
-/**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
- *
- * The per-cpu batchsize and zone watermarks are determined by managed_pages.
- * In the DMA zone, a significant percentage may be consumed by kernel image
- * and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in the
- * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
- * smaller per-cpu batchsize.
- */
-void __init set_dma_reserve(unsigned long new_dma_reserve)
-{
- dma_reserve = new_dma_reserve;
-}
-
static int page_alloc_cpu_dead(unsigned int cpu)
{
+ struct zone *zone;
lru_add_drain_cpu(cpu);
+ mlock_drain_remote(cpu);
drain_pages(cpu);
/*
@@ -7655,33 +5586,29 @@ static int page_alloc_cpu_dead(unsigned int cpu)
* race with what we are doing.
*/
cpu_vm_stats_fold(cpu);
+
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone, 0);
+
return 0;
}
-#ifdef CONFIG_NUMA
-int hashdist = HASHDIST_DEFAULT;
-
-static int __init set_hashdist(char *str)
+static int page_alloc_cpu_online(unsigned int cpu)
{
- if (!str)
- return 0;
- hashdist = simple_strtoul(str, &str, 0);
- return 1;
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone, 1);
+ return 0;
}
-__setup("hashdist=", set_hashdist);
-#endif
-void __init page_alloc_init(void)
+void __init page_alloc_init_cpuhp(void)
{
int ret;
-#ifdef CONFIG_NUMA
- if (num_node_state(N_MEMORY) == 1)
- hashdist = 0;
-#endif
-
- ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
- "mm/page_alloc:dead", NULL,
+ ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
+ "mm/page_alloc:pcp",
+ page_alloc_cpu_online,
page_alloc_cpu_dead);
WARN_ON(ret < 0);
}
@@ -7734,31 +5661,24 @@ static void calculate_totalreserve_pages(void)
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
- enum zone_type j, idx;
+ enum zone_type i, j;
for_each_online_pgdat(pgdat) {
- for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone *zone = pgdat->node_zones + j;
- unsigned long managed_pages = zone_managed_pages(zone);
-
- zone->lowmem_reserve[j] = 0;
+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
+ struct zone *zone = &pgdat->node_zones[i];
+ int ratio = sysctl_lowmem_reserve_ratio[i];
+ bool clear = !ratio || !zone_managed_pages(zone);
+ unsigned long managed_pages = 0;
- idx = j;
- while (idx) {
- struct zone *lower_zone;
+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
+ struct zone *upper_zone = &pgdat->node_zones[j];
- idx--;
- lower_zone = pgdat->node_zones + idx;
+ managed_pages += zone_managed_pages(upper_zone);
- if (!sysctl_lowmem_reserve_ratio[idx] ||
- !zone_managed_pages(lower_zone)) {
- lower_zone->lowmem_reserve[j] = 0;
- continue;
- } else {
- lower_zone->lowmem_reserve[j] =
- managed_pages / sysctl_lowmem_reserve_ratio[idx];
- }
- managed_pages += zone_managed_pages(lower_zone);
+ if (clear)
+ zone->lowmem_reserve[j] = 0;
+ else
+ zone->lowmem_reserve[j] = managed_pages / ratio;
}
}
}
@@ -7820,7 +5740,8 @@ static void __setup_per_zone_wmarks(void)
zone->watermark_boost = 0;
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
- zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
+ zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -7838,11 +5759,19 @@ static void __setup_per_zone_wmarks(void)
*/
void setup_per_zone_wmarks(void)
{
+ struct zone *zone;
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
__setup_per_zone_wmarks();
spin_unlock(&lock);
+
+ /*
+ * The watermark size have changed so update the pcpu batch
+ * and high limits or the limits may be inappropriate.
+ */
+ for_each_zone(zone)
+ zone_pcp_update(zone, 0);
}
/*
@@ -7869,7 +5798,7 @@ void setup_per_zone_wmarks(void)
* 8192MB: 11584k
* 16384MB: 16384k
*/
-int __meminit init_per_zone_wmark_min(void)
+void calculate_min_free_kbytes(void)
{
unsigned long lowmem_kbytes;
int new_min_free_kbytes;
@@ -7877,16 +5806,17 @@ int __meminit init_per_zone_wmark_min(void)
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
- if (new_min_free_kbytes > user_min_free_kbytes) {
- min_free_kbytes = new_min_free_kbytes;
- if (min_free_kbytes < 128)
- min_free_kbytes = 128;
- if (min_free_kbytes > 262144)
- min_free_kbytes = 262144;
- } else {
+ if (new_min_free_kbytes > user_min_free_kbytes)
+ min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
+ else
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes);
- }
+
+}
+
+int __meminit init_per_zone_wmark_min(void)
+{
+ calculate_min_free_kbytes();
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
@@ -7907,7 +5837,7 @@ postcore_initcall(init_per_zone_wmark_min)
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
-int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -7923,7 +5853,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -7953,7 +5883,7 @@ static void setup_min_unmapped_ratio(void)
}
-int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -7980,7 +5910,7 @@ static void setup_min_slab_ratio(void)
sysctl_min_slab_ratio) / 100;
}
-int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8004,8 +5934,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
+static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *length, loff_t *ppos)
{
int i;
@@ -8020,330 +5950,137 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-static void __zone_pcp_update(struct zone *zone)
-{
- unsigned int cpu;
-
- for_each_possible_cpu(cpu)
- pageset_set_high_and_batch(zone,
- per_cpu_ptr(zone->pageset, cpu));
-}
-
/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu. It is the fraction of total pages in each zone that a hot per cpu
+ * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
+static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- int old_percpu_pagelist_fraction;
+ int old_percpu_pagelist_high_fraction;
int ret;
mutex_lock(&pcp_batch_high_lock);
- old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+ old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (!write || ret < 0)
goto out;
/* Sanity checking to avoid pcp imbalance */
- if (percpu_pagelist_fraction &&
- percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
- percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ if (percpu_pagelist_high_fraction &&
+ percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
+ percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
ret = -EINVAL;
goto out;
}
/* No change? */
- if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
goto out;
for_each_populated_zone(zone)
- __zone_pcp_update(zone);
+ zone_set_pageset_high_and_batch(zone, 0);
out:
mutex_unlock(&pcp_batch_high_lock);
return ret;
}
-#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-/*
- * Returns the number of pages that arch has reserved but
- * is not known to alloc_large_system_hash().
- */
-static unsigned long __init arch_reserved_kernel_pages(void)
-{
- return 0;
-}
-#endif
-
-/*
- * Adaptive scale is meant to reduce sizes of hash tables on large memory
- * machines. As memory size is increased the scale is also increased but at
- * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
- * quadruples the scale is increased by one, which means the size of hash table
- * only doubles, instead of quadrupling as well.
- * Because 32-bit systems cannot have large physical memory, where this scaling
- * makes sense, it is disabled on such platforms.
- */
-#if __BITS_PER_LONG > 32
-#define ADAPT_SCALE_BASE (64ul << 30)
-#define ADAPT_SCALE_SHIFT 2
-#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
-#endif
-
-/*
- * allocate a large system hash table from bootmem
- * - it is assumed that the hash table must contain an exact power-of-2
- * quantity of entries
- * - limit is the number of hash buckets, not the total allocation size
- */
-void *__init alloc_large_system_hash(const char *tablename,
- unsigned long bucketsize,
- unsigned long numentries,
- int scale,
- int flags,
- unsigned int *_hash_shift,
- unsigned int *_hash_mask,
- unsigned long low_limit,
- unsigned long high_limit)
-{
- unsigned long long max = high_limit;
- unsigned long log2qty, size;
- void *table = NULL;
- gfp_t gfp_flags;
- bool virt;
-
- /* allow the kernel cmdline to have a say */
- if (!numentries) {
- /* round applicable memory size up to nearest megabyte */
- numentries = nr_kernel_pages;
- numentries -= arch_reserved_kernel_pages();
-
- /* It isn't necessary when PAGE_SIZE >= 1MB */
- if (PAGE_SHIFT < 20)
- numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
-
-#if __BITS_PER_LONG > 32
- if (!high_limit) {
- unsigned long adapt;
-
- for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
- adapt <<= ADAPT_SCALE_SHIFT)
- scale++;
- }
+static struct ctl_table page_alloc_sysctl_table[] = {
+ {
+ .procname = "min_free_kbytes",
+ .data = &min_free_kbytes,
+ .maxlen = sizeof(min_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "watermark_boost_factor",
+ .data = &watermark_boost_factor,
+ .maxlen = sizeof(watermark_boost_factor),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "watermark_scale_factor",
+ .data = &watermark_scale_factor,
+ .maxlen = sizeof(watermark_scale_factor),
+ .mode = 0644,
+ .proc_handler = watermark_scale_factor_sysctl_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_THREE_THOUSAND,
+ },
+ {
+ .procname = "percpu_pagelist_high_fraction",
+ .data = &percpu_pagelist_high_fraction,
+ .maxlen = sizeof(percpu_pagelist_high_fraction),
+ .mode = 0644,
+ .proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "lowmem_reserve_ratio",
+ .data = &sysctl_lowmem_reserve_ratio,
+ .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
+ .mode = 0644,
+ .proc_handler = lowmem_reserve_ratio_sysctl_handler,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_zonelist_order",
+ .data = &numa_zonelist_order,
+ .maxlen = NUMA_ZONELIST_ORDER_LEN,
+ .mode = 0644,
+ .proc_handler = numa_zonelist_order_handler,
+ },
+ {
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "min_slab_ratio",
+ .data = &sysctl_min_slab_ratio,
+ .maxlen = sizeof(sysctl_min_slab_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
#endif
+ {}
+};
- /* limit to 1 bucket per 2^scale bytes of low memory */
- if (scale > PAGE_SHIFT)
- numentries >>= (scale - PAGE_SHIFT);
- else
- numentries <<= (PAGE_SHIFT - scale);
-
- /* Make sure we've got at least a 0-order allocation.. */
- if (unlikely(flags & HASH_SMALL)) {
- /* Makes no sense without HASH_EARLY */
- WARN_ON(!(flags & HASH_EARLY));
- if (!(numentries >> *_hash_shift)) {
- numentries = 1UL << *_hash_shift;
- BUG_ON(!numentries);
- }
- } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
- numentries = PAGE_SIZE / bucketsize;
- }
- numentries = roundup_pow_of_two(numentries);
-
- /* limit allocation size to 1/16 total memory by default */
- if (max == 0) {
- max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
- do_div(max, bucketsize);
- }
- max = min(max, 0x80000000ULL);
-
- if (numentries < low_limit)
- numentries = low_limit;
- if (numentries > max)
- numentries = max;
-
- log2qty = ilog2(numentries);
-
- gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
- do {
- virt = false;
- size = bucketsize << log2qty;
- if (flags & HASH_EARLY) {
- if (flags & HASH_ZERO)
- table = memblock_alloc(size, SMP_CACHE_BYTES);
- else
- table = memblock_alloc_raw(size,
- SMP_CACHE_BYTES);
- } else if (get_order(size) >= MAX_ORDER || hashdist) {
- table = __vmalloc(size, gfp_flags);
- virt = true;
- } else {
- /*
- * If bucketsize is not a power-of-two, we may free
- * some pages at the end of hash table which
- * alloc_pages_exact() automatically does
- */
- table = alloc_pages_exact(size, gfp_flags);
- kmemleak_alloc(table, size, 1, gfp_flags);
- }
- } while (!table && size > PAGE_SIZE && --log2qty);
-
- if (!table)
- panic("Failed to allocate %s hash table\n", tablename);
-
- pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
- virt ? "vmalloc" : "linear");
-
- if (_hash_shift)
- *_hash_shift = log2qty;
- if (_hash_mask)
- *_hash_mask = (1 << log2qty) - 1;
-
- return table;
-}
-
-/*
- * This function checks whether pageblock includes unmovable pages or not.
- *
- * PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * check without lock_page also may miss some movable non-lru pages at
- * race condition. So you can't expect this function should be exact.
- *
- * Returns a page without holding a reference. If the caller wants to
- * dereference that page (e.g., dumping), it has to make sure that it
- * cannot get removed (e.g., via memory unplug) concurrently.
- *
- */
-struct page *has_unmovable_pages(struct zone *zone, struct page *page,
- int migratetype, int flags)
+void __init page_alloc_sysctl_init(void)
{
- unsigned long iter = 0;
- unsigned long pfn = page_to_pfn(page);
- unsigned long offset = pfn % pageblock_nr_pages;
-
- if (is_migrate_cma_page(page)) {
- /*
- * CMA allocations (alloc_contig_range) really need to mark
- * isolate CMA pageblocks even when they are not movable in fact
- * so consider them movable here.
- */
- if (is_migrate_cma(migratetype))
- return NULL;
-
- return page;
- }
-
- for (; iter < pageblock_nr_pages - offset; iter++) {
- if (!pfn_valid_within(pfn + iter))
- continue;
-
- page = pfn_to_page(pfn + iter);
-
- /*
- * Both, bootmem allocations and memory holes are marked
- * PG_reserved and are unmovable. We can even have unmovable
- * allocations inside ZONE_MOVABLE, for example when
- * specifying "movablecore".
- */
- if (PageReserved(page))
- return page;
-
- /*
- * If the zone is movable and we have ruled out all reserved
- * pages then it should be reasonably safe to assume the rest
- * is movable.
- */
- if (zone_idx(zone) == ZONE_MOVABLE)
- continue;
-
- /*
- * Hugepages are not in LRU lists, but they're movable.
- * THPs are on the LRU, but need to be counted as #small pages.
- * We need not scan over tail pages because we don't
- * handle each tail page individually in migration.
- */
- if (PageHuge(page) || PageTransCompound(page)) {
- struct page *head = compound_head(page);
- unsigned int skip_pages;
-
- if (PageHuge(page)) {
- if (!hugepage_migration_supported(page_hstate(head)))
- return page;
- } else if (!PageLRU(head) && !__PageMovable(head)) {
- return page;
- }
-
- skip_pages = compound_nr(head) - (page - head);
- iter += skip_pages - 1;
- continue;
- }
-
- /*
- * We can't use page_count without pin a page
- * because another CPU can free compound page.
- * This check already skips compound tails of THP
- * because their page->_refcount is zero at all time.
- */
- if (!page_ref_count(page)) {
- if (PageBuddy(page))
- iter += (1 << page_order(page)) - 1;
- continue;
- }
-
- /*
- * The HWPoisoned page may be not in buddy system, and
- * page_count() is not 0.
- */
- if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
- continue;
-
- /*
- * We treat all PageOffline() pages as movable when offlining
- * to give drivers a chance to decrement their reference count
- * in MEM_GOING_OFFLINE in order to indicate that these pages
- * can be offlined as there are no direct references anymore.
- * For actually unmovable PageOffline() where the driver does
- * not support this, we will fail later when trying to actually
- * move these pages that still have a reference count > 0.
- * (false negatives in this function only)
- */
- if ((flags & MEMORY_OFFLINE) && PageOffline(page))
- continue;
-
- if (__PageMovable(page) || PageLRU(page))
- continue;
-
- /*
- * If there are RECLAIMABLE pages, we need to check
- * it. But now, memory offline itself doesn't call
- * shrink_node_slabs() and it still to be fixed.
- */
- return page;
- }
- return NULL;
+ register_sysctl_init("vm", page_alloc_sysctl_table);
}
#ifdef CONFIG_CONTIG_ALLOC
-static unsigned long pfn_max_align_down(unsigned long pfn)
+/* Usage: See admin-guide/dynamic-debug-howto.rst */
+static void alloc_contig_dump_pages(struct list_head *page_list)
{
- return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
- pageblock_nr_pages) - 1);
-}
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
-static unsigned long pfn_max_align_up(unsigned long pfn)
-{
- return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
- pageblock_nr_pages));
+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
+ struct page *page;
+
+ dump_stack();
+ list_for_each_entry(page, page_list, lru)
+ dump_page(page, "migration failure");
+ }
}
/* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
@@ -8356,7 +6093,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
- migrate_prep();
+ lru_cache_disable();
while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
@@ -8366,14 +6103,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
if (list_empty(&cc->migratepages)) {
cc->nr_migratepages = 0;
- pfn = isolate_migratepages_range(cc, pfn, end);
- if (!pfn) {
- ret = -EINTR;
+ ret = isolate_migratepages_range(cc, pfn, end);
+ if (ret && ret != -EAGAIN)
break;
- }
+ pfn = cc->migrate_pfn;
tries = 0;
} else if (++tries == 5) {
- ret = ret < 0 ? ret : -EBUSY;
+ ret = -EBUSY;
break;
}
@@ -8382,9 +6118,20 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
cc->nr_migratepages -= nr_reclaimed;
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
- NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
+
+ /*
+ * On -ENOMEM, migrate_pages() bails out right away. It is pointless
+ * to retry again over this error, so do the same here.
+ */
+ if (ret == -ENOMEM)
+ break;
}
+
+ lru_cache_enable();
if (ret < 0) {
+ if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
+ alloc_contig_dump_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
return ret;
}
@@ -8395,14 +6142,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
- * @migratetype: migratetype of the underlaying pageblocks (either
+ * @migratetype: migratetype of the underlying pageblocks (either
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
* @gfp_mask: GFP mask to use during compaction
*
- * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
- * aligned. The PFN range must belong to a single zone.
+ * The PFN range does not have to be pageblock aligned. The PFN range must
+ * belong to a single zone.
*
* The first thing this routine does is attempt to MIGRATE_ISOLATE all
* pageblocks in the range. Once isolated, the pageblocks should not
@@ -8416,7 +6163,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
- unsigned int order;
+ int order;
int ret = 0;
struct compact_control cc = {
@@ -8435,14 +6182,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
* have different sizes, and due to the way page allocator
- * work, we align the range to biggest of the two pages so
- * that page allocator won't try to merge buddies from
- * different pageblocks and change MIGRATE_ISOLATE to some
- * other migration type.
+ * work, start_isolate_page_range() has special handlings for this.
*
* Once the pageblocks are marked as MIGRATE_ISOLATE, we
* migrate the pages from an unaligned range (ie. pages that
- * we are interested in). This will put all the pages in
+ * we are interested in). This will put all the pages in
* range back to page allocator as MIGRATE_ISOLATE.
*
* When this is done, we take the pages in range from page
@@ -8455,10 +6199,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype, 0);
- if (ret < 0)
- return ret;
+ ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
+ if (ret)
+ goto done;
+
+ drain_all_pages(cc.zone);
/*
* In case of -EBUSY, we'd like to know which page causes problem.
@@ -8473,10 +6218,10 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
- ret =0;
+ ret = 0;
/*
- * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ * Pages from [start, end) are within a pageblock_nr_pages
* aligned blocks that are marked as MIGRATE_ISOLATE. What's
* more, all pages in [start, end) are free in page allocator.
* What we are going to do is to allocate all pages from
@@ -8492,12 +6237,10 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* isolated thus they won't get removed from buddy.
*/
- lru_add_drain_all();
-
order = 0;
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
- if (++order >= MAX_ORDER) {
+ if (++order > MAX_ORDER) {
outer_start = start;
break;
}
@@ -8505,7 +6248,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
if (outer_start != start) {
- order = page_order(pfn_to_page(outer_start));
+ order = buddy_order(pfn_to_page(outer_start));
/*
* outer_start page could be small order buddy page and
@@ -8519,8 +6262,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, 0)) {
- pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
- __func__, outer_start, end);
ret = -EBUSY;
goto done;
}
@@ -8539,8 +6280,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
free_contig_range(end, outer_end - end);
done:
- undo_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype);
+ undo_isolate_page_range(start, end, migratetype);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range);
@@ -8571,9 +6311,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
if (PageReserved(page))
return false;
- if (page_count(page) > 0)
- return false;
-
if (PageHuge(page))
return false;
}
@@ -8601,8 +6338,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
* for allocation requests which can not be fulfilled with the buddy allocator.
*
* The allocated memory is always aligned to a page boundary. If nr_pages is a
- * power of two then the alignment is guaranteed to be to the given nr_pages
- * (e.g. 1GB request would be aligned to 1GB).
+ * power of two, then allocated range is also guaranteed to be aligned to same
+ * nr_pages (e.g. 1GB request would be aligned to 1GB).
*
* Allocated pages can be freed with free_contig_range() or by manually calling
* __free_page() on each allocated page.
@@ -8647,9 +6384,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
}
#endif /* CONFIG_CONTIG_ALLOC */
-void free_contig_range(unsigned long pfn, unsigned int nr_pages)
+void free_contig_range(unsigned long pfn, unsigned long nr_pages)
{
- unsigned int count = 0;
+ unsigned long count = 0;
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -8657,71 +6394,67 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
count += page_count(page) != 1;
__free_page(page);
}
- WARN(count != 0, "%d pages are still in use!\n", count);
+ WARN(count != 0, "%lu pages are still in use!\n", count);
}
EXPORT_SYMBOL(free_contig_range);
/*
- * The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalulated.
+ * Effectively disable pcplists for the zone by setting the high limit to 0
+ * and draining all cpus. A concurrent page freeing on another CPU that's about
+ * to put the page on pcplist will either finish before the drain and the page
+ * will be drained, or observe the new high limit and skip the pcplist.
+ *
+ * Must be paired with a call to zone_pcp_enable().
*/
-void __meminit zone_pcp_update(struct zone *zone)
+void zone_pcp_disable(struct zone *zone)
{
mutex_lock(&pcp_batch_high_lock);
- __zone_pcp_update(zone);
+ __zone_set_pageset_high_and_batch(zone, 0, 1);
+ __drain_all_pages(zone, true);
+}
+
+void zone_pcp_enable(struct zone *zone)
+{
+ __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
mutex_unlock(&pcp_batch_high_lock);
}
void zone_pcp_reset(struct zone *zone)
{
- unsigned long flags;
int cpu;
- struct per_cpu_pageset *pset;
+ struct per_cpu_zonestat *pzstats;
- /* avoid races with drain_pages() */
- local_irq_save(flags);
- if (zone->pageset != &boot_pageset) {
+ if (zone->per_cpu_pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
- pset = per_cpu_ptr(zone->pageset, cpu);
- drain_zonestat(zone, pset);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ drain_zonestat(zone, pzstats);
+ }
+ free_percpu(zone->per_cpu_pageset);
+ zone->per_cpu_pageset = &boot_pageset;
+ if (zone->per_cpu_zonestats != &boot_zonestats) {
+ free_percpu(zone->per_cpu_zonestats);
+ zone->per_cpu_zonestats = &boot_zonestats;
}
- free_percpu(zone->pageset);
- zone->pageset = &boot_pageset;
}
- local_irq_restore(flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be in a single zone and isolated
- * before calling this.
+ * All pages in the range must be in a single zone, must not contain holes,
+ * must span full sections, and must be isolated before calling this function.
*/
-unsigned long
-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
+ unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
unsigned int order;
- unsigned long pfn;
unsigned long flags;
- unsigned long offlined_pages = 0;
-
- /* find the first valid pfn */
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- if (pfn_valid(pfn))
- break;
- if (pfn == end_pfn)
- return offlined_pages;
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
- pfn = start_pfn;
while (pfn < end_pfn) {
- if (!pfn_valid(pfn)) {
- pfn++;
- continue;
- }
page = pfn_to_page(pfn);
/*
* The HWPoisoned page may be not in buddy system, and
@@ -8729,7 +6462,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
- offlined_pages++;
continue;
}
/*
@@ -8740,68 +6472,294 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(PageBuddy(page));
pfn++;
- offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
- order = page_order(page);
- offlined_pages += 1 << order;
+ order = buddy_order(page);
del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return offlined_pages;
}
#endif
+/*
+ * This function returns a stable result only if called under zone lock.
+ */
bool is_free_buddy_page(struct page *page)
{
- struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
- unsigned long flags;
unsigned int order;
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) && page_order(page_head) >= order)
+ if (PageBuddy(page_head) &&
+ buddy_order_unsafe(page_head) >= order)
break;
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return order < MAX_ORDER;
+ return order <= MAX_ORDER;
}
+EXPORT_SYMBOL(is_free_buddy_page);
#ifdef CONFIG_MEMORY_FAILURE
/*
- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
- * test is performed under the zone lock to prevent a race against page
- * allocation.
+ * Break down a higher-order page in sub-pages, and keep our target out of
+ * buddy allocator.
*/
-bool set_hwpoison_free_buddy_page(struct page *page)
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ struct page *target, int low, int high,
+ int migratetype)
+{
+ unsigned long size = 1 << high;
+ struct page *current_buddy, *next_page;
+
+ while (high > low) {
+ high--;
+ size >>= 1;
+
+ if (target >= &page[size]) {
+ next_page = page + size;
+ current_buddy = page;
+ } else {
+ next_page = page;
+ current_buddy = page + size;
+ }
+
+ if (set_page_guard(zone, current_buddy, high, migratetype))
+ continue;
+
+ if (current_buddy != target) {
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
+ page = next_page;
+ }
+ }
+}
+
+/*
+ * Take a page that will be marked as poisoned off the buddy allocator.
+ */
+bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
- bool hwpoisoned = false;
+ bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
-
- if (PageBuddy(page_head) && page_order(page_head) >= order) {
- if (!TestSetPageHWPoison(page))
- hwpoisoned = true;
+ int page_order = buddy_order(page_head);
+
+ if (PageBuddy(page_head) && page_order >= order) {
+ unsigned long pfn_head = page_to_pfn(page_head);
+ int migratetype = get_pfnblock_migratetype(page_head,
+ pfn_head);
+
+ del_page_from_free_list(page_head, zone, page_order);
+ break_down_buddy_pages(zone, page_head, page, 0,
+ page_order, migratetype);
+ SetPageHWPoisonTakenOff(page);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -1, migratetype);
+ ret = true;
break;
}
+ if (page_count(page_head) > 0)
+ break;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+}
+
+/*
+ * Cancel takeoff done by take_page_off_buddy().
+ */
+bool put_page_back_buddy(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ int migratetype = get_pfnblock_migratetype(page, pfn);
+ bool ret = false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (put_page_testzero(page)) {
+ ClearPageHWPoisonTakenOff(page);
+ __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
+ if (TestClearPageHWPoison(page)) {
+ ret = true;
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
- return hwpoisoned;
+ return ret;
}
#endif
+
+#ifdef CONFIG_ZONE_DMA
+bool has_managed_dma(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
+
+ if (managed_zone(zone))
+ return true;
+ }
+ return false;
+}
+#endif /* CONFIG_ZONE_DMA */
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+
+/* Counts number of zones with unaccepted pages. */
+static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
+
+static bool lazy_accept = true;
+
+static int __init accept_memory_parse(char *p)
+{
+ if (!strcmp(p, "lazy")) {
+ lazy_accept = true;
+ return 0;
+ } else if (!strcmp(p, "eager")) {
+ lazy_accept = false;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+early_param("accept_memory", accept_memory_parse);
+
+static bool page_contains_unaccepted(struct page *page, unsigned int order)
+{
+ phys_addr_t start = page_to_phys(page);
+ phys_addr_t end = start + (PAGE_SIZE << order);
+
+ return range_contains_unaccepted_memory(start, end);
+}
+
+static void accept_page(struct page *page, unsigned int order)
+{
+ phys_addr_t start = page_to_phys(page);
+
+ accept_memory(start, start + (PAGE_SIZE << order));
+}
+
+static bool try_to_accept_memory_one(struct zone *zone)
+{
+ unsigned long flags;
+ struct page *page;
+ bool last;
+
+ if (list_empty(&zone->unaccepted_pages))
+ return false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ page = list_first_entry_or_null(&zone->unaccepted_pages,
+ struct page, lru);
+ if (!page) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return false;
+ }
+
+ list_del(&page->lru);
+ last = list_empty(&zone->unaccepted_pages);
+
+ __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ accept_page(page, MAX_ORDER);
+
+ __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
+
+ if (last)
+ static_branch_dec(&zones_with_unaccepted_pages);
+
+ return true;
+}
+
+static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+{
+ long to_accept;
+ int ret = false;
+
+ /* How much to accept to get to high watermark? */
+ to_accept = high_wmark_pages(zone) -
+ (zone_page_state(zone, NR_FREE_PAGES) -
+ __zone_watermark_unusable_free(zone, order, 0));
+
+ /* Accept at least one page */
+ do {
+ if (!try_to_accept_memory_one(zone))
+ break;
+ ret = true;
+ to_accept -= MAX_ORDER_NR_PAGES;
+ } while (to_accept > 0);
+
+ return ret;
+}
+
+static inline bool has_unaccepted_memory(void)
+{
+ return static_branch_unlikely(&zones_with_unaccepted_pages);
+}
+
+static bool __free_unaccepted(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long flags;
+ bool first = false;
+
+ if (!lazy_accept)
+ return false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ first = list_empty(&zone->unaccepted_pages);
+ list_add_tail(&page->lru, &zone->unaccepted_pages);
+ __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ if (first)
+ static_branch_inc(&zones_with_unaccepted_pages);
+
+ return true;
+}
+
+#else
+
+static bool page_contains_unaccepted(struct page *page, unsigned int order)
+{
+ return false;
+}
+
+static void accept_page(struct page *page, unsigned int order)
+{
+}
+
+static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+{
+ return false;
+}
+
+static inline bool has_unaccepted_memory(void)
+{
+ return false;
+}
+
+static bool __free_unaccepted(struct page *page)
+{
+ BUILD_BUG();
+ return false;
+}
+
+#endif /* CONFIG_UNACCEPTED_MEMORY */
diff --git a/mm/page_counter.c b/mm/page_counter.c
index b24a60b28bb0..db20d6452b71 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -17,24 +17,23 @@ static void propagate_protected_usage(struct page_counter *c,
unsigned long usage)
{
unsigned long protected, old_protected;
- unsigned long low, min;
long delta;
if (!c->parent)
return;
- min = READ_ONCE(c->min);
- if (min || atomic_long_read(&c->min_usage)) {
- protected = min(usage, min);
+ protected = min(usage, READ_ONCE(c->min));
+ old_protected = atomic_long_read(&c->min_usage);
+ if (protected != old_protected) {
old_protected = atomic_long_xchg(&c->min_usage, protected);
delta = protected - old_protected;
if (delta)
atomic_long_add(delta, &c->parent->children_min_usage);
}
- low = READ_ONCE(c->low);
- if (low || atomic_long_read(&c->low_usage)) {
- protected = min(usage, low);
+ protected = min(usage, READ_ONCE(c->low));
+ old_protected = atomic_long_read(&c->low_usage);
+ if (protected != old_protected) {
old_protected = atomic_long_xchg(&c->low_usage, protected);
delta = protected - old_protected;
if (delta)
@@ -52,9 +51,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_sub_return(nr_pages, &counter->usage);
- propagate_protected_usage(counter, new);
/* More uncharges than charges? */
- WARN_ON_ONCE(new < 0);
+ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
+ new, nr_pages)) {
+ new = 0;
+ atomic_long_set(&counter->usage, new);
+ }
+ propagate_protected_usage(counter, new);
}
/**
@@ -116,7 +119,6 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_protected_usage(c, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt which is only used
@@ -183,14 +185,14 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
* the limit, so if it sees the old limit, we see the
* modified counter and retry.
*/
- usage = atomic_long_read(&counter->usage);
+ usage = page_counter_read(counter);
if (usage > nr_pages)
return -EBUSY;
old = xchg(&counter->max, nr_pages);
- if (atomic_long_read(&counter->usage) <= usage)
+ if (page_counter_read(counter) <= usage || nr_pages >= old)
return 0;
counter->max = old;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index a3616f7a0e9e..dc1626be458b 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -8,6 +8,8 @@
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
#include <linux/page_idle.h>
+#include <linux/page_table_check.h>
+#include <linux/rcupdate.h>
/*
* struct page extension
@@ -34,7 +36,7 @@
*
* The need callback is used to decide whether extended memory allocation is
* needed or not. Sometimes users want to deactivate some features in this
- * boot and extra memory would be unneccessary. In this case, to avoid
+ * boot and extra memory would be unnecessary. In this case, to avoid
* allocating huge chunk of memory, each clients represent their need of
* extra memory through the need callback. If one of the need callbacks
* returns true, it means that someone needs extra memory so that
@@ -58,18 +60,45 @@
* can utilize this callback to initialize the state of it correctly.
*/
-static struct page_ext_operations *page_ext_ops[] = {
+#ifdef CONFIG_SPARSEMEM
+#define PAGE_EXT_INVALID (0x1)
+#endif
+
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
+static bool need_page_idle(void)
+{
+ return true;
+}
+static struct page_ext_operations page_idle_ops __initdata = {
+ .need = need_page_idle,
+ .need_shared_flags = true,
+};
+#endif
+
+static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
&page_idle_ops,
#endif
+#ifdef CONFIG_PAGE_TABLE_CHECK
+ &page_table_check_ops,
+#endif
};
-unsigned long page_ext_size = sizeof(struct page_ext);
+unsigned long page_ext_size;
static unsigned long total_usage;
+static struct page_ext *lookup_page_ext(const struct page *page);
+
+bool early_page_ext __meminitdata;
+static int __init setup_early_page_ext(char *str)
+{
+ early_page_ext = true;
+ return 0;
+}
+early_param("early_page_ext", setup_early_page_ext);
static bool __init invoke_need_callbacks(void)
{
@@ -78,7 +107,16 @@ static bool __init invoke_need_callbacks(void)
bool need = false;
for (i = 0; i < entries; i++) {
- if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
+ if (page_ext_ops[i]->need()) {
+ if (page_ext_ops[i]->need_shared_flags) {
+ page_ext_size = sizeof(struct page_ext);
+ break;
+ }
+ }
+ }
+
+ for (i = 0; i < entries; i++) {
+ if (page_ext_ops[i]->need()) {
page_ext_ops[i]->offset = page_ext_size;
page_ext_size += page_ext_ops[i]->size;
need = true;
@@ -99,12 +137,61 @@ static void __init invoke_init_callbacks(void)
}
}
+#ifndef CONFIG_SPARSEMEM
+void __init page_ext_init_flatmem_late(void)
+{
+ invoke_init_callbacks();
+}
+#endif
+
static inline struct page_ext *get_entry(void *base, unsigned long index)
{
return base + page_ext_size * index;
}
-#if !defined(CONFIG_SPARSEMEM)
+/**
+ * page_ext_get() - Get the extended information for a page.
+ * @page: The page we're interested in.
+ *
+ * Ensures that the page_ext will remain valid until page_ext_put()
+ * is called.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ * Context: Any context. Caller may not sleep until they have called
+ * page_ext_put().
+ */
+struct page_ext *page_ext_get(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ rcu_read_lock();
+ page_ext = lookup_page_ext(page);
+ if (!page_ext) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ return page_ext;
+}
+
+/**
+ * page_ext_put() - Working with page extended information is done.
+ * @page_ext: Page extended information received from page_ext_get().
+ *
+ * The page extended information of the page may not be valid after this
+ * function is called.
+ *
+ * Return: None.
+ * Context: Any context with corresponding page_ext_get() is called.
+ */
+void page_ext_put(struct page_ext *page_ext)
+{
+ if (unlikely(!page_ext))
+ return;
+
+ rcu_read_unlock();
+}
+#ifndef CONFIG_SPARSEMEM
void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
@@ -112,12 +199,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
pgdat->node_page_ext = NULL;
}
-struct page_ext *lookup_page_ext(const struct page *page)
+static struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long index;
struct page_ext *base;
+ WARN_ON_ONCE(!rcu_read_lock_held());
base = NODE_DATA(page_to_nid(page))->node_page_ext;
/*
* The sanity checks the page allocator does upon freeing a
@@ -177,7 +265,6 @@ void __init page_ext_init_flatmem(void)
goto fail;
}
pr_info("allocated %ld bytes of page_ext\n", total_usage);
- invoke_init_callbacks();
return;
fail:
@@ -185,21 +272,28 @@ fail:
panic("Out of memory");
}
-#else /* CONFIG_FLAT_NODE_MEM_MAP */
+#else /* CONFIG_SPARSEMEM */
+static bool page_ext_invalid(struct page_ext *page_ext)
+{
+ return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
+}
-struct page_ext *lookup_page_ext(const struct page *page)
+static struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
+ struct page_ext *page_ext = READ_ONCE(section->page_ext);
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
*/
- if (!section->page_ext)
+ if (page_ext_invalid(page_ext))
return NULL;
- return get_entry(section->page_ext, pfn);
+ return get_entry(page_ext, pfn);
}
static void *__meminit alloc_page_ext(size_t size, int nid)
@@ -253,7 +347,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
total_usage += table_size;
return 0;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
+
static void free_page_ext(void *addr)
{
if (is_vmalloc_addr(addr)) {
@@ -278,9 +372,30 @@ static void __free_page_ext(unsigned long pfn)
ms = __pfn_to_section(pfn);
if (!ms || !ms->page_ext)
return;
- base = get_entry(ms->page_ext, pfn);
+
+ base = READ_ONCE(ms->page_ext);
+ /*
+ * page_ext here can be valid while doing the roll back
+ * operation in online_page_ext().
+ */
+ if (page_ext_invalid(base))
+ base = (void *)base - PAGE_EXT_INVALID;
+ WRITE_ONCE(ms->page_ext, NULL);
+
+ base = get_entry(base, pfn);
free_page_ext(base);
- ms->page_ext = NULL;
+}
+
+static void __invalidate_page_ext(unsigned long pfn)
+{
+ struct mem_section *ms;
+ void *val;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_ext)
+ return;
+ val = (void *)ms->page_ext + PAGE_EXT_INVALID;
+ WRITE_ONCE(ms->page_ext, val);
}
static int __meminit online_page_ext(unsigned long start_pfn,
@@ -300,7 +415,7 @@ static int __meminit online_page_ext(unsigned long start_pfn,
* online__pages(), and start_pfn should exist.
*/
nid = pfn_to_nid(start_pfn);
- VM_BUG_ON(!node_state(nid, N_ONLINE));
+ VM_BUG_ON(!node_online(nid));
}
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
@@ -316,13 +431,27 @@ static int __meminit online_page_ext(unsigned long start_pfn,
}
static int __meminit offline_page_ext(unsigned long start_pfn,
- unsigned long nr_pages, int nid)
+ unsigned long nr_pages)
{
unsigned long start, end, pfn;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+ /*
+ * Freeing of page_ext is done in 3 steps to avoid
+ * use-after-free of it:
+ * 1) Traverse all the sections and mark their page_ext
+ * as invalid.
+ * 2) Wait for all the existing users of page_ext who
+ * started before invalidation to finish.
+ * 3) Free the page_ext.
+ */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __invalidate_page_ext(pfn);
+
+ synchronize_rcu();
+
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_ext(pfn);
return 0;
@@ -342,11 +471,11 @@ static int __meminit page_ext_callback(struct notifier_block *self,
break;
case MEM_OFFLINE:
offline_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ mn->nr_pages);
break;
case MEM_CANCEL_ONLINE:
offline_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ mn->nr_pages);
break;
case MEM_GOING_OFFLINE:
break;
@@ -358,8 +487,6 @@ static int __meminit page_ext_callback(struct notifier_block *self,
return notifier_from_errno(ret);
}
-#endif
-
void __init page_ext_init(void)
{
unsigned long pfn;
@@ -396,7 +523,7 @@ void __init page_ext_init(void)
cond_resched();
}
}
- hotplug_memory_notifier(page_ext_callback, 0);
+ hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
pr_info("allocated %ld bytes of page_ext\n", total_usage);
invoke_init_callbacks();
return;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 057c61df12db..41ea77f22011 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -13,6 +13,8 @@
#include <linux/page_ext.h>
#include <linux/page_idle.h>
+#include "internal.h"
+
#define BITMAP_CHUNK_SIZE sizeof(u64)
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
@@ -29,34 +31,29 @@
*
* This function tries to get a user memory page by pfn as described above.
*/
-static struct page *page_idle_get_page(unsigned long pfn)
+static struct folio *page_idle_get_folio(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
- pg_data_t *pgdat;
+ struct folio *folio;
- if (!page || !PageLRU(page) ||
- !get_page_unless_zero(page))
+ if (!page || PageTail(page))
return NULL;
- pgdat = page_pgdat(page);
- spin_lock_irq(&pgdat->lru_lock);
- if (unlikely(!PageLRU(page))) {
- put_page(page);
- page = NULL;
+ folio = page_folio(page);
+ if (!folio_test_lru(folio) || !folio_try_get(folio))
+ return NULL;
+ if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ folio_put(folio);
+ folio = NULL;
}
- spin_unlock_irq(&pgdat->lru_lock);
- return page;
+ return folio;
}
-static bool page_idle_clear_pte_refs_one(struct page *page,
+static bool page_idle_clear_pte_refs_one(struct folio *folio,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
bool referenced = false;
while (page_vma_mapped_walk(&pvmw)) {
@@ -78,41 +75,40 @@ static bool page_idle_clear_pte_refs_one(struct page *page,
}
if (referenced) {
- clear_page_idle(page);
+ folio_clear_idle(folio);
/*
* We cleared the referenced bit in a mapping to this page. To
* avoid interference with page reclaim, mark it young so that
- * page_referenced() will return > 0.
+ * folio_referenced() will return > 0.
*/
- set_page_young(page);
+ folio_set_young(folio);
}
return true;
}
-static void page_idle_clear_pte_refs(struct page *page)
+static void page_idle_clear_pte_refs(struct folio *folio)
{
/*
- * Since rwc.arg is unused, rwc is effectively immutable, so we
- * can make it static const to save some cycles and stack.
+ * Since rwc.try_lock is unused, rwc is effectively immutable, so we
+ * can make it static to save some cycles and stack.
*/
- static const struct rmap_walk_control rwc = {
+ static struct rmap_walk_control rwc = {
.rmap_one = page_idle_clear_pte_refs_one,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!page_mapped(page) ||
- !page_rmapping(page))
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio))
return;
- need_lock = !PageAnon(page) || PageKsm(page);
- if (need_lock && !trylock_page(page))
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
return;
- rmap_walk(page, (struct rmap_walk_control *)&rwc);
+ rmap_walk(folio, &rwc);
if (need_lock)
- unlock_page(page);
+ folio_unlock(folio);
}
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
@@ -120,7 +116,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
loff_t pos, size_t count)
{
u64 *out = (u64 *)buf;
- struct page *page;
+ struct folio *folio;
unsigned long pfn, end_pfn;
int bit;
@@ -139,19 +135,19 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
bit = pfn % BITMAP_CHUNK_BITS;
if (!bit)
*out = 0ULL;
- page = page_idle_get_page(pfn);
- if (page) {
- if (page_is_idle(page)) {
+ folio = page_idle_get_folio(pfn);
+ if (folio) {
+ if (folio_test_idle(folio)) {
/*
* The page might have been referenced via a
* pte, in which case it is not idle. Clear
* refs and recheck.
*/
- page_idle_clear_pte_refs(page);
- if (page_is_idle(page))
+ page_idle_clear_pte_refs(folio);
+ if (folio_test_idle(folio))
*out |= 1ULL << bit;
}
- put_page(page);
+ folio_put(folio);
}
if (bit == BITMAP_CHUNK_BITS - 1)
out++;
@@ -165,7 +161,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
loff_t pos, size_t count)
{
const u64 *in = (u64 *)buf;
- struct page *page;
+ struct folio *folio;
unsigned long pfn, end_pfn;
int bit;
@@ -183,11 +179,11 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if ((*in >> bit) & 1) {
- page = page_idle_get_page(pfn);
- if (page) {
- page_idle_clear_pte_refs(page);
- set_page_idle(page);
- put_page(page);
+ folio = page_idle_get_folio(pfn);
+ if (folio) {
+ page_idle_clear_pte_refs(folio);
+ folio_set_idle(folio);
+ folio_put(folio);
}
}
if (bit == BITMAP_CHUNK_BITS - 1)
@@ -211,16 +207,6 @@ static const struct attribute_group page_idle_attr_group = {
.name = "page_idle",
};
-#ifndef CONFIG_64BIT
-static bool need_page_idle(void)
-{
- return true;
-}
-struct page_ext_operations page_idle_ops = {
- .need = need_page_idle,
-};
-#endif
-
static int __init page_idle_init(void)
{
int err;
diff --git a/mm/page_io.c b/mm/page_io.c
index 433df1263349..684cd3c7b59b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,34 +18,16 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
+#include <linux/delayacct.h>
+#include "swap.h"
-static struct bio *get_swap_bio(gfp_t gfp_flags,
- struct page *page, bio_end_io_t end_io)
-{
- struct bio *bio;
-
- bio = bio_alloc(gfp_flags, 1);
- if (bio) {
- struct block_device *bdev;
-
- bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
- bio->bi_end_io = end_io;
-
- bio_add_page(bio, page, thp_size(page), 0);
- }
- return bio;
-}
-
-void end_swap_bio_write(struct bio *bio)
+static void __end_swap_bio_write(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -57,90 +39,43 @@ void end_swap_bio_write(struct bio *bio)
* Also print a dire warning that things will go BAD (tm)
* very quickly.
*
- * Also clear PG_reclaim to avoid rotate_reclaimable_page()
+ * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
*/
set_page_dirty(page);
- pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
- bio_put(bio);
}
-static void swap_slot_free_notify(struct page *page)
+static void end_swap_bio_write(struct bio *bio)
{
- struct swap_info_struct *sis;
- struct gendisk *disk;
- swp_entry_t entry;
-
- /*
- * There is no guarantee that the page is in swap cache - the software
- * suspend code (at least) uses end_swap_bio_read() against a non-
- * swapcache page. So we must check PG_swapcache before proceeding with
- * this optimization.
- */
- if (unlikely(!PageSwapCache(page)))
- return;
-
- sis = page_swap_info(page);
- if (data_race(!(sis->flags & SWP_BLKDEV)))
- return;
-
- /*
- * The swap subsystem performs lazy swap slot freeing,
- * expecting that the page will be swapped out again.
- * So we can avoid an unnecessary write if the page
- * isn't redirtied.
- * This is good for real swap storage because we can
- * reduce unnecessary I/O and enhance wear-leveling
- * if an SSD is used as the as swap device.
- * But if in-memory swap device (eg zram) is used,
- * this causes a duplicated copy between uncompressed
- * data in VM-owned memory and compressed data in
- * zram-owned memory. So let's free zram-owned memory
- * and make the VM-owned decompressed page *dirty*,
- * so the page should be swapped out somewhere again if
- * we again wish to reclaim it.
- */
- disk = sis->bdev->bd_disk;
- entry.val = page_private(page);
- if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
- unsigned long offset;
-
- offset = swp_offset(entry);
-
- SetPageDirty(page);
- disk->fops->swap_slot_free_notify(sis->bdev,
- offset);
- }
+ __end_swap_bio_write(bio);
+ bio_put(bio);
}
-static void end_swap_bio_read(struct bio *bio)
+static void __end_swap_bio_read(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
- struct task_struct *waiter = bio->bi_private;
if (bio->bi_status) {
SetPageError(page);
ClearPageUptodate(page);
- pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
- goto out;
+ pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ } else {
+ SetPageUptodate(page);
}
-
- SetPageUptodate(page);
- swap_slot_free_notify(page);
-out:
unlock_page(page);
- WRITE_ONCE(bio->bi_private, NULL);
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+ __end_swap_bio_read(bio);
bio_put(bio);
- if (waiter) {
- blk_wake_io_task(waiter);
- put_task_struct(waiter);
- }
}
int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -246,36 +181,31 @@ bad_bmap:
*/
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
- int ret = 0;
+ struct folio *folio = page_folio(page);
+ int ret;
- if (try_to_free_swap(page)) {
- unlock_page(page);
- goto out;
+ if (folio_free_swap(folio)) {
+ folio_unlock(folio);
+ return 0;
}
/*
* Arch code may have to preserve more data than just the page
* contents, e.g. memory tags.
*/
- ret = arch_prepare_to_swap(page);
+ ret = arch_prepare_to_swap(&folio->page);
if (ret) {
- set_page_dirty(page);
- unlock_page(page);
- goto out;
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ return ret;
}
- if (frontswap_store(page) == 0) {
- set_page_writeback(page);
- unlock_page(page);
- end_page_writeback(page);
- goto out;
+ if (frontswap_store(&folio->page) == 0) {
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
+ return 0;
}
- ret = __swap_writepage(page, wbc, end_swap_bio_write);
-out:
- return ret;
-}
-
-static sector_t swap_page_sector(struct page *page)
-{
- return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
+ __swap_writepage(&folio->page, wbc);
+ return 0;
}
static inline void count_swpout_vm_event(struct page *page)
@@ -291,12 +221,14 @@ static inline void count_swpout_vm_event(struct page *page)
static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
{
struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg;
- if (!page->mem_cgroup)
+ memcg = page_memcg(page);
+ if (!memcg)
return;
rcu_read_lock();
- css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+ css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
bio_associate_blkg_from_css(bio, css);
rcu_read_unlock();
}
@@ -304,173 +236,311 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
#define bio_associate_blkg_from_page(bio, page) do { } while (0)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func)
+struct swap_iocb {
+ struct kiocb iocb;
+ struct bio_vec bvec[SWAP_CLUSTER_MAX];
+ int pages;
+ int len;
+};
+static mempool_t *sio_pool;
+
+int sio_pool_init(void)
{
- struct bio *bio;
- int ret;
- struct swap_info_struct *sis = page_swap_info(page);
+ if (!sio_pool) {
+ mempool_t *pool = mempool_create_kmalloc_pool(
+ SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
+ if (cmpxchg(&sio_pool, NULL, pool))
+ mempool_destroy(pool);
+ }
+ if (!sio_pool)
+ return -ENOMEM;
+ return 0;
+}
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (data_race(sis->flags & SWP_FS_OPS)) {
- struct kiocb kiocb;
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
- struct bio_vec bv = {
- .bv_page = page,
- .bv_len = PAGE_SIZE,
- .bv_offset = 0
- };
- struct iov_iter from;
-
- iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
- init_sync_kiocb(&kiocb, swap_file);
- kiocb.ki_pos = page_file_offset(page);
-
- set_page_writeback(page);
- unlock_page(page);
- ret = mapping->a_ops->direct_IO(&kiocb, &from);
- if (ret == PAGE_SIZE) {
- count_vm_event(PSWPOUT);
- ret = 0;
- } else {
- /*
- * In the case of swap-over-nfs, this can be a
- * temporary failure if the system has limited
- * memory for allocating transmit buffers.
- * Mark the page dirty and avoid
- * rotate_reclaimable_page but rate-limit the
- * messages but do not flag PageError like
- * the normal direct-to-bio case as it could
- * be temporary.
- */
+static void sio_write_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+ struct page *page = sio->bvec[0].bv_page;
+ int p;
+
+ if (ret != sio->len) {
+ /*
+ * In the case of swap-over-nfs, this can be a
+ * temporary failure if the system has limited
+ * memory for allocating transmit buffers.
+ * Mark the page dirty and avoid
+ * folio_rotate_reclaimable but rate-limit the
+ * messages but do not flag PageError like
+ * the normal direct-to-bio case as it could
+ * be temporary.
+ */
+ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
+ ret, page_file_offset(page));
+ for (p = 0; p < sio->pages; p++) {
+ page = sio->bvec[p].bv_page;
set_page_dirty(page);
ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
- page_file_offset(page));
}
- end_page_writeback(page);
- return ret;
+ } else {
+ for (p = 0; p < sio->pages; p++)
+ count_swpout_vm_event(sio->bvec[p].bv_page);
}
- ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
- if (!ret) {
- count_swpout_vm_event(page);
- return 0;
- }
+ for (p = 0; p < sio->pages; p++)
+ end_page_writeback(sio->bvec[p].bv_page);
- bio = get_swap_bio(GFP_NOIO, page, end_write_func);
- if (bio == NULL) {
- set_page_dirty(page);
- unlock_page(page);
- return -ENOMEM;
+ mempool_free(sio, sio_pool);
+}
+
+static void swap_writepage_fs(struct page *page, struct writeback_control *wbc)
+{
+ struct swap_iocb *sio = NULL;
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct file *swap_file = sis->swap_file;
+ loff_t pos = page_file_offset(page);
+
+ set_page_writeback(page);
+ unlock_page(page);
+ if (wbc->swap_plug)
+ sio = *wbc->swap_plug;
+ if (sio) {
+ if (sio->iocb.ki_filp != swap_file ||
+ sio->iocb.ki_pos + sio->len != pos) {
+ swap_write_unplug(sio);
+ sio = NULL;
+ }
}
- bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
- bio_associate_blkg_from_page(bio, page);
+ if (!sio) {
+ sio = mempool_alloc(sio_pool, GFP_NOIO);
+ init_sync_kiocb(&sio->iocb, swap_file);
+ sio->iocb.ki_complete = sio_write_complete;
+ sio->iocb.ki_pos = pos;
+ sio->pages = 0;
+ sio->len = 0;
+ }
+ bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
+ sio->len += thp_size(page);
+ sio->pages += 1;
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
+ swap_write_unplug(sio);
+ sio = NULL;
+ }
+ if (wbc->swap_plug)
+ *wbc->swap_plug = sio;
+}
+
+static void swap_writepage_bdev_sync(struct page *page,
+ struct writeback_control *wbc, struct swap_info_struct *sis)
+{
+ struct bio_vec bv;
+ struct bio bio;
+
+ bio_init(&bio, sis->bdev, &bv, 1,
+ REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
+ bio.bi_iter.bi_sector = swap_page_sector(page);
+ __bio_add_page(&bio, page, thp_size(page), 0);
+
+ bio_associate_blkg_from_page(&bio, page);
count_swpout_vm_event(page);
+
set_page_writeback(page);
unlock_page(page);
- submit_bio(bio);
- return 0;
+ submit_bio_wait(&bio);
+ __end_swap_bio_write(&bio);
}
-int swap_readpage(struct page *page, bool synchronous)
+static void swap_writepage_bdev_async(struct page *page,
+ struct writeback_control *wbc, struct swap_info_struct *sis)
{
struct bio *bio;
- int ret = 0;
- struct swap_info_struct *sis = page_swap_info(page);
- blk_qc_t qc;
- struct gendisk *disk;
- unsigned long pflags;
- VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageUptodate(page), page);
+ bio = bio_alloc(sis->bdev, 1,
+ REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
+ GFP_NOIO);
+ bio->bi_iter.bi_sector = swap_page_sector(page);
+ bio->bi_end_io = end_swap_bio_write;
+ __bio_add_page(bio, page, thp_size(page), 0);
+
+ bio_associate_blkg_from_page(bio, page);
+ count_swpout_vm_event(page);
+ set_page_writeback(page);
+ unlock_page(page);
+ submit_bio(bio);
+}
+void __swap_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
/*
- * Count submission time as memory stall. When the device is congested,
- * or the submitting cgroup IO-throttled, submission can be a
- * significant part of overall IO time.
+ * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * but that will never affect SWP_FS_OPS, so the data_race
+ * is safe.
*/
- psi_memstall_enter(&pflags);
+ if (data_race(sis->flags & SWP_FS_OPS))
+ swap_writepage_fs(page, wbc);
+ else if (sis->flags & SWP_SYNCHRONOUS_IO)
+ swap_writepage_bdev_sync(page, wbc, sis);
+ else
+ swap_writepage_bdev_async(page, wbc, sis);
+}
- if (frontswap_load(page) == 0) {
- SetPageUptodate(page);
- unlock_page(page);
- goto out;
- }
+void swap_write_unplug(struct swap_iocb *sio)
+{
+ struct iov_iter from;
+ struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ int ret;
- if (data_race(sis->flags & SWP_FS_OPS)) {
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
+ iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_write_complete(&sio->iocb, ret);
+}
- ret = mapping->a_ops->readpage(swap_file, page);
- if (!ret)
- count_vm_event(PSWPIN);
- goto out;
- }
+static void sio_read_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+ int p;
- if (sis->flags & SWP_SYNCHRONOUS_IO) {
- ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
- if (!ret) {
- if (trylock_page(page)) {
- swap_slot_free_notify(page);
- unlock_page(page);
- }
+ if (ret == sio->len) {
+ for (p = 0; p < sio->pages; p++) {
+ struct page *page = sio->bvec[p].bv_page;
- count_vm_event(PSWPIN);
- goto out;
+ SetPageUptodate(page);
+ unlock_page(page);
}
+ count_vm_events(PSWPIN, sio->pages);
+ } else {
+ for (p = 0; p < sio->pages; p++) {
+ struct page *page = sio->bvec[p].bv_page;
+
+ SetPageError(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ }
+ pr_alert_ratelimited("Read-error on swap-device\n");
}
+ mempool_free(sio, sio_pool);
+}
- ret = 0;
- bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
- if (bio == NULL) {
- unlock_page(page);
- ret = -ENOMEM;
- goto out;
+static void swap_readpage_fs(struct page *page,
+ struct swap_iocb **plug)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct swap_iocb *sio = NULL;
+ loff_t pos = page_file_offset(page);
+
+ if (plug)
+ sio = *plug;
+ if (sio) {
+ if (sio->iocb.ki_filp != sis->swap_file ||
+ sio->iocb.ki_pos + sio->len != pos) {
+ swap_read_unplug(sio);
+ sio = NULL;
+ }
+ }
+ if (!sio) {
+ sio = mempool_alloc(sio_pool, GFP_KERNEL);
+ init_sync_kiocb(&sio->iocb, sis->swap_file);
+ sio->iocb.ki_pos = pos;
+ sio->iocb.ki_complete = sio_read_complete;
+ sio->pages = 0;
+ sio->len = 0;
}
- disk = bio->bi_disk;
+ bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
+ sio->len += thp_size(page);
+ sio->pages += 1;
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+ swap_read_unplug(sio);
+ sio = NULL;
+ }
+ if (plug)
+ *plug = sio;
+}
+
+static void swap_readpage_bdev_sync(struct page *page,
+ struct swap_info_struct *sis)
+{
+ struct bio_vec bv;
+ struct bio bio;
+
+ bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = swap_page_sector(page);
+ __bio_add_page(&bio, page, thp_size(page), 0);
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
- if (synchronous) {
- bio->bi_opf |= REQ_HIPRI;
- get_task_struct(current);
- bio->bi_private = current;
- }
+ get_task_struct(current);
count_vm_event(PSWPIN);
- bio_get(bio);
- qc = submit_bio(bio);
- while (synchronous) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!READ_ONCE(bio->bi_private))
- break;
-
- if (!blk_poll(disk->queue, qc, true))
- blk_io_schedule();
- }
- __set_current_state(TASK_RUNNING);
- bio_put(bio);
+ submit_bio_wait(&bio);
+ __end_swap_bio_read(&bio);
+ put_task_struct(current);
+}
-out:
- psi_memstall_leave(&pflags);
- return ret;
+static void swap_readpage_bdev_async(struct page *page,
+ struct swap_info_struct *sis)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ bio->bi_iter.bi_sector = swap_page_sector(page);
+ bio->bi_end_io = end_swap_bio_read;
+ __bio_add_page(bio, page, thp_size(page), 0);
+ count_vm_event(PSWPIN);
+ submit_bio(bio);
}
-int swap_set_page_dirty(struct page *page)
+void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
{
struct swap_info_struct *sis = page_swap_info(page);
+ bool workingset = PageWorkingset(page);
+ unsigned long pflags;
+ bool in_thrashing;
+
+ VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageUptodate(page), page);
- if (data_race(sis->flags & SWP_FS_OPS)) {
- struct address_space *mapping = sis->swap_file->f_mapping;
+ /*
+ * Count submission time as memory stall and delay. When the device
+ * is congested, or the submitting cgroup IO-throttled, submission
+ * can be a significant part of overall IO time.
+ */
+ if (workingset) {
+ delayacct_thrashing_start(&in_thrashing);
+ psi_memstall_enter(&pflags);
+ }
+ delayacct_swapin_start();
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- return mapping->a_ops->set_page_dirty(page);
+ if (frontswap_load(page) == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ } else if (data_race(sis->flags & SWP_FS_OPS)) {
+ swap_readpage_fs(page, plug);
+ } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
+ swap_readpage_bdev_sync(page, sis);
} else {
- return __set_page_dirty_no_writeback(page);
+ swap_readpage_bdev_async(page, sis);
}
+
+ if (workingset) {
+ delayacct_thrashing_end(&in_thrashing);
+ psi_memstall_leave(&pflags);
+ }
+ delayacct_swapin_end();
+}
+
+void __swap_read_unplug(struct swap_iocb *sio)
+{
+ struct iov_iter from;
+ struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ int ret;
+
+ iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_read_complete(&sio->iocb, ret);
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index aa94afb63823..6599cc965e21 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,11 +15,142 @@
#define CREATE_TRACE_POINTS
#include <trace/events/page_isolation.h>
-static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
+/*
+ * This function checks whether the range [start_pfn, end_pfn) includes
+ * unmovable pages or not. The range must fall into a single pageblock and
+ * consequently belong to a single zone.
+ *
+ * PageLRU check without isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
+ *
+ * Returns a page without holding a reference. If the caller wants to
+ * dereference that page (e.g., dumping), it has to make sure that it
+ * cannot get removed (e.g., via memory unplug) concurrently.
+ *
+ */
+static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
+ int migratetype, int flags)
+{
+ struct page *page = pfn_to_page(start_pfn);
+ struct zone *zone = page_zone(page);
+ unsigned long pfn;
+
+ VM_BUG_ON(pageblock_start_pfn(start_pfn) !=
+ pageblock_start_pfn(end_pfn - 1));
+
+ if (is_migrate_cma_page(page)) {
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark
+ * isolate CMA pageblocks even when they are not movable in fact
+ * so consider them movable here.
+ */
+ if (is_migrate_cma(migratetype))
+ return NULL;
+
+ return page;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ page = pfn_to_page(pfn);
+
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
+ if (PageReserved(page))
+ return page;
+
+ /*
+ * If the zone is movable and we have ruled out all reserved
+ * pages then it should be reasonably safe to assume the rest
+ * is movable.
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ continue;
+
+ /*
+ * Hugepages are not in LRU lists, but they're movable.
+ * THPs are on the LRU, but need to be counted as #small pages.
+ * We need not scan over tail pages because we don't
+ * handle each tail page individually in migration.
+ */
+ if (PageHuge(page) || PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+ unsigned int skip_pages;
+
+ if (PageHuge(page)) {
+ if (!hugepage_migration_supported(page_hstate(head)))
+ return page;
+ } else if (!PageLRU(head) && !__PageMovable(head)) {
+ return page;
+ }
+
+ skip_pages = compound_nr(head) - (page - head);
+ pfn += skip_pages - 1;
+ continue;
+ }
+
+ /*
+ * We can't use page_count without pin a page
+ * because another CPU can free compound page.
+ * This check already skips compound tails of THP
+ * because their page->_refcount is zero at all time.
+ */
+ if (!page_ref_count(page)) {
+ if (PageBuddy(page))
+ pfn += (1 << buddy_order(page)) - 1;
+ continue;
+ }
+
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ continue;
+
+ /*
+ * We treat all PageOffline() pages as movable when offlining
+ * to give drivers a chance to decrement their reference count
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
+ * can be offlined as there are no direct references anymore.
+ * For actually unmovable PageOffline() where the driver does
+ * not support this, we will fail later when trying to actually
+ * move these pages that still have a reference count > 0.
+ * (false negatives in this function only)
+ */
+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ continue;
+
+ if (__PageMovable(page) || PageLRU(page))
+ continue;
+
+ /*
+ * If there are RECLAIMABLE pages, we need to check
+ * it. But now, memory offline itself doesn't call
+ * shrink_node_slabs() and it still to be fixed.
+ */
+ return page;
+ }
+ return NULL;
+}
+
+/*
+ * This function set pageblock migratetype to isolate if no unmovable page is
+ * present in [start_pfn, end_pfn). The pageblock must intersect with
+ * [start_pfn, end_pfn).
+ */
+static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
+ unsigned long start_pfn, unsigned long end_pfn)
{
struct zone *zone = page_zone(page);
struct page *unmovable;
unsigned long flags;
+ unsigned long check_unmovable_start, check_unmovable_end;
spin_lock_irqsave(&zone->lock, flags);
@@ -36,8 +167,16 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
/*
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
+ *
+ * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock
+ * to avoid redundant checks.
*/
- unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags);
+ check_unmovable_start = max(page_to_pfn(page), start_pfn);
+ check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
+ end_pfn);
+
+ unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
+ migratetype, isol_flags);
if (!unmovable) {
unsigned long nr_pages;
int mt = get_pageblock_migratetype(page);
@@ -49,7 +188,6 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
__mod_zone_freepage_state(zone, -nr_pages, mt);
spin_unlock_irqrestore(&zone->lock, flags);
- drain_all_pages(zone);
return 0;
}
@@ -65,13 +203,12 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
return -EBUSY;
}
-static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, int migratetype)
{
struct zone *zone;
unsigned long flags, nr_pages;
bool isolated_page = false;
unsigned int order;
- unsigned long pfn, buddy_pfn;
struct page *buddy;
zone = page_zone(page);
@@ -88,16 +225,18 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* these pages to be merged.
*/
if (PageBuddy(page)) {
- order = page_order(page);
- if (order >= pageblock_order) {
- pfn = page_to_pfn(page);
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
-
- if (pfn_valid_within(buddy_pfn) &&
- !is_migrate_isolate_page(buddy)) {
- __isolate_free_page(page, order);
- isolated_page = true;
+ order = buddy_order(page);
+ if (order >= pageblock_order && order < MAX_ORDER) {
+ buddy = find_buddy_page_pfn(page, page_to_pfn(page),
+ order, NULL);
+ if (buddy && !is_migrate_isolate_page(buddy)) {
+ isolated_page = !!__isolate_free_page(page, order);
+ /*
+ * Isolating a free page in an isolated pageblock
+ * is expected to always work as watermarks don't
+ * apply here.
+ */
+ VM_WARN_ON(!isolated_page);
}
}
}
@@ -106,6 +245,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* If we isolate freepage with more than pageblock_order, there
* should be no freepage in the range, so we could avoid costly
* pageblock scanning for freepage moving.
+ *
+ * We didn't actually touch any of the isolated pages, so place them
+ * to the tail of the freelist. This is an optimization for memory
+ * onlining - just onlined memory won't immediately be considered for
+ * allocation.
*/
if (!isolated_page) {
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
@@ -136,11 +280,210 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
}
/**
- * start_isolate_page_range() - make page-allocation-type of range of pages to
- * be MIGRATE_ISOLATE.
- * @start_pfn: The lower PFN of the range to be isolated.
- * @end_pfn: The upper PFN of the range to be isolated.
- * start_pfn/end_pfn must be aligned to pageblock_order.
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn: pageblock-aligned pfn that a page might cross
+ * @flags: isolation flags
+ * @gfp_flags: GFP flags used for migrating pages
+ * @isolate_before: isolate the pageblock before the boundary_pfn
+ * @skip_isolation: the flag to skip the pageblock isolation in second
+ * isolate_single_pageblock()
+ * @migratetype: migrate type to set in error recovery.
+ *
+ * Free and in-use pages can be as big as MAX_ORDER and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks.
+ * [ MAX_ORDER ]
+ * [ pageblock0 | pageblock1 ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
+ int migratetype)
+{
+ unsigned long start_pfn;
+ unsigned long isolate_pageblock;
+ unsigned long pfn;
+ struct zone *zone;
+ int ret;
+
+ VM_BUG_ON(!pageblock_aligned(boundary_pfn));
+
+ if (isolate_before)
+ isolate_pageblock = boundary_pfn - pageblock_nr_pages;
+ else
+ isolate_pageblock = boundary_pfn;
+
+ /*
+ * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
+ * only isolating a subset of pageblocks from a bigger than pageblock
+ * free or in-use page. Also make sure all to-be-isolated pageblocks
+ * are within the same zone.
+ */
+ zone = page_zone(pfn_to_page(isolate_pageblock));
+ start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
+ zone->zone_start_pfn);
+
+ if (skip_isolation) {
+ int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+
+ VM_BUG_ON(!is_migrate_isolate(mt));
+ } else {
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
+ flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Bail out early when the to-be-isolated pageblock does not form
+ * a free or in-use page across boundary_pfn:
+ *
+ * 1. isolate before boundary_pfn: the page after is not online
+ * 2. isolate after boundary_pfn: the page before is not online
+ *
+ * This also ensures correctness. Without it, when isolate after
+ * boundary_pfn and [start_pfn, boundary_pfn) are not online,
+ * __first_valid_page() will return unexpected NULL in the for loop
+ * below.
+ */
+ if (isolate_before) {
+ if (!pfn_to_online_page(boundary_pfn))
+ return 0;
+ } else {
+ if (!pfn_to_online_page(boundary_pfn - 1))
+ return 0;
+ }
+
+ for (pfn = start_pfn; pfn < boundary_pfn;) {
+ struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
+
+ VM_BUG_ON(!page);
+ pfn = page_to_pfn(page);
+ /*
+ * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+ * free pages in [start_pfn, boundary_pfn), its head page will
+ * always be in the range.
+ */
+ if (PageBuddy(page)) {
+ int order = buddy_order(page);
+
+ if (pfn + (1UL << order) > boundary_pfn) {
+ /* free page changed before split, check it again */
+ if (split_free_page(page, order, boundary_pfn - pfn))
+ continue;
+ }
+
+ pfn += 1UL << order;
+ continue;
+ }
+ /*
+ * migrate compound pages then let the free page handling code
+ * above do the rest. If migration is not possible, just fail.
+ */
+ if (PageCompound(page)) {
+ struct page *head = compound_head(page);
+ unsigned long head_pfn = page_to_pfn(head);
+ unsigned long nr_pages = compound_nr(head);
+
+ if (head_pfn + nr_pages <= boundary_pfn) {
+ pfn = head_pfn + nr_pages;
+ continue;
+ }
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ /*
+ * hugetlb, lru compound (THP), and movable compound pages
+ * can be migrated. Otherwise, fail the isolation.
+ */
+ if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
+ int order;
+ unsigned long outer_pfn;
+ int page_mt = get_pageblock_migratetype(page);
+ bool isolate_page = !is_migrate_isolate_page(page);
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(head_pfn)),
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ .no_set_skip_hint = true,
+ .gfp_mask = gfp_flags,
+ .alloc_contig = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ /*
+ * XXX: mark the page as MIGRATE_ISOLATE so that
+ * no one else can grab the freed page after migration.
+ * Ideally, the page should be freed as two separate
+ * pages to be added into separate migratetype free
+ * lists.
+ */
+ if (isolate_page) {
+ ret = set_migratetype_isolate(page, page_mt,
+ flags, head_pfn, head_pfn + nr_pages);
+ if (ret)
+ goto failed;
+ }
+
+ ret = __alloc_contig_migrate_range(&cc, head_pfn,
+ head_pfn + nr_pages);
+
+ /*
+ * restore the page's migratetype so that it can
+ * be split into separate migratetype free lists
+ * later.
+ */
+ if (isolate_page)
+ unset_migratetype_isolate(page, page_mt);
+
+ if (ret)
+ goto failed;
+ /*
+ * reset pfn to the head of the free page, so
+ * that the free page handling code above can split
+ * the free page to the right migratetype list.
+ *
+ * head_pfn is not used here as a hugetlb page order
+ * can be bigger than MAX_ORDER, but after it is
+ * freed, the free page order is not. Use pfn within
+ * the range to find the head of the free page.
+ */
+ order = 0;
+ outer_pfn = pfn;
+ while (!PageBuddy(pfn_to_page(outer_pfn))) {
+ /* stop if we cannot find the free page */
+ if (++order > MAX_ORDER)
+ goto failed;
+ outer_pfn &= ~0UL << order;
+ }
+ pfn = outer_pfn;
+ continue;
+ } else
+#endif
+ goto failed;
+ }
+
+ pfn++;
+ }
+ return 0;
+failed:
+ /* restore the original migratetype */
+ if (!skip_isolation)
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
+ return -EBUSY;
+}
+
+/**
+ * start_isolate_page_range() - mark page range MIGRATE_ISOLATE
+ * @start_pfn: The first PFN of the range to be isolated.
+ * @end_pfn: The last PFN of the range to be isolated.
* @migratetype: Migrate type to set in error recovery.
* @flags: The following flags are allowed (they can be combined in
* a bit mask)
@@ -149,6 +492,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* and PageOffline() pages.
* REPORT_FAILURE - report details about the failure to
* isolate the range
+ * @gfp_flags: GFP flags used for migrating pages that sit across the
+ * range boundaries.
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -157,6 +502,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* pages in the range finally, the caller have to free all pages in the range.
* test_page_isolated() can be used for test it.
*
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
* There is no high level synchronization mechanism that prevents two threads
* from trying to isolate overlapping ranges. If this happens, one thread
* will notice pageblocks in the overlapping range already set to isolate.
@@ -167,66 +516,79 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
*
* Please note that there is no strong synchronization with the page allocator
* either. Pages might be freed while their page blocks are marked ISOLATED.
- * In some cases pages might still end up on pcp lists and that would allow
+ * A call to drain_all_pages() after isolation can flush most of them. However
+ * in some cases pages might still end up on pcp lists and that would allow
* for their allocation even when they are in fact isolated already. Depending
- * on how strong of a guarantee the caller needs drain_all_pages might be needed
- * (e.g. __offline_pages will need to call it after check for isolated range for
- * a next retry).
+ * on how strong of a guarantee the caller needs, zone_pcp_disable/enable()
+ * might be used to flush and disable pcplist before isolation and enable after
+ * unisolation.
*
- * Return: the number of isolated pageblocks on success and -EBUSY if any part
- * of range cannot be isolated.
+ * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype, int flags)
+ int migratetype, int flags, gfp_t gfp_flags)
{
unsigned long pfn;
- unsigned long undo_pfn;
struct page *page;
- int nr_isolate_pageblock = 0;
+ /* isolation is done at page block granularity */
+ unsigned long isolate_start = pageblock_start_pfn(start_pfn);
+ unsigned long isolate_end = pageblock_align(end_pfn);
+ int ret;
+ bool skip_isolation = false;
- BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
- BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+ /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
+ skip_isolation, migratetype);
+ if (ret)
+ return ret;
- for (pfn = start_pfn;
- pfn < end_pfn;
+ if (isolate_start == isolate_end - pageblock_nr_pages)
+ skip_isolation = true;
+
+ /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
+ skip_isolation, migratetype);
+ if (ret) {
+ unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+ return ret;
+ }
+
+ /* skip isolated pageblocks at the beginning and end */
+ for (pfn = isolate_start + pageblock_nr_pages;
+ pfn < isolate_end - pageblock_nr_pages;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page) {
- if (set_migratetype_isolate(page, migratetype, flags)) {
- undo_pfn = pfn;
- goto undo;
- }
- nr_isolate_pageblock++;
+ if (page && set_migratetype_isolate(page, migratetype, flags,
+ start_pfn, end_pfn)) {
+ undo_isolate_page_range(isolate_start, pfn, migratetype);
+ unset_migratetype_isolate(
+ pfn_to_page(isolate_end - pageblock_nr_pages),
+ migratetype);
+ return -EBUSY;
}
}
- return nr_isolate_pageblock;
-undo:
- for (pfn = start_pfn;
- pfn < undo_pfn;
- pfn += pageblock_nr_pages) {
- struct page *page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- unset_migratetype_isolate(page, migratetype);
- }
-
- return -EBUSY;
+ return 0;
}
-/*
- * Make isolated pages available again.
+/**
+ * undo_isolate_page_range - undo effects of start_isolate_page_range()
+ * @start_pfn: The first PFN of the isolated range
+ * @end_pfn: The last PFN of the isolated range
+ * @migratetype: New migrate type to set on the range
+ *
+ * This finds every MIGRATE_ISOLATE page block in the given range
+ * and switches it to @migratetype.
*/
void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype)
+ int migratetype)
{
unsigned long pfn;
struct page *page;
+ unsigned long isolate_start = pageblock_start_pfn(start_pfn);
+ unsigned long isolate_end = pageblock_align(end_pfn);
- BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
- BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
-
- for (pfn = start_pfn;
- pfn < end_pfn;
+ for (pfn = isolate_start;
+ pfn < isolate_end;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
if (!page || !is_migrate_isolate_page(page))
@@ -248,10 +610,6 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
struct page *page;
while (pfn < end_pfn) {
- if (!pfn_valid_within(pfn)) {
- pfn++;
- continue;
- }
page = pfn_to_page(pfn);
if (PageBuddy(page))
/*
@@ -259,7 +617,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* the correct MIGRATE_ISOLATE freelist. There is no
* simple way to verify that as VM_BUG_ON(), though.
*/
- pfn += 1 << page_order(page);
+ pfn += 1 << buddy_order(page);
else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
@@ -278,13 +636,28 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
return pfn;
}
-/* Caller should ensure that requested range is in a single zone */
+/**
+ * test_pages_isolated - check if pageblocks in range are isolated
+ * @start_pfn: The first PFN of the isolated range
+ * @end_pfn: The first PFN *after* the isolated range
+ * @isol_flags: Testing mode flags
+ *
+ * This tests if all in the specified range are free.
+ *
+ * If %MEMORY_OFFLINE is specified in @flags, it will consider
+ * poisoned and offlined pages free as well.
+ *
+ * Caller must ensure the requested range doesn't span zones.
+ *
+ * Returns 0 if true, -EBUSY if one or more pages are in use.
+ */
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int isol_flags)
{
unsigned long pfn, flags;
struct page *page;
struct zone *zone;
+ int ret;
/*
* Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
@@ -297,15 +670,21 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
break;
}
page = __first_valid_page(start_pfn, end_pfn - start_pfn);
- if ((pfn < end_pfn) || !page)
- return -EBUSY;
+ if ((pfn < end_pfn) || !page) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
spin_unlock_irqrestore(&zone->lock, flags);
+ ret = pfn < end_pfn ? -EBUSY : 0;
+
+out:
trace_test_pages_isolated(start_pfn, end_pfn, pfn);
- return pfn < end_pfn ? -EBUSY : 0;
+ return ret;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 360461509423..c93baef0148f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -10,6 +10,8 @@
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/clock.h>
#include "internal.h"
@@ -25,9 +27,14 @@ struct page_owner {
gfp_t gfp_mask;
depot_stack_handle_t handle;
depot_stack_handle_t free_handle;
+ u64 ts_nsec;
+ u64 free_ts_nsec;
+ char comm[TASK_COMM_LEN];
+ pid_t pid;
+ pid_t tgid;
};
-static bool page_owner_enabled = false;
+static bool page_owner_enabled __initdata;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static depot_stack_handle_t dummy_handle;
@@ -38,17 +45,16 @@ static void init_early_allocated_pages(void);
static int __init early_page_owner_param(char *buf)
{
- if (!buf)
- return -EINVAL;
+ int ret = kstrtobool(buf, &page_owner_enabled);
- if (strcmp(buf, "on") == 0)
- page_owner_enabled = true;
+ if (page_owner_enabled)
+ stack_depot_request_early_init();
- return 0;
+ return ret;
}
early_param("page_owner", early_page_owner_param);
-static bool need_page_owner(void)
+static __init bool need_page_owner(void)
{
return page_owner_enabled;
}
@@ -77,7 +83,7 @@ static noinline void register_early_stack(void)
early_handle = create_dummy_stack();
}
-static void init_page_owner(void)
+static __init void init_page_owner(void)
{
if (!page_owner_enabled)
return;
@@ -93,6 +99,7 @@ struct page_ext_operations page_owner_ops = {
.size = sizeof(struct page_owner),
.need = need_page_owner,
.init = init_page_owner,
+ .need_shared_flags = true,
};
static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
@@ -100,71 +107,63 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
return (void *)page_ext + page_owner_ops.offset;
}
-static inline bool check_recursive_alloc(unsigned long *entries,
- unsigned int nr_entries,
- unsigned long ip)
-{
- unsigned int i;
-
- for (i = 0; i < nr_entries; i++) {
- if (entries[i] == ip)
- return true;
- }
- return false;
-}
-
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
depot_stack_handle_t handle;
unsigned int nr_entries;
- nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
-
/*
- * We need to check recursion here because our request to
- * stackdepot could trigger memory allocation to save new
- * entry. New memory allocation would reach here and call
- * stack_depot_save_entries() again if we don't catch it. There is
- * still not enough memory in stackdepot so it would try to
- * allocate memory again and loop forever.
+ * Avoid recursion.
+ *
+ * Sometimes page metadata allocation tracking requires more
+ * memory to be allocated:
+ * - when new stack trace is saved to stack depot
+ * - when backtrace itself is calculated (ia64)
*/
- if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
+ if (current->in_page_owner)
return dummy_handle;
+ current->in_page_owner = 1;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
+ current->in_page_owner = 0;
return handle;
}
-void __reset_page_owner(struct page *page, unsigned int order)
+void __reset_page_owner(struct page *page, unsigned short order)
{
int i;
struct page_ext *page_ext;
- depot_stack_handle_t handle = 0;
+ depot_stack_handle_t handle;
struct page_owner *page_owner;
+ u64 free_ts_nsec = local_clock();
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
-
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
+
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
for (i = 0; i < (1 << order); i++) {
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
page_owner = get_page_owner(page_ext);
page_owner->free_handle = handle;
+ page_owner->free_ts_nsec = free_ts_nsec;
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
-static inline void __set_page_owner_handle(struct page *page,
- struct page_ext *page_ext, depot_stack_handle_t handle,
- unsigned int order, gfp_t gfp_mask)
+static inline void __set_page_owner_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle,
+ unsigned short order, gfp_t gfp_mask)
{
struct page_owner *page_owner;
int i;
+ u64 ts_nsec = local_clock();
for (i = 0; i < (1 << order); i++) {
page_owner = get_page_owner(page_ext);
@@ -172,6 +171,11 @@ static inline void __set_page_owner_handle(struct page *page,
page_owner->order = order;
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = -1;
+ page_owner->pid = current->pid;
+ page_owner->tgid = current->tgid;
+ page_owner->ts_nsec = ts_nsec;
+ strscpy(page_owner->comm, current->comm,
+ sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
@@ -179,22 +183,24 @@ static inline void __set_page_owner_handle(struct page *page,
}
}
-noinline void __set_page_owner(struct page *page, unsigned int order,
+noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext;
depot_stack_handle_t handle;
+ handle = save_stack(gfp_mask);
+
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
-
- handle = save_stack(gfp_mask);
- __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
+ __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ page_ext_put(page_ext);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get(page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
@@ -202,33 +208,42 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner = get_page_owner(page_ext);
page_owner->last_migrate_reason = reason;
+ page_ext_put(page_ext);
}
-void __split_page_owner(struct page *page, unsigned int order)
+void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get(page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
return;
- for (i = 0; i < (1 << order); i++) {
+ for (i = 0; i < nr; i++) {
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
-void __copy_page_owner(struct page *oldpage, struct page *newpage)
+void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
- struct page_ext *old_ext = lookup_page_ext(oldpage);
- struct page_ext *new_ext = lookup_page_ext(newpage);
+ struct page_ext *old_ext;
+ struct page_ext *new_ext;
struct page_owner *old_page_owner, *new_page_owner;
- if (unlikely(!old_ext || !new_ext))
+ old_ext = page_ext_get(&old->page);
+ if (unlikely(!old_ext))
return;
+ new_ext = page_ext_get(&newfolio->page);
+ if (unlikely(!new_ext)) {
+ page_ext_put(old_ext);
+ return;
+ }
+
old_page_owner = get_page_owner(old_ext);
new_page_owner = get_page_owner(new_ext);
new_page_owner->order = old_page_owner->order;
@@ -236,18 +251,25 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
new_page_owner->last_migrate_reason =
old_page_owner->last_migrate_reason;
new_page_owner->handle = old_page_owner->handle;
+ new_page_owner->pid = old_page_owner->pid;
+ new_page_owner->tgid = old_page_owner->tgid;
+ new_page_owner->ts_nsec = old_page_owner->ts_nsec;
+ new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
+ strcpy(new_page_owner->comm, old_page_owner->comm);
/*
- * We don't clear the bit on the oldpage as it's going to be freed
+ * We don't clear the bit on the old folio as it's going to be freed
* after migration. Until then, the info can be useful in case of
- * a bug, and the overal stats will be off a bit only temporarily.
+ * a bug, and the overall stats will be off a bit only temporarily.
* Also, migrate_misplaced_transhuge_page() can still fail the
- * migration and then we want the oldpage to retain the info. But
+ * migration and then we want the old folio to retain the info. But
* in that case we also don't need to explicitly clear the info from
* the new page, which will be freed.
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
+ page_ext_put(new_ext);
+ page_ext_put(old_ext);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -256,8 +278,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
struct page *page;
struct page_ext *page_ext;
struct page_owner *page_owner;
- unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
- unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long pfn, block_end_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
unsigned long count[MIGRATE_TYPES] = { 0, };
int pageblock_mt, page_mt;
int i;
@@ -277,15 +299,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
continue;
}
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
- if (!pfn_valid_within(pfn))
- continue;
-
/* The pageblock is online, no need to recheck. */
page = pfn_to_page(pfn);
@@ -295,8 +314,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageBuddy(page)) {
unsigned long freepage_order;
- freepage_order = page_order_unsafe(page);
- if (freepage_order < MAX_ORDER)
+ freepage_order = buddy_order_unsafe(page);
+ if (freepage_order <= MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
continue;
}
@@ -304,12 +323,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageReserved(page))
continue;
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
- continue;
+ goto ext_put_continue;
page_owner = get_page_owner(page_ext);
page_mt = gfp_migratetype(page_owner->gfp_mask);
@@ -320,9 +339,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
count[pageblock_mt]++;
pfn = block_end_pfn;
+ page_ext_put(page_ext);
break;
}
pfn += (1UL << page_owner->order) - 1;
+ext_put_continue:
+ page_ext_put(page_ext);
}
}
@@ -333,14 +355,51 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
seq_putc(m, '\n');
}
+/*
+ * Looking for memcg information and print it out
+ */
+static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
+ struct page *page)
+{
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+ struct mem_cgroup *memcg;
+ bool online;
+ char name[80];
+
+ rcu_read_lock();
+ memcg_data = READ_ONCE(page->memcg_data);
+ if (!memcg_data)
+ goto out_unlock;
+
+ if (memcg_data & MEMCG_DATA_OBJCGS)
+ ret += scnprintf(kbuf + ret, count - ret,
+ "Slab cache page\n");
+
+ memcg = page_memcg_check(page);
+ if (!memcg)
+ goto out_unlock;
+
+ online = (memcg->css.flags & CSS_ONLINE);
+ cgroup_name(memcg->css.cgroup, name, sizeof(name));
+ ret += scnprintf(kbuf + ret, count - ret,
+ "Charged %sto %smemcg %s\n",
+ PageMemcgKmem(page) ? "(via objcg) " : "",
+ online ? "" : "offline ",
+ name);
+out_unlock:
+ rcu_read_unlock();
+#endif /* CONFIG_MEMCG */
+
+ return ret;
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_owner *page_owner,
depot_stack_handle_t handle)
{
int ret, pageblock_mt, page_mt;
- unsigned long *entries;
- unsigned int nr_entries;
char *kbuf;
count = min_t(size_t, count, PAGE_SIZE);
@@ -348,41 +407,36 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (!kbuf)
return -ENOMEM;
- ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg)\n",
+ ret = scnprintf(kbuf, count,
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
- &page_owner->gfp_mask);
-
- if (ret >= count)
- goto err;
+ &page_owner->gfp_mask, page_owner->pid,
+ page_owner->tgid, page_owner->comm,
+ page_owner->ts_nsec, page_owner->free_ts_nsec);
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfp_migratetype(page_owner->gfp_mask);
- ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
+ ret += scnprintf(kbuf + ret, count - ret,
+ "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n",
pfn,
migratetype_names[page_mt],
pfn >> pageblock_order,
migratetype_names[pageblock_mt],
- page->flags, &page->flags);
+ &page->flags);
- if (ret >= count)
- goto err;
-
- nr_entries = stack_depot_fetch(handle, &entries);
- ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
+ ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
goto err;
if (page_owner->last_migrate_reason != -1) {
- ret += snprintf(kbuf + ret, count - ret,
+ ret += scnprintf(kbuf + ret, count - ret,
"Page has been migrated, last migrate reason: %s\n",
migrate_reason_names[page_owner->last_migrate_reason]);
- if (ret >= count)
- goto err;
}
+ ret = print_page_owner_memcg(kbuf, count, ret, page);
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -398,13 +452,11 @@ err:
return -ENOMEM;
}
-void __dump_page_owner(struct page *page)
+void __dump_page_owner(const struct page *page)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get((void *)page);
struct page_owner *page_owner;
depot_stack_handle_t handle;
- unsigned long *entries;
- unsigned int nr_entries;
gfp_t gfp_mask;
int mt;
@@ -419,6 +471,7 @@ void __dump_page_owner(struct page *page)
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
pr_alert("page_owner info is not present (never set?)\n");
+ page_ext_put(page_ext);
return;
}
@@ -427,29 +480,29 @@ void __dump_page_owner(struct page *page)
else
pr_alert("page_owner tracks the page as freed\n");
- pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
- page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
+ page_owner->pid, page_owner->tgid, page_owner->comm,
+ page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
- if (!handle) {
+ if (!handle)
pr_alert("page_owner allocation stack trace missing\n");
- } else {
- nr_entries = stack_depot_fetch(handle, &entries);
- stack_trace_print(entries, nr_entries, 0);
- }
+ else
+ stack_depot_print(handle);
handle = READ_ONCE(page_owner->free_handle);
if (!handle) {
pr_alert("page_owner free stack trace missing\n");
} else {
- nr_entries = stack_depot_fetch(handle, &entries);
pr_alert("page last free stack trace:\n");
- stack_trace_print(entries, nr_entries, 0);
+ stack_depot_print(handle);
}
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
migrate_reason_names[page_owner->last_migrate_reason]);
+ page_ext_put(page_ext);
}
static ssize_t
@@ -465,17 +518,25 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return -EINVAL;
page = NULL;
- pfn = min_low_pfn + *ppos;
-
+ if (*ppos == 0)
+ pfn = min_low_pfn;
+ else
+ pfn = *ppos;
/* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
pfn++;
- drain_all_pages(NULL);
-
/* Find an allocated page */
for (; pfn < max_pfn; pfn++) {
/*
+ * This temporary page_owner is required so
+ * that we can avoid the context switches while holding
+ * the rcu lock and copying the page owner information to
+ * user through copy_to_user() or GFP_KERNEL allocations.
+ */
+ struct page_owner page_owner_tmp;
+
+ /*
* If the new page is in a new MAX_ORDER_NR_PAGES area,
* validate the area as existing, skip it if not
*/
@@ -484,20 +545,16 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
continue;
}
- /* Check for holes within a MAX_ORDER area */
- if (!pfn_valid_within(pfn))
- continue;
-
page = pfn_to_page(pfn);
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
- if (freepage_order < MAX_ORDER)
+ if (freepage_order <= MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
continue;
}
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
@@ -506,14 +563,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* because we don't hold the zone lock.
*/
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
+ goto ext_put_continue;
/*
* Although we do have the info about past allocation of free
* pages, it's not relevant for current memory usage.
*/
if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
- continue;
+ goto ext_put_continue;
page_owner = get_page_owner(page_ext);
@@ -522,7 +579,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* would inflate the stats.
*/
if (!IS_ALIGNED(pfn, 1 << page_owner->order))
- continue;
+ goto ext_put_continue;
/*
* Access to page_ext->handle isn't synchronous so we should
@@ -530,18 +587,37 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
*/
handle = READ_ONCE(page_owner->handle);
if (!handle)
- continue;
+ goto ext_put_continue;
/* Record the next PFN to read in the file offset */
- *ppos = (pfn - min_low_pfn) + 1;
+ *ppos = pfn + 1;
+ page_owner_tmp = *page_owner;
+ page_ext_put(page_ext);
return print_page_owner(buf, count, pfn, page,
- page_owner, handle);
+ &page_owner_tmp, handle);
+ext_put_continue:
+ page_ext_put(page_ext);
}
return 0;
}
+static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
+{
+ switch (orig) {
+ case SEEK_SET:
+ file->f_pos = offset;
+ break;
+ case SEEK_CUR:
+ file->f_pos += offset;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return file->f_pos;
+}
+
static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
unsigned long pfn = zone->zone_start_pfn;
@@ -561,18 +637,13 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
}
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
for (; pfn < block_end_pfn; pfn++) {
- struct page *page;
+ struct page *page = pfn_to_page(pfn);
struct page_ext *page_ext;
- if (!pfn_valid_within(pfn))
- continue;
-
- page = pfn_to_page(pfn);
-
if (page_zone(page) != zone)
continue;
@@ -584,9 +655,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
* heavy lock contention.
*/
if (PageBuddy(page)) {
- unsigned long order = page_order_unsafe(page);
+ unsigned long order = buddy_order_unsafe(page);
- if (order > 0 && order < MAX_ORDER)
+ if (order > 0 && order <= MAX_ORDER)
pfn += (1UL << order) - 1;
continue;
}
@@ -594,18 +665,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
if (PageReserved(page))
continue;
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
/* Maybe overlapping zone */
if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
+ goto ext_put_continue;
/* Found early allocated page */
- __set_page_owner_handle(page, page_ext, early_handle,
+ __set_page_owner_handle(page_ext, early_handle,
0, 0);
count++;
+ext_put_continue:
+ page_ext_put(page_ext);
}
cond_resched();
}
@@ -637,6 +710,7 @@ static void init_early_allocated_pages(void)
static const struct file_operations proc_page_owner_operations = {
.read = read_page_owner,
+ .llseek = lseek_page_owner,
};
static int __init pageowner_init(void)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 34b9181ee5d1..98438985e1ed 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -2,53 +2,36 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/mmdebug.h>
#include <linux/highmem.h>
#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
#include <linux/kasan.h>
-static bool want_page_poisoning __read_mostly;
+bool _page_poisoning_enabled_early;
+EXPORT_SYMBOL(_page_poisoning_enabled_early);
+DEFINE_STATIC_KEY_FALSE(_page_poisoning_enabled);
+EXPORT_SYMBOL(_page_poisoning_enabled);
static int __init early_page_poison_param(char *buf)
{
- if (!buf)
- return -EINVAL;
- return strtobool(buf, &want_page_poisoning);
+ return kstrtobool(buf, &_page_poisoning_enabled_early);
}
early_param("page_poison", early_page_poison_param);
-/**
- * page_poisoning_enabled - check if page poisoning is enabled
- *
- * Return true if page poisoning is enabled, or false if not.
- */
-bool page_poisoning_enabled(void)
-{
- /*
- * Assumes that debug_pagealloc_enabled is set before
- * memblock_free_all.
- * Page poisoning is debug page alloc for some arches. If
- * either of those options are enabled, enable poisoning.
- */
- return (want_page_poisoning ||
- (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
- debug_pagealloc_enabled()));
-}
-EXPORT_SYMBOL_GPL(page_poisoning_enabled);
-
static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
/* KASAN still think the page is in-use, so skip it. */
kasan_disable_current();
- memset(addr, PAGE_POISON, PAGE_SIZE);
+ memset(kasan_reset_tag(addr), PAGE_POISON, PAGE_SIZE);
kasan_enable_current();
kunmap_atomic(addr);
}
-static void poison_pages(struct page *page, int n)
+void __kernel_poison_pages(struct page *page, int n)
{
int i;
@@ -63,15 +46,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
return error && !(error & (error - 1));
}
-static void check_poison_mem(unsigned char *mem, size_t bytes)
+static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes)
{
static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
unsigned char *start;
unsigned char *end;
- if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
- return;
-
start = memchr_inv(mem, PAGE_POISON, bytes);
if (!start)
return;
@@ -91,6 +71,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
end - start + 1, 1);
dump_stack();
+ dump_page(page, "pagealloc: corrupted page details");
}
static void unpoison_page(struct page *page)
@@ -98,16 +79,18 @@ static void unpoison_page(struct page *page)
void *addr;
addr = kmap_atomic(page);
+ kasan_disable_current();
/*
* Page poisoning when enabled poisons each and every page
* that is freed to buddy. Thus no extra check is done to
* see if a page was poisoned.
*/
- check_poison_mem(addr, PAGE_SIZE);
+ check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE);
+ kasan_enable_current();
kunmap_atomic(addr);
}
-static void unpoison_pages(struct page *page, int n)
+void __kernel_unpoison_pages(struct page *page, int n)
{
int i;
@@ -115,17 +98,6 @@ static void unpoison_pages(struct page *page, int n)
unpoison_page(page + i);
}
-void kernel_poison_pages(struct page *page, int numpages, int enable)
-{
- if (!page_poisoning_enabled())
- return;
-
- if (enable)
- unpoison_pages(page, numpages);
- else
- poison_pages(page, numpages);
-}
-
#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 3bbd471cfc81..b021f482a4cb 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -4,12 +4,49 @@
#include <linux/page_reporting.h>
#include <linux/gfp.h>
#include <linux/export.h>
+#include <linux/module.h>
#include <linux/delay.h>
#include <linux/scatterlist.h>
#include "page_reporting.h"
#include "internal.h"
+/* Initialize to an unsupported value */
+unsigned int page_reporting_order = -1;
+
+static int page_order_update_notify(const char *val, const struct kernel_param *kp)
+{
+ /*
+ * If param is set beyond this limit, order is set to default
+ * pageblock_order value
+ */
+ return param_set_uint_minmax(val, kp, 0, MAX_ORDER);
+}
+
+static const struct kernel_param_ops page_reporting_param_ops = {
+ .set = &page_order_update_notify,
+ /*
+ * For the get op, use param_get_int instead of param_get_uint.
+ * This is to make sure that when unset the initialized value of
+ * -1 is shown correctly
+ */
+ .get = &param_get_int,
+};
+
+module_param_cb(page_reporting_order, &page_reporting_param_ops,
+ &page_reporting_order, 0644);
+MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
+
+/*
+ * This symbol is also a kernel parameter. Export the page_reporting_order
+ * symbol so that other drivers can access it to control order values without
+ * having to introduce another configurable parameter. Only one driver can
+ * register with the page_reporting driver for the service, so we have just
+ * one control parameter for the use case(which can be accessed in both
+ * drivers)
+ */
+EXPORT_SYMBOL_GPL(page_reporting_order);
+
#define PAGE_REPORTING_DELAY (2 * HZ)
static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
@@ -31,8 +68,8 @@ __page_reporting_request(struct page_reporting_dev_info *prdev)
return;
/*
- * If reporting is already active there is nothing we need to do.
- * Test against 0 as that represents PAGE_REPORTING_IDLE.
+ * If reporting is already active there is nothing we need to do.
+ * Test against 0 as that represents PAGE_REPORTING_IDLE.
*/
state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
if (state != PAGE_REPORTING_IDLE)
@@ -92,7 +129,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
* report on the new larger page when we make our way
* up to that higher order.
*/
- if (PageBuddy(page) && page_order(page) == order)
+ if (PageBuddy(page) && buddy_order(page) == order)
__SetPageReported(page);
} while ((sg = sg_next(sg)));
@@ -178,7 +215,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
* the new head of the free list before we release the
* zone lock.
*/
- if (&page->lru != list && !list_is_first(&page->lru, list))
+ if (!list_is_first(&page->lru, list))
list_rotate_to_front(&page->lru, list);
/* release lock before waiting on report processing */
@@ -211,7 +248,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
}
/* Rotate any leftover pages to the head of the freelist */
- if (&next->lru != list && !list_is_first(&next->lru, list))
+ if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
list_rotate_to_front(&next->lru, list);
spin_unlock_irq(&zone->lock);
@@ -229,7 +266,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
/* Generate minimum watermark to be able to guarantee progress */
watermark = low_wmark_pages(zone) +
- (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
+ (PAGE_REPORTING_CAPACITY << page_reporting_order);
/*
* Cancel request if insufficient free memory or if we failed
@@ -239,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
return err;
/* Process each free list starting from lowest order/mt */
- for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+ for (order = page_reporting_order; order <= MAX_ORDER; order++) {
for (mt = 0; mt < MIGRATE_TYPES; mt++) {
/* We do not pull pages from the isolate free list */
if (is_migrate_isolate(mt))
@@ -319,11 +356,26 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
mutex_lock(&page_reporting_mutex);
/* nothing to do if already in use */
- if (rcu_access_pointer(pr_dev_info)) {
+ if (rcu_dereference_protected(pr_dev_info,
+ lockdep_is_held(&page_reporting_mutex))) {
err = -EBUSY;
goto err_out;
}
+ /*
+ * If the page_reporting_order value is not set, we check if
+ * an order is provided from the driver that is performing the
+ * registration. If that is not provided either, we default to
+ * pageblock_order.
+ */
+
+ if (page_reporting_order == -1) {
+ if (prdev->order > 0 && prdev->order <= MAX_ORDER)
+ page_reporting_order = prdev->order;
+ else
+ page_reporting_order = pageblock_order;
+ }
+
/* initialize state and work structures */
atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
@@ -350,7 +402,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
{
mutex_lock(&page_reporting_mutex);
- if (rcu_access_pointer(pr_dev_info) == prdev) {
+ if (prdev == rcu_dereference_protected(pr_dev_info,
+ lockdep_is_held(&page_reporting_mutex))) {
/* Disable page reporting notification */
RCU_INIT_POINTER(pr_dev_info, NULL);
synchronize_rcu();
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
index 2c385dd4ddbd..c51dbc228b94 100644
--- a/mm/page_reporting.h
+++ b/mm/page_reporting.h
@@ -10,10 +10,9 @@
#include <linux/pgtable.h>
#include <linux/scatterlist.h>
-#define PAGE_REPORTING_MIN_ORDER pageblock_order
-
#ifdef CONFIG_PAGE_REPORTING
DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
+extern unsigned int page_reporting_order;
void __page_reporting_notify(void);
static inline bool page_reported(struct page *page)
@@ -38,7 +37,7 @@ static inline void page_reporting_notify_free(unsigned int order)
return;
/* Determine if we have crossed reporting threshold */
- if (order < PAGE_REPORTING_MIN_ORDER)
+ if (order < page_reporting_order)
return;
/* This will add a few cycles, but should be called infrequently */
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
new file mode 100644
index 000000000000..93ec7690a0d8
--- /dev/null
+++ b/mm/page_table_check.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2021, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+#include <linux/kstrtox.h>
+#include <linux/mm.h>
+#include <linux/page_table_check.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "page_table_check: " fmt
+
+struct page_table_check {
+ atomic_t anon_map_count;
+ atomic_t file_map_count;
+};
+
+static bool __page_table_check_enabled __initdata =
+ IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED);
+
+DEFINE_STATIC_KEY_TRUE(page_table_check_disabled);
+EXPORT_SYMBOL(page_table_check_disabled);
+
+static int __init early_page_table_check_param(char *buf)
+{
+ return kstrtobool(buf, &__page_table_check_enabled);
+}
+
+early_param("page_table_check", early_page_table_check_param);
+
+static bool __init need_page_table_check(void)
+{
+ return __page_table_check_enabled;
+}
+
+static void __init init_page_table_check(void)
+{
+ if (!__page_table_check_enabled)
+ return;
+ static_branch_disable(&page_table_check_disabled);
+}
+
+struct page_ext_operations page_table_check_ops = {
+ .size = sizeof(struct page_table_check),
+ .need = need_page_table_check,
+ .init = init_page_table_check,
+ .need_shared_flags = false,
+};
+
+static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
+{
+ BUG_ON(!page_ext);
+ return (void *)(page_ext) + page_table_check_ops.offset;
+}
+
+/*
+ * An entry is removed from the page table, decrement the counters for that page
+ * verify that it is of correct type and counters do not become negative.
+ */
+static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
+ unsigned long pfn, unsigned long pgcnt)
+{
+ struct page_ext *page_ext;
+ struct page *page;
+ unsigned long i;
+ bool anon;
+
+ if (!pfn_valid(pfn))
+ return;
+
+ page = pfn_to_page(pfn);
+ page_ext = page_ext_get(page);
+
+ BUG_ON(PageSlab(page));
+ anon = PageAnon(page);
+
+ for (i = 0; i < pgcnt; i++) {
+ struct page_table_check *ptc = get_page_table_check(page_ext);
+
+ if (anon) {
+ BUG_ON(atomic_read(&ptc->file_map_count));
+ BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
+ } else {
+ BUG_ON(atomic_read(&ptc->anon_map_count));
+ BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
+ }
+ page_ext = page_ext_next(page_ext);
+ }
+ page_ext_put(page_ext);
+}
+
+/*
+ * A new entry is added to the page table, increment the counters for that page
+ * verify that it is of correct type and is not being mapped with a different
+ * type to a different process.
+ */
+static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
+ unsigned long pfn, unsigned long pgcnt,
+ bool rw)
+{
+ struct page_ext *page_ext;
+ struct page *page;
+ unsigned long i;
+ bool anon;
+
+ if (!pfn_valid(pfn))
+ return;
+
+ page = pfn_to_page(pfn);
+ page_ext = page_ext_get(page);
+
+ BUG_ON(PageSlab(page));
+ anon = PageAnon(page);
+
+ for (i = 0; i < pgcnt; i++) {
+ struct page_table_check *ptc = get_page_table_check(page_ext);
+
+ if (anon) {
+ BUG_ON(atomic_read(&ptc->file_map_count));
+ BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
+ } else {
+ BUG_ON(atomic_read(&ptc->anon_map_count));
+ BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
+ }
+ page_ext = page_ext_next(page_ext);
+ }
+ page_ext_put(page_ext);
+}
+
+/*
+ * page is on free list, or is being allocated, verify that counters are zeroes
+ * crash if they are not.
+ */
+void __page_table_check_zero(struct page *page, unsigned int order)
+{
+ struct page_ext *page_ext;
+ unsigned long i;
+
+ BUG_ON(PageSlab(page));
+
+ page_ext = page_ext_get(page);
+ BUG_ON(!page_ext);
+ for (i = 0; i < (1ul << order); i++) {
+ struct page_table_check *ptc = get_page_table_check(page_ext);
+
+ BUG_ON(atomic_read(&ptc->anon_map_count));
+ BUG_ON(atomic_read(&ptc->file_map_count));
+ page_ext = page_ext_next(page_ext);
+ }
+ page_ext_put(page_ext);
+}
+
+void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t pte)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (pte_user_accessible_page(pte)) {
+ page_table_check_clear(mm, addr, pte_pfn(pte),
+ PAGE_SIZE >> PAGE_SHIFT);
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pte_clear);
+
+void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t pmd)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (pmd_user_accessible_page(pmd)) {
+ page_table_check_clear(mm, addr, pmd_pfn(pmd),
+ PMD_SIZE >> PAGE_SHIFT);
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pmd_clear);
+
+void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
+ pud_t pud)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (pud_user_accessible_page(pud)) {
+ page_table_check_clear(mm, addr, pud_pfn(pud),
+ PUD_SIZE >> PAGE_SHIFT);
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pud_clear);
+
+void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ if (&init_mm == mm)
+ return;
+
+ __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
+ if (pte_user_accessible_page(pte)) {
+ page_table_check_set(mm, addr, pte_pfn(pte),
+ PAGE_SIZE >> PAGE_SHIFT,
+ pte_write(pte));
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pte_set);
+
+void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+ if (&init_mm == mm)
+ return;
+
+ __page_table_check_pmd_clear(mm, addr, *pmdp);
+ if (pmd_user_accessible_page(pmd)) {
+ page_table_check_set(mm, addr, pmd_pfn(pmd),
+ PMD_SIZE >> PAGE_SHIFT,
+ pmd_write(pmd));
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pmd_set);
+
+void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+ if (&init_mm == mm)
+ return;
+
+ __page_table_check_pud_clear(mm, addr, *pudp);
+ if (pud_user_accessible_page(pud)) {
+ page_table_check_set(mm, addr, pud_pfn(pud),
+ PUD_SIZE >> PAGE_SHIFT,
+ pud_write(pud));
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pud_set);
+
+void __page_table_check_pte_clear_range(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t pmd)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
+ pte_t *ptep = pte_offset_map(&pmd, addr);
+ unsigned long i;
+
+ if (WARN_ON(!ptep))
+ return;
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
+ addr += PAGE_SIZE;
+ ptep++;
+ }
+ pte_unmap(ptep - PTRS_PER_PTE);
+ }
+}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 5e77b269c330..49e0d28f0379 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -13,71 +13,80 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
return false;
}
-static bool map_pte(struct page_vma_mapped_walk *pvmw)
+static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
{
- pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address);
- if (!(pvmw->flags & PVMW_SYNC)) {
- if (pvmw->flags & PVMW_MIGRATION) {
- if (!is_swap_pte(*pvmw->pte))
- return false;
- } else {
- /*
- * We get here when we are trying to unmap a private
- * device page from the process address space. Such
- * page is not CPU accessible and thus is mapped as
- * a special swap entry, nonetheless it still does
- * count as a valid regular mapping for the page (and
- * is accounted as such in page maps count).
- *
- * So handle this special case as if it was a normal
- * page mapping ie lock CPU page table and returns
- * true.
- *
- * For more details on device private memory see HMM
- * (include/linux/hmm.h or mm/hmm.c).
- */
- if (is_swap_pte(*pvmw->pte)) {
- swp_entry_t entry;
+ pte_t ptent;
- /* Handle un-addressable ZONE_DEVICE memory */
- entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry))
- return false;
- } else if (!pte_present(*pvmw->pte))
- return false;
- }
+ if (pvmw->flags & PVMW_SYNC) {
+ /* Use the stricter lookup */
+ pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, &pvmw->ptl);
+ *ptlp = pvmw->ptl;
+ return !!pvmw->pte;
}
- pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
- return true;
-}
-static inline bool pfn_is_match(struct page *page, unsigned long pfn)
-{
- unsigned long page_pfn = page_to_pfn(page);
+ /*
+ * It is important to return the ptl corresponding to pte,
+ * in case *pvmw->pmd changes underneath us; so we need to
+ * return it even when choosing not to lock, in case caller
+ * proceeds to loop over next ptes, and finds a match later.
+ * Though, in most cases, page lock already protects this.
+ */
+ pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, ptlp);
+ if (!pvmw->pte)
+ return false;
- /* normal page and hugetlbfs page */
- if (!PageTransCompound(page) || PageHuge(page))
- return page_pfn == pfn;
+ ptent = ptep_get(pvmw->pte);
- /* THP can be referenced by any subpage */
- return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page);
+ if (pvmw->flags & PVMW_MIGRATION) {
+ if (!is_swap_pte(ptent))
+ return false;
+ } else if (is_swap_pte(ptent)) {
+ swp_entry_t entry;
+ /*
+ * Handle un-addressable ZONE_DEVICE memory.
+ *
+ * We get here when we are trying to unmap a private
+ * device page from the process address space. Such
+ * page is not CPU accessible and thus is mapped as
+ * a special swap entry, nonetheless it still does
+ * count as a valid regular mapping for the page
+ * (and is accounted as such in page maps count).
+ *
+ * So handle this special case as if it was a normal
+ * page mapping ie lock CPU page table and return true.
+ *
+ * For more details on device private memory see HMM
+ * (include/linux/hmm.h or mm/hmm.c).
+ */
+ entry = pte_to_swp_entry(ptent);
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
+ return false;
+ } else if (!pte_present(ptent)) {
+ return false;
+ }
+ pvmw->ptl = *ptlp;
+ spin_lock(pvmw->ptl);
+ return true;
}
/**
* check_pte - check if @pvmw->page is mapped at the @pvmw->pte
+ * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking
*
* page_vma_mapped_walk() found a place where @pvmw->page is *potentially*
* mapped. check_pte() has to validate this.
*
- * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary
- * page.
+ * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to
+ * arbitrary page.
*
* If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration
* entry that points to @pvmw->page or any subpage in case of THP.
*
- * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to
- * @pvmw->page or any subpage in case of THP.
+ * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to
+ * pvmw->page or any subpage in case of THP.
*
* Otherwise, return false.
*
@@ -85,38 +94,58 @@ static inline bool pfn_is_match(struct page *page, unsigned long pfn)
static bool check_pte(struct page_vma_mapped_walk *pvmw)
{
unsigned long pfn;
+ pte_t ptent = ptep_get(pvmw->pte);
if (pvmw->flags & PVMW_MIGRATION) {
swp_entry_t entry;
- if (!is_swap_pte(*pvmw->pte))
+ if (!is_swap_pte(ptent))
return false;
- entry = pte_to_swp_entry(*pvmw->pte);
+ entry = pte_to_swp_entry(ptent);
- if (!is_migration_entry(entry))
+ if (!is_migration_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
- pfn = migration_entry_to_pfn(entry);
- } else if (is_swap_pte(*pvmw->pte)) {
+ pfn = swp_offset_pfn(entry);
+ } else if (is_swap_pte(ptent)) {
swp_entry_t entry;
/* Handle un-addressable ZONE_DEVICE memory */
- entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry))
+ entry = pte_to_swp_entry(ptent);
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
- pfn = device_private_entry_to_pfn(entry);
+ pfn = swp_offset_pfn(entry);
} else {
- if (!pte_present(*pvmw->pte))
+ if (!pte_present(ptent))
return false;
- pfn = pte_pfn(*pvmw->pte);
+ pfn = pte_pfn(ptent);
}
- return pfn_is_match(pvmw->page, pfn);
+ return (pfn - pvmw->pfn) < pvmw->nr_pages;
+}
+
+/* Returns true if the two ranges overlap. Careful to not overflow. */
+static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw)
+{
+ if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn)
+ return false;
+ if (pfn > pvmw->pfn + pvmw->nr_pages - 1)
+ return false;
+ return true;
+}
+
+static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
+{
+ pvmw->address = (pvmw->address + size) & ~(size - 1);
+ if (!pvmw->address)
+ pvmw->address = ULONG_MAX;
}
/**
- * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
+ * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at
* @pvmw->address
* @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags
* must be set. pmd, pte and ptl must be NULL.
@@ -133,7 +162,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
* regardless of which page table level the page is mapped at. @pvmw->pmd is
* NULL.
*
- * Retruns false if there are no more page table entries for the page in
+ * Returns false if there are no more page table entries for the page in
* the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
*
* If you need to stop the walk before page_vma_mapped_walk() returned false,
@@ -141,8 +170,10 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
*/
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
{
- struct mm_struct *mm = pvmw->vma->vm_mm;
- struct page *page = pvmw->page;
+ struct vm_area_struct *vma = pvmw->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long end;
+ spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -152,101 +183,132 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- if (pvmw->pte)
- goto next_pte;
-
- if (unlikely(PageHuge(pvmw->page))) {
- /* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
+ if (unlikely(is_vm_hugetlb_page(vma))) {
+ struct hstate *hstate = hstate_vma(vma);
+ unsigned long size = huge_page_size(hstate);
+ /* The only possible mapping was handled on last iteration */
+ if (pvmw->pte)
+ return not_found(pvmw);
+ /*
+ * All callers that get here will already hold the
+ * i_mmap_rwsem. Therefore, no additional locks need to be
+ * taken before calling hugetlb_walk().
+ */
+ pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
if (!pvmw->pte)
return false;
- pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
- spin_lock(pvmw->ptl);
+ pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
if (!check_pte(pvmw))
return not_found(pvmw);
return true;
}
+
+ end = vma_address_end(pvmw);
+ if (pvmw->pte)
+ goto next_pte;
restart:
- pgd = pgd_offset(mm, pvmw->address);
- if (!pgd_present(*pgd))
- return false;
- p4d = p4d_offset(pgd, pvmw->address);
- if (!p4d_present(*p4d))
- return false;
- pud = pud_offset(p4d, pvmw->address);
- if (!pud_present(*pud))
- return false;
- pvmw->pmd = pmd_offset(pud, pvmw->address);
- /*
- * Make sure the pmd value isn't cached in a register by the
- * compiler and used as a stale value after we've observed a
- * subsequent update.
- */
- pmde = READ_ONCE(*pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
- if (likely(pmd_trans_huge(*pvmw->pmd))) {
- if (pvmw->flags & PVMW_MIGRATION)
- return not_found(pvmw);
- if (pmd_page(*pvmw->pmd) != page)
- return not_found(pvmw);
- return true;
- } else if (!pmd_present(*pvmw->pmd)) {
- if (thp_migration_supported()) {
- if (!(pvmw->flags & PVMW_MIGRATION))
- return not_found(pvmw);
- if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
+ do {
+ pgd = pgd_offset(mm, pvmw->address);
+ if (!pgd_present(*pgd)) {
+ step_forward(pvmw, PGDIR_SIZE);
+ continue;
+ }
+ p4d = p4d_offset(pgd, pvmw->address);
+ if (!p4d_present(*p4d)) {
+ step_forward(pvmw, P4D_SIZE);
+ continue;
+ }
+ pud = pud_offset(p4d, pvmw->address);
+ if (!pud_present(*pud)) {
+ step_forward(pvmw, PUD_SIZE);
+ continue;
+ }
- if (migration_entry_to_page(entry) != page)
- return not_found(pvmw);
- return true;
- }
+ pvmw->pmd = pmd_offset(pud, pvmw->address);
+ /*
+ * Make sure the pmd value isn't cached in a register by the
+ * compiler and used as a stale value after we've observed a
+ * subsequent update.
+ */
+ pmde = pmdp_get_lockless(pvmw->pmd);
+
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
+ (pmd_present(pmde) && pmd_devmap(pmde))) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ pmde = *pvmw->pmd;
+ if (!pmd_present(pmde)) {
+ swp_entry_t entry;
+
+ if (!thp_migration_supported() ||
+ !(pvmw->flags & PVMW_MIGRATION))
+ return not_found(pvmw);
+ entry = pmd_to_swp_entry(pmde);
+ if (!is_migration_entry(entry) ||
+ !check_pmd(swp_offset_pfn(entry), pvmw))
+ return not_found(pvmw);
+ return true;
+ }
+ if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+ if (pvmw->flags & PVMW_MIGRATION)
+ return not_found(pvmw);
+ if (!check_pmd(pmd_pfn(pmde), pvmw))
+ return not_found(pvmw);
+ return true;
}
- return not_found(pvmw);
- } else {
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
+ } else if (!pmd_present(pmde)) {
+ /*
+ * If PVMW_SYNC, take and drop THP pmd lock so that we
+ * cannot return prematurely, while zap_huge_pmd() has
+ * cleared *pmd but not decremented compound_mapcount().
+ */
+ if ((pvmw->flags & PVMW_SYNC) &&
+ transhuge_vma_suitable(vma, pvmw->address) &&
+ (pvmw->nr_pages >= HPAGE_PMD_NR)) {
+ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+
+ spin_unlock(ptl);
+ }
+ step_forward(pvmw, PMD_SIZE);
+ continue;
}
- } else if (!pmd_present(pmde)) {
- return false;
- }
- if (!map_pte(pvmw))
- goto next_pte;
- while (1) {
+ if (!map_pte(pvmw, &ptl)) {
+ if (!pvmw->pte)
+ goto restart;
+ goto next_pte;
+ }
+this_pte:
if (check_pte(pvmw))
return true;
next_pte:
- /* Seek to next pte only makes sense for THP */
- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
- return not_found(pvmw);
do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >= pvmw->vma->vm_end ||
- pvmw->address >=
- __vma_address(pvmw->page, pvmw->vma) +
- thp_size(pvmw->page))
+ if (pvmw->address >= end)
return not_found(pvmw);
/* Did we cross page table boundary? */
- if (pvmw->address % PMD_SIZE == 0) {
- pte_unmap(pvmw->pte);
+ if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
if (pvmw->ptl) {
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
}
+ pte_unmap(pvmw->pte);
+ pvmw->pte = NULL;
goto restart;
- } else {
- pvmw->pte++;
}
- } while (pte_none(*pvmw->pte));
+ pvmw->pte++;
+ } while (pte_none(ptep_get(pvmw->pte)));
if (!pvmw->ptl) {
- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ pvmw->ptl = ptl;
spin_lock(pvmw->ptl);
}
- }
+ goto this_pte;
+ } while (pvmw->address < end);
+
+ return false;
}
/**
@@ -261,18 +323,15 @@ next_pte:
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
{
struct page_vma_mapped_walk pvmw = {
- .page = page,
+ .pfn = page_to_pfn(page),
+ .nr_pages = 1,
.vma = vma,
.flags = PVMW_SYNC,
};
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
- if (unlikely(end < vma->vm_start || start >= vma->vm_end))
+ pvmw.address = vma_address(page, vma);
+ if (pvmw.address == -EFAULT)
return 0;
- pvmw.address = max(start, vma->vm_start);
if (!page_vma_mapped_walk(&pvmw))
return 0;
page_vma_mapped_walk_done(&pvmw);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..b7d7e4fcfad7 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -46,17 +46,71 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
spinlock_t *ptl;
if (walk->no_vma) {
- pte = pte_offset_map(pmd, addr);
- err = walk_pte_range_inner(pte, addr, end, walk);
- pte_unmap(pte);
+ /*
+ * pte_offset_map() might apply user-specific validation.
+ * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
+ * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
+ * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
+ */
+ if (walk->mm == &init_mm || addr >= TASK_SIZE)
+ pte = pte_offset_kernel(pmd, addr);
+ else
+ pte = pte_offset_map(pmd, addr);
+ if (pte) {
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ if (walk->mm != &init_mm && addr < TASK_SIZE)
+ pte_unmap(pte);
+ }
} else {
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- err = walk_pte_range_inner(pte, addr, end, walk);
- pte_unmap_unlock(pte, ptl);
+ if (pte) {
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap_unlock(pte, ptl);
+ }
}
+ if (!pte)
+ walk->action = ACTION_AGAIN;
+ return err;
+}
+
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk, int pdshift)
+{
+ int err = 0;
+ const struct mm_walk_ops *ops = walk->ops;
+ int shift = hugepd_shift(*phpd);
+ int page_size = 1 << shift;
+
+ if (!ops->pte_entry)
+ return 0;
+
+ if (addr & (page_size - 1))
+ return 0;
+
+ for (;;) {
+ pte_t *pte;
+
+ spin_lock(&walk->mm->page_table_lock);
+ pte = hugepte_offset(*phpd, addr, pdshift);
+ err = ops->pte_entry(pte, addr, addr + page_size, walk);
+ spin_unlock(&walk->mm->page_table_lock);
+ if (err)
+ break;
+ if (addr >= end - page_size)
+ break;
+ addr += page_size;
+ }
return err;
}
+#else
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk, int pdshift)
+{
+ return 0;
+}
+#endif
static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -71,7 +125,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
do {
again:
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
+ if (pmd_none(*pmd)) {
if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
@@ -102,15 +156,19 @@ again:
!(ops->pte_entry))
continue;
- if (walk->vma) {
+ if (walk->vma)
split_huge_pmd(walk->vma, pmd, addr);
- if (pmd_trans_unstable(pmd))
- goto again;
- }
- err = walk_pte_range(pmd, addr, next, walk);
+ if (is_hugepd(__hugepd(pmd_val(*pmd))))
+ err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
+ else
+ err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
+
+ if (walk->action == ACTION_AGAIN)
+ goto again;
+
} while (pmd++, addr = next, addr != end);
return err;
@@ -129,7 +187,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
do {
again:
next = pud_addr_end(addr, end);
- if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
+ if (pud_none(*pud)) {
if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
@@ -157,7 +215,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (pud_none(*pud))
goto again;
- err = walk_pmd_range(pud, addr, next, walk);
+ if (is_hugepd(__hugepd(pud_val(*pud))))
+ err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
+ else
+ err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -189,7 +250,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
if (err)
break;
}
- if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+ if (is_hugepd(__hugepd(p4d_val(*p4d))))
+ err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
+ else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -224,8 +287,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
if (err)
break;
}
- if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
- ops->pte_entry)
+ if (is_hugepd(__hugepd(pgd_val(*pgd))))
+ err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
+ else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
@@ -254,18 +318,18 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ hugetlb_vma_lock_read(vma);
do {
next = hugetlb_entry_end(h, addr, end);
- pte = huge_pte_offset(walk->mm, addr & hmask, sz);
-
+ pte = hugetlb_walk(vma, addr & hmask, sz);
if (pte)
err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
else if (ops->pte_hole)
err = ops->pte_hole(addr, next, -1, walk);
-
if (err)
break;
} while (addr = next, addr != end);
+ hugetlb_vma_unlock_read(vma);
return err;
}
@@ -318,24 +382,51 @@ static int __walk_page_range(unsigned long start, unsigned long end,
struct vm_area_struct *vma = walk->vma;
const struct mm_walk_ops *ops = walk->ops;
- if (vma && ops->pre_vma) {
+ if (ops->pre_vma) {
err = ops->pre_vma(start, end, walk);
if (err)
return err;
}
- if (vma && is_vm_hugetlb_page(vma)) {
+ if (is_vm_hugetlb_page(vma)) {
if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
- if (vma && ops->post_vma)
+ if (ops->post_vma)
ops->post_vma(walk);
return err;
}
+static inline void process_mm_walk_lock(struct mm_struct *mm,
+ enum page_walk_lock walk_lock)
+{
+ if (walk_lock == PGWALK_RDLOCK)
+ mmap_assert_locked(mm);
+ else
+ mmap_assert_write_locked(mm);
+}
+
+static inline void process_vma_walk_lock(struct vm_area_struct *vma,
+ enum page_walk_lock walk_lock)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ switch (walk_lock) {
+ case PGWALK_WRLOCK:
+ vma_start_write(vma);
+ break;
+ case PGWALK_WRLOCK_VERIFY:
+ vma_assert_write_locked(vma);
+ break;
+ case PGWALK_RDLOCK:
+ /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
+ break;
+ }
+#endif
+}
+
/**
* walk_page_range - walk page table with caller specific callbacks
* @mm: mm_struct representing the target process of page table walk
@@ -395,20 +486,25 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (!walk.mm)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
vma = find_vma(walk.mm, start);
do {
if (!vma) { /* after the last vma */
walk.vma = NULL;
next = end;
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, next, -1, &walk);
} else if (start < vma->vm_start) { /* outside vma */
walk.vma = NULL;
next = min(end, vma->vm_start);
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, next, -1, &walk);
} else { /* inside vma */
+ process_vma_walk_lock(vma, ops->walk_lock);
walk.vma = vma;
next = min(end, vma->vm_end);
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
err = walk_page_test(start, next, &walk);
if (err > 0) {
@@ -422,16 +518,23 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
}
if (err < 0)
break;
- }
- if (walk.vma || walk.ops->pte_hole)
err = __walk_page_range(start, next, &walk);
+ }
if (err)
break;
} while (start = next, start < end);
return err;
}
-/*
+/**
+ * walk_page_range_novma - walk a range of pagetables not backed by a vma
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @pgd: pgd to walk if different from mm->pgd
+ * @private: private data for callbacks' usage
+ *
* Similar to walk_page_range() but can walk any page tables even if they are
* not backed by VMAs. Because 'unusual' entries may be walked this function
* will also not lock the PTEs for the pte_entry() callback. This is useful for
@@ -453,8 +556,29 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
if (start >= end || !walk.mm)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ mmap_assert_write_locked(walk.mm);
+
+ return walk_pgd_range(start, end, &walk);
+}
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
+
+ if (start >= end || !walk.mm)
+ return -EINVAL;
+ if (start < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
+ process_vma_walk_lock(vma, ops->walk_lock);
return __walk_page_range(start, end, &walk);
}
@@ -467,18 +591,12 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
.vma = vma,
.private = private,
};
- int err;
if (!walk.mm)
return -EINVAL;
- mmap_assert_locked(walk.mm);
-
- err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
- if (err > 0)
- return 0;
- if (err < 0)
- return err;
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
+ process_vma_walk_lock(vma, ops->walk_lock);
return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 18b768ac7dca..cdd0aa597a81 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -4,25 +4,7 @@
#include <linux/types.h>
#include <linux/percpu.h>
-
-/*
- * There are two chunk types: root and memcg-aware.
- * Chunks of each type have separate slots list.
- *
- * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
- * used to store memcg membership data of a percpu object. Obj_cgroups are
- * ref-counted pointers to a memory cgroup with an ability to switch dynamically
- * to the parent memory cgroup. This allows to reclaim a deleted memory cgroup
- * without reclaiming of all outstanding objects, which hold a reference at it.
- */
-enum pcpu_chunk_type {
- PCPU_CHUNK_ROOT,
-#ifdef CONFIG_MEMCG_KMEM
- PCPU_CHUNK_MEMCG,
-#endif
- PCPU_NR_CHUNK_TYPES,
- PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
-};
+#include <linux/memcontrol.h>
/*
* pcpu_block_md is the metadata block struct.
@@ -59,14 +41,23 @@ struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
int free_bytes; /* free bytes in the chunk */
struct pcpu_block_md chunk_md;
- void *base_addr; /* base address of this chunk */
+ unsigned long *bound_map; /* boundary map */
+
+ /*
+ * base_addr is the base address of this chunk.
+ * To reduce false sharing, current layout is optimized to make sure
+ * base_addr locate in the different cacheline with free_bytes and
+ * chunk_md.
+ */
+ void *base_addr ____cacheline_aligned_in_smp;
unsigned long *alloc_map; /* allocation map */
- unsigned long *bound_map; /* boundary map */
struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
bool immutable; /* no [de]population allowed */
+ bool isolated; /* isolated from active chunk
+ slots */
int start_offset; /* the overlap with the previous
region to have a page aligned
base_addr */
@@ -87,6 +78,8 @@ extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
+extern int pcpu_sidelined_slot;
+extern int pcpu_to_depopulate_slot;
extern int pcpu_nr_empty_pop_pages;
extern struct pcpu_chunk *pcpu_first_chunk;
@@ -128,35 +121,23 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
-#ifdef CONFIG_MEMCG_KMEM
-static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
-{
- if (chunk->obj_cgroups)
- return PCPU_CHUNK_MEMCG;
- return PCPU_CHUNK_ROOT;
-}
-
-static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
-{
- return chunk_type == PCPU_CHUNK_MEMCG;
-}
-
-#else
-static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+/**
+ * pcpu_obj_full_size - helper to calculate size of each accounted object
+ * @size: size of area to allocate in bytes
+ *
+ * For each accounted object there is an extra space which is used to store
+ * obj_cgroup membership if kmemcg is not disabled. Charge it too.
+ */
+static inline size_t pcpu_obj_full_size(size_t size)
{
- return PCPU_CHUNK_ROOT;
-}
+ size_t extra_size = 0;
-static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
-{
- return false;
-}
+#ifdef CONFIG_MEMCG_KMEM
+ if (!mem_cgroup_kmem_disabled())
+ extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
#endif
-static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
-{
- return &pcpu_chunk_lists[pcpu_nr_slots *
- pcpu_is_memcg_chunk(chunk_type)];
+ return size * num_possible_cpus() + extra_size;
}
#ifdef CONFIG_PERCPU_STATS
@@ -170,7 +151,7 @@ struct percpu_stats {
u64 nr_max_alloc; /* max # of live allocations */
u32 nr_chunks; /* current # of live chunks */
u32 nr_max_chunks; /* max # of live chunks */
- size_t min_alloc_size; /* min allocaiton size */
+ size_t min_alloc_size; /* min allocation size */
size_t max_alloc_size; /* max allocation size */
};
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 35c9941077ee..fe31aa19db81 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -32,6 +32,12 @@
#include <linux/log2.h>
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ /* nothing */
+}
+
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end, gfp_t gfp)
{
@@ -44,8 +50,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
/* nada */
}
-static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
- gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
@@ -53,7 +58,7 @@ static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
unsigned long flags;
int i;
- chunk = pcpu_alloc_chunk(type, gfp);
+ chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
@@ -118,3 +123,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
return 0;
}
+
+static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
+{
+ return false;
+}
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index c8400a2adbc2..dd3590dfc23d 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -34,15 +34,11 @@ static int find_max_nr_alloc(void)
{
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
- enum pcpu_chunk_type type;
max_nr_alloc = 0;
- for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
- for (slot = 0; slot < pcpu_nr_slots; slot++)
- list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
- list)
- max_nr_alloc = max(max_nr_alloc,
- chunk->nr_alloc);
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list)
+ max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
return max_nr_alloc;
}
@@ -133,9 +129,6 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("cur_min_alloc", cur_min_alloc);
P("cur_med_alloc", cur_med_alloc);
P("cur_max_alloc", cur_max_alloc);
-#ifdef CONFIG_MEMCG_KMEM
- P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
-#endif
seq_putc(m, '\n');
}
@@ -144,7 +137,6 @@ static int percpu_stats_show(struct seq_file *m, void *v)
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
int *buffer;
- enum pcpu_chunk_type type;
alloc_buffer:
spin_lock_irq(&pcpu_lock);
@@ -152,7 +144,7 @@ alloc_buffer:
spin_unlock_irq(&pcpu_lock);
/* there can be at most this many free and allocated fragments */
- buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1)));
+ buffer = vmalloc_array(2 * max_nr_alloc + 1, sizeof(int));
if (!buffer)
return -ENOMEM;
@@ -165,7 +157,7 @@ alloc_buffer:
goto alloc_buffer;
}
-#define PL(X) \
+#define PL(X) \
seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
seq_printf(m,
@@ -210,18 +202,17 @@ alloc_buffer:
chunk_map_stats(m, pcpu_reserved_chunk, buffer);
}
- for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
- for (slot = 0; slot < pcpu_nr_slots; slot++) {
- list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
- list) {
- if (chunk == pcpu_first_chunk) {
- seq_puts(m, "Chunk: <- First Chunk\n");
- chunk_map_stats(m, chunk, buffer);
- } else {
- seq_puts(m, "Chunk:\n");
- chunk_map_stats(m, chunk, buffer);
- }
- }
+ for (slot = 0; slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
+ if (chunk == pcpu_first_chunk)
+ seq_puts(m, "Chunk: <- First Chunk\n");
+ else if (slot == pcpu_to_depopulate_slot)
+ seq_puts(m, "Chunk (to_depopulate)\n");
+ else if (slot == pcpu_sidelined_slot)
+ seq_puts(m, "Chunk (sidelined):\n");
+ else
+ seq_puts(m, "Chunk:\n");
+ chunk_map_stats(m, chunk, buffer);
}
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index e46f7a6917f9..2054c9213c43 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -8,6 +8,7 @@
* Chunks are mapped into vmalloc areas and populated page by page.
* This is the default chunk allocator.
*/
+#include "internal.h"
static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
@@ -133,7 +134,7 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
- unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+ vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
}
/**
@@ -192,8 +193,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
int nr_pages)
{
- return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
- PAGE_KERNEL, pages);
+ return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
+ PAGE_KERNEL, pages, PAGE_SHIFT);
}
/**
@@ -302,6 +303,9 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk.
*
+ * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the
+ * region back to vmalloc() which will lazily flush the tlb.
+ *
* CONTEXT:
* pcpu_alloc_mutex.
*/
@@ -323,18 +327,15 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_unmap_pages(chunk, pages, page_start, page_end);
- /* no need to flush tlb, vmalloc will handle it lazily */
-
pcpu_free_pages(chunk, pages, page_start, page_end);
}
-static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
- gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
struct vm_struct **vms;
- chunk = pcpu_alloc_chunk(type, gfp);
+ chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
@@ -377,3 +378,33 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
/* no extra restriction */
return 0;
}
+
+/**
+ * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim
+ * @chunk: chunk of interest
+ *
+ * This is the entry point for percpu reclaim. If a chunk qualifies, it is then
+ * isolated and managed in separate lists at the back of pcpu_slot: sidelined
+ * and to_depopulate respectively. The to_depopulate list holds chunks slated
+ * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once
+ * they are on this list. Once depopulated, they are moved onto the sidelined
+ * list which enables them to be pulled back in for allocation if no other chunk
+ * can suffice the allocation.
+ */
+static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
+{
+ /* do not reclaim either the first chunk or reserved chunk */
+ if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk)
+ return false;
+
+ /*
+ * If it is isolated, it may be on the sidelined list so move it back to
+ * the to_depopulate list. If we hit at least 1/4 pages empty pages AND
+ * there is no system-wide shortage of empty pages aside from this
+ * chunk, move it to the to_depopulate list.
+ */
+ return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
+ (pcpu_nr_empty_pop_pages >
+ (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) &&
+ chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index 1ed1a349eab8..28e07ede46f6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -69,9 +69,9 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/bitmap.h>
+#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
-#include <linux/lcm.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
@@ -98,7 +98,10 @@
#include "percpu-internal.h"
-/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+/*
+ * The slots are sorted by the size of the biggest continuous free area.
+ * 1-31 bytes share the same slot.
+ */
#define PCPU_SLOT_BASE_SHIFT 5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD 3
@@ -131,6 +134,9 @@ static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
+static int pcpu_free_slot __ro_after_init;
+int pcpu_sidelined_slot __ro_after_init;
+int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */
@@ -139,7 +145,6 @@ static unsigned int pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;
-EXPORT_SYMBOL_GPL(pcpu_base_addr);
static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
@@ -168,12 +173,9 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext
struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
-/* chunks which need their map areas extended, protected by pcpu_lock */
-static LIST_HEAD(pcpu_map_extend_chunks);
-
/*
- * The number of empty populated pages, protected by pcpu_lock. The
- * reserved chunk doesn't contribute to the count.
+ * The number of empty populated pages, protected by pcpu_lock.
+ * The reserved chunk doesn't contribute to the count.
*/
int pcpu_nr_empty_pop_pages;
@@ -233,7 +235,7 @@ static int __pcpu_size_to_slot(int size)
static int pcpu_size_to_slot(int size)
{
if (size == pcpu_unit_size)
- return pcpu_nr_slots - 1;
+ return pcpu_free_slot;
return __pcpu_size_to_slot(size);
}
@@ -302,6 +304,25 @@ static unsigned long pcpu_block_off_to_off(int index, int off)
return index * PCPU_BITMAP_BLOCK_BITS + off;
}
+/**
+ * pcpu_check_block_hint - check against the contig hint
+ * @block: block of interest
+ * @bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Check to see if the allocation can fit in the block's contig hint.
+ * Note, a chunk uses the same hints as a block so this can also check against
+ * the chunk's contig hint.
+ */
+static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
+ size_t align)
+{
+ int bit_off = ALIGN(block->contig_hint_start, align) -
+ block->contig_hint_start;
+
+ return bit_off + bits <= block->contig_hint;
+}
+
/*
* pcpu_next_hint - determine which hint to use
* @block: block of interest
@@ -506,13 +527,10 @@ static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
bool move_front)
{
if (chunk != pcpu_reserved_chunk) {
- struct list_head *pcpu_slot;
-
- pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
if (move_front)
- list_move(&chunk->list, &pcpu_slot[slot]);
+ list_move(&chunk->list, &pcpu_chunk_lists[slot]);
else
- list_move_tail(&chunk->list, &pcpu_slot[slot]);
+ list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
}
}
@@ -538,10 +556,36 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
+ /* leave isolated chunks in-place */
+ if (chunk->isolated)
+ return;
+
if (oslot != nslot)
__pcpu_chunk_move(chunk, nslot, oslot < nslot);
}
+static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ if (!chunk->isolated) {
+ chunk->isolated = true;
+ pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
+ }
+ list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
+}
+
+static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ if (chunk->isolated) {
+ chunk->isolated = false;
+ pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
+ pcpu_chunk_relocate(chunk, -1);
+ }
+}
+
/*
* pcpu_update_empty_pages - update empty page counters
* @chunk: chunk of interest
@@ -554,7 +598,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
chunk->nr_empty_pop_pages += nr;
- if (chunk != pcpu_reserved_chunk)
+ if (chunk != pcpu_reserved_chunk && !chunk->isolated)
pcpu_nr_empty_pop_pages += nr;
}
@@ -731,7 +775,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
- unsigned int rs, re, start; /* region start, region end */
+ unsigned int start, end; /* region start, region end */
/* promote scan_hint to contig_hint */
if (block->scan_hint) {
@@ -747,9 +791,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
block->right_free = 0;
/* iterate over free areas and update the contig hints */
- bitmap_for_each_clear_region(alloc_map, rs, re, start,
- PCPU_BITMAP_BLOCK_BITS)
- pcpu_block_update(block, rs, re);
+ for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
+ pcpu_block_update(block, start, end);
}
/**
@@ -787,13 +830,15 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
/*
* Update s_block.
- * block->first_free must be updated if the allocation takes its place.
- * If the allocation breaks the contig_hint, a scan is required to
- * restore this hint.
*/
if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
+ /*
+ * block->first_free must be updated if the allocation takes its place.
+ * If the allocation breaks the contig_hint, a scan is required to
+ * restore this hint.
+ */
if (s_off == s_block->first_free)
s_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, s_index),
@@ -868,6 +913,12 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
}
}
+ /*
+ * If the allocation is not atomic, some blocks may not be
+ * populated with pages, while we account it here. The number
+ * of pages will be added back with pcpu_chunk_populated()
+ * when populating pages.
+ */
if (nr_empty_pages)
pcpu_update_empty_pages(chunk, -nr_empty_pages);
@@ -1022,17 +1073,18 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
int *next_off)
{
- unsigned int page_start, page_end, rs, re;
+ unsigned int start, end;
- page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
- page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
+ start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
+ end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
- rs = page_start;
- bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
- if (rs >= page_end)
+ start = find_next_zero_bit(chunk->populated, end, start);
+ if (start >= end)
return true;
- *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+ end = find_next_bit(chunk->populated, end, start + 1);
+
+ *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
return false;
}
@@ -1062,14 +1114,11 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
int bit_off, bits, next_off;
/*
- * Check to see if the allocation can fit in the chunk's contig hint.
- * This is an optimization to prevent scanning by assuming if it
- * cannot fit in the global hint, there is memory pressure and creating
- * a new chunk would happen soon.
+ * This is an optimization to prevent scanning by assuming if the
+ * allocation cannot fit in the global hint, there is memory pressure
+ * and creating a new chunk would happen soon.
*/
- bit_off = ALIGN(chunk_md->contig_hint_start, align) -
- chunk_md->contig_hint_start;
- if (bit_off + alloc_bits > chunk_md->contig_hint)
+ if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
return -1;
bit_off = pcpu_next_hint(chunk_md, alloc_bits);
@@ -1297,7 +1346,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
int map_size)
{
struct pcpu_chunk *chunk;
- unsigned long aligned_addr, lcm_align;
+ unsigned long aligned_addr;
int start_offset, offset_bits, region_size, region_bits;
size_t alloc_size;
@@ -1305,18 +1354,11 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
aligned_addr = tmp_addr & PAGE_MASK;
start_offset = tmp_addr - aligned_addr;
-
- /*
- * Align the end of the region with the LCM of PAGE_SIZE and
- * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of
- * the other.
- */
- lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
- region_size = ALIGN(start_offset + map_size, lcm_align);
+ region_size = ALIGN(start_offset + map_size, PAGE_SIZE);
/* allocate chunk */
- alloc_size = sizeof(struct pcpu_chunk) +
- BITS_TO_LONGS(region_size >> PAGE_SHIFT) * sizeof(unsigned long);
+ alloc_size = struct_size(chunk, populated,
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT));
chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!chunk)
panic("%s: Failed to allocate %zu bytes\n", __func__,
@@ -1351,7 +1393,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
alloc_size);
#ifdef CONFIG_MEMCG_KMEM
- /* first chunk isn't memcg-aware */
+ /* first chunk is free to use */
chunk->obj_cgroups = NULL;
#endif
pcpu_init_md_blocks(chunk);
@@ -1393,7 +1435,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
return chunk;
}
-static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;
@@ -1422,7 +1464,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
goto md_blocks_fail;
#ifdef CONFIG_MEMCG_KMEM
- if (pcpu_is_memcg_chunk(type)) {
+ if (!mem_cgroup_kmem_disabled()) {
chunk->obj_cgroups =
pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
sizeof(struct obj_cgroup *), gfp);
@@ -1474,9 +1516,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
* successful population.
- *
- * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
- * is to serve an allocation in that area.
*/
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
int page_end)
@@ -1526,6 +1565,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
*
* pcpu_populate_chunk - populate the specified range of a chunk
* pcpu_depopulate_chunk - depopulate the specified range of a chunk
+ * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk
* pcpu_create_chunk - create a new chunk
* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
* pcpu_addr_to_page - translate address to physical address
@@ -1535,8 +1575,9 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
- gfp_t gfp);
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1579,26 +1620,25 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
}
#ifdef CONFIG_MEMCG_KMEM
-static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
- struct obj_cgroup **objcgp)
+static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+ struct obj_cgroup **objcgp)
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) ||
- memcg_kmem_bypass())
- return PCPU_CHUNK_ROOT;
+ if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
+ return true;
objcg = get_obj_cgroup_from_current();
if (!objcg)
- return PCPU_CHUNK_ROOT;
+ return true;
- if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
+ if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) {
obj_cgroup_put(objcg);
- return PCPU_FAIL_ALLOC;
+ return false;
}
*objcgp = objcg;
- return PCPU_CHUNK_MEMCG;
+ return true;
}
static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
@@ -1608,15 +1648,15 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
if (!objcg)
return;
- if (chunk) {
+ if (likely(chunk && chunk->obj_cgroups)) {
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
- size * num_possible_cpus());
+ pcpu_obj_full_size(size));
rcu_read_unlock();
} else {
- obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+ obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
obj_cgroup_put(objcg);
}
}
@@ -1625,27 +1665,29 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
struct obj_cgroup *objcg;
- if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
+ if (unlikely(!chunk->obj_cgroups))
return;
objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+ if (!objcg)
+ return;
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
- obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+ obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
- -(size * num_possible_cpus()));
+ -pcpu_obj_full_size(size));
rcu_read_unlock();
obj_cgroup_put(objcg);
}
#else /* CONFIG_MEMCG_KMEM */
-static enum pcpu_chunk_type
+static bool
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
- return PCPU_CHUNK_ROOT;
+ return true;
}
static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
@@ -1680,8 +1722,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t pcpu_gfp;
bool is_atomic;
bool do_warn;
- enum pcpu_chunk_type type;
- struct list_head *pcpu_slot;
struct obj_cgroup *objcg = NULL;
static int warn_limit = 10;
struct pcpu_chunk *chunk, *next;
@@ -1717,10 +1757,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
return NULL;
}
- type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
- if (unlikely(type == PCPU_FAIL_ALLOC))
+ if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
return NULL;
- pcpu_slot = pcpu_chunk_list(type);
if (!is_atomic) {
/*
@@ -1758,8 +1796,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
restart:
/* search through normal chunks */
- for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
- list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
+ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
+ list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
+ list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic);
if (off < 0) {
@@ -1769,26 +1808,23 @@ restart:
}
off = pcpu_alloc_area(chunk, bits, bit_align, off);
- if (off >= 0)
+ if (off >= 0) {
+ pcpu_reintegrate_chunk(chunk);
goto area_found;
-
+ }
}
}
spin_unlock_irqrestore(&pcpu_lock, flags);
- /*
- * No space left. Create a new chunk. We don't want multiple
- * tasks to create chunks simultaneously. Serialize and create iff
- * there's still no empty chunk after grabbing the mutex.
- */
if (is_atomic) {
err = "atomic alloc failed, no space left";
goto fail;
}
- if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk(type, pcpu_gfp);
+ /* No space left. Create a new chunk. */
+ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
+ chunk = pcpu_create_chunk(pcpu_gfp);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1808,13 +1844,12 @@ area_found:
/* populate if not all pages are already there */
if (!is_atomic) {
- unsigned int page_start, page_end, rs, re;
+ unsigned int page_end, rs, re;
- page_start = PFN_DOWN(off);
+ rs = PFN_DOWN(off);
page_end = PFN_UP(off + size);
- bitmap_for_each_clear_region(chunk->populated, rs, re,
- page_start, page_end) {
+ for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
WARN_ON(chunk->immutable);
ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
@@ -1842,8 +1877,9 @@ area_found:
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);
- trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
- chunk->base_addr, off, ptr);
+ trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
+ chunk->base_addr, off, ptr,
+ pcpu_obj_full_size(size), gfp);
pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
@@ -1862,7 +1898,7 @@ fail:
pr_info("limit reached, disable warning\n");
}
if (is_atomic) {
- /* see the flag handling in pcpu_blance_workfn() */
+ /* see the flag handling in pcpu_balance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
} else {
@@ -1930,33 +1966,28 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
}
/**
- * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
- * @type: chunk type
+ * pcpu_balance_free - manage the amount of free chunks
+ * @empty_only: free chunks only if there are no populated pages
*
- * Reclaim all fully free chunks except for the first one. This is also
- * responsible for maintaining the pool of empty populated pages. However,
- * it is possible that this is called when physical memory is scarce causing
- * OOM killer to be triggered. We should avoid doing so until an actual
- * allocation causes the failure as it is possible that requests can be
- * serviced from already backed regions.
+ * If empty_only is %false, reclaim all fully free chunks regardless of the
+ * number of populated pages. Otherwise, only reclaim chunks that have no
+ * populated pages.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
*/
-static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
+static void pcpu_balance_free(bool empty_only)
{
- /* gfp flags passed to underlying allocators */
- const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free);
- struct list_head *pcpu_slot = pcpu_chunk_list(type);
- struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
+ struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
struct pcpu_chunk *chunk, *next;
- int slot, nr_to_pop, ret;
+
+ lockdep_assert_held(&pcpu_lock);
/*
* There's no reason to keep around multiple unused chunks and VM
* areas can be scarce. Destroy all free chunks except for one.
*/
- mutex_lock(&pcpu_alloc_mutex);
- spin_lock_irq(&pcpu_lock);
-
list_for_each_entry_safe(chunk, next, free_head, list) {
WARN_ON(chunk->immutable);
@@ -1964,16 +1995,18 @@ static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
- list_move(&chunk->list, &to_free);
+ if (!empty_only || chunk->nr_empty_pop_pages == 0)
+ list_move(&chunk->list, &to_free);
}
- spin_unlock_irq(&pcpu_lock);
+ if (list_empty(&to_free))
+ return;
+ spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &to_free, list) {
unsigned int rs, re;
- bitmap_for_each_set_region(chunk->populated, rs, re, 0,
- chunk->nr_pages) {
+ for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
pcpu_depopulate_chunk(chunk, rs, re);
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, rs, re);
@@ -1982,6 +2015,29 @@ static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
pcpu_destroy_chunk(chunk);
cond_resched();
}
+ spin_lock_irq(&pcpu_lock);
+}
+
+/**
+ * pcpu_balance_populated - manage the amount of populated pages
+ *
+ * Maintain a certain amount of populated pages to satisfy atomic allocations.
+ * It is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered. We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
+ */
+static void pcpu_balance_populated(void)
+{
+ /* gfp flags passed to underlying allocators */
+ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ struct pcpu_chunk *chunk;
+ int slot, nr_to_pop, ret;
+
+ lockdep_assert_held(&pcpu_lock);
/*
* Ensure there are certain number of free populated pages for
@@ -2004,34 +2060,32 @@ retry_pop:
0, PCPU_EMPTY_POP_PAGES_HIGH);
}
- for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
+ for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
unsigned int nr_unpop = 0, rs, re;
if (!nr_to_pop)
break;
- spin_lock_irq(&pcpu_lock);
- list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
nr_unpop = chunk->nr_pages - chunk->nr_populated;
if (nr_unpop)
break;
}
- spin_unlock_irq(&pcpu_lock);
if (!nr_unpop)
continue;
/* @chunk can't go away while pcpu_alloc_mutex is held */
- bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
- chunk->nr_pages) {
+ for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
int nr = min_t(int, re - rs, nr_to_pop);
+ spin_unlock_irq(&pcpu_lock);
ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
if (!ret) {
nr_to_pop -= nr;
- spin_lock_irq(&pcpu_lock);
pcpu_chunk_populated(chunk, rs, rs + nr);
- spin_unlock_irq(&pcpu_lock);
} else {
nr_to_pop = 0;
}
@@ -2043,30 +2097,149 @@ retry_pop:
if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
- chunk = pcpu_create_chunk(type, gfp);
+ spin_unlock_irq(&pcpu_lock);
+ chunk = pcpu_create_chunk(gfp);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
if (chunk) {
- spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
- spin_unlock_irq(&pcpu_lock);
goto retry_pop;
}
}
+}
- mutex_unlock(&pcpu_alloc_mutex);
+/**
+ * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
+ *
+ * Scan over chunks in the depopulate list and try to release unused populated
+ * pages back to the system. Depopulated chunks are sidelined to prevent
+ * repopulating these pages unless required. Fully free chunks are reintegrated
+ * and freed accordingly (1 is kept around). If we drop below the empty
+ * populated pages threshold, reintegrate the chunk if it has empty free pages.
+ * Each chunk is scanned in the reverse order to keep populated pages close to
+ * the beginning of the chunk.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
+ *
+ */
+static void pcpu_reclaim_populated(void)
+{
+ struct pcpu_chunk *chunk;
+ struct pcpu_block_md *block;
+ int freed_page_start, freed_page_end;
+ int i, end;
+ bool reintegrate;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ /*
+ * Once a chunk is isolated to the to_depopulate list, the chunk is no
+ * longer discoverable to allocations whom may populate pages. The only
+ * other accessor is the free path which only returns area back to the
+ * allocator not touching the populated bitmap.
+ */
+ while ((chunk = list_first_entry_or_null(
+ &pcpu_chunk_lists[pcpu_to_depopulate_slot],
+ struct pcpu_chunk, list))) {
+ WARN_ON(chunk->immutable);
+
+ /*
+ * Scan chunk's pages in the reverse order to keep populated
+ * pages close to the beginning of the chunk.
+ */
+ freed_page_start = chunk->nr_pages;
+ freed_page_end = 0;
+ reintegrate = false;
+ for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
+ /* no more work to do */
+ if (chunk->nr_empty_pop_pages == 0)
+ break;
+
+ /* reintegrate chunk to prevent atomic alloc failures */
+ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+ reintegrate = true;
+ break;
+ }
+
+ /*
+ * If the page is empty and populated, start or
+ * extend the (i, end) range. If i == 0, decrease
+ * i and perform the depopulation to cover the last
+ * (first) page in the chunk.
+ */
+ block = chunk->md_blocks + i;
+ if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
+ test_bit(i, chunk->populated)) {
+ if (end == -1)
+ end = i;
+ if (i > 0)
+ continue;
+ i--;
+ }
+
+ /* depopulate if there is an active range */
+ if (end == -1)
+ continue;
+
+ spin_unlock_irq(&pcpu_lock);
+ pcpu_depopulate_chunk(chunk, i + 1, end + 1);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
+
+ pcpu_chunk_depopulated(chunk, i + 1, end + 1);
+ freed_page_start = min(freed_page_start, i + 1);
+ freed_page_end = max(freed_page_end, end + 1);
+
+ /* reset the range and continue */
+ end = -1;
+ }
+
+ /* batch tlb flush per chunk to amortize cost */
+ if (freed_page_start < freed_page_end) {
+ spin_unlock_irq(&pcpu_lock);
+ pcpu_post_unmap_tlb_flush(chunk,
+ freed_page_start,
+ freed_page_end);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
+ }
+
+ if (reintegrate || chunk->free_bytes == pcpu_unit_size)
+ pcpu_reintegrate_chunk(chunk);
+ else
+ list_move_tail(&chunk->list,
+ &pcpu_chunk_lists[pcpu_sidelined_slot]);
+ }
}
/**
* pcpu_balance_workfn - manage the amount of free chunks and populated pages
* @work: unused
*
- * Call __pcpu_balance_workfn() for each chunk type.
+ * For each chunk type, manage the number of fully free chunks and the number of
+ * populated pages. An important thing to consider is when pages are freed and
+ * how they contribute to the global counts.
*/
static void pcpu_balance_workfn(struct work_struct *work)
{
- enum pcpu_chunk_type type;
+ /*
+ * pcpu_balance_free() is called twice because the first time we may
+ * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
+ * to grow other chunks. This then gives pcpu_reclaim_populated() time
+ * to move fully free chunks to the active list to be freed if
+ * appropriate.
+ */
+ mutex_lock(&pcpu_alloc_mutex);
+ spin_lock_irq(&pcpu_lock);
- for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
- __pcpu_balance_workfn(type);
+ pcpu_balance_free(false);
+ pcpu_reclaim_populated();
+ pcpu_balance_populated();
+ pcpu_balance_free(true);
+
+ spin_unlock_irq(&pcpu_lock);
+ mutex_unlock(&pcpu_alloc_mutex);
}
/**
@@ -2085,7 +2258,6 @@ void free_percpu(void __percpu *ptr)
unsigned long flags;
int size, off;
bool need_balance = false;
- struct list_head *pcpu_slot;
if (!ptr)
return;
@@ -2101,19 +2273,24 @@ void free_percpu(void __percpu *ptr)
size = pcpu_free_area(chunk, off);
- pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
-
pcpu_memcg_free_hook(chunk, off, size);
- /* if there are more than one fully free chunks, wake up grim reaper */
- if (chunk->free_bytes == pcpu_unit_size) {
+ /*
+ * If there are more than one fully free chunks, wake up grim reaper.
+ * If the chunk is isolated, it may be in the process of being
+ * reclaimed. Let reclaim manage cleaning up of that chunk.
+ */
+ if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
struct pcpu_chunk *pos;
- list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
+ list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
if (pos != chunk) {
need_balance = true;
break;
}
+ } else if (pcpu_should_reclaim_chunk(chunk)) {
+ pcpu_isolate_chunk(chunk);
+ need_balance = true;
}
trace_percpu_free_percpu(chunk->base_addr, off, ptr);
@@ -2285,7 +2462,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
- memblock_free_early(__pa(ai), ai->__ai_size);
+ memblock_free(ai, ai->__ai_size);
}
/**
@@ -2414,7 +2591,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int map_size;
unsigned long tmp_addr;
size_t alloc_size;
- enum pcpu_chunk_type type;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
@@ -2522,28 +2698,30 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_atom_size = ai->atom_size;
- pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
- BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
+ pcpu_chunk_struct_size = struct_size(chunk, populated,
+ BITS_TO_LONGS(pcpu_unit_pages));
pcpu_stats_save_ai(ai);
/*
- * Allocate chunk slots. The additional last slot is for
- * empty chunks.
+ * Allocate chunk slots. The slots after the active slots are:
+ * sidelined_slot - isolated, depopulated chunks
+ * free_slot - fully free chunks
+ * to_depopulate_slot - isolated, chunks to depopulate
*/
- pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
+ pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
+ pcpu_free_slot = pcpu_sidelined_slot + 1;
+ pcpu_to_depopulate_slot = pcpu_free_slot + 1;
+ pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
- sizeof(pcpu_chunk_lists[0]) *
- PCPU_NR_CHUNK_TYPES,
+ sizeof(pcpu_chunk_lists[0]),
SMP_CACHE_BYTES);
if (!pcpu_chunk_lists)
panic("%s: Failed to allocate %zu bytes\n", __func__,
- pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
- PCPU_NR_CHUNK_TYPES);
+ pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
- for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
- for (i = 0; i < pcpu_nr_slots; i++)
- INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
+ for (i = 0; i < pcpu_nr_slots; i++)
+ INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
/*
* The end of the static region needs to be aligned with the
@@ -2663,13 +2841,14 @@ early_param("percpu_alloc", percpu_alloc_setup);
* On success, pointer to the new allocation_info is returned. On
* failure, ERR_PTR value is returned.
*/
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
+ static struct cpumask mask __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
@@ -2682,6 +2861,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
/* this function may be called multiple times */
memset(group_map, 0, sizeof(group_map));
memset(group_cnt, 0, sizeof(group_cnt));
+ cpumask_clear(&mask);
/* calculate size_sum and ensure dyn_size is enough for early alloc */
size_sum = PFN_ALIGN(static_size + reserved_size +
@@ -2703,24 +2883,27 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
upa--;
max_upa = upa;
+ cpumask_copy(&mask, cpu_possible_mask);
+
/* group cpus according to their proximity */
- for_each_possible_cpu(cpu) {
- group = 0;
- next_group:
- for_each_possible_cpu(tcpu) {
- if (cpu == tcpu)
- break;
- if (group_map[tcpu] == group && cpu_distance_fn &&
- (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
- cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
- group++;
- nr_groups = max(nr_groups, group + 1);
- goto next_group;
- }
- }
+ for (group = 0; !cpumask_empty(&mask); group++) {
+ /* pop the group's first cpu */
+ cpu = cpumask_first(&mask);
group_map[cpu] = group;
group_cnt[group]++;
+ cpumask_clear_cpu(cpu, &mask);
+
+ for_each_cpu(tcpu, &mask) {
+ if (!cpu_distance_fn ||
+ (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
+ cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
+ group_map[tcpu] = group;
+ group_cnt[group]++;
+ cpumask_clear_cpu(tcpu, &mask);
+ }
+ }
}
+ nr_groups = group;
/*
* Wasted space is caused by a ratio imbalance of upa to group_cnt.
@@ -2728,6 +2911,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
* Related to atom_size, which could be much larger than the unit_size.
*/
last_allocs = INT_MAX;
+ best_upa = 0;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
@@ -2754,6 +2938,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
last_allocs = allocs;
best_upa = upa;
}
+ BUG_ON(!best_upa);
upa = best_upa;
/* allocate and fill alloc_info */
@@ -2797,6 +2982,42 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
return ai;
}
+
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
+ pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
+{
+ const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NUMA
+ int node = NUMA_NO_NODE;
+ void *ptr;
+
+ if (cpu_to_nd_fn)
+ node = cpu_to_nd_fn(cpu);
+
+ if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
+ ptr = memblock_alloc_from(size, align, goal);
+ pr_info("cpu %d has no node %d or node-local memory\n",
+ cpu, node);
+ pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
+ cpu, size, (u64)__pa(ptr));
+ } else {
+ ptr = memblock_alloc_try_nid(size, align, goal,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ node);
+
+ pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
+ cpu, size, node, (u64)__pa(ptr));
+ }
+ return ptr;
+#else
+ return memblock_alloc_from(size, align, goal);
+#endif
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+ memblock_free(ptr, size);
+}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
#if defined(BUILD_EMBED_FIRST_CHUNK)
@@ -2806,14 +3027,13 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
* @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
- * @alloc_fn: function to allocate percpu page
- * @free_fn: function to free percpu page
+ * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
- * by calling @alloc_fn and used as-is without being mapped into
+ * by calling pcpu_fc_alloc and used as-is without being mapped into
* vmalloc area. Allocations are always whole multiples of @atom_size
* aligned to @atom_size.
*
@@ -2827,7 +3047,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
* @dyn_size specifies the minimum dynamic area size.
*
* If the needed size is smaller than the minimum or specified unit
- * size, the leftover is returned using @free_fn.
+ * size, the leftover is returned using pcpu_fc_free.
*
* RETURNS:
* 0 on success, -errno on failure.
@@ -2835,8 +3055,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
- pcpu_fc_alloc_fn_t alloc_fn,
- pcpu_fc_free_fn_t free_fn)
+ pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
void *base = (void *)ULONG_MAX;
void **areas = NULL;
@@ -2871,13 +3090,13 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
BUG_ON(cpu == NR_CPUS);
/* allocate space for the whole group */
- ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
+ ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
if (!ptr) {
rc = -ENOMEM;
goto out_free_areas;
}
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(ptr);
+ kmemleak_ignore_phys(__pa(ptr));
areas[group] = ptr;
base = min(ptr, base);
@@ -2910,12 +3129,12 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
if (gi->cpu_map[i] == NR_CPUS) {
/* unused unit, free whole */
- free_fn(ptr, ai->unit_size);
+ pcpu_fc_free(ptr, ai->unit_size);
continue;
}
/* copy and return the unused part */
memcpy(ptr, __per_cpu_load, ai->static_size);
- free_fn(ptr + size_sum, ai->unit_size - size_sum);
+ pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@@ -2934,23 +3153,90 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
if (areas[group])
- free_fn(areas[group],
+ pcpu_fc_free(areas[group],
ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
- memblock_free_early(__pa(areas), areas_size);
+ memblock_free(areas, areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
+#include <asm/pgalloc.h>
+
+#ifndef P4D_TABLE_SIZE
+#define P4D_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PUD_TABLE_SIZE
+#define PUD_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PMD_TABLE_SIZE
+#define PMD_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PTE_TABLE_SIZE
+#define PTE_TABLE_SIZE PAGE_SIZE
+#endif
+void __init __weak pcpu_populate_pte(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (pgd_none(*pgd)) {
+ p4d_t *new;
+
+ new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
+ if (!new)
+ goto err_alloc;
+ pgd_populate(&init_mm, pgd, new);
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ pud_t *new;
+
+ new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+ if (!new)
+ goto err_alloc;
+ p4d_populate(&init_mm, p4d, new);
+ }
+
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ pmd_t *new;
+
+ new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
+ if (!new)
+ goto err_alloc;
+ pud_populate(&init_mm, pud, new);
+ }
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd)) {
+ pte_t *new;
+
+ new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
+ if (!new)
+ goto err_alloc;
+ pmd_populate_kernel(&init_mm, pmd, new);
+ }
+
+ return;
+
+err_alloc:
+ panic("%s: Failed to allocate memory\n", __func__);
+}
+
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
- * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
- * @free_fn: function to free percpu page, always called with PAGE_SIZE
- * @populate_pte_fn: function to populate pte
+ * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
@@ -2961,10 +3247,7 @@ out_free:
* RETURNS:
* 0 on success, -errno on failure.
*/
-int __init pcpu_page_first_chunk(size_t reserved_size,
- pcpu_fc_alloc_fn_t alloc_fn,
- pcpu_fc_free_fn_t free_fn,
- pcpu_fc_populate_pte_fn_t populate_pte_fn)
+int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
static struct vm_struct vm;
struct pcpu_alloc_info *ai;
@@ -3006,14 +3289,14 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
for (i = 0; i < unit_pages; i++) {
void *ptr;
- ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
+ ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
if (!ptr) {
pr_warn("failed to allocate %s page for cpu%u\n",
psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(ptr);
+ kmemleak_ignore_phys(__pa(ptr));
pages[j++] = virt_to_page(ptr);
}
}
@@ -3028,7 +3311,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
(unsigned long)vm.addr + unit * ai->unit_size;
for (i = 0; i < unit_pages; i++)
- populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+ pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
@@ -3058,10 +3341,10 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
enomem:
while (--j >= 0)
- free_fn(page_address(pages[j]), PAGE_SIZE);
+ pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
- memblock_free_early(__pa(pages), pages_size);
+ memblock_free(pages, pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@@ -3083,17 +3366,6 @@ out_free_ar:
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
- size_t align)
-{
- return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
-}
-
-static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
-{
- memblock_free_early(__pa(ptr), size);
-}
-
void __init setup_per_cpu_areas(void)
{
unsigned long delta;
@@ -3104,9 +3376,8 @@ void __init setup_per_cpu_areas(void)
* Always reserve area for module percpu variables. That's
* what the legacy allocator did.
*/
- rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
- PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
- pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
+ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
+ PAGE_SIZE, NULL, NULL);
if (rc < 0)
panic("Failed to initialize percpu areas.");
@@ -3138,7 +3409,7 @@ void __init setup_per_cpu_areas(void)
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(fc);
+ kmemleak_ignore_phys(__pa(fc));
ai->dyn_size = unit_size;
ai->unit_size = unit_size;
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
index 1dcc865029a2..e9e879de8649 100644
--- a/mm/pgalloc-track.h
+++ b/mm/pgalloc-track.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PGALLLC_TRACK_H
-#define _LINUX_PGALLLC_TRACK_H
+#ifndef _LINUX_PGALLOC_TRACK_H
+#define _LINUX_PGALLOC_TRACK_H
#if defined(CONFIG_MMU)
static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
@@ -48,4 +48,4 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
(__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
NULL: pte_offset_kernel(pmd, address))
-#endif /* _LINUX_PGALLLC_TRACK_H */
+#endif /* _LINUX_PGALLOC_TRACK_H */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 9578db83e312..4d454953046f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,9 @@
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/mm_inline.h>
#include <asm/tlb.h>
/*
@@ -65,10 +68,10 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
- int changed = !pte_same(*ptep, entry);
+ int changed = !pte_same(ptep_get(ptep), entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
- flush_tlb_fix_spurious_fault(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address, ptep);
}
return changed;
}
@@ -135,8 +138,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
@@ -200,6 +203,14 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
}
#endif
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
+pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ return pmdp_invalidate(vma, address, pmdp);
+}
+#endif
+
#ifndef pmdp_collapse_flush
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
@@ -220,3 +231,57 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
+{
+ pmd_t pmdval;
+
+ /* rcu_read_lock() to be added later */
+ pmdval = pmdp_get_lockless(pmd);
+ if (pmdvalp)
+ *pmdvalp = pmdval;
+ if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
+ goto nomap;
+ if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+ goto nomap;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ goto nomap;
+ }
+ return __pte_map(&pmdval, addr);
+nomap:
+ /* rcu_read_unlock() to be added later */
+ return NULL;
+}
+
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
+{
+ pmd_t pmdval;
+ pte_t *pte;
+
+ pte = __pte_offset_map(pmd, addr, &pmdval);
+ if (likely(pte))
+ *ptlp = pte_lockptr(mm, &pmdval);
+ return pte;
+}
+
+pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
+{
+ spinlock_t *ptl;
+ pmd_t pmdval;
+ pte_t *pte;
+again:
+ pte = __pte_offset_map(pmd, addr, &pmdval);
+ if (unlikely(!pte))
+ return pte;
+ ptl = pte_lockptr(mm, &pmdval);
+ spin_lock(ptl);
+ if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ *ptlp = ptl;
+ return pte;
+ }
+ pte_unmap_unlock(pte, ptl);
+ goto again;
+}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index fd12da80b6f2..0523edab03a6 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -5,6 +5,7 @@
* Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
*/
+#include <linux/compat.h>
#include <linux/mm.h>
#include <linux/uio.h>
#include <linux/sched.h>
@@ -103,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
mmap_read_lock(mm);
pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
- NULL, &locked);
+ &locked);
if (locked)
mmap_read_unlock(mm);
if (pinned_pages <= 0)
@@ -259,10 +260,10 @@ static ssize_t process_vm_rw(pid_t pid,
struct iovec iovstack_l[UIO_FASTIOV];
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
- struct iovec *iov_r = iovstack_r;
+ struct iovec *iov_r;
struct iov_iter iter;
ssize_t rc;
- int dir = vm_write ? WRITE : READ;
+ int dir = vm_write ? ITER_SOURCE : ITER_DEST;
if (flags != 0)
return -EINVAL;
@@ -273,7 +274,8 @@ static ssize_t process_vm_rw(pid_t pid,
return rc;
if (!iov_iter_count(&iter))
goto free_iov_l;
- iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, false);
+ iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r,
+ in_compat_syscall());
if (IS_ERR(iov_r)) {
rc = PTR_ERR(iov_r);
goto free_iov_l;
diff --git a/mm/ptdump.c b/mm/ptdump.c
index ba88ec43ff21..03c1bdae4a43 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,7 +4,7 @@
#include <linux/ptdump.h>
#include <linux/kasan.h>
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
/*
* This is an optimization for KASAN=y case. Since all kasan page tables
* eventually point to the kasan_early_shadow_page we could call note_page()
@@ -31,7 +31,8 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
struct ptdump_state *st = walk->private;
pgd_t val = READ_ONCE(*pgd);
-#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 4 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
return note_kasan_page_table(walk, addr);
#endif
@@ -39,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 0, pgd_val(val));
- if (pgd_leaf(val))
+ if (pgd_leaf(val)) {
st->note_page(st, addr, 0, pgd_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -51,7 +54,8 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
struct ptdump_state *st = walk->private;
p4d_t val = READ_ONCE(*p4d);
-#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 3 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
return note_kasan_page_table(walk, addr);
#endif
@@ -59,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 1, p4d_val(val));
- if (p4d_leaf(val))
+ if (p4d_leaf(val)) {
st->note_page(st, addr, 1, p4d_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -71,7 +77,8 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
struct ptdump_state *st = walk->private;
pud_t val = READ_ONCE(*pud);
-#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 2 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
return note_kasan_page_table(walk, addr);
#endif
@@ -79,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 2, pud_val(val));
- if (pud_leaf(val))
+ if (pud_leaf(val)) {
st->note_page(st, addr, 2, pud_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -91,15 +100,17 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
struct ptdump_state *st = walk->private;
pmd_t val = READ_ONCE(*pmd);
-#if defined(CONFIG_KASAN)
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
return note_kasan_page_table(walk, addr);
#endif
if (st->effective_prot)
st->effective_prot(st, 3, pmd_val(val));
- if (pmd_leaf(val))
+ if (pmd_leaf(val)) {
st->note_page(st, addr, 3, pmd_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -108,7 +119,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pte_t val = READ_ONCE(*pte);
+ pte_t val = ptep_get_lockless(pte);
if (st->effective_prot)
st->effective_prot(st, 4, pte_val(val));
@@ -141,13 +152,13 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
{
const struct ptdump_range *range = st->range;
- mmap_read_lock(mm);
+ mmap_write_lock(mm);
while (range->start != range->end) {
walk_page_range_novma(mm, range->start, range->end,
&ptdump_ops, pgd, st);
range++;
}
- mmap_read_unlock(mm);
+ mmap_write_unlock(mm);
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
diff --git a/mm/readahead.c b/mm/readahead.c
index 3c9a8dd7c56c..a9c999aa19af 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,15 +8,120 @@
* Initial version.
*/
+/**
+ * DOC: Readahead Overview
+ *
+ * Readahead is used to read content into the page cache before it is
+ * explicitly requested by the application. Readahead only ever
+ * attempts to read folios that are not yet in the page cache. If a
+ * folio is present but not up-to-date, readahead will not try to read
+ * it. In that case a simple ->read_folio() will be requested.
+ *
+ * Readahead is triggered when an application read request (whether a
+ * system call or a page fault) finds that the requested folio is not in
+ * the page cache, or that it is in the page cache and has the
+ * readahead flag set. This flag indicates that the folio was read
+ * as part of a previous readahead request and now that it has been
+ * accessed, it is time for the next readahead.
+ *
+ * Each readahead request is partly synchronous read, and partly async
+ * readahead. This is reflected in the struct file_ra_state which
+ * contains ->size being the total number of pages, and ->async_size
+ * which is the number of pages in the async section. The readahead
+ * flag will be set on the first folio in this async section to trigger
+ * a subsequent readahead. Once a series of sequential reads has been
+ * established, there should be no need for a synchronous component and
+ * all readahead request will be fully asynchronous.
+ *
+ * When either of the triggers causes a readahead, three numbers need
+ * to be determined: the start of the region to read, the size of the
+ * region, and the size of the async tail.
+ *
+ * The start of the region is simply the first page address at or after
+ * the accessed address, which is not currently populated in the page
+ * cache. This is found with a simple search in the page cache.
+ *
+ * The size of the async tail is determined by subtracting the size that
+ * was explicitly requested from the determined request size, unless
+ * this would be less than zero - then zero is used. NOTE THIS
+ * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
+ * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY.
+ *
+ * The size of the region is normally determined from the size of the
+ * previous readahead which loaded the preceding pages. This may be
+ * discovered from the struct file_ra_state for simple sequential reads,
+ * or from examining the state of the page cache when multiple
+ * sequential reads are interleaved. Specifically: where the readahead
+ * was triggered by the readahead flag, the size of the previous
+ * readahead is assumed to be the number of pages from the triggering
+ * page to the start of the new readahead. In these cases, the size of
+ * the previous readahead is scaled, often doubled, for the new
+ * readahead, though see get_next_ra_size() for details.
+ *
+ * If the size of the previous read cannot be determined, the number of
+ * preceding pages in the page cache is used to estimate the size of
+ * a previous read. This estimate could easily be misled by random
+ * reads being coincidentally adjacent, so it is ignored unless it is
+ * larger than the current request, and it is not scaled up, unless it
+ * is at the start of file.
+ *
+ * In general readahead is accelerated at the start of the file, as
+ * reads from there are often sequential. There are other minor
+ * adjustments to the readahead size in various special cases and these
+ * are best discovered by reading the code.
+ *
+ * The above calculation, based on the previous readahead size,
+ * determines the size of the readahead, to which any requested read
+ * size may be added.
+ *
+ * Readahead requests are sent to the filesystem using the ->readahead()
+ * address space operation, for which mpage_readahead() is a canonical
+ * implementation. ->readahead() should normally initiate reads on all
+ * folios, but may fail to read any or all folios without causing an I/O
+ * error. The page cache reading code will issue a ->read_folio() request
+ * for any folio which ->readahead() did not read, and only an error
+ * from this will be final.
+ *
+ * ->readahead() will generally call readahead_folio() repeatedly to get
+ * each folio from those prepared for readahead. It may fail to read a
+ * folio by:
+ *
+ * * not calling readahead_folio() sufficiently many times, effectively
+ * ignoring some folios, as might be appropriate if the path to
+ * storage is congested.
+ *
+ * * failing to actually submit a read request for a given folio,
+ * possibly due to insufficient resources, or
+ *
+ * * getting an error during subsequent processing of a request.
+ *
+ * In the last two cases, the folio should be unlocked by the filesystem
+ * to indicate that the read attempt has failed. In the first case the
+ * folio will be unlocked by the VFS.
+ *
+ * Those folios not in the final ``async_size`` of the request should be
+ * considered to be important and ->readahead() should not fail them due
+ * to congestion or temporary resource unavailability, but should wait
+ * for necessary resources (e.g. memory or indexing information) to
+ * become available. Folios in the final ``async_size`` may be
+ * considered less urgent and failure to read them is more acceptable.
+ * In this case it is best to use filemap_remove_folio() to remove the
+ * folios from the page cache as is automatically done for folios that
+ * were not fetched with readahead_folio(). This will allow a
+ * subsequent synchronous readahead request to try them again. If they
+ * are left in the page cache, then they will be read individually using
+ * ->read_folio() which may be less efficient.
+ */
+
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
-#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/pagevec.h>
#include <linux/pagemap.h>
+#include <linux/psi.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
@@ -38,130 +143,54 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
-/*
- * see if a page needs releasing upon read_cache_pages() failure
- * - the caller of read_cache_pages() may have set PG_private or PG_fscache
- * before calling, such as the NFS fs marking pages that are cached locally
- * on disk, thus we need to give the fs a chance to clean up in the event of
- * an error
- */
-static void read_cache_pages_invalidate_page(struct address_space *mapping,
- struct page *page)
-{
- if (page_has_private(page)) {
- if (!trylock_page(page))
- BUG();
- page->mapping = mapping;
- do_invalidatepage(page, 0, PAGE_SIZE);
- page->mapping = NULL;
- unlock_page(page);
- }
- put_page(page);
-}
-
-/*
- * release a list of pages, invalidating them first if need be
- */
-static void read_cache_pages_invalidate_pages(struct address_space *mapping,
- struct list_head *pages)
-{
- struct page *victim;
-
- while (!list_empty(pages)) {
- victim = lru_to_page(pages);
- list_del(&victim->lru);
- read_cache_pages_invalidate_page(mapping, victim);
- }
-}
-
-/**
- * read_cache_pages - populate an address space with some pages & start reads against them
- * @mapping: the address_space
- * @pages: The address of a list_head which contains the target pages. These
- * pages have their ->index populated and are otherwise uninitialised.
- * @filler: callback routine for filling a single page.
- * @data: private data for the callback routine.
- *
- * Hides the details of the LRU cache etc from the filesystems.
- *
- * Returns: %0 on success, error return by @filler otherwise
- */
-int read_cache_pages(struct address_space *mapping, struct list_head *pages,
- int (*filler)(void *, struct page *), void *data)
-{
- struct page *page;
- int ret = 0;
-
- while (!list_empty(pages)) {
- page = lru_to_page(pages);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping))) {
- read_cache_pages_invalidate_page(mapping, page);
- continue;
- }
- put_page(page);
-
- ret = filler(data, page);
- if (unlikely(ret)) {
- read_cache_pages_invalidate_pages(mapping, pages);
- break;
- }
- task_io_account_read(PAGE_SIZE);
- }
- return ret;
-}
-
-EXPORT_SYMBOL(read_cache_pages);
-
-static void read_pages(struct readahead_control *rac, struct list_head *pages,
- bool skip_page)
+static void read_pages(struct readahead_control *rac)
{
const struct address_space_operations *aops = rac->mapping->a_ops;
- struct page *page;
+ struct folio *folio;
struct blk_plug plug;
if (!readahead_count(rac))
- goto out;
+ return;
+ if (unlikely(rac->_workingset))
+ psi_memstall_enter(&rac->_pflags);
blk_start_plug(&plug);
if (aops->readahead) {
aops->readahead(rac);
- /* Clean up the remaining pages */
- while ((page = readahead_page(rac))) {
- unlock_page(page);
- put_page(page);
+ /*
+ * Clean up the remaining folios. The sizes in ->ra
+ * may be used to size the next readahead, so make sure
+ * they accurately reflect what happened.
+ */
+ while ((folio = readahead_folio(rac)) != NULL) {
+ unsigned long nr = folio_nr_pages(folio);
+
+ folio_get(folio);
+ rac->ra->size -= nr;
+ if (rac->ra->async_size >= nr) {
+ rac->ra->async_size -= nr;
+ filemap_remove_folio(folio);
+ }
+ folio_unlock(folio);
+ folio_put(folio);
}
- } else if (aops->readpages) {
- aops->readpages(rac->file, rac->mapping, pages,
- readahead_count(rac));
- /* Clean up the remaining pages */
- put_pages_list(pages);
- rac->_index += rac->_nr_pages;
- rac->_nr_pages = 0;
} else {
- while ((page = readahead_page(rac))) {
- aops->readpage(rac->file, page);
- put_page(page);
- }
+ while ((folio = readahead_folio(rac)) != NULL)
+ aops->read_folio(rac->file, folio);
}
blk_finish_plug(&plug);
+ if (unlikely(rac->_workingset))
+ psi_memstall_leave(&rac->_pflags);
+ rac->_workingset = false;
- BUG_ON(!list_empty(pages));
BUG_ON(readahead_count(rac));
-
-out:
- if (skip_page)
- rac->_index++;
}
/**
- * page_cache_readahead_unbounded - Start unchecked readahead.
- * @mapping: File address space.
- * @file: This instance of the open file; used for authentication.
- * @index: First page index to read.
+ * page_cache_ra_unbounded - Start unchecked readahead.
+ * @ractl: Readahead control.
* @nr_to_read: The number of pages to read.
* @lookahead_size: Where to start the next readahead.
*
@@ -173,17 +202,12 @@ out:
* Context: File is referenced by caller. Mutexes may be held by caller.
* May sleep, but will not reenter filesystem to reclaim memory.
*/
-void page_cache_readahead_unbounded(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void page_cache_ra_unbounded(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
- LIST_HEAD(page_pool);
+ struct address_space *mapping = ractl->mapping;
+ unsigned long index = readahead_index(ractl);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- struct readahead_control rac = {
- .mapping = mapping,
- .file = file,
- ._index = index,
- };
unsigned long i;
/*
@@ -198,15 +222,14 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
*/
unsigned int nofs = memalloc_nofs_save();
+ filemap_invalidate_lock_shared(mapping);
/*
* Preallocate as many pages as we will need.
*/
for (i = 0; i < nr_to_read; i++) {
- struct page *page = xa_load(&mapping->i_pages, index + i);
+ struct folio *folio = xa_load(&mapping->i_pages, index + i);
- BUG_ON(index + i != rac._index + rac._nr_pages);
-
- if (page && !xa_is_value(page)) {
+ if (folio && !xa_is_value(folio)) {
/*
* Page already present? Kick off the current batch
* of contiguous pages before continuing with the
@@ -215,48 +238,51 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl);
+ ractl->_index++;
+ i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
- page = __page_cache_alloc(gfp_mask);
- if (!page)
+ folio = filemap_alloc_folio(gfp_mask, 0);
+ if (!folio)
break;
- if (mapping->a_ops->readpages) {
- page->index = index + i;
- list_add(&page->lru, &page_pool);
- } else if (add_to_page_cache_lru(page, mapping, index + i,
+ if (filemap_add_folio(mapping, folio, index + i,
gfp_mask) < 0) {
- put_page(page);
- read_pages(&rac, &page_pool, true);
+ folio_put(folio);
+ read_pages(ractl);
+ ractl->_index++;
+ i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
if (i == nr_to_read - lookahead_size)
- SetPageReadahead(page);
- rac._nr_pages++;
+ folio_set_readahead(folio);
+ ractl->_workingset |= folio_test_workingset(folio);
+ ractl->_nr_pages++;
}
/*
- * Now start the IO. We ignore I/O errors - if the page is not
- * uptodate then the caller will launch readpage again, and
+ * Now start the IO. We ignore I/O errors - if the folio is not
+ * uptodate then the caller will launch read_folio again, and
* will then handle the error.
*/
- read_pages(&rac, &page_pool, false);
+ read_pages(ractl);
+ filemap_invalidate_unlock_shared(mapping);
memalloc_nofs_restore(nofs);
}
-EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * do_page_cache_ra() actually reads a chunk of disk. It allocates
* the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
-void __do_page_cache_readahead(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+static void do_page_cache_ra(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = ractl->mapping->host;
+ unsigned long index = readahead_index(ractl);
loff_t isize = i_size_read(inode);
pgoff_t end_index; /* The last page we want to read */
@@ -270,37 +296,38 @@ void __do_page_cache_readahead(struct address_space *mapping,
if (nr_to_read > end_index - index)
nr_to_read = end_index - index + 1;
- page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
- lookahead_size);
+ page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
-void force_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t index, unsigned long nr_to_read)
+void force_page_cache_ra(struct readahead_control *ractl,
+ unsigned long nr_to_read)
{
+ struct address_space *mapping = ractl->mapping;
+ struct file_ra_state *ra = ractl->ra;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
- struct file_ra_state *ra = &filp->f_ra;
- unsigned long max_pages;
+ unsigned long max_pages, index;
- if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
- !mapping->a_ops->readahead))
+ if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
return;
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
+ index = readahead_index(ractl);
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
- nr_to_read = min(nr_to_read, max_pages);
+ nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
while (nr_to_read) {
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
+ ractl->_index = index;
+ do_page_cache_ra(ractl, this_chunk, 0);
index += this_chunk;
nr_to_read -= this_chunk;
@@ -311,7 +338,7 @@ void force_page_cache_readahead(struct address_space *mapping,
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
- * 1-8 page = 32k initial, > 8 page = 128k initial
+ * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
*/
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
@@ -401,7 +428,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
}
/*
- * page cache context based read-ahead
+ * page cache context based readahead
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
@@ -435,17 +462,116 @@ static int try_context_readahead(struct address_space *mapping,
}
/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size. I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
+#else
+#define MAX_PAGECACHE_ORDER 8
+#endif
+
+static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
+ pgoff_t mark, unsigned int order, gfp_t gfp)
+{
+ int err;
+ struct folio *folio = filemap_alloc_folio(gfp, order);
+
+ if (!folio)
+ return -ENOMEM;
+ mark = round_up(mark, 1UL << order);
+ if (index == mark)
+ folio_set_readahead(folio);
+ err = filemap_add_folio(ractl->mapping, folio, index, gfp);
+ if (err) {
+ folio_put(folio);
+ return err;
+ }
+
+ ractl->_nr_pages += 1UL << order;
+ ractl->_workingset |= folio_test_workingset(folio);
+ return 0;
+}
+
+void page_cache_ra_order(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned int new_order)
+{
+ struct address_space *mapping = ractl->mapping;
+ pgoff_t index = readahead_index(ractl);
+ pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+ pgoff_t mark = index + ra->size - ra->async_size;
+ int err = 0;
+ gfp_t gfp = readahead_gfp_mask(mapping);
+
+ if (!mapping_large_folio_support(mapping) || ra->size < 4)
+ goto fallback;
+
+ limit = min(limit, index + ra->size - 1);
+
+ if (new_order < MAX_PAGECACHE_ORDER) {
+ new_order += 2;
+ if (new_order > MAX_PAGECACHE_ORDER)
+ new_order = MAX_PAGECACHE_ORDER;
+ while ((1 << new_order) > ra->size)
+ new_order--;
+ }
+
+ filemap_invalidate_lock_shared(mapping);
+ while (index <= limit) {
+ unsigned int order = new_order;
+
+ /* Align with smaller pages if needed */
+ if (index & ((1UL << order) - 1)) {
+ order = __ffs(index);
+ if (order == 1)
+ order = 0;
+ }
+ /* Don't allocate pages past EOF */
+ while (index + (1UL << order) - 1 > limit) {
+ if (--order == 1)
+ order = 0;
+ }
+ err = ra_alloc_folio(ractl, index, mark, order, gfp);
+ if (err)
+ break;
+ index += 1UL << order;
+ }
+
+ if (index > limit) {
+ ra->size += index - limit - 1;
+ ra->async_size += index - limit - 1;
+ }
+
+ read_pages(ractl);
+ filemap_invalidate_unlock_shared(mapping);
+
+ /*
+ * If there were already pages in the page cache, then we may have
+ * left some gaps. Let the regular readahead code take care of this
+ * situation.
+ */
+ if (!err)
+ return;
+fallback:
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
+}
+
+/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
-static void ondemand_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- bool hit_readahead_marker, pgoff_t index,
- unsigned long req_size)
+static void ondemand_readahead(struct readahead_control *ractl,
+ struct folio *folio, unsigned long req_size)
{
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
+ struct file_ra_state *ra = ractl->ra;
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
- pgoff_t prev_index;
+ pgoff_t index = readahead_index(ractl);
+ pgoff_t expected, prev_index;
+ unsigned int order = folio ? folio_order(folio) : 0;
/*
* If the request exceeds the readahead window, allow the read to
@@ -464,8 +590,9 @@ static void ondemand_readahead(struct address_space *mapping,
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- if ((index == (ra->start + ra->size - ra->async_size) ||
- index == (ra->start + ra->size))) {
+ expected = round_up(ra->start + ra->size - ra->async_size,
+ 1UL << order);
+ if (index == expected || index == (ra->start + ra->size)) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -473,16 +600,17 @@ static void ondemand_readahead(struct address_space *mapping,
}
/*
- * Hit a marked page without valid readahead state.
+ * Hit a marked folio without valid readahead state.
* E.g. interleaved reads.
* Query the pagecache for async_size, which normally equals to
* readahead size. Ramp it up and use it as the new readahead size.
*/
- if (hit_readahead_marker) {
+ if (folio) {
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_miss(mapping, index + 1, max_pages);
+ start = page_cache_next_miss(ractl->mapping, index + 1,
+ max_pages);
rcu_read_unlock();
if (!start || start - index > max_pages)
@@ -515,14 +643,15 @@ static void ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, index, req_size, max_pages))
+ if (try_context_readahead(ractl->mapping, ra, index, req_size,
+ max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
- __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+ do_page_cache_ra(ractl, req_size, 0);
return;
initial_readahead:
@@ -548,89 +677,59 @@ readit:
}
}
- ra_submit(ra, mapping, filp);
+ ractl->_index = ra->start;
+ page_cache_ra_order(ractl, ra, order);
}
-/**
- * page_cache_sync_readahead - generic file readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_sync_readahead() should be called when a cache miss happened:
- * it will submit the read. The readahead logic may decide to piggyback more
- * pages onto the read request if access patterns suggest it will improve
- * performance.
- */
-void page_cache_sync_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- pgoff_t index, unsigned long req_count)
+void page_cache_sync_ra(struct readahead_control *ractl,
+ unsigned long req_count)
{
- /* no read-ahead */
- if (!ra->ra_pages)
- return;
+ bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
- if (blk_cgroup_congested())
- return;
+ /*
+ * Even if readahead is disabled, issue this request as readahead
+ * as we'll need it to satisfy the requested range. The forced
+ * readahead will do the right thing and limit the read to just the
+ * requested range, which we'll set to 1 page for this case.
+ */
+ if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
+ if (!ractl->file)
+ return;
+ req_count = 1;
+ do_forced_ra = true;
+ }
/* be dumb */
- if (filp && (filp->f_mode & FMODE_RANDOM)) {
- force_page_cache_readahead(mapping, filp, index, req_count);
+ if (do_forced_ra) {
+ force_page_cache_ra(ractl, req_count);
return;
}
- /* do read-ahead */
- ondemand_readahead(mapping, ra, filp, false, index, req_count);
+ ondemand_readahead(ractl, NULL, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
+EXPORT_SYMBOL_GPL(page_cache_sync_ra);
-/**
- * page_cache_async_readahead - file readahead for marked pages
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @page: The page at @index which triggered the readahead call.
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_async_readahead() should be called when a page is used which
- * is marked as PageReadahead; this is a marker to suggest that the application
- * has used up enough of the readahead window that we should start pulling in
- * more pages.
- */
-void
-page_cache_async_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- struct page *page, pgoff_t index,
- unsigned long req_count)
+void page_cache_async_ra(struct readahead_control *ractl,
+ struct folio *folio, unsigned long req_count)
{
- /* no read-ahead */
- if (!ra->ra_pages)
+ /* no readahead */
+ if (!ractl->ra->ra_pages)
return;
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
return;
- ClearPageReadahead(page);
-
- /*
- * Defer asynchronous read-ahead on IO congestion.
- */
- if (inode_read_congested(mapping->host))
- return;
+ folio_clear_readahead(folio);
if (blk_cgroup_congested())
return;
- /* do read-ahead */
- ondemand_readahead(mapping, ra, filp, true, index, req_count);
+ ondemand_readahead(ractl, folio, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
@@ -662,3 +761,94 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
return ksys_readahead(fd, offset, count);
}
+
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD)
+COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count)
+{
+ return ksys_readahead(fd, compat_arg_u64_glue(offset), count);
+}
+#endif
+
+/**
+ * readahead_expand - Expand a readahead request
+ * @ractl: The request to be expanded
+ * @new_start: The revised start
+ * @new_len: The revised size of the request
+ *
+ * Attempt to expand a readahead request outwards from the current size to the
+ * specified size by inserting locked pages before and after the current window
+ * to increase the size to the new window. This may involve the insertion of
+ * THPs, in which case the window may get expanded even beyond what was
+ * requested.
+ *
+ * The algorithm will stop if it encounters a conflicting page already in the
+ * pagecache and leave a smaller expansion than requested.
+ *
+ * The caller must check for this by examining the revised @ractl object for a
+ * different expansion than was requested.
+ */
+void readahead_expand(struct readahead_control *ractl,
+ loff_t new_start, size_t new_len)
+{
+ struct address_space *mapping = ractl->mapping;
+ struct file_ra_state *ra = ractl->ra;
+ pgoff_t new_index, new_nr_pages;
+ gfp_t gfp_mask = readahead_gfp_mask(mapping);
+
+ new_index = new_start / PAGE_SIZE;
+
+ /* Expand the leading edge downwards */
+ while (ractl->_index > new_index) {
+ unsigned long index = ractl->_index - 1;
+ struct folio *folio = xa_load(&mapping->i_pages, index);
+
+ if (folio && !xa_is_value(folio))
+ return; /* Folio apparently present */
+
+ folio = filemap_alloc_folio(gfp_mask, 0);
+ if (!folio)
+ return;
+ if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
+ folio_put(folio);
+ return;
+ }
+ if (unlikely(folio_test_workingset(folio)) &&
+ !ractl->_workingset) {
+ ractl->_workingset = true;
+ psi_memstall_enter(&ractl->_pflags);
+ }
+ ractl->_nr_pages++;
+ ractl->_index = folio->index;
+ }
+
+ new_len += new_start - readahead_pos(ractl);
+ new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
+
+ /* Expand the trailing edge upwards */
+ while (ractl->_nr_pages < new_nr_pages) {
+ unsigned long index = ractl->_index + ractl->_nr_pages;
+ struct folio *folio = xa_load(&mapping->i_pages, index);
+
+ if (folio && !xa_is_value(folio))
+ return; /* Folio apparently present */
+
+ folio = filemap_alloc_folio(gfp_mask, 0);
+ if (!folio)
+ return;
+ if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
+ folio_put(folio);
+ return;
+ }
+ if (unlikely(folio_test_workingset(folio)) &&
+ !ractl->_workingset) {
+ ractl->_workingset = true;
+ psi_memstall_enter(&ractl->_pflags);
+ }
+ ractl->_nr_pages++;
+ if (ra) {
+ ra->size++;
+ ra->async_size++;
+ }
+ }
+}
+EXPORT_SYMBOL(readahead_expand);
diff --git a/mm/rmap.c b/mm/rmap.c
index 9425260774a1..0c0d8857dfce 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,35 +20,37 @@
/*
* Lock ordering in mm:
*
- * inode->i_mutex (while writing or truncating, not reading or faulting)
+ * inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
- * page->flags PG_locked (lock_page) * (see huegtlbfs below)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
- * mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in __set_page_dirty_buffers)
- * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
- * i_pages lock (widely used)
- * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * i_pages lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within bdi.wb->list_lock in __sync_single_inode)
+ * mapping->invalidate_lock (in filemap_fault)
+ * page->flags PG_locked (lock_page)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
+ * vma_start_write
+ * mapping->i_mmap_rwsem
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in block_dirty_folio)
+ * folio_lock_memcg move_lock (in block_dirty_folio)
+ * i_pages lock (widely used)
+ * lruvec->lru_lock (in folio_lruvec_lock_irq)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * i_pages lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
*
- * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*
- * * hugetlbfs PageHuge() pages take locks in this order:
- * mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * page->flags PG_locked (lock_page)
+ * hugetlbfs PageHuge() take locks in this order:
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * vma_lock (hugetlb specific lock for pmd_sharing)
+ * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ * page->flags PG_locked (lock_page)
*/
#include <linux/mm.h>
@@ -72,10 +74,13 @@
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
+#include <trace/events/migrate.h>
#include "internal.h"
@@ -89,7 +94,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
- anon_vma->degree = 1; /* Reference for first vma */
+ anon_vma->num_children = 0;
+ anon_vma->num_active_vmas = 0;
anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
@@ -106,15 +112,15 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
VM_BUG_ON(atomic_read(&anon_vma->refcount));
/*
- * Synchronize against page_lock_anon_vma_read() such that
+ * Synchronize against folio_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
* Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by
- * down_read_trylock() from page_lock_anon_vma_read(). This orders:
+ * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
*
- * page_lock_anon_vma_read() VS put_anon_vma()
+ * folio_lock_anon_vma_read() VS put_anon_vma()
* down_read_trylock() atomic_dec_and_test()
* LOCK MB
* atomic_read() rwsem_is_locked()
@@ -167,8 +173,8 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma_read()
- * and that may actually touch the spinlock even in the newly
+ * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
+ * and that may actually touch the rwsem even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
*
@@ -197,6 +203,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
+ anon_vma->num_children++; /* self-parent link for new root */
allocated = anon_vma;
}
@@ -206,8 +213,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
- /* vma reference or self-parent link for new root */
- anon_vma->degree++;
+ anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
}
@@ -257,11 +263,12 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
- * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
- * anon_vma_fork(). The first three want an exact copy of src, while the last
- * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
- * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
- * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
+ * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
+ * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
+ * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
+ * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
+ * call, we can identify this case by checking (!dst->anon_vma &&
+ * src->anon_vma).
*
* If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
* and reuse existing anon_vma which has no vmas and only one child anon_vma.
@@ -292,26 +299,26 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
anon_vma_chain_link(dst, avc, anon_vma);
/*
- * Reuse existing anon_vma if its degree lower than two,
- * that means it has no vma and only one anon_vma child.
+ * Reuse existing anon_vma if it has no vma and only one
+ * anon_vma child.
*
- * Do not chose parent anon_vma, otherwise first child
- * will always reuse it. Root anon_vma is never reused:
+ * Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
if (!dst->anon_vma && src->anon_vma &&
- anon_vma != src->anon_vma && anon_vma->degree < 2)
+ anon_vma->num_children < 2 &&
+ anon_vma->num_active_vmas == 0)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
- dst->anon_vma->degree++;
+ dst->anon_vma->num_active_vmas++;
unlock_anon_vma_root(root);
return 0;
enomem_failure:
/*
- * dst->anon_vma is dropped here otherwise its degree can be incorrectly
- * decremented in unlink_anon_vmas().
+ * dst->anon_vma is dropped here otherwise its num_active_vmas can
+ * be incorrectly decremented in unlink_anon_vmas().
* We can safely do this because callers of anon_vma_clone() don't care
* about dst->anon_vma if anon_vma_clone() failed.
*/
@@ -354,12 +361,13 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
+ anon_vma->num_active_vmas++;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
/*
- * The root anon_vma's spinlock is the lock actually used when we
+ * The root anon_vma's rwsem is the lock actually used when we
* lock any of the anon_vmas in this anon_vma tree.
*/
anon_vma->root = pvma->anon_vma->root;
@@ -374,7 +382,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
- anon_vma->parent->degree++;
+ anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
return 0;
@@ -406,15 +414,22 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* to free them outside the lock.
*/
if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
- anon_vma->parent->degree--;
+ anon_vma->parent->num_children--;
continue;
}
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
- if (vma->anon_vma)
- vma->anon_vma->degree--;
+ if (vma->anon_vma) {
+ vma->anon_vma->num_active_vmas--;
+
+ /*
+ * vma would still be needed after unlink, and anon_vma will be prepared
+ * when handle fault.
+ */
+ vma->anon_vma = NULL;
+ }
unlock_anon_vma_root(root);
/*
@@ -425,7 +440,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- VM_WARN_ON(anon_vma->degree);
+ VM_WARN_ON(anon_vma->num_children);
+ VM_WARN_ON(anon_vma->num_active_vmas);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
@@ -455,8 +471,8 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is tricky!
*
* Since there is no serialization what so ever against page_remove_rmap()
- * the best this function can do is return a locked anon_vma that might
- * have been relevant to this page.
+ * the best this function can do is return a refcount increased anon_vma
+ * that might have been relevant to this page.
*
* The page might have been remapped to a different anon_vma or the anon_vma
* returned may already be freed (and even reused).
@@ -475,16 +491,16 @@ void __init anon_vma_init(void)
* if there is a mapcount, we can dereference the anon_vma after observing
* those.
*/
-struct anon_vma *page_get_anon_vma(struct page *page)
+struct anon_vma *folio_get_anon_vma(struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
- if (!page_mapped(page))
+ if (!folio_mapped(folio))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
@@ -494,13 +510,13 @@ struct anon_vma *page_get_anon_vma(struct page *page)
}
/*
- * If this page is still mapped, then its anon_vma cannot have been
+ * If this folio is still mapped, then its anon_vma cannot have been
* freed. But if it has been unmapped, we have no security against the
* anon_vma structure being freed and reused (for another anon_vma:
* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
* above cannot corrupt).
*/
- if (!page_mapped(page)) {
+ if (!folio_mapped(folio)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
@@ -512,47 +528,55 @@ out:
}
/*
- * Similar to page_get_anon_vma() except it locks the anon_vma.
+ * Similar to folio_get_anon_vma() except it locks the anon_vma.
*
* Its a little more complex as it tries to keep the fast path to a single
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
- * reference like with page_get_anon_vma() and then block on the mutex.
+ * reference like with folio_get_anon_vma() and then block on the mutex
+ * on !rwc->try_lock case.
*/
-struct anon_vma *page_lock_anon_vma_read(struct page *page)
+struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
- if (!page_mapped(page))
+ if (!folio_mapped(folio))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
- * If the page is still mapped, then this anon_vma is still
+ * If the folio is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
*/
- if (!page_mapped(page)) {
+ if (!folio_mapped(folio)) {
up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
goto out;
}
+ if (rwc && rwc->try_lock) {
+ anon_vma = NULL;
+ rwc->contended = true;
+ goto out;
+ }
+
/* trylock failed, we got to sleep */
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
- if (!page_mapped(page)) {
+ if (!folio_mapped(folio)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
@@ -580,11 +604,6 @@ out:
return anon_vma;
}
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
-{
- anon_vma_unlock_read(anon_vma);
-}
-
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
* Flush TLB entries for recently unmapped pages from remote CPUs. It is
@@ -613,9 +632,24 @@ void try_to_unmap_flush_dirty(void)
try_to_unmap_flush();
}
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+/*
+ * Bits 0-14 of mm->tlb_flush_batched record pending generations.
+ * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
+ */
+#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
+#define TLB_FLUSH_BATCH_PENDING_MASK \
+ ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
+#define TLB_FLUSH_BATCH_PENDING_LARGE \
+ (TLB_FLUSH_BATCH_PENDING_MASK / 2)
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+ int batch;
+ bool writable = pte_dirty(pteval);
+
+ if (!pte_accessible(mm, pteval))
+ return;
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
@@ -625,7 +659,19 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
* before the PTE is cleared.
*/
barrier();
- mm->tlb_flush_batched = true;
+ batch = atomic_read(&mm->tlb_flush_batched);
+retry:
+ if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
+ /*
+ * Prevent `pending' from catching up with `flushed' because of
+ * overflow. Reset `pending' and `flushed' to be 1 and 0 if
+ * `pending' becomes large.
+ */
+ if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
+ goto retry;
+ } else {
+ atomic_inc(&mm->tlb_flush_batched);
+ }
/*
* If the PTE was dirty then it's best to assume it's writable. The
@@ -672,19 +718,22 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
void flush_tlb_batched_pending(struct mm_struct *mm)
{
- if (data_race(mm->tlb_flush_batched)) {
- flush_tlb_mm(mm);
+ int batch = atomic_read(&mm->tlb_flush_batched);
+ int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
+ int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
+ if (pending != flushed) {
+ flush_tlb_mm(mm);
/*
- * Do not allow the compiler to re-order the clearing of
- * tlb_flush_batched before the tlb is flushed.
+ * If the new TLB flushing is pending during flushing, leave
+ * mm->tlb_flush_batched as is, to avoid losing flushing.
*/
- barrier();
- mm->tlb_flush_batched = false;
+ atomic_cmpxchg(&mm->tlb_flush_batched, batch,
+ pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
}
}
#else
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
{
}
@@ -700,9 +749,9 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- unsigned long address;
- if (PageAnon(page)) {
- struct anon_vma *page__anon_vma = page_anon_vma(page);
+ struct folio *folio = page_folio(page);
+ if (folio_test_anon(folio)) {
+ struct anon_vma *page__anon_vma = folio_anon_vma(folio);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
@@ -710,24 +759,26 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping) {
- if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
- return -EFAULT;
- } else
+ } else if (!vma->vm_file) {
return -EFAULT;
- address = __vma_address(page, vma);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ } else if (vma->vm_file->f_mapping != folio->mapping) {
return -EFAULT;
- return address;
+ }
+
+ return vma_address(page, vma);
}
+/*
+ * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
+ * NULL if it doesn't exist. No guarantees / checks on what the pmd_t*
+ * represents.
+ */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd = NULL;
- pmd_t pmde;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -742,68 +793,54 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
goto out;
pmd = pmd_offset(pud, address);
- /*
- * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
- * without holding anon_vma lock for write. So when looking for a
- * genuine pmde (in which to find pte), test present and !THP together.
- */
- pmde = *pmd;
- barrier();
- if (!pmd_present(pmde) || pmd_trans_huge(pmde))
- pmd = NULL;
out:
return pmd;
}
-struct page_referenced_arg {
+struct folio_referenced_arg {
int mapcount;
int referenced;
unsigned long vm_flags;
struct mem_cgroup *memcg;
};
/*
- * arg: page_referenced_arg will be passed
+ * arg: folio_referenced_arg will be passed
*/
-static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *arg)
+static bool folio_referenced_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long address, void *arg)
{
- struct page_referenced_arg *pra = arg;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- };
+ struct folio_referenced_arg *pra = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
int referenced = 0;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
- if (vma->vm_flags & VM_LOCKED) {
+ if ((vma->vm_flags & VM_LOCKED) &&
+ (!folio_test_large(folio) || !pvmw.pte)) {
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma, !pvmw.pte);
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
return false; /* To break the loop */
}
if (pvmw.pte) {
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- /*
- * Don't treat a reference through
- * a sequentially read mapping as such.
- * If the page has been used in another mapping,
- * we will catch it; if this other mapping is
- * already gone, the unmap path will have set
- * PG_referenced or activated the page.
- */
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
- referenced++;
+ if (lru_gen_enabled() &&
+ pte_young(ptep_get(pvmw.pte))) {
+ lru_gen_look_around(&pvmw);
+ referenced++;
}
+
+ if (ptep_clear_flush_young_notify(vma, address,
+ pvmw.pte))
+ referenced++;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
referenced++;
} else {
- /* unexpected pmd-mapped page? */
+ /* unexpected pmd-mapped folio? */
WARN_ON_ONCE(1);
}
@@ -811,13 +848,13 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
}
if (referenced)
- clear_page_idle(page);
- if (test_and_clear_page_young(page))
+ folio_clear_idle(folio);
+ if (folio_test_clear_young(folio))
referenced++;
if (referenced) {
pra->referenced++;
- pra->vm_flags |= vma->vm_flags;
+ pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
}
if (!pra->mapcount)
@@ -826,107 +863,107 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
return true;
}
-static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
+static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
{
- struct page_referenced_arg *pra = arg;
+ struct folio_referenced_arg *pra = arg;
struct mem_cgroup *memcg = pra->memcg;
- if (!mm_match_cgroup(vma->vm_mm, memcg))
+ /*
+ * Ignore references from this mapping if it has no recency. If the
+ * folio has been used in another mapping, we will catch it; if this
+ * other mapping is already gone, the unmap path will have set the
+ * referenced flag or activated the folio in zap_pte_range().
+ */
+ if (!vma_has_recency(vma))
+ return true;
+
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip counting on behalf
+ * of references from different cgroups.
+ */
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
return true;
return false;
}
/**
- * page_referenced - test if the page was referenced
- * @page: the page to test
- * @is_locked: caller holds lock on the page
+ * folio_referenced() - Test if the folio was referenced.
+ * @folio: The folio to test.
+ * @is_locked: Caller holds lock on the folio.
* @memcg: target memory cgroup
- * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+ * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
+ *
+ * Quick test_and_clear_referenced for all mappings of a folio,
*
- * Quick test_and_clear_referenced for all mappings to a page,
- * returns the number of ptes which referenced the page.
+ * Return: The number of mappings which referenced the folio. Return -1 if
+ * the function bailed out due to rmap lock contention.
*/
-int page_referenced(struct page *page,
- int is_locked,
- struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+int folio_referenced(struct folio *folio, int is_locked,
+ struct mem_cgroup *memcg, unsigned long *vm_flags)
{
int we_locked = 0;
- struct page_referenced_arg pra = {
- .mapcount = total_mapcount(page),
+ struct folio_referenced_arg pra = {
+ .mapcount = folio_mapcount(folio),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
- .rmap_one = page_referenced_one,
+ .rmap_one = folio_referenced_one,
.arg = (void *)&pra,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
+ .try_lock = true,
+ .invalid_vma = invalid_folio_referenced_vma,
};
*vm_flags = 0;
if (!pra.mapcount)
return 0;
- if (!page_rmapping(page))
+ if (!folio_raw_mapping(folio))
return 0;
- if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
- we_locked = trylock_page(page);
+ if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
+ we_locked = folio_trylock(folio);
if (!we_locked)
return 1;
}
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg) {
- rwc.invalid_vma = invalid_page_referenced_vma;
- }
-
- rmap_walk(page, &rwc);
+ rmap_walk(folio, &rwc);
*vm_flags = pra.vm_flags;
if (we_locked)
- unlock_page(page);
+ folio_unlock(folio);
- return pra.referenced;
+ return rwc.contended ? -1 : pra.referenced;
}
-static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
{
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- .flags = PVMW_SYNC,
- };
+ int cleaned = 0;
+ struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
- int *cleaned = arg;
+ unsigned long address = pvmw->address;
/*
* We have to assume the worse case ie pmd for invalidation. Note that
- * the page can not be free from this function.
+ * the folio can not be freed from this function.
*/
- mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
- 0, vma, vma->vm_mm, address,
- min(vma->vm_end, address + page_size(page)));
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
+ vma->vm_mm, address, vma_address_end(pvmw));
mmu_notifier_invalidate_range_start(&range);
- while (page_vma_mapped_walk(&pvmw)) {
+ while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
- address = pvmw.address;
- if (pvmw.pte) {
- pte_t entry;
- pte_t *pte = pvmw.pte;
+ address = pvmw->address;
+ if (pvmw->pte) {
+ pte_t *pte = pvmw->pte;
+ pte_t entry = ptep_get(pte);
- if (!pte_dirty(*pte) && !pte_write(*pte))
+ if (!pte_dirty(entry) && !pte_write(entry))
continue;
- flush_cache_page(vma, address, pte_pfn(*pte));
+ flush_cache_page(vma, address, pte_pfn(entry));
entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
@@ -934,20 +971,21 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- pmd_t *pmd = pvmw.pmd;
+ pmd_t *pmd = pvmw->pmd;
pmd_t entry;
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
- flush_cache_page(vma, address, page_to_pfn(page));
+ flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
ret = 1;
#else
- /* unexpected pmd-mapped page? */
+ /* unexpected pmd-mapped folio? */
WARN_ON_ONCE(1);
#endif
}
@@ -957,14 +995,25 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
* downgrading page table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
if (ret)
- (*cleaned)++;
+ cleaned++;
}
mmu_notifier_invalidate_range_end(&range);
+ return cleaned;
+}
+
+static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+ int *cleaned = arg;
+
+ *cleaned += page_vma_mkclean_one(&pvmw);
+
return true;
}
@@ -976,7 +1025,7 @@ static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
return true;
}
-int page_mkclean(struct page *page)
+int folio_mkclean(struct folio *folio)
{
int cleaned = 0;
struct address_space *mapping;
@@ -986,20 +1035,75 @@ int page_mkclean(struct page *page)
.invalid_vma = invalid_mkclean_vma,
};
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (!page_mapped(page))
+ if (!folio_mapped(folio))
return 0;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (!mapping)
return 0;
- rmap_walk(page, &rwc);
+ rmap_walk(folio, &rwc);
return cleaned;
}
-EXPORT_SYMBOL_GPL(page_mkclean);
+EXPORT_SYMBOL_GPL(folio_mkclean);
+
+/**
+ * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
+ * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
+ * within the @vma of shared mappings. And since clean PTEs
+ * should also be readonly, write protects them too.
+ * @pfn: start pfn.
+ * @nr_pages: number of physically contiguous pages srarting with @pfn.
+ * @pgoff: page offset that the @pfn mapped with.
+ * @vma: vma that @pfn mapped within.
+ *
+ * Returns the number of cleaned PTEs (including PMDs).
+ */
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+ struct vm_area_struct *vma)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .pfn = pfn,
+ .nr_pages = nr_pages,
+ .pgoff = pgoff,
+ .vma = vma,
+ .flags = PVMW_SYNC,
+ };
+
+ if (invalid_mkclean_vma(vma, NULL))
+ return 0;
+
+ pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
+ VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
+
+ return page_vma_mkclean_one(&pvmw);
+}
+
+int folio_total_mapcount(struct folio *folio)
+{
+ int mapcount = folio_entire_mapcount(folio);
+ int nr_pages;
+ int i;
+
+ /* In the common case, avoid the loop when no pages mapped by PTE */
+ if (folio_nr_pages_mapped(folio) == 0)
+ return mapcount;
+ /*
+ * Add all the PTE mappings of those pages mapped by PTE.
+ * Limit the loop to folio_nr_pages_mapped()?
+ * Perhaps: given all the raciness, that may be a good or a bad idea.
+ */
+ nr_pages = folio_nr_pages(folio);
+ for (i = 0; i < nr_pages; i++)
+ mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
+
+ /* But each of those _mapcounts was based on -1 */
+ mapcount += nr_pages;
+ return mapcount;
+}
/**
* page_move_anon_rmap - move a page to our anon_vma
@@ -1013,38 +1117,39 @@ EXPORT_SYMBOL_GPL(page_mkclean);
*/
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
-
- page = compound_head(page);
+ void *anon_vma = vma->anon_vma;
+ struct folio *folio = page_folio(page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_VMA(!anon_vma, vma);
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ anon_vma += PAGE_MAPPING_ANON;
/*
* Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
- * simultaneously, so a concurrent reader (eg page_referenced()'s
- * PageAnon()) will not see one without the other.
+ * simultaneously, so a concurrent reader (eg folio_referenced()'s
+ * folio_test_anon()) will not see one without the other.
*/
- WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
+ WRITE_ONCE(folio->mapping, anon_vma);
+ SetPageAnonExclusive(page);
}
/**
* __page_set_anon_rmap - set up new anonymous rmap
- * @page: Page or Hugepage to add to rmap
+ * @folio: Folio which contains page.
+ * @page: Page to add to rmap.
* @vma: VM area to add page to.
- * @address: User virtual address of the mapping
+ * @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
*/
-static void __page_set_anon_rmap(struct page *page,
+static void __page_set_anon_rmap(struct folio *folio, struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma);
- if (PageAnon(page))
- return;
+ if (folio_test_anon(folio))
+ goto out;
/*
* If the page isn't exclusively mapped into this vma,
@@ -1054,9 +1159,18 @@ static void __page_set_anon_rmap(struct page *page,
if (!exclusive)
anon_vma = anon_vma->root;
+ /*
+ * page_idle does a lockless/optimistic rmap scan on folio->mapping.
+ * Make sure the compiler doesn't split the stores of anon_vma and
+ * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
+ * could mistake the mapping for a struct address_space and crash.
+ */
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
- page->index = linear_page_index(vma, address);
+ WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
+ folio->index = linear_page_index(vma, address);
+out:
+ if (exclusive)
+ SetPageAnonExclusive(page);
}
/**
@@ -1068,19 +1182,20 @@ static void __page_set_anon_rmap(struct page *page,
static void __page_check_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
+ struct folio *folio = page_folio(page);
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
* be set up correctly at this point.
*
* We have exclusion against page_add_anon_rmap because the caller
- * always holds the page locked, except if called from page_dup_rmap,
- * in which case the page is already known to be setup.
+ * always holds the page locked.
*
* We have exclusion against page_add_new_anon_rmap because those pages
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
- VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
+ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
+ folio);
VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
page);
}
@@ -1090,332 +1205,289 @@ static void __page_check_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
- * @compound: charge the page as compound or small page
+ * @flags: the rmap flags
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
* and to ensure that PageAnon is not being upgraded racily to PageKsm
* (but PageKsm is never downgraded to PageAnon).
*/
-void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, bool compound)
-{
- do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
-}
-
-/*
- * Special version of the above for do_swap_page, which often runs
- * into pages that are exclusively owned by the current process.
- * Everybody else should continue to use page_add_anon_rmap above.
- */
-void do_page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int flags)
+void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, rmap_t flags)
{
+ struct folio *folio = page_folio(page);
+ atomic_t *mapped = &folio->_nr_pages_mapped;
+ int nr = 0, nr_pmdmapped = 0;
bool compound = flags & RMAP_COMPOUND;
- bool first;
+ bool first = true;
- if (unlikely(PageKsm(page)))
- lock_page_memcg(page);
- else
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- if (compound) {
- atomic_t *mapcount;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- mapcount = compound_mapcount_ptr(page);
- first = atomic_inc_and_test(mapcount);
- } else {
+ /* Is page being mapped by PTE? Is this its first map to be added? */
+ if (likely(!compound)) {
first = atomic_inc_and_test(&page->_mapcount);
+ nr = first;
+ if (first && folio_test_large(folio)) {
+ nr = atomic_inc_return_relaxed(mapped);
+ nr = (nr < COMPOUND_MAPPED);
+ }
+ } else if (folio_test_pmd_mappable(folio)) {
+ /* That test is redundant: it's for safety or to optimize out */
+
+ first = atomic_inc_and_test(&folio->_entire_mapcount);
+ if (first) {
+ nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
+ if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
+ nr_pmdmapped = folio_nr_pages(folio);
+ nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ /* Raced ahead of a remove and another add? */
+ if (unlikely(nr < 0))
+ nr = 0;
+ } else {
+ /* Raced ahead of a remove of COMPOUND_MAPPED */
+ nr = 0;
+ }
+ }
}
- if (first) {
- int nr = compound ? thp_nr_pages(page) : 1;
- /*
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption
- * disabled.
- */
- if (compound)
- __inc_lruvec_page_state(page, NR_ANON_THPS);
- __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
- }
+ VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+ VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
- if (unlikely(PageKsm(page))) {
- unlock_page_memcg(page);
- return;
+ if (nr_pmdmapped)
+ __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
+ if (nr)
+ __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
+
+ if (likely(!folio_test_ksm(folio))) {
+ /* address might be in next vma when migration races vma_merge */
+ if (first)
+ __page_set_anon_rmap(folio, page, vma, address,
+ !!(flags & RMAP_EXCLUSIVE));
+ else
+ __page_check_anon_rmap(page, vma, address);
}
- /* address might be in next vma when migration races vma_adjust */
- if (first)
- __page_set_anon_rmap(page, vma, address,
- flags & RMAP_EXCLUSIVE);
- else
- __page_check_anon_rmap(page, vma, address);
+ mlock_vma_folio(folio, vma, compound);
}
/**
- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
- * @page: the page to add the mapping to
+ * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
+ * @folio: The folio to add the mapping to.
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
- * @compound: charge the page as compound or small page
*
- * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * Like page_add_anon_rmap() but must only be called on *new* folios.
* This means the inc-and-test can be bypassed.
- * Page does not have to be locked.
+ * The folio does not have to be locked.
+ *
+ * If the folio is large, it is accounted as a THP. As the folio
+ * is new, it's assumed to be mapped exclusively by a single process.
*/
-void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, bool compound)
+void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long address)
{
- int nr = compound ? thp_nr_pages(page) : 1;
+ int nr;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- __SetPageSwapBacked(page);
- if (compound) {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- /* increment count (starts at -1) */
- atomic_set(compound_mapcount_ptr(page), 0);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ __folio_set_swapbacked(folio);
- __inc_lruvec_page_state(page, NR_ANON_THPS);
+ if (likely(!folio_test_pmd_mappable(folio))) {
+ /* increment count (starts at -1) */
+ atomic_set(&folio->_mapcount, 0);
+ nr = 1;
} else {
- /* Anon THP always mapped first with PMD */
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
- atomic_set(&page->_mapcount, 0);
+ atomic_set(&folio->_entire_mapcount, 0);
+ atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED);
+ nr = folio_nr_pages(folio);
+ __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
}
- __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
- __page_set_anon_rmap(page, vma, address, 1);
+
+ __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
+ __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
}
/**
* page_add_file_rmap - add pte mapping to a file page
- * @page: the page to add the mapping to
- * @compound: charge the page as compound or small page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_add_file_rmap(struct page *page, bool compound)
+void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
+ bool compound)
{
- int i, nr = 1;
+ struct folio *folio = page_folio(page);
+ atomic_t *mapped = &folio->_nr_pages_mapped;
+ int nr = 0, nr_pmdmapped = 0;
+ bool first;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
- lock_page_memcg(page);
- if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
- if (atomic_inc_and_test(&page[i]._mapcount))
- nr++;
- }
- if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
- goto out;
- if (PageSwapBacked(page))
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
- else
- __inc_node_page_state(page, NR_FILE_PMDMAPPED);
- } else {
- if (PageTransCompound(page) && page_mapping(page)) {
- VM_WARN_ON_ONCE(!PageLocked(page));
-
- SetPageDoubleMap(compound_head(page));
- if (PageMlocked(page))
- clear_page_mlock(compound_head(page));
- }
- if (!atomic_inc_and_test(&page->_mapcount))
- goto out;
- }
- __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
-out:
- unlock_page_memcg(page);
-}
-
-static void page_remove_file_rmap(struct page *page, bool compound)
-{
- int i, nr = 1;
-
- VM_BUG_ON_PAGE(compound && !PageHead(page), page);
- /* Hugepages are not counted in NR_FILE_MAPPED for now. */
- if (unlikely(PageHuge(page))) {
- /* hugetlb pages are always mapped with pmds */
- atomic_dec(compound_mapcount_ptr(page));
- return;
- }
-
- /* page still mapped by someone else? */
- if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
- if (atomic_add_negative(-1, &page[i]._mapcount))
- nr++;
+ /* Is page being mapped by PTE? Is this its first map to be added? */
+ if (likely(!compound)) {
+ first = atomic_inc_and_test(&page->_mapcount);
+ nr = first;
+ if (first && folio_test_large(folio)) {
+ nr = atomic_inc_return_relaxed(mapped);
+ nr = (nr < COMPOUND_MAPPED);
}
- if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- return;
- if (PageSwapBacked(page))
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
- else
- __dec_node_page_state(page, NR_FILE_PMDMAPPED);
- } else {
- if (!atomic_add_negative(-1, &page->_mapcount))
- return;
- }
-
- /*
- * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption disabled.
- */
- __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
-
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
-}
-
-static void page_remove_anon_compound_rmap(struct page *page)
-{
- int i, nr;
-
- if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- return;
-
- /* Hugepages are not counted in NR_ANON_PAGES for now. */
- if (unlikely(PageHuge(page)))
- return;
-
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
- return;
-
- __dec_lruvec_page_state(page, NR_ANON_THPS);
-
- if (TestClearPageDoubleMap(page)) {
- /*
- * Subpages can be mapped with PTEs too. Check how many of
- * them are still mapped.
- */
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
- if (atomic_add_negative(-1, &page[i]._mapcount))
- nr++;
+ } else if (folio_test_pmd_mappable(folio)) {
+ /* That test is redundant: it's for safety or to optimize out */
+
+ first = atomic_inc_and_test(&folio->_entire_mapcount);
+ if (first) {
+ nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
+ if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
+ nr_pmdmapped = folio_nr_pages(folio);
+ nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ /* Raced ahead of a remove and another add? */
+ if (unlikely(nr < 0))
+ nr = 0;
+ } else {
+ /* Raced ahead of a remove of COMPOUND_MAPPED */
+ nr = 0;
+ }
}
-
- /*
- * Queue the page for deferred split if at least one small
- * page of the compound page is unmapped, but at least one
- * small page is still mapped.
- */
- if (nr && nr < HPAGE_PMD_NR)
- deferred_split_huge_page(page);
- } else {
- nr = HPAGE_PMD_NR;
}
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
-
+ if (nr_pmdmapped)
+ __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ?
+ NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
if (nr)
- __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
+ __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
+
+ mlock_vma_folio(folio, vma, compound);
}
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
+ * @vma: the vm area from which the mapping is removed
* @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page, bool compound)
+void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
+ bool compound)
{
- lock_page_memcg(page);
+ struct folio *folio = page_folio(page);
+ atomic_t *mapped = &folio->_nr_pages_mapped;
+ int nr = 0, nr_pmdmapped = 0;
+ bool last;
+ enum node_stat_item idx;
- if (!PageAnon(page)) {
- page_remove_file_rmap(page, compound);
- goto out;
- }
+ VM_BUG_ON_PAGE(compound && !PageHead(page), page);
- if (compound) {
- page_remove_anon_compound_rmap(page);
- goto out;
+ /* Hugetlb pages are not counted in NR_*MAPPED */
+ if (unlikely(folio_test_hugetlb(folio))) {
+ /* hugetlb pages are always mapped with pmds */
+ atomic_dec(&folio->_entire_mapcount);
+ return;
}
- /* page still mapped by someone else? */
- if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
-
- /*
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption disabled.
- */
- __dec_lruvec_page_state(page, NR_ANON_MAPPED);
+ /* Is page being unmapped by PTE? Is this its last map to be removed? */
+ if (likely(!compound)) {
+ last = atomic_add_negative(-1, &page->_mapcount);
+ nr = last;
+ if (last && folio_test_large(folio)) {
+ nr = atomic_dec_return_relaxed(mapped);
+ nr = (nr < COMPOUND_MAPPED);
+ }
+ } else if (folio_test_pmd_mappable(folio)) {
+ /* That test is redundant: it's for safety or to optimize out */
+
+ last = atomic_add_negative(-1, &folio->_entire_mapcount);
+ if (last) {
+ nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped);
+ if (likely(nr < COMPOUND_MAPPED)) {
+ nr_pmdmapped = folio_nr_pages(folio);
+ nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ /* Raced ahead of another remove and an add? */
+ if (unlikely(nr < 0))
+ nr = 0;
+ } else {
+ /* An add of COMPOUND_MAPPED raced ahead */
+ nr = 0;
+ }
+ }
+ }
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
+ if (nr_pmdmapped) {
+ if (folio_test_anon(folio))
+ idx = NR_ANON_THPS;
+ else if (folio_test_swapbacked(folio))
+ idx = NR_SHMEM_PMDMAPPED;
+ else
+ idx = NR_FILE_PMDMAPPED;
+ __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped);
+ }
+ if (nr) {
+ idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
+ __lruvec_stat_mod_folio(folio, idx, -nr);
- if (PageTransCompound(page))
- deferred_split_huge_page(compound_head(page));
+ /*
+ * Queue anon THP for deferred split if at least one
+ * page of the folio is unmapped and at least one page
+ * is still mapped.
+ */
+ if (folio_test_pmd_mappable(folio) && folio_test_anon(folio))
+ if (!compound || nr < nr_pmdmapped)
+ deferred_split_folio(folio);
+ }
/*
- * It would be tidy to reset the PageAnon mapping here,
- * but that might overwrite a racing page_add_anon_rmap
- * which increments mapcount after us but sets mapping
- * before us: so leave the reset to free_unref_page,
- * and remember that it's only reliable while mapped.
- * Leaving it set also helps swapoff to reinstate ptes
- * faster for those pages still in swapcache.
+ * It would be tidy to reset folio_test_anon mapping when fully
+ * unmapped, but that might overwrite a racing page_add_anon_rmap
+ * which increments mapcount after us but sets mapping before us:
+ * so leave the reset to free_pages_prepare, and remember that
+ * it's only reliable while mapped.
*/
-out:
- unlock_page_memcg(page);
+
+ munlock_vma_folio(folio, vma, compound);
}
/*
* @arg: enum ttu_flags will be passed to this argument
*/
-static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
- bool ret = true;
+ bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long pfn;
- /* munlock has nothing to gain from examining un-locked vmas */
- if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
- return true;
-
- if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
- is_zone_device_page(page) && !is_device_private_page(page))
- return true;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return before page_mapped() has become false,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
- if (flags & TTU_SPLIT_HUGE_PMD) {
- split_huge_pmd_address(vma, address,
- flags & TTU_SPLIT_FREEZE, page);
- }
+ if (flags & TTU_SPLIT_HUGE_PMD)
+ split_huge_pmd_address(vma, address, false, folio);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
*
- * Note that the page can not be free in this function as call of
- * try_to_unmap() must hold a reference on the page.
+ * Note that the folio can not be freed in this function as call of
+ * try_to_unmap() must hold a reference on the folio.
*/
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address,
- min(vma->vm_end, address + page_size(page)));
- if (PageHuge(page)) {
+ range.end = vma_address_end(&pvmw);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
+ address, range.end);
+ if (folio_test_hugetlb(folio)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
- *
- * If called for a huge page, caller must hold i_mmap_rwsem
- * in write mode as it is possible to call huge_pmd_unshare.
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
@@ -1423,162 +1495,124 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- /* PMD-mapped THP migration entry */
- if (!pvmw.pte && (flags & TTU_MIGRATION)) {
- VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
-
- set_pmd_migration_entry(&pvmw, page);
- continue;
- }
-#endif
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
/*
- * If the page is mlock()d, we cannot swap it out.
- * If it's recently referenced (perhaps page_referenced
- * skipped over this mm) then we should reactivate it.
+ * If the folio is in an mlock()d vma, we must not swap it out.
*/
- if (!(flags & TTU_IGNORE_MLOCK)) {
- if (vma->vm_flags & VM_LOCKED) {
- /* PTE-mapped THP are never mlocked */
- if (!PageTransCompound(page)) {
- /*
- * Holding pte lock, we do *not* need
- * mmap_lock here
- */
- mlock_vma_page(page);
- }
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
- if (flags & TTU_MUNLOCK)
- continue;
+ if (!(flags & TTU_IGNORE_MLOCK) &&
+ (vma->vm_flags & VM_LOCKED)) {
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma, false);
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
}
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_PAGE(!pvmw.pte, page);
-
- subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ pfn = pte_pfn(ptep_get(pvmw.pte));
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(subpage);
- if (PageHuge(page)) {
- /*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
- */
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- /*
- * huge_pmd_unshare unmapped an entire PMD
- * page. There is no way of knowing exactly
- * which PMDs may be cached for this mm, so
- * we must flush them all. start/end were
- * already adjusted above to cover this range.
- */
- flush_cache_range(vma, range.start, range.end);
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
- }
-
- if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION) &&
- is_zone_device_page(page)) {
- swp_entry_t entry;
- pte_t swp_pte;
-
- pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
+ if (folio_test_hugetlb(folio)) {
+ bool anon = folio_test_anon(folio);
/*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
+ * The try_to_unmap() is only passed a hugetlb page
+ * in the case where the hugetlb page is poisoned.
*/
- entry = make_migration_entry(page, 0);
- swp_pte = swp_entry_to_pte(entry);
-
+ VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
/*
- * pteval maps a zone device page and is therefore
- * a swap pte.
+ * huge_pmd_unshare may unmap an entire PMD page.
+ * There is no way of knowing exactly which PMDs may
+ * be cached for this mm, so we must flush them all.
+ * start/end were already adjusted above to cover this
+ * range.
*/
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ flush_cache_range(vma, range.start, range.end);
+
/*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
*
- * The assignment to subpage above was computed from a
- * swap PTE which results in an invalid pointer.
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
+ * We also must hold hugetlb vma_lock in write mode.
+ * Lock order dictates acquiring vma_lock BEFORE
+ * i_mmap_rwsem. We can only try lock here and fail
+ * if unsuccessful.
*/
- subpage = page;
- goto discard;
- }
+ if (!anon) {
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (!hugetlb_vma_trylock_write(vma)) {
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
+ }
+ if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ flush_tlb_range(vma,
+ range.start, range.end);
+ mmu_notifier_invalidate_range(mm,
+ range.start, range.end);
+ /*
+ * The ref count of the PMD page was
+ * dropped which is part of the way map
+ * counting is done for shared PMDs.
+ * Return 'true' here. When there is
+ * no other sharing, huge_pmd_unshare
+ * returns false and we will unmap the
+ * actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ hugetlb_vma_unlock_write(vma);
+ }
+ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+ } else {
+ flush_cache_page(vma, address, pfn);
+ /* Nuke the page table entry. */
+ if (should_defer_flush(mm, flags)) {
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- if (!(flags & TTU_IGNORE_ACCESS)) {
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ set_tlb_ubc_flush_pending(mm, pteval);
+ } else {
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
}
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the page.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
-
- set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
- }
+ /*
+ * Now the pte is cleared. If this pte was uffd-wp armed,
+ * we may want to replace a none pte with a marker pte if
+ * it's file-backed, so we don't lose the tracking info.
+ */
+ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
- /* Move the dirty bit to the page. Now the pte is gone. */
+ /* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
- if (PageHuge(page)) {
- hugetlb_count_sub(compound_nr(page), mm);
- set_huge_swap_pte_at(mm, address,
- pvmw.pte, pteval,
- vma_mmu_pagesize(vma));
+ if (folio_test_hugetlb(folio)) {
+ hugetlb_count_sub(folio_nr_pages(folio), mm);
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
} else {
- dec_mm_counter(mm, mm_counter(page));
+ dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -1593,47 +1627,19 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(page));
+ dec_mm_counter(mm, mm_counter(&folio->page));
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
- } else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = make_migration_entry(subpage,
- pte_write(pteval));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
- } else if (PageAnon(page)) {
+ } else if (folio_test_anon(folio)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+ if (unlikely(folio_test_swapbacked(folio) !=
+ folio_test_swapcache(folio))) {
WARN_ON_ONCE(1);
ret = false;
/* We have to invalidate as we cleared the pte */
@@ -1644,8 +1650,31 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/* MADV_FREE page check */
- if (!PageSwapBacked(page)) {
- if (!PageDirty(page)) {
+ if (!folio_test_swapbacked(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
@@ -1654,11 +1683,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/*
- * If the page was redirtied, it cannot be
+ * If the folio was redirtied, it cannot be
* discarded. Remap the page to page table.
*/
set_pte_at(mm, address, pvmw.pte, pteval);
- SetPageSwapBacked(page);
+ folio_set_swapbacked(folio);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
@@ -1671,6 +1700,17 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
break;
}
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ swap_free(entry);
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
+ if (anon_exclusive &&
+ page_try_share_anon_rmap(subpage)) {
+ swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
@@ -1685,6 +1725,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
dec_mm_counter(mm, MM_ANONPAGES);
inc_mm_counter(mm, MM_SWAPENTS);
swp_pte = swp_entry_to_pte(entry);
+ if (anon_exclusive)
+ swp_pte = pte_swp_mkexclusive(swp_pte);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
@@ -1695,16 +1737,17 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
address + PAGE_SIZE);
} else {
/*
- * This is a locked file-backed page, thus it cannot
- * be removed from the page cache and replaced by a new
- * page before mmu_notifier_invalidate_range_end, so no
- * concurrent thread might update its page table to
- * point at new page while a device still is using this
- * page.
+ * This is a locked file-backed folio,
+ * so it cannot be removed from the page
+ * cache and replaced by a new folio before
+ * mmu_notifier_invalidate_range_end, so no
+ * concurrent thread might update its page table
+ * to point at a new folio while a device is
+ * still using this folio.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
- dec_mm_counter(mm, mm_counter_file(page));
+ dec_mm_counter(mm, mm_counter_file(&folio->page));
}
discard:
/*
@@ -1712,10 +1755,12 @@ discard:
* done above for all cases requiring it to happen under page
* table lock before mmu_notifier_invalidate_range_end()
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
- page_remove_rmap(subpage, PageHuge(page));
- put_page(page);
+ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_drain_local();
+ folio_put(folio);
}
mmu_notifier_invalidate_range_end(&range);
@@ -1728,30 +1773,387 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return vma_is_temporary_stack(vma);
}
-static int page_mapcount_is_zero(struct page *page)
+static int folio_not_mapped(struct folio *folio)
{
- return !total_mapcount(page);
+ return !folio_mapped(folio);
}
/**
- * try_to_unmap - try to remove all page table mappings to a page
- * @page: the page to get unmapped
+ * try_to_unmap - Try to remove all page table mappings to a folio.
+ * @folio: The folio to unmap.
* @flags: action and flags
*
* Tries to remove all the page table entries which are mapping this
- * page, used in the pageout path. Caller must hold the page lock.
+ * folio. It is the caller's responsibility to check if the folio is
+ * still mapped if needed (use TTU_SYNC to prevent accounting races).
*
- * If unmap is successful, return true. Otherwise, false.
+ * Context: Caller must hold the folio lock.
*/
-bool try_to_unmap(struct page *page, enum ttu_flags flags)
+void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
- .done = page_mapcount_is_zero,
- .anon_lock = page_lock_anon_vma_read,
+ .done = folio_not_mapped,
+ .anon_lock = folio_lock_anon_vma_read,
};
+ if (flags & TTU_RMAP_LOCKED)
+ rmap_walk_locked(folio, &rwc);
+ else
+ rmap_walk(folio, &rwc);
+}
+
+/*
+ * @arg: enum ttu_flags will be passed to this argument.
+ *
+ * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
+ * containing migration entries.
+ */
+static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ pte_t pteval;
+ struct page *subpage;
+ bool anon_exclusive, ret = true;
+ struct mmu_notifier_range range;
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long pfn;
+
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_migrate() may return before page_mapped() has become false,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
+ /*
+ * unmap_page() in mm/huge_memory.c is the only user of migration with
+ * TTU_SPLIT_HUGE_PMD and it wants to freeze.
+ */
+ if (flags & TTU_SPLIT_HUGE_PMD)
+ split_huge_pmd_address(vma, address, true, folio);
+
+ /*
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
+ */
+ range.end = vma_address_end(&pvmw);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
+ address, range.end);
+ if (folio_test_hugetlb(folio)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
+ }
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte) {
+ subpage = folio_page(folio,
+ pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
+ VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
+ !folio_test_pmd_mappable(folio), folio);
+
+ if (set_pmd_migration_entry(&pvmw, subpage)) {
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ continue;
+ }
+#endif
+
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+
+ pfn = pte_pfn(ptep_get(pvmw.pte));
+
+ if (folio_is_zone_device(folio)) {
+ /*
+ * Our PTE is a non-present device exclusive entry and
+ * calculating the subpage as for the common case would
+ * result in an invalid pointer.
+ *
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
+ */
+ VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
+ subpage = &folio->page;
+ } else {
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
+ }
+ address = pvmw.address;
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(subpage);
+
+ if (folio_test_hugetlb(folio)) {
+ bool anon = folio_test_anon(folio);
+
+ /*
+ * huge_pmd_unshare may unmap an entire PMD page.
+ * There is no way of knowing exactly which PMDs may
+ * be cached for this mm, so we must flush them all.
+ * start/end were already adjusted above to cover this
+ * range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ *
+ * We also must hold hugetlb vma_lock in write mode.
+ * Lock order dictates acquiring vma_lock BEFORE
+ * i_mmap_rwsem. We can only try lock here and
+ * fail if unsuccessful.
+ */
+ if (!anon) {
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (!hugetlb_vma_trylock_write(vma)) {
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
+ }
+ if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ flush_tlb_range(vma,
+ range.start, range.end);
+ mmu_notifier_invalidate_range(mm,
+ range.start, range.end);
+
+ /*
+ * The ref count of the PMD page was
+ * dropped which is part of the way map
+ * counting is done for shared PMDs.
+ * Return 'true' here. When there is
+ * no other sharing, huge_pmd_unshare
+ * returns false and we will unmap the
+ * actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ hugetlb_vma_unlock_write(vma);
+ }
+ /* Nuke the hugetlb page table entry */
+ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+ } else {
+ flush_cache_page(vma, address, pfn);
+ /* Nuke the page table entry. */
+ if (should_defer_flush(mm, flags)) {
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+
+ set_tlb_ubc_flush_pending(mm, pteval);
+ } else {
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ }
+ }
+
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
+ if (folio_is_device_private(folio)) {
+ unsigned long pfn = folio_pfn(folio);
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ if (anon_exclusive)
+ BUG_ON(page_try_share_anon_rmap(subpage));
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = pte_to_swp_entry(pteval);
+ if (is_writable_device_private_entry(entry))
+ entry = make_writable_migration_entry(pfn);
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(pfn);
+ else
+ entry = make_readable_migration_entry(pfn);
+ swp_pte = swp_entry_to_pte(entry);
+
+ /*
+ * pteval maps a zone device page and is therefore
+ * a swap pte.
+ */
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
+ compound_order(&folio->page));
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ */
+ } else if (PageHWPoison(subpage)) {
+ pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
+ if (folio_test_hugetlb(folio)) {
+ hugetlb_count_sub(folio_nr_pages(folio), mm);
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ } else {
+ dec_mm_counter(mm, mm_counter(&folio->page));
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ }
+
+ } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ /*
+ * The guest indicated that the page content is of no
+ * interest anymore. Simply discard the pte, vmscan
+ * will take care of the rest.
+ * A future reference will then fault in a new zero
+ * page. When userfaultfd is active, we must not drop
+ * this page though, as its main user (postcopy
+ * migration) will not expect userfaults on already
+ * copied pages.
+ */
+ dec_mm_counter(mm, mm_counter(&folio->page));
+ /* We have to invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
+ } else {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ else
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
+ !anon_exclusive, subpage);
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
+ if (anon_exclusive &&
+ page_try_share_anon_rmap(subpage)) {
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ else
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ if (pte_write(pteval))
+ entry = make_writable_migration_entry(
+ page_to_pfn(subpage));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(subpage));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(subpage));
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pteval))
+ entry = make_migration_entry_dirty(entry);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
+ else
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+ trace_set_migration_pte(address, pte_val(swp_pte),
+ compound_order(&folio->page));
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ */
+ }
+
+ /*
+ * No need to call mmu_notifier_invalidate_range() it has be
+ * done above for all cases requiring it to happen under page
+ * table lock before mmu_notifier_invalidate_range_end()
+ *
+ * See Documentation/mm/mmu_notifier.rst
+ */
+ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_drain_local();
+ folio_put(folio);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
+/**
+ * try_to_migrate - try to replace all page table mappings with swap entries
+ * @folio: the folio to replace page table entries for
+ * @flags: action and flags
+ *
+ * Tries to remove all the page table entries which are mapping this folio and
+ * replace them with special swap entries. Caller must hold the folio lock.
+ */
+void try_to_migrate(struct folio *folio, enum ttu_flags flags)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_migrate_one,
+ .arg = (void *)flags,
+ .done = folio_not_mapped,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+
+ /*
+ * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
+ * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
+ */
+ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+ TTU_SYNC | TTU_BATCH_FLUSH)))
+ return;
+
+ if (folio_is_zone_device(folio) &&
+ (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
+ return;
+
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
@@ -1760,47 +2162,201 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
- if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
- && !PageKsm(page) && PageAnon(page))
+ if (!folio_test_ksm(folio) && folio_test_anon(folio))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
- rmap_walk_locked(page, &rwc);
+ rmap_walk_locked(folio, &rwc);
else
- rmap_walk(page, &rwc);
-
- return !page_mapcount(page) ? true : false;
+ rmap_walk(folio, &rwc);
}
-static int page_not_mapped(struct page *page)
-{
- return !page_mapped(page);
+#ifdef CONFIG_DEVICE_PRIVATE
+struct make_exclusive_args {
+ struct mm_struct *mm;
+ unsigned long address;
+ void *owner;
+ bool valid;
};
+static bool page_make_device_exclusive_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long address, void *priv)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ struct make_exclusive_args *args = priv;
+ pte_t pteval;
+ struct page *subpage;
+ bool ret = true;
+ struct mmu_notifier_range range;
+ swp_entry_t entry;
+ pte_t swp_pte;
+ pte_t ptent;
+
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ vma->vm_mm, address, min(vma->vm_end,
+ address + folio_size(folio)),
+ args->owner);
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+
+ ptent = ptep_get(pvmw.pte);
+ if (!pte_present(ptent)) {
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ subpage = folio_page(folio,
+ pte_pfn(ptent) - folio_pfn(folio));
+ address = pvmw.address;
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(ptent));
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+
+ /*
+ * Check that our target page is still mapped at the expected
+ * address.
+ */
+ if (args->mm == mm && args->address == address &&
+ pte_write(pteval))
+ args->valid = true;
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ if (pte_write(pteval))
+ entry = make_writable_device_exclusive_entry(
+ page_to_pfn(subpage));
+ else
+ entry = make_readable_device_exclusive_entry(
+ page_to_pfn(subpage));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+
+ /*
+ * There is a reference on the page for the swap entry which has
+ * been removed, so shouldn't take another.
+ */
+ page_remove_rmap(subpage, vma, false);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
/**
- * try_to_munlock - try to munlock a page
- * @page: the page to be munlocked
+ * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
+ * @folio: The folio to replace page table entries for.
+ * @mm: The mm_struct where the folio is expected to be mapped.
+ * @address: Address where the folio is expected to be mapped.
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
*
- * Called from munlock code. Checks all of the VMAs mapping the page
- * to make sure nobody else has this page mlocked. The page will be
- * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ * Tries to remove all the page table entries which are mapping this
+ * folio and replace them with special device exclusive swap entries to
+ * grant a device exclusive access to the folio.
+ *
+ * Context: Caller must hold the folio lock.
+ * Return: false if the page is still mapped, or if it could not be unmapped
+ * from the expected address. Otherwise returns true (success).
*/
-
-void try_to_munlock(struct page *page)
+static bool folio_make_device_exclusive(struct folio *folio,
+ struct mm_struct *mm, unsigned long address, void *owner)
{
+ struct make_exclusive_args args = {
+ .mm = mm,
+ .address = address,
+ .owner = owner,
+ .valid = false,
+ };
struct rmap_walk_control rwc = {
- .rmap_one = try_to_unmap_one,
- .arg = (void *)TTU_MUNLOCK,
- .done = page_not_mapped,
- .anon_lock = page_lock_anon_vma_read,
-
+ .rmap_one = page_make_device_exclusive_one,
+ .done = folio_not_mapped,
+ .anon_lock = folio_lock_anon_vma_read,
+ .arg = &args,
};
- VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
- VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+ /*
+ * Restrict to anonymous folios for now to avoid potential writeback
+ * issues.
+ */
+ if (!folio_test_anon(folio))
+ return false;
+
+ rmap_walk(folio, &rwc);
+
+ return args.valid && !folio_mapcount(folio);
+}
+
+/**
+ * make_device_exclusive_range() - Mark a range for exclusive use by a device
+ * @mm: mm_struct of associated target process
+ * @start: start of the region to mark for exclusive device access
+ * @end: end address of region
+ * @pages: returns the pages which were successfully marked for exclusive access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ *
+ * Returns: number of pages found in the range by GUP. A page is marked for
+ * exclusive access only if the page pointer is non-NULL.
+ *
+ * This function finds ptes mapping page(s) to the given address range, locks
+ * them and replaces mappings with special swap entries preventing userspace CPU
+ * access. On fault these entries are replaced with the original mapping after
+ * calling MMU notifiers.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the page lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ */
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct page **pages,
+ void *owner)
+{
+ long npages = (end - start) >> PAGE_SHIFT;
+ long i;
+
+ npages = get_user_pages_remote(mm, start, npages,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ pages, NULL);
+ if (npages < 0)
+ return npages;
+
+ for (i = 0; i < npages; i++, start += PAGE_SIZE) {
+ struct folio *folio = page_folio(pages[i]);
+ if (PageTail(pages[i]) || !folio_trylock(folio)) {
+ folio_put(folio);
+ pages[i] = NULL;
+ continue;
+ }
+
+ if (!folio_make_device_exclusive(folio, mm, start, owner)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ pages[i] = NULL;
+ }
+ }
- rmap_walk(page, &rwc);
+ return npages;
}
+EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+#endif
void __put_anon_vma(struct anon_vma *anon_vma)
{
@@ -1811,25 +2367,35 @@ void __put_anon_vma(struct anon_vma *anon_vma)
anon_vma_free(root);
}
-static struct anon_vma *rmap_walk_anon_lock(struct page *page,
- struct rmap_walk_control *rwc)
+static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
if (rwc->anon_lock)
- return rwc->anon_lock(page);
+ return rwc->anon_lock(folio, rwc);
/*
- * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
+ * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
- anon_vma = page_anon_vma(page);
+ anon_vma = folio_anon_vma(folio);
if (!anon_vma)
return NULL;
+ if (anon_vma_trylock_read(anon_vma))
+ goto out;
+
+ if (rwc->try_lock) {
+ anon_vma = NULL;
+ rwc->contended = true;
+ goto out;
+ }
+
anon_vma_lock_read(anon_vma);
+out:
return anon_vma;
}
@@ -1841,44 +2407,40 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
- *
- * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * LOCKED.
*/
-static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
- bool locked)
+static void rmap_walk_anon(struct folio *folio,
+ struct rmap_walk_control *rwc, bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff_start, pgoff_end;
struct anon_vma_chain *avc;
if (locked) {
- anon_vma = page_anon_vma(page);
+ anon_vma = folio_anon_vma(folio);
/* anon_vma disappear under us? */
- VM_BUG_ON_PAGE(!anon_vma, page);
+ VM_BUG_ON_FOLIO(!anon_vma, folio);
} else {
- anon_vma = rmap_walk_anon_lock(page, rwc);
+ anon_vma = rmap_walk_anon_lock(folio, rwc);
}
if (!anon_vma)
return;
- pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
+ pgoff_start = folio_pgoff(folio);
+ pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(page, vma);
+ unsigned long address = vma_address(&folio->page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- if (!rwc->rmap_one(page, vma, address, rwc->arg))
+ if (!rwc->rmap_one(folio, vma, address, rwc->arg))
break;
- if (rwc->done && rwc->done(page))
+ if (rwc->done && rwc->done(folio))
break;
}
@@ -1893,16 +2455,11 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
- *
- * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * LOCKED.
*/
-static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
- bool locked)
+static void rmap_walk_file(struct folio *folio,
+ struct rmap_walk_control *rwc, bool locked)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping = folio_mapping(folio);
pgoff_t pgoff_start, pgoff_end;
struct vm_area_struct *vma;
@@ -1912,27 +2469,38 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
* structure at mapping cannot be freed and reused yet,
* so we can safely take mapping->i_mmap_rwsem.
*/
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (!mapping)
return;
- pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
- if (!locked)
+ pgoff_start = folio_pgoff(folio);
+ pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
+ if (!locked) {
+ if (i_mmap_trylock_read(mapping))
+ goto lookup;
+
+ if (rwc->try_lock) {
+ rwc->contended = true;
+ return;
+ }
+
i_mmap_lock_read(mapping);
+ }
+lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(page, vma);
+ unsigned long address = vma_address(&folio->page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- if (!rwc->rmap_one(page, vma, address, rwc->arg))
+ if (!rwc->rmap_one(folio, vma, address, rwc->arg))
goto done;
- if (rwc->done && rwc->done(page))
+ if (rwc->done && rwc->done(folio))
goto done;
}
@@ -1941,25 +2509,25 @@ done:
i_mmap_unlock_read(mapping);
}
-void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
- if (unlikely(PageKsm(page)))
- rmap_walk_ksm(page, rwc);
- else if (PageAnon(page))
- rmap_walk_anon(page, rwc, false);
+ if (unlikely(folio_test_ksm(folio)))
+ rmap_walk_ksm(folio, rwc);
+ else if (folio_test_anon(folio))
+ rmap_walk_anon(folio, rwc, false);
else
- rmap_walk_file(page, rwc, false);
+ rmap_walk_file(folio, rwc, false);
}
/* Like rmap_walk, but caller holds relevant rmap lock */
-void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
{
/* no ksm support for now */
- VM_BUG_ON_PAGE(PageKsm(page), page);
- if (PageAnon(page))
- rmap_walk_anon(page, rwc, true);
+ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
+ if (folio_test_anon(folio))
+ rmap_walk_anon(folio, rwc, true);
else
- rmap_walk_file(page, rwc, true);
+ rmap_walk_file(folio, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
@@ -1967,29 +2535,34 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
* The following two functions are for anonymous (private mapped) hugepages.
* Unlike common anonymous pages, anonymous hugepages have no accounting code
* and no lru code, because we handle hugepages differently from common pages.
+ *
+ * RMAP_COMPOUND is ignored.
*/
-void hugepage_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, rmap_t flags)
{
+ struct folio *folio = page_folio(page);
struct anon_vma *anon_vma = vma->anon_vma;
int first;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
BUG_ON(!anon_vma);
- /* address might be in next vma when migration races vma_adjust */
- first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ /* address might be in next vma when migration races vma_merge */
+ first = atomic_inc_and_test(&folio->_entire_mapcount);
+ VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+ VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first)
- __page_set_anon_rmap(page, vma, address, 0);
+ __page_set_anon_rmap(folio, page, vma, address,
+ !!(flags & RMAP_EXCLUSIVE));
}
-void hugepage_add_new_anon_rmap(struct page *page,
+void hugepage_add_new_anon_rmap(struct folio *folio,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- atomic_set(compound_mapcount_ptr(page), 0);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
-
- __page_set_anon_rmap(page, vma, address, 1);
+ /* increment count (starts at -1) */
+ atomic_set(&folio->_entire_mapcount, 0);
+ folio_clear_hugetlb_restore_reserve(folio);
+ __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 2613371945b7..6d783436951f 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -9,13 +9,13 @@
#include <linux/rodata_test.h>
#include <linux/uaccess.h>
+#include <linux/mm.h>
#include <asm/sections.h>
static const int rodata_test_data = 0xC3;
void rodata_test(void)
{
- unsigned long start, end;
int zero = 0;
/* test 1: read the value */
@@ -39,13 +39,11 @@ void rodata_test(void)
}
/* test 4: check if the rodata section is PAGE_SIZE aligned */
- start = (unsigned long)__start_rodata;
- end = (unsigned long)__end_rodata;
- if (start & (PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(__start_rodata)) {
pr_err("start of .rodata is not page size aligned\n");
return;
}
- if (end & (PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(__end_rodata)) {
pr_err("end of .rodata is not page size aligned\n");
return;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
new file mode 100644
index 000000000000..86442a15d12f
--- /dev/null
+++ b/mm/secretmem.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2021
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+#include <linux/mount.h>
+#include <linux/memfd.h>
+#include <linux/bitops.h>
+#include <linux/printk.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/pseudo_fs.h>
+#include <linux/secretmem.h>
+#include <linux/set_memory.h>
+#include <linux/sched/signal.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "secretmem: " fmt
+
+/*
+ * Define mode and flag masks to allow validation of the system call
+ * parameters.
+ */
+#define SECRETMEM_MODE_MASK (0x0)
+#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
+
+static bool secretmem_enable __ro_after_init = 1;
+module_param_named(enable, secretmem_enable, bool, 0400);
+MODULE_PARM_DESC(secretmem_enable,
+ "Enable secretmem and memfd_secret(2) system call");
+
+static atomic_t secretmem_users;
+
+bool secretmem_active(void)
+{
+ return !!atomic_read(&secretmem_users);
+}
+
+static vm_fault_t secretmem_fault(struct vm_fault *vmf)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ pgoff_t offset = vmf->pgoff;
+ gfp_t gfp = vmf->gfp_mask;
+ unsigned long addr;
+ struct page *page;
+ vm_fault_t ret;
+ int err;
+
+ if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+ return vmf_error(-EINVAL);
+
+ filemap_invalidate_lock_shared(mapping);
+
+retry:
+ page = find_lock_page(mapping, offset);
+ if (!page) {
+ page = alloc_page(gfp | __GFP_ZERO);
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
+ err = set_direct_map_invalid_noflush(page);
+ if (err) {
+ put_page(page);
+ ret = vmf_error(err);
+ goto out;
+ }
+
+ __SetPageUptodate(page);
+ err = add_to_page_cache_lru(page, mapping, offset, gfp);
+ if (unlikely(err)) {
+ put_page(page);
+ /*
+ * If a split of large page was required, it
+ * already happened when we marked the page invalid
+ * which guarantees that this call won't fail
+ */
+ set_direct_map_default_noflush(page);
+ if (err == -EEXIST)
+ goto retry;
+
+ ret = vmf_error(err);
+ goto out;
+ }
+
+ addr = (unsigned long)page_address(page);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ }
+
+ vmf->page = page;
+ ret = VM_FAULT_LOCKED;
+
+out:
+ filemap_invalidate_unlock_shared(mapping);
+ return ret;
+}
+
+static const struct vm_operations_struct secretmem_vm_ops = {
+ .fault = secretmem_fault,
+};
+
+static int secretmem_release(struct inode *inode, struct file *file)
+{
+ atomic_dec(&secretmem_users);
+ return 0;
+}
+
+static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long len = vma->vm_end - vma->vm_start;
+
+ if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ return -EINVAL;
+
+ if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ return -EAGAIN;
+
+ vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
+ vma->vm_ops = &secretmem_vm_ops;
+
+ return 0;
+}
+
+bool vma_is_secretmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &secretmem_vm_ops;
+}
+
+static const struct file_operations secretmem_fops = {
+ .release = secretmem_release,
+ .mmap = secretmem_mmap,
+};
+
+static int secretmem_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
+{
+ return -EBUSY;
+}
+
+static void secretmem_free_folio(struct folio *folio)
+{
+ set_direct_map_default_noflush(&folio->page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+}
+
+const struct address_space_operations secretmem_aops = {
+ .dirty_folio = noop_dirty_folio,
+ .free_folio = secretmem_free_folio,
+ .migrate_folio = secretmem_migrate_folio,
+};
+
+static int secretmem_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct address_space *mapping = inode->i_mapping;
+ unsigned int ia_valid = iattr->ia_valid;
+ int ret;
+
+ filemap_invalidate_lock(mapping);
+
+ if ((ia_valid & ATTR_SIZE) && inode->i_size)
+ ret = -EINVAL;
+ else
+ ret = simple_setattr(idmap, dentry, iattr);
+
+ filemap_invalidate_unlock(mapping);
+
+ return ret;
+}
+
+static const struct inode_operations secretmem_iops = {
+ .setattr = secretmem_setattr,
+};
+
+static struct vfsmount *secretmem_mnt;
+
+static struct file *secretmem_file_create(unsigned long flags)
+{
+ struct file *file;
+ struct inode *inode;
+ const char *anon_name = "[secretmem]";
+ const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
+ int err;
+
+ inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ err = security_inode_init_security_anon(inode, &qname, NULL);
+ if (err) {
+ file = ERR_PTR(err);
+ goto err_free_inode;
+ }
+
+ file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
+ O_RDWR, &secretmem_fops);
+ if (IS_ERR(file))
+ goto err_free_inode;
+
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_unevictable(inode->i_mapping);
+
+ inode->i_op = &secretmem_iops;
+ inode->i_mapping->a_ops = &secretmem_aops;
+
+ /* pretend we are a normal file with zero size */
+ inode->i_mode |= S_IFREG;
+ inode->i_size = 0;
+
+ return file;
+
+err_free_inode:
+ iput(inode);
+ return file;
+}
+
+SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
+{
+ struct file *file;
+ int fd, err;
+
+ /* make sure local flags do not confict with global fcntl.h */
+ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
+
+ if (!secretmem_enable)
+ return -ENOSYS;
+
+ if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
+ return -EINVAL;
+ if (atomic_read(&secretmem_users) < 0)
+ return -ENFILE;
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ file = secretmem_file_create(flags);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_put_fd;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+
+ atomic_inc(&secretmem_users);
+ fd_install(fd, file);
+ return fd;
+
+err_put_fd:
+ put_unused_fd(fd);
+ return err;
+}
+
+static int secretmem_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type secretmem_fs = {
+ .name = "secretmem",
+ .init_fs_context = secretmem_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init secretmem_init(void)
+{
+ if (!secretmem_enable)
+ return 0;
+
+ secretmem_mnt = kern_mount(&secretmem_fs);
+ if (IS_ERR(secretmem_mnt))
+ return PTR_ERR(secretmem_mnt);
+
+ /* prevent secretmem mappings from ever getting PROT_EXEC */
+ secretmem_mnt->mnt_flags |= MNT_NOEXEC;
+
+ return 0;
+}
+fs_initcall(secretmem_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index 6d4ddef4a24f..79a998b38ac8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,18 +28,19 @@
#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/fileattr.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
+#include <linux/shmem_fs.h>
#include <linux/swap.h>
#include <linux/uio.h>
-#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
-#include <linux/frontswap.h>
#include <linux/fs_parser.h>
-
-#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+#include <linux/swapfile.h>
+#include <linux/iversion.h>
+#include "swap.h"
static struct vfsmount *shm_mnt;
@@ -58,9 +59,7 @@ static struct vfsmount *shm_mnt;
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
-#include <linux/shmem_fs.h>
#include <linux/writeback.h>
-#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
@@ -77,7 +76,6 @@ static struct vfsmount *shm_mnt;
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
-#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
@@ -96,7 +94,7 @@ static struct vfsmount *shm_mnt;
/*
* shmem_fallocate communicates with shmem_fault or shmem_writepage via
- * inode->i_private (with i_mutex making sure that it has only one user at
+ * inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
@@ -117,10 +115,12 @@ struct shmem_options {
bool full_inums;
int huge;
int seen;
+ bool noswap;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
+#define SHMEM_SEEN_NOSWAP 16
};
#ifdef CONFIG_TMPFS
@@ -137,24 +137,10 @@ static unsigned long shmem_default_max_inodes(void)
}
#endif
-static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
-static int shmem_replace_page(struct page **pagep, gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index);
-static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
+static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type);
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
- struct vm_fault *vmf, vm_fault_t *fault_type);
-
-int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp)
-{
- return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
-}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
@@ -195,7 +181,7 @@ static inline int shmem_reacct_size(unsigned long flags,
/*
* ... whereas tmpfs objects are accounted incrementally as
* pages are allocated, in order to allow large sparse files.
- * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
static inline int shmem_acct_block(unsigned long flags, long pages)
@@ -246,17 +232,23 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
}
static const struct super_operations shmem_ops;
-static const struct address_space_operations shmem_aops;
+const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
+static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;
+bool vma_is_anon_shmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &shmem_anon_vm_ops;
+}
+
bool vma_is_shmem(struct vm_area_struct *vma)
{
- return vma->vm_ops == &shmem_vm_ops;
+ return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}
static LIST_HEAD(shmem_swaplist);
@@ -278,10 +270,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
ino_t ino;
if (!(sb->s_flags & SB_KERNMOUNT)) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_inodes) {
if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
sbinfo->free_inodes--;
@@ -304,7 +296,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
}
*inop = ino;
}
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
} else if (inop) {
/*
* __shmem_file_setup, one of our callers, is lock-free: it
@@ -319,13 +311,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
* to worry about things like glibc compatibility.
*/
ino_t *next_ino;
+
next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
ino = *next_ino;
if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
ino = sbinfo->next_ino;
sbinfo->next_ino += SHMEM_INO_BATCH;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
if (unlikely(is_zero_ino(ino)))
ino++;
}
@@ -341,9 +334,9 @@ static void shmem_free_inode(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -397,7 +390,7 @@ void shmem_uncharge(struct inode *inode, long pages)
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long flags;
- /* nrpages adjustment done by __delete_from_page_cache() or caller */
+ /* nrpages adjustment done by __filemap_remove_folio() or caller */
spin_lock_irqsave(&info->lock, flags);
info->alloced -= pages;
@@ -474,7 +467,39 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */
-static int shmem_huge __read_mostly;
+static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
+
+bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
+ struct mm_struct *mm, unsigned long vm_flags)
+{
+ loff_t i_size;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
+ return false;
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
+ if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
+ return true;
+
+ switch (SHMEM_SB(inode->i_sb)->huge) {
+ case SHMEM_HUGE_ALWAYS:
+ return true;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ index = round_up(index + 1, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >> PAGE_SHIFT >= index)
+ return true;
+ fallthrough;
+ case SHMEM_HUGE_ADVISE:
+ if (mm && (vm_flags & VM_HUGEPAGE))
+ return true;
+ fallthrough;
+ default:
+ return false;
+ }
+}
#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
@@ -525,9 +550,9 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
LIST_HEAD(to_remove);
struct inode *inode;
struct shmem_inode_info *info;
- struct page *page;
+ struct folio *folio;
unsigned long batch = sc ? sc->nr_to_scan : 128;
- int removed = 0, split = 0;
+ int split = 0;
if (list_empty(&sbinfo->shrinklist))
return SHRINK_STOP;
@@ -542,7 +567,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
/* inode is about to be evicted */
if (!inode) {
list_del_init(&info->shrinklist);
- removed++;
goto next;
}
@@ -550,12 +574,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
if (round_up(inode->i_size, PAGE_SIZE) ==
round_up(inode->i_size, HPAGE_PMD_SIZE)) {
list_move(&info->shrinklist, &to_remove);
- removed++;
goto next;
}
list_move(&info->shrinklist, &list);
next:
+ sbinfo->shrinklist_len--;
if (!--batch)
break;
}
@@ -570,57 +594,64 @@ next:
list_for_each_safe(pos, next, &list) {
int ret;
+ pgoff_t index;
info = list_entry(pos, struct shmem_inode_info, shrinklist);
inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split)
- goto leave;
+ goto move_back;
- page = find_get_page(inode->i_mapping,
- (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
- if (!page)
+ index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ folio = filemap_get_folio(inode->i_mapping, index);
+ if (IS_ERR(folio))
goto drop;
/* No huge page at the end of the file: nothing to split */
- if (!PageTransHuge(page)) {
- put_page(page);
+ if (!folio_test_large(folio)) {
+ folio_put(folio);
goto drop;
}
/*
- * Leave the inode on the list if we failed to lock
- * the page at this time.
+ * Move the inode on the list back to shrinklist if we failed
+ * to lock the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
- if (!trylock_page(page)) {
- put_page(page);
- goto leave;
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ goto move_back;
}
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ ret = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
- /* If split failed leave the inode on the list */
+ /* If split failed move the inode on the list back to shrinklist */
if (ret)
- goto leave;
+ goto move_back;
split++;
drop:
list_del_init(&info->shrinklist);
- removed++;
-leave:
+ goto put;
+move_back:
+ /*
+ * Make sure the inode is either on the global list or deleted
+ * from any local list before iput() since it could be deleted
+ * in another thread once we put the inode (then the local list
+ * is corrupted).
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_move(&info->shrinklist, &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ spin_unlock(&sbinfo->shrinklist_lock);
+put:
iput(inode);
}
- spin_lock(&sbinfo->shrinklist_lock);
- list_splice_tail(&list, &sbinfo->shrinklist);
- sbinfo->shrinklist_len -= removed;
- spin_unlock(&sbinfo->shrinklist_lock);
-
return split;
}
@@ -645,6 +676,12 @@ static long shmem_unused_huge_count(struct super_block *sb,
#define shmem_huge SHMEM_HUGE_DENY
+bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
+ struct mm_struct *mm, unsigned long vm_flags)
+{
+ return false;
+}
+
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
{
@@ -652,72 +689,59 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
-{
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
- shmem_huge != SHMEM_HUGE_DENY)
- return true;
- return false;
-}
-
/*
- * Like add_to_page_cache_locked, but error if expected item has gone.
+ * Like filemap_add_folio, but error if expected item has gone.
*/
-static int shmem_add_to_page_cache(struct page *page,
+static int shmem_add_to_page_cache(struct folio *folio,
struct address_space *mapping,
pgoff_t index, void *expected, gfp_t gfp,
struct mm_struct *charge_mm)
{
- XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
- unsigned long i = 0;
- unsigned long nr = compound_nr(page);
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
+ long nr = folio_nr_pages(folio);
int error;
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(index != round_down(index, nr), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- VM_BUG_ON(expected && PageTransHuge(page));
+ VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
+ VM_BUG_ON(expected && folio_test_large(folio));
- page_ref_add(page, nr);
- page->mapping = mapping;
- page->index = index;
+ folio_ref_add(folio, nr);
+ folio->mapping = mapping;
+ folio->index = index;
- if (!PageSwapCache(page)) {
- error = mem_cgroup_charge(page, charge_mm, gfp);
+ if (!folio_test_swapcache(folio)) {
+ error = mem_cgroup_charge(folio, charge_mm, gfp);
if (error) {
- if (PageTransHuge(page)) {
+ if (folio_test_pmd_mappable(folio)) {
count_vm_event(THP_FILE_FALLBACK);
count_vm_event(THP_FILE_FALLBACK_CHARGE);
}
goto error;
}
}
- cgroup_throttle_swaprate(page, gfp);
+ folio_throttle_swaprate(folio, gfp);
do {
- void *entry;
xas_lock_irq(&xas);
- entry = xas_find_conflict(&xas);
- if (entry != expected)
+ if (expected != xas_find_conflict(&xas)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ if (expected && xas_find_conflict(&xas)) {
xas_set_err(&xas, -EEXIST);
- xas_create_range(&xas);
- if (xas_error(&xas))
goto unlock;
-next:
- xas_store(&xas, page);
- if (++i < nr) {
- xas_next(&xas);
- goto next;
}
- if (PageTransHuge(page)) {
+ xas_store(&xas, folio);
+ if (xas_error(&xas))
+ goto unlock;
+ if (folio_test_pmd_mappable(folio)) {
count_vm_event(THP_FILE_ALLOC);
- __inc_node_page_state(page, NR_SHMEM_THPS);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
}
mapping->nrpages += nr;
- __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
- __mod_lruvec_page_state(page, NR_SHMEM, nr);
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -729,29 +753,28 @@ unlock:
return 0;
error:
- page->mapping = NULL;
- page_ref_sub(page, nr);
+ folio->mapping = NULL;
+ folio_ref_sub(folio, nr);
return error;
}
/*
- * Like delete_from_page_cache, but substitutes swap for page.
+ * Like delete_from_page_cache, but substitutes swap for @folio.
*/
-static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
+ long nr = folio_nr_pages(folio);
int error;
- VM_BUG_ON_PAGE(PageCompound(page), page);
-
xa_lock_irq(&mapping->i_pages);
- error = shmem_replace_entry(mapping, page->index, page, radswap);
- page->mapping = NULL;
- mapping->nrpages--;
- __dec_lruvec_page_state(page, NR_FILE_PAGES);
- __dec_lruvec_page_state(page, NR_SHMEM);
+ error = shmem_replace_entry(mapping, folio->index, folio, radswap);
+ folio->mapping = NULL;
+ mapping->nrpages -= nr;
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
xa_unlock_irq(&mapping->i_pages);
- put_page(page);
+ folio_put(folio);
BUG_ON(error);
}
@@ -774,7 +797,7 @@ static int shmem_free_swap(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given offsets are swapped out.
*
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
@@ -783,14 +806,16 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned long swapped = 0;
+ unsigned long max = end - 1;
rcu_read_lock();
- xas_for_each(&xas, page, end - 1) {
+ xas_for_each(&xas, page, max) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page))
swapped++;
-
+ if (xas.xa_index == max)
+ break;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
@@ -806,7 +831,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given vma is swapped out.
*
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
@@ -831,9 +856,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
return swapped << PAGE_SHIFT;
/* Here comes the more involved part */
- return shmem_partial_swap_usage(mapping,
- linear_page_index(vma, vma->vm_start),
- linear_page_index(vma, vma->vm_end));
+ return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
+ vma->vm_pgoff + vma_pages(vma));
}
/*
@@ -841,55 +865,47 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
*/
void shmem_unlock_mapping(struct address_space *mapping)
{
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
+ struct folio_batch fbatch;
pgoff_t index = 0;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
/*
* Minor point, but we might as well stop if someone else SHM_LOCKs it.
*/
- while (!mapping_unevictable(mapping)) {
- /*
- * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
- * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
- */
- pvec.nr = find_get_entries(mapping, index,
- PAGEVEC_SIZE, pvec.pages, indices);
- if (!pvec.nr)
- break;
- index = indices[pvec.nr - 1] + 1;
- pagevec_remove_exceptionals(&pvec);
- check_move_unevictable_pages(&pvec);
- pagevec_release(&pvec);
+ while (!mapping_unevictable(mapping) &&
+ filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
+ check_move_unevictable_folios(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
}
}
-/*
- * Check whether a hole-punch or truncation needs to split a huge page,
- * returning true if no split was required, or the split has been successful.
- *
- * Eviction (or truncation to 0 size) should never need to split a huge page;
- * but in rare cases might do so, if shmem_undo_range() failed to trylock on
- * head, and then succeeded to trylock on tail.
- *
- * A split can only succeed when there are no additional references on the
- * huge page: so the split below relies upon find_get_entries() having stopped
- * when it found a subpage of the huge page, without getting further references.
- */
-static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
+static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
- if (!PageTransCompound(page))
- return true;
+ struct folio *folio;
- /* Just proceed to delete a huge page wholly within the range punched */
- if (PageHead(page) &&
- page->index >= start && page->index + HPAGE_PMD_NR <= end)
- return true;
-
- /* Try to split huge page, so we can truly punch the hole or truncate */
- return split_huge_page(page) >= 0;
+ /*
+ * At first avoid shmem_get_folio(,,,SGP_READ): that fails
+ * beyond i_size, and reports fallocated folios as holes.
+ */
+ folio = filemap_get_entry(inode->i_mapping, index);
+ if (!folio)
+ return folio;
+ if (!xa_is_value(folio)) {
+ folio_lock(folio);
+ if (folio->mapping == inode->i_mapping)
+ return folio;
+ /* The folio has been swapped out */
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ /*
+ * But read a folio back from swap if any of it is within i_size
+ * (although in some cases this is just a waste of time).
+ */
+ folio = NULL;
+ shmem_get_folio(inode, index, &folio, SGP_READ);
+ return folio;
}
/*
@@ -903,10 +919,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgoff_t end = (lend + 1) >> PAGE_SHIFT;
- unsigned int partial_start = lstart & (PAGE_SIZE - 1);
- unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
+ struct folio *folio;
+ bool same_folio;
long nr_swaps_freed = 0;
pgoff_t index;
int i;
@@ -914,84 +930,75 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (lend == -1)
end = -1; /* unsigned, so actually very big */
- pagevec_init(&pvec);
- index = start;
- while (index < end) {
- pvec.nr = find_get_entries(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
- if (!pvec.nr)
- break;
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ if (info->fallocend > start && info->fallocend <= end && !unfalloc)
+ info->fallocend = start;
- index = indices[i];
- if (index >= end)
- break;
+ folio_batch_init(&fbatch);
+ index = start;
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
+ &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
- index, page);
+ indices[i], folio);
continue;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
-
- if (!trylock_page(page))
- continue;
-
- if ((!unfalloc || !PageUptodate(page)) &&
- page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- if (shmem_punch_compound(page, start, end))
- truncate_inode_page(mapping, page);
- }
- unlock_page(page);
+ if (!unfalloc || !folio_test_uptodate(folio))
+ truncate_inode_folio(mapping, folio);
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
- index++;
}
- if (partial_start) {
- struct page *page = NULL;
- shmem_getpage(inode, start - 1, &page, SGP_READ);
- if (page) {
- unsigned int top = PAGE_SIZE;
- if (start > end) {
- top = partial_end;
- partial_end = 0;
- }
- zero_user_segment(page, partial_start, top);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ /*
+ * When undoing a failed fallocate, we want none of the partial folio
+ * zeroing and splitting below, but shall want to truncate the whole
+ * folio when !uptodate indicates that it was added by this fallocate,
+ * even when [lstart, lend] covers only a part of the folio.
+ */
+ if (unfalloc)
+ goto whole_folios;
+
+ same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
+ folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
+ if (folio) {
+ same_folio = lend < folio_pos(folio) + folio_size(folio);
+ folio_mark_dirty(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend)) {
+ start = folio->index + folio_nr_pages(folio);
+ if (same_folio)
+ end = folio->index;
}
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
}
- if (partial_end) {
- struct page *page = NULL;
- shmem_getpage(inode, end, &page, SGP_READ);
- if (page) {
- zero_user_segment(page, 0, partial_end);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
- }
+
+ if (!same_folio)
+ folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
+ if (folio) {
+ folio_mark_dirty(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend))
+ end = folio->index;
+ folio_unlock(folio);
+ folio_put(folio);
}
- if (start >= end)
- return;
+
+whole_folios:
index = start;
while (index < end) {
cond_resched();
- pvec.nr = find_get_entries(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
- if (!pvec.nr) {
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
+ indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
break;
@@ -999,52 +1006,38 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index = start;
continue;
}
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
-
- index = indices[i];
- if (index >= end)
- break;
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, index, page)) {
+ if (shmem_free_swap(mapping, indices[i], folio)) {
/* Swap was replaced by page: retry */
- index--;
+ index = indices[i];
break;
}
nr_swaps_freed++;
continue;
}
- lock_page(page);
+ folio_lock(folio);
- if (!unfalloc || !PageUptodate(page)) {
- if (page_mapping(page) != mapping) {
+ if (!unfalloc || !folio_test_uptodate(folio)) {
+ if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
- unlock_page(page);
- index--;
+ folio_unlock(folio);
+ index = indices[i];
break;
}
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- if (shmem_punch_compound(page, start, end))
- truncate_inode_page(mapping, page);
- else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- /* Wipe the page and don't get stuck */
- clear_highpage(page);
- flush_dcache_page(page);
- set_page_dirty(page);
- if (index <
- round_up(start, HPAGE_PMD_NR))
- start = index + 1;
- }
+ VM_BUG_ON_FOLIO(folio_test_writeback(folio),
+ folio);
+ truncate_inode_folio(mapping, folio);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- index++;
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
}
spin_lock_irq(&info->lock);
@@ -1057,45 +1050,69 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_getattr(const struct path *path, struct kstat *stat,
+static int shmem_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
}
- generic_fillattr(inode, stat);
-
- if (is_huge_enabled(sb_info))
+ if (info->fsflags & FS_APPEND_FL)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (info->fsflags & FS_IMMUTABLE_FL)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (info->fsflags & FS_NODUMP_FL)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+ generic_fillattr(idmap, inode, stat);
+
+ if (shmem_is_huge(inode, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
+ if (request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime.tv_sec = info->i_crtime.tv_sec;
+ stat->btime.tv_nsec = info->i_crtime.tv_nsec;
+ }
+
return 0;
}
-static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
int error;
+ bool update_mtime = false;
+ bool update_ctime = true;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(idmap, dentry, attr);
if (error)
return error;
+ if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
+ if ((inode->i_mode ^ attr->ia_mode) & 0111) {
+ return -EPERM;
+ }
+ }
+
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
@@ -1106,7 +1123,9 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
i_size_write(inode, newsize);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ update_mtime = true;
+ } else {
+ update_ctime = false;
}
if (newsize <= oldsize) {
loff_t holebegin = round_up(newsize, PAGE_SIZE);
@@ -1120,30 +1139,18 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
if (oldsize > holebegin)
unmap_mapping_range(inode->i_mapping,
holebegin, 0, 1);
-
- /*
- * Part of the huge page can be beyond i_size: subject
- * to shrink under memory pressure.
- */
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- spin_lock(&sbinfo->shrinklist_lock);
- /*
- * _careful to defend against unlocked access to
- * ->shrink_list in shmem_unused_huge_shrink()
- */
- if (list_empty_careful(&info->shrinklist)) {
- list_add_tail(&info->shrinklist,
- &sbinfo->shrinklist);
- sbinfo->shrinklist_len++;
- }
- spin_unlock(&sbinfo->shrinklist_lock);
- }
}
}
- setattr_copy(inode, attr);
+ setattr_copy(idmap, inode, attr);
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(inode, inode->i_mode);
+ error = posix_acl_chmod(idmap, dentry, inode->i_mode);
+ if (!error && update_ctime) {
+ inode->i_ctime = current_time(inode);
+ if (update_mtime)
+ inode->i_mtime = inode->i_ctime;
+ inode_inc_iversion(inode);
+ }
return error;
}
@@ -1152,9 +1159,10 @@ static void shmem_evict_inode(struct inode *inode)
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (inode->i_mapping->a_ops == &shmem_aops) {
+ if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
+ mapping_set_exiting(inode->i_mapping);
shmem_truncate_range(inode, 0, (loff_t)-1);
if (!list_empty(&info->shrinklist)) {
spin_lock(&sbinfo->shrinklist_lock);
@@ -1182,75 +1190,68 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
-extern struct swap_info_struct *swap_info[];
-
static int shmem_find_swap_entries(struct address_space *mapping,
- pgoff_t start, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices,
- unsigned int type, bool frontswap)
+ pgoff_t start, struct folio_batch *fbatch,
+ pgoff_t *indices, unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
swp_entry_t entry;
- unsigned int ret = 0;
-
- if (!nr_entries)
- return 0;
rcu_read_lock();
- xas_for_each(&xas, page, ULONG_MAX) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ if (xas_retry(&xas, folio))
continue;
- if (!xa_is_value(page))
+ if (!xa_is_value(folio))
continue;
- entry = radix_to_swp_entry(page);
+ entry = radix_to_swp_entry(folio);
+ /*
+ * swapin error entries can be found in the mapping. But they're
+ * deliberately ignored here as we've done everything we can do.
+ */
if (swp_type(entry) != type)
continue;
- if (frontswap &&
- !frontswap_test(swap_info[type], swp_offset(entry)))
- continue;
- indices[ret] = xas.xa_index;
- entries[ret] = page;
+ indices[folio_batch_count(fbatch)] = xas.xa_index;
+ if (!folio_batch_add(fbatch, folio))
+ break;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
- if (++ret == nr_entries)
- break;
}
rcu_read_unlock();
- return ret;
+ return xas.xa_index;
}
/*
* Move the swapped pages for an inode to page cache. Returns the count
* of pages swapped in, or the error in case of failure.
*/
-static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
- pgoff_t *indices)
+static int shmem_unuse_swap_entries(struct inode *inode,
+ struct folio_batch *fbatch, pgoff_t *indices)
{
int i = 0;
int ret = 0;
int error = 0;
struct address_space *mapping = inode->i_mapping;
- for (i = 0; i < pvec.nr; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(page))
+ if (!xa_is_value(folio))
continue;
- error = shmem_swapin_page(inode, indices[i],
- &page, SGP_CACHE,
+ error = shmem_swapin_folio(inode, indices[i],
+ &folio, SGP_CACHE,
mapping_gfp_mask(mapping),
NULL, NULL);
if (error == 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
ret++;
}
if (error == -ENOMEM)
@@ -1263,44 +1264,27 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
-static int shmem_unuse_inode(struct inode *inode, unsigned int type,
- bool frontswap, unsigned long *fs_pages_to_unuse)
+static int shmem_unuse_inode(struct inode *inode, unsigned int type)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0;
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
- bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
int ret = 0;
- pagevec_init(&pvec);
do {
- unsigned int nr_entries = PAGEVEC_SIZE;
-
- if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
- nr_entries = *fs_pages_to_unuse;
-
- pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
- pvec.pages, indices,
- type, frontswap);
- if (pvec.nr == 0) {
+ folio_batch_init(&fbatch);
+ shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
+ if (folio_batch_count(&fbatch) == 0) {
ret = 0;
break;
}
- ret = shmem_unuse_swap_entries(inode, pvec, indices);
+ ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
if (ret < 0)
break;
- if (frontswap_partial) {
- *fs_pages_to_unuse -= ret;
- if (*fs_pages_to_unuse == 0) {
- ret = FRONTSWAP_PAGES_UNUSED;
- break;
- }
- }
-
- start = indices[pvec.nr - 1];
+ start = indices[folio_batch_count(&fbatch) - 1];
} while (true);
return ret;
@@ -1311,8 +1295,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
* device 'type' back into memory, so the swap device can be
* unused.
*/
-int shmem_unuse(unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+int shmem_unuse(unsigned int type)
{
struct shmem_inode_info *info, *next;
int error = 0;
@@ -1335,8 +1318,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
atomic_inc(&info->stop_eviction);
mutex_unlock(&shmem_swaplist_mutex);
- error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
- fs_pages_to_unuse);
+ error = shmem_unuse_inode(&info->vfs_inode, type);
cond_resched();
mutex_lock(&shmem_swaplist_mutex);
@@ -1358,23 +1340,14 @@ int shmem_unuse(unsigned int type, bool frontswap,
*/
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
- struct shmem_inode_info *info;
- struct address_space *mapping;
- struct inode *inode;
+ struct folio *folio = page_folio(page);
+ struct address_space *mapping = folio->mapping;
+ struct inode *inode = mapping->host;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
swp_entry_t swap;
pgoff_t index;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- BUG_ON(!PageLocked(page));
- mapping = page->mapping;
- index = page->index;
- inode = mapping->host;
- info = SHMEM_I(inode);
- if (info->flags & VM_LOCKED)
- goto redirty;
- if (!total_swap_pages)
- goto redirty;
-
/*
* Our capabilities prevent regular writeback or sync from ever calling
* shmem_writepage; but a stacking filesystem might use ->writepage of
@@ -1382,23 +1355,43 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
* swap only in response to memory pressure, and not for the writeback
* threads or sync.
*/
- if (!wbc->for_reclaim) {
- WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
+ if (WARN_ON_ONCE(!wbc->for_reclaim))
+ goto redirty;
+
+ if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
goto redirty;
+
+ if (!total_swap_pages)
+ goto redirty;
+
+ /*
+ * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
+ * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
+ * and its shmem_writeback() needs them to be split when swapping.
+ */
+ if (folio_test_large(folio)) {
+ /* Ensure the subpages are still dirty */
+ folio_test_set_dirty(folio);
+ if (split_huge_page(page) < 0)
+ goto redirty;
+ folio = page_folio(page);
+ folio_clear_dirty(folio);
}
+ index = folio->index;
+
/*
* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
* value into swapfile.c, the only way we can correctly account for a
- * fallocated page arriving here is now to initialize it and write it.
+ * fallocated folio arriving here is now to initialize it and write it.
*
- * That's okay for a page already fallocated earlier, but if we have
+ * That's okay for a folio already fallocated earlier, but if we have
* not yet completed the fallocation, then (a) we want to keep track
- * of this page in case we have to undo it, and (b) it may not be a
+ * of this folio in case we have to undo it, and (b) it may not be a
* good idea to continue anyway, once we're pushing into swap. So
- * reactivate the page, and let shmem_fallocate() quit when too many.
+ * reactivate the folio, and let shmem_fallocate() quit when too many.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (inode->i_private) {
struct shmem_falloc *shmem_falloc;
spin_lock(&inode->i_lock);
@@ -1414,18 +1407,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (shmem_falloc)
goto redirty;
}
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
}
- swap = get_swap_page(page);
+ swap = folio_alloc_swap(folio);
if (!swap.val)
goto redirty;
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now before the page is
+ * if it's not already there. Do it now before the folio is
* moved to swap cache, when its pagelock no longer protects
* the inode from eviction. But don't unlock the mutex until
* we've incremented swapped, because shmem_unuse_inode() will
@@ -1435,7 +1428,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(page, swap,
+ if (add_to_swap_cache(folio, swap,
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
NULL) == 0) {
spin_lock_irq(&info->lock);
@@ -1444,21 +1437,21 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_unlock_irq(&info->lock);
swap_shmem_alloc(swap);
- shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
mutex_unlock(&shmem_swaplist_mutex);
- BUG_ON(page_mapped(page));
- swap_writepage(page, wbc);
+ BUG_ON(folio_mapped(folio));
+ swap_writepage(&folio->page, wbc);
return 0;
}
mutex_unlock(&shmem_swaplist_mutex);
- put_swap_page(page, swap);
+ put_swap_folio(folio, swap);
redirty:
- set_page_dirty(page);
+ folio_mark_dirty(folio);
if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
- unlock_page(page);
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
+ folio_unlock(folio);
return 0;
}
@@ -1479,10 +1472,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
struct mempolicy *mpol = NULL;
if (sbinfo->mpol) {
- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
mpol = sbinfo->mpol;
mpol_get(mpol);
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
return mpol;
}
@@ -1515,29 +1508,55 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
mpol_cond_put(vma->vm_policy);
}
-static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct page *page;
- struct vm_fault vmf;
+ struct vm_fault vmf = {
+ .vma = &pvma,
+ };
shmem_pseudo_vma_init(&pvma, info, index);
- vmf.vma = &pvma;
- vmf.address = 0;
page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
- return page;
+ if (!page)
+ return NULL;
+ return page_folio(page);
}
-static struct page *shmem_alloc_hugepage(gfp_t gfp,
+/*
+ * Make sure huge_gfp is always more limited than limit_gfp.
+ * Some of the flags set permissions, while others set limitations.
+ */
+static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
+{
+ gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
+ gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
+ gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
+
+ /* Allow allocations only from the originally specified zones. */
+ result |= zoneflags;
+
+ /*
+ * Minimize the result gfp by taking the union with the deny flags,
+ * and the intersection of the allow flags.
+ */
+ result |= (limit_gfp & denyflags);
+ result |= (huge_gfp & limit_gfp) & allowflags;
+
+ return result;
+}
+
+static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct address_space *mapping = info->vfs_inode.i_mapping;
pgoff_t hindex;
- struct page *page;
+ struct folio *folio;
hindex = round_down(index, HPAGE_PMD_NR);
if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
@@ -1545,35 +1564,31 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
- page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
- HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+ folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
shmem_pseudo_vma_destroy(&pvma);
- if (page)
- prep_transhuge_page(page);
- else
+ if (!folio)
count_vm_event(THP_FILE_FALLBACK);
- return page;
+ return folio;
}
-static struct page *shmem_alloc_page(gfp_t gfp,
+static struct folio *shmem_alloc_folio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
- struct page *page;
+ struct folio *folio;
shmem_pseudo_vma_init(&pvma, info, index);
- page = alloc_page_vma(gfp, &pvma, 0);
+ folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
shmem_pseudo_vma_destroy(&pvma);
- return page;
+ return folio;
}
-static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
- struct inode *inode,
+static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
pgoff_t index, bool huge)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct page *page;
+ struct folio *folio;
int nr;
int err = -ENOSPC;
@@ -1585,13 +1600,13 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
goto failed;
if (huge)
- page = shmem_alloc_hugepage(gfp, info, index);
+ folio = shmem_alloc_hugefolio(gfp, info, index);
else
- page = shmem_alloc_page(gfp, info, index);
- if (page) {
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
- return page;
+ folio = shmem_alloc_folio(gfp, info, index);
+ if (folio) {
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ return folio;
}
err = -ENOMEM;
@@ -1602,7 +1617,7 @@ failed:
/*
* When a page is moved from swapcache to shmem filecache (either by the
- * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
* shmem_unuse_inode()), it may have been read in earlier from swap, in
* ignorance of the mapping it belongs to. If that mapping has special
* constraints (like the gma500 GEM driver, which requires RAM below 4GB),
@@ -1612,54 +1627,57 @@ failed:
* NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
* but for now it is a simple matter of zone.
*/
-static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
{
- return page_zonenum(page) > gfp_zone(gfp);
+ return folio_zonenum(folio) > gfp_zone(gfp);
}
-static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct page *oldpage, *newpage;
+ struct folio *old, *new;
struct address_space *swap_mapping;
swp_entry_t entry;
pgoff_t swap_index;
int error;
- oldpage = *pagep;
- entry.val = page_private(oldpage);
+ old = *foliop;
+ entry = folio_swap_entry(old);
swap_index = swp_offset(entry);
- swap_mapping = page_mapping(oldpage);
+ swap_mapping = swap_address_space(entry);
/*
* We have arrived here because our zones are constrained, so don't
* limit chance of success by further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
- newpage = shmem_alloc_page(gfp, info, index);
- if (!newpage)
+ VM_BUG_ON_FOLIO(folio_test_large(old), old);
+ new = shmem_alloc_folio(gfp, info, index);
+ if (!new)
return -ENOMEM;
- get_page(newpage);
- copy_highpage(newpage, oldpage);
- flush_dcache_page(newpage);
+ folio_get(new);
+ folio_copy(new, old);
+ flush_dcache_folio(new);
- __SetPageLocked(newpage);
- __SetPageSwapBacked(newpage);
- SetPageUptodate(newpage);
- set_page_private(newpage, entry.val);
- SetPageSwapCache(newpage);
+ __folio_set_locked(new);
+ __folio_set_swapbacked(new);
+ folio_mark_uptodate(new);
+ folio_set_swap_entry(new, entry);
+ folio_set_swapcache(new);
/*
* Our caller will very soon move newpage out of swapcache, but it's
* a nice clean interface for us to replace oldpage by newpage there.
*/
xa_lock_irq(&swap_mapping->i_pages);
- error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
+ error = shmem_replace_entry(swap_mapping, swap_index, old, new);
if (!error) {
- mem_cgroup_migrate(oldpage, newpage);
- __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
- __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
+ mem_cgroup_migrate(old, new);
+ __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
+ __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
+ __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
+ __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -1669,46 +1687,87 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* both PageSwapCache and page_private after getting page lock;
* but be defensive. Reverse old to newpage for clear and free.
*/
- oldpage = newpage;
+ old = new;
} else {
- lru_cache_add(newpage);
- *pagep = newpage;
+ folio_add_lru(new);
+ *foliop = new;
}
- ClearPageSwapCache(oldpage);
- set_page_private(oldpage, 0);
+ folio_clear_swapcache(old);
+ old->private = NULL;
- unlock_page(oldpage);
- put_page(oldpage);
- put_page(oldpage);
+ folio_unlock(old);
+ folio_put_refs(old, 2);
return error;
}
+static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
+ struct folio *folio, swp_entry_t swap)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swapin_error;
+ void *old;
+
+ swapin_error = make_swapin_error_entry();
+ old = xa_cmpxchg_irq(&mapping->i_pages, index,
+ swp_to_radix_entry(swap),
+ swp_to_radix_entry(swapin_error), 0);
+ if (old != swp_to_radix_entry(swap))
+ return;
+
+ folio_wait_writeback(folio);
+ delete_from_swap_cache(folio);
+ spin_lock_irq(&info->lock);
+ /*
+ * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
+ * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
+ * shmem_evict_inode.
+ */
+ info->alloced--;
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ swap_free(swap);
+}
+
/*
- * Swap in the page pointed to by *pagep.
- * Caller has to make sure that *pagep contains a valid swapped page.
- * Returns 0 and the page in pagep if success. On failure, returns the
- * error code and NULL in *pagep.
+ * Swap in the folio pointed to by *foliop.
+ * Caller has to make sure that *foliop contains a valid swapped folio.
+ * Returns 0 and the folio in foliop if success. On failure, returns the
+ * error code and NULL in *foliop.
*/
-static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
+static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
- struct page *page;
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
+ struct swap_info_struct *si;
+ struct folio *folio = NULL;
swp_entry_t swap;
int error;
- VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
- swap = radix_to_swp_entry(*pagep);
- *pagep = NULL;
+ VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
+ swap = radix_to_swp_entry(*foliop);
+ *foliop = NULL;
+
+ if (is_swapin_error_entry(swap))
+ return -EIO;
+
+ si = get_swap_device(swap);
+ if (!si) {
+ if (!shmem_confirm_swap(mapping, index, swap))
+ return -EEXIST;
+ else
+ return -EINVAL;
+ }
/* Look it up and read it in.. */
- page = lookup_swap_cache(swap, NULL, 0);
- if (!page) {
+ folio = swap_cache_get_folio(swap, NULL, 0);
+ if (!folio) {
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
@@ -1716,39 +1775,40 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
count_memcg_event_mm(charge_mm, PGMAJFAULT);
}
/* Here we actually start the io */
- page = shmem_swapin(swap, gfp, info, index);
- if (!page) {
+ folio = shmem_swapin(swap, gfp, info, index);
+ if (!folio) {
error = -ENOMEM;
goto failed;
}
}
- /* We have to do this with page locked to prevent races */
- lock_page(page);
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ /* We have to do this with folio locked to prevent races */
+ folio_lock(folio);
+ if (!folio_test_swapcache(folio) ||
+ folio_swap_entry(folio).val != swap.val ||
!shmem_confirm_swap(mapping, index, swap)) {
error = -EEXIST;
goto unlock;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
error = -EIO;
goto failed;
}
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
/*
* Some architectures may have to restore extra metadata to the
- * physical page after reading from swap.
+ * folio after reading from swap.
*/
- arch_swap_restore(swap, page);
+ arch_swap_restore(swap, folio);
- if (shmem_should_replace_page(page, gfp)) {
- error = shmem_replace_page(&page, gfp, info, index);
+ if (shmem_should_replace_folio(folio, gfp)) {
+ error = shmem_replace_folio(&folio, gfp, info, index);
if (error)
goto failed;
}
- error = shmem_add_to_page_cache(page, mapping, index,
+ error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(swap), gfp,
charge_mm);
if (error)
@@ -1760,56 +1820,58 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
spin_unlock_irq(&info->lock);
if (sgp == SGP_WRITE)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
- delete_from_swap_cache(page);
- set_page_dirty(page);
+ delete_from_swap_cache(folio);
+ folio_mark_dirty(folio);
swap_free(swap);
+ put_swap_device(si);
- *pagep = page;
+ *foliop = folio;
return 0;
failed:
if (!shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
+ if (error == -EIO)
+ shmem_set_folio_swapin_error(inode, index, folio, swap);
unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
+ put_swap_device(si);
return error;
}
/*
- * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
+ * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * vmf and fault_type are only supplied by shmem_fault:
+ * vma, vmf, and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, struct vm_fault *vmf,
- vm_fault_t *fault_type)
+static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
+ struct vm_area_struct *vma, struct vm_fault *vmf,
+ vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
- struct page *page;
- enum sgp_type sgp_huge = sgp;
- pgoff_t hindex = index;
+ struct folio *folio;
+ pgoff_t hindex;
+ gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
- if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
- sgp = SGP_CACHE;
repeat:
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
@@ -1817,39 +1879,58 @@ repeat:
}
sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : current->mm;
+ charge_mm = vma ? vma->vm_mm : NULL;
+
+ folio = filemap_get_entry(mapping, index);
+ if (folio && vma && userfaultfd_minor(vma)) {
+ if (!xa_is_value(folio))
+ folio_put(folio);
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
+ return 0;
+ }
- page = find_lock_entry(mapping, index);
- if (xa_is_value(page)) {
- error = shmem_swapin_page(inode, index, &page,
+ if (xa_is_value(folio)) {
+ error = shmem_swapin_folio(inode, index, &folio,
sgp, gfp, vma, fault_type);
if (error == -EEXIST)
goto repeat;
- *pagep = page;
+ *foliop = folio;
return error;
}
- if (page)
- hindex = page->index;
- if (page && sgp == SGP_WRITE)
- mark_page_accessed(page);
+ if (folio) {
+ folio_lock(folio);
- /* fallocated page? */
- if (page && !PageUptodate(page)) {
+ /* Has the folio been truncated or swapped out? */
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto repeat;
+ }
+ if (sgp == SGP_WRITE)
+ folio_mark_accessed(folio);
+ if (folio_test_uptodate(folio))
+ goto out;
+ /* fallocated folio */
if (sgp != SGP_READ)
goto clear;
- unlock_page(page);
- put_page(page);
- page = NULL;
- hindex = index;
+ folio_unlock(folio);
+ folio_put(folio);
}
- if (page || sgp == SGP_READ)
- goto out;
/*
- * Fast cache lookup did not find it:
- * bring it back from swap or allocate.
+ * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
+ * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
+ */
+ *foliop = NULL;
+ if (sgp == SGP_READ)
+ return 0;
+ if (sgp == SGP_NOALLOC)
+ return -ENOENT;
+
+ /*
+ * Fast cache lookup and swap lookup did not find it: allocate.
*/
if (vma && userfaultfd_missing(vma)) {
@@ -1857,51 +1938,26 @@ repeat:
return 0;
}
- /* shmem_symlink() */
- if (mapping->a_ops != &shmem_aops)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
+ if (!shmem_is_huge(inode, index, false,
+ vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- goto alloc_huge;
- switch (sbinfo->huge) {
- case SHMEM_HUGE_NEVER:
- goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE: {
- loff_t i_size;
- pgoff_t off;
- off = round_up(index, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- goto alloc_huge;
-
- fallthrough;
- }
- case SHMEM_HUGE_ADVISE:
- if (sgp_huge == SGP_HUGE)
- goto alloc_huge;
- /* TODO: implement fadvise() hints */
- goto alloc_nohuge;
- }
-
-alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
- if (IS_ERR(page)) {
+ huge_gfp = vma_thp_gfp_mask(vma);
+ huge_gfp = limit_gfp_mask(huge_gfp, gfp);
+ folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
+ if (IS_ERR(folio)) {
alloc_nohuge:
- page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
+ folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
}
- if (IS_ERR(page)) {
+ if (IS_ERR(folio)) {
int retry = 5;
- error = PTR_ERR(page);
- page = NULL;
+ error = PTR_ERR(folio);
+ folio = NULL;
if (error != -ENOSPC)
goto unlock;
/*
- * Try to reclaim some space by splitting a huge page
+ * Try to reclaim some space by splitting a large folio
* beyond i_size on the filesystem.
*/
while (retry--) {
@@ -1916,33 +1972,30 @@ alloc_nohuge:
goto unlock;
}
- if (PageTransHuge(page))
- hindex = round_down(index, HPAGE_PMD_NR);
- else
- hindex = index;
+ hindex = round_down(index, folio_nr_pages(folio));
if (sgp == SGP_WRITE)
- __SetPageReferenced(page);
+ __folio_set_referenced(folio);
- error = shmem_add_to_page_cache(page, mapping, hindex,
+ error = shmem_add_to_page_cache(folio, mapping, hindex,
NULL, gfp & GFP_RECLAIM_MASK,
charge_mm);
if (error)
goto unacct;
- lru_cache_add(page);
+ folio_add_lru(folio);
spin_lock_irq(&info->lock);
- info->alloced += compound_nr(page);
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
+ info->alloced += folio_nr_pages(folio);
+ inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
alloced = true;
- if (PageTransHuge(page) &&
+ if (folio_test_pmd_mappable(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- hindex + HPAGE_PMD_NR - 1) {
+ folio_next_index(folio) - 1) {
/*
- * Part of the huge page is beyond i_size: subject
+ * Part of the large folio is beyond i_size: subject
* to shrink under memory pressure.
*/
spin_lock(&sbinfo->shrinklist_lock);
@@ -1959,32 +2012,31 @@ alloc_nohuge:
}
/*
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
*/
if (sgp == SGP_FALLOC)
sgp = SGP_WRITE;
clear:
/*
- * Let SGP_WRITE caller clear ends if write does not fill page;
- * but SGP_FALLOC on a page fallocated earlier must initialize
+ * Let SGP_WRITE caller clear ends if write does not fill folio;
+ * but SGP_FALLOC on a folio fallocated earlier must initialize
* it now, lest undo on failure cancel our earlier guarantee.
*/
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
- int i;
+ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
+ long i, n = folio_nr_pages(folio);
- for (i = 0; i < compound_nr(page); i++) {
- clear_highpage(page + i);
- flush_dcache_page(page + i);
- }
- SetPageUptodate(page);
+ for (i = 0; i < n; i++)
+ clear_highpage(folio_page(folio, i));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
}
/* Perhaps the file has been truncated since we checked */
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
if (alloced) {
- ClearPageDirty(page);
- delete_from_page_cache(page);
+ folio_clear_dirty(folio);
+ filemap_remove_folio(folio);
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
@@ -1993,24 +2045,24 @@ clear:
goto unlock;
}
out:
- *pagep = page + index - hindex;
+ *foliop = folio;
return 0;
/*
* Error recovery.
*/
unacct:
- shmem_inode_unacct_blocks(inode, compound_nr(page));
+ shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
- if (PageTransHuge(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio_test_large(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto alloc_nohuge;
}
unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
if (error == -ENOSPC && !once++) {
spin_lock_irq(&info->lock);
@@ -2023,6 +2075,13 @@ unlock:
return error;
}
+int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
+ enum sgp_type sgp)
+{
+ return shmem_get_folio_gfp(inode, index, foliop, sgp,
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
+}
+
/*
* This is like autoremove_wake_function, but it removes the wait queue
* entry unconditionally - even if something else had already woken the
@@ -2040,14 +2099,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
- enum sgp_type sgp;
+ struct folio *folio = NULL;
int err;
vm_fault_t ret = VM_FAULT_LOCKED;
/*
* Trinity finds that probing a hole which tmpfs is punching can
* prevent the hole-punch from ever completing: which in turn
- * locks writers out with its hold on i_mutex. So refrain from
+ * locks writers out with its hold on i_rwsem. So refrain from
* faulting pages into the hole while it's being punched. Although
* shmem_undo_range() does remove the additions, it may be unable to
* keep up, as each new page needs its own unmap_mapping_range() call,
@@ -2058,7 +2117,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
* we just need to make racing faults a rare case.
*
* The implementation below would be much simpler if we just used a
- * standard mutex or completion: but we cannot take i_mutex in fault,
+ * standard mutex or completion: but we cannot take i_rwsem in fault,
* and bloating every shmem inode for this unlikely case would be sad.
*/
if (unlikely(inode->i_private)) {
@@ -2103,18 +2162,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- sgp = SGP_CACHE;
-
- if ((vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- sgp = SGP_NOHUGE;
- else if (vma->vm_flags & VM_HUGEPAGE)
- sgp = SGP_HUGE;
-
- err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
+ err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
gfp, vma, vmf, &ret);
if (err)
return vmf_error(err);
+ if (folio)
+ vmf->page = folio_file_page(folio, vmf->pgoff);
return ret;
}
@@ -2225,7 +2278,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
}
#endif
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
struct inode *inode = file_inode(file);
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2237,13 +2290,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
* no serialization needed when called from shm_destroy().
*/
if (lock && !(info->flags & VM_LOCKED)) {
- if (!user_shm_lock(inode->i_size, user))
+ if (!user_shm_lock(inode->i_size, ucounts))
goto out_nomem;
info->flags |= VM_LOCKED;
mapping_set_unevictable(file->f_mapping);
}
- if (!lock && (info->flags & VM_LOCKED) && user) {
- user_shm_unlock(inode->i_size, user);
+ if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+ user_shm_unlock(inode->i_size, ucounts);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
}
@@ -2255,42 +2308,58 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct shmem_inode_info *info = SHMEM_I(file_inode(file));
-
- if (info->seals & F_SEAL_FUTURE_WRITE) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
+ struct inode *inode = file_inode(file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ int ret;
- /*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (vma->vm_flags & VM_SHARED)
- vma->vm_flags &= ~(VM_MAYWRITE);
- }
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
/* arm64 - allow memory tagging on RAM-based files */
- vma->vm_flags |= VM_MTE_ALLOWED;
+ vm_flags_set(vma, VM_MTE_ALLOWED);
file_accessed(file);
- vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
- (vma->vm_end & HPAGE_PMD_MASK)) {
- khugepaged_enter(vma, vma->vm_flags);
- }
+ /* This is anonymous shared memory if it is unlinked at the time of mmap */
+ if (inode->i_nlink)
+ vma->vm_ops = &shmem_vm_ops;
+ else
+ vma->vm_ops = &shmem_anon_vm_ops;
return 0;
}
-static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
- umode_t mode, dev_t dev, unsigned long flags)
+#ifdef CONFIG_TMPFS_XATTR
+static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+
+/*
+ * chattr's fsflags are unrelated to extended attributes,
+ * but tmpfs has chosen to enable them under the same config option.
+ */
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+{
+ unsigned int i_flags = 0;
+
+ if (fsflags & FS_NOATIME_FL)
+ i_flags |= S_NOATIME;
+ if (fsflags & FS_APPEND_FL)
+ i_flags |= S_APPEND;
+ if (fsflags & FS_IMMUTABLE_FL)
+ i_flags |= S_IMMUTABLE;
+ /*
+ * But FS_NODUMP_FL does not require any action in i_flags.
+ */
+ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
+}
+#else
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+{
+}
+#define shmem_initxattrs NULL
+#endif
+
+static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
+ struct inode *dir, umode_t mode, dev_t dev,
+ unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
@@ -2303,20 +2372,28 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode = new_inode(sb);
if (inode) {
inode->i_ino = ino;
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(idmap, inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
+ info->i_crtime = inode->i_mtime;
+ info->fsflags = (dir == NULL) ? 0 :
+ SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
+ if (info->fsflags)
+ shmem_set_inode_flags(inode, info->fsflags);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
+ if (sbinfo->noswap)
+ mapping_set_unevictable(inode->i_mapping);
simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
+ mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
default:
@@ -2352,104 +2429,105 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
-bool shmem_mapping(struct address_space *mapping)
-{
- return mapping->a_ops == &shmem_aops;
-}
-
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- bool zeropage,
- struct page **pagep)
+#ifdef CONFIG_USERFAULTFD
+int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct folio **foliop)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- spinlock_t *ptl;
void *page_kaddr;
- struct page *page;
- pte_t _dst_pte, *dst_pte;
+ struct folio *folio;
int ret;
- pgoff_t offset, max_off;
+ pgoff_t max_off;
- ret = -ENOMEM;
- if (!shmem_inode_acct_block(inode, 1))
- goto out;
+ if (!shmem_inode_acct_block(inode, 1)) {
+ /*
+ * We may have got a page, returned -ENOENT triggering a retry,
+ * and now we find ourselves with -ENOMEM. Release the page, to
+ * avoid a BUG_ON in our caller.
+ */
+ if (unlikely(*foliop)) {
+ folio_put(*foliop);
+ *foliop = NULL;
+ }
+ return -ENOMEM;
+ }
- if (!*pagep) {
- page = shmem_alloc_page(gfp, info, pgoff);
- if (!page)
+ if (!*foliop) {
+ ret = -ENOMEM;
+ folio = shmem_alloc_folio(gfp, info, pgoff);
+ if (!folio)
goto out_unacct_blocks;
- if (!zeropage) { /* mcopy_atomic */
- page_kaddr = kmap_atomic(page);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
+ page_kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
- *pagep = page;
- shmem_inode_unacct_blocks(inode, 1);
+ *foliop = folio;
+ ret = -ENOENT;
/* don't free the page */
- return -ENOENT;
+ goto out_unacct_blocks;
}
- } else { /* mfill_zeropage_atomic */
- clear_highpage(page);
+
+ flush_dcache_folio(folio);
+ } else { /* ZEROPAGE */
+ clear_user_highpage(&folio->page, dst_addr);
}
} else {
- page = *pagep;
- *pagep = NULL;
+ folio = *foliop;
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+ *foliop = NULL;
}
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
- __SetPageUptodate(page);
+ VM_BUG_ON(folio_test_locked(folio));
+ VM_BUG_ON(folio_test_swapbacked(folio));
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ __folio_mark_uptodate(folio);
ret = -EFAULT;
- offset = linear_page_index(dst_vma, dst_addr);
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
+ if (unlikely(pgoff >= max_off))
goto out_release;
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK, dst_mm);
+ ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
+ gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
if (ret)
goto out_release;
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
- else {
- /*
- * We don't set the pte dirty if the vma has no
- * VM_WRITE permission, so mark the page dirty or it
- * could be freed from under us. We could do it
- * unconditionally before unlock_page(), but doing it
- * only if VM_WRITE is not set is faster.
- */
- set_page_dirty(page);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
-
- ret = -EFAULT;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
- goto out_release_unlock;
-
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_unlock;
-
- lru_cache_add(page);
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, flags);
+ if (ret)
+ goto out_delete_from_cache;
spin_lock_irq(&info->lock);
info->alloced++;
@@ -2457,71 +2535,35 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
- inc_mm_counter(dst_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
- pte_unmap_unlock(dst_pte, ptl);
- unlock_page(page);
- ret = 0;
-out:
- return ret;
-out_release_unlock:
- pte_unmap_unlock(dst_pte, ptl);
- ClearPageDirty(page);
- delete_from_page_cache(page);
+ folio_unlock(folio);
+ return 0;
+out_delete_from_cache:
+ filemap_remove_folio(folio);
out_release:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out_unacct_blocks:
shmem_inode_unacct_blocks(inode, 1);
- goto out;
-}
-
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
-{
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, false, pagep);
-}
-
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
-{
- struct page *page = NULL;
-
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, 0, true, &page);
+ return ret;
}
+#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
-#ifdef CONFIG_TMPFS_XATTR
-static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
-#else
-#define shmem_initxattrs NULL
-#endif
-
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
+ struct folio *folio;
+ int ret = 0;
- /* i_mutex is held by caller */
+ /* i_rwsem is held by caller */
if (unlikely(info->seals & (F_SEAL_GROW |
F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
@@ -2530,7 +2572,20 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- return shmem_getpage(inode, index, pagep, SGP_WRITE);
+ ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
+
+ if (ret)
+ return ret;
+
+ *pagep = folio_file_page(folio, index);
+ if (PageHWPoison(*pagep)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ *pagep = NULL;
+ return -EIO;
+ }
+
+ return 0;
}
static int
@@ -2538,33 +2593,23 @@ shmem_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
if (pos + copied > inode->i_size)
i_size_write(inode, pos + copied);
- if (!PageUptodate(page)) {
- struct page *head = compound_head(page);
- if (PageTransCompound(page)) {
- int i;
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- if (head + i == page)
- continue;
- clear_highpage(head + i);
- flush_dcache_page(head + i);
- }
- }
- if (copied < PAGE_SIZE) {
- unsigned from = pos & (PAGE_SIZE - 1);
- zero_user_segments(page, 0, from,
- from + copied, PAGE_SIZE);
+ if (!folio_test_uptodate(folio)) {
+ if (copied < folio_size(folio)) {
+ size_t from = offset_in_folio(folio, pos);
+ folio_zero_segments(folio, 0, from,
+ from + copied, folio_size(folio));
}
- SetPageUptodate(head);
+ folio_mark_uptodate(folio);
}
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -2576,23 +2621,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct address_space *mapping = inode->i_mapping;
pgoff_t index;
unsigned long offset;
- enum sgp_type sgp = SGP_READ;
int error = 0;
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
- /*
- * Might this read be for a stacking filesystem? Then when reading
- * holes of a sparse file, we actually need to allocate those pages,
- * and even mark them dirty, so it cannot exceed the max_blocks limit.
- */
- if (!iter_is_iovec(to))
- sgp = SGP_CACHE;
-
index = *ppos >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
for (;;) {
+ struct folio *folio = NULL;
struct page *page = NULL;
pgoff_t end_index;
unsigned long nr, ret;
@@ -2607,21 +2644,26 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, sgp);
+ error = shmem_get_folio(inode, index, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
- if (page) {
- if (sgp == SGP_CACHE)
- set_page_dirty(page);
- unlock_page(page);
+ if (folio) {
+ folio_unlock(folio);
+
+ page = folio_file_page(folio, index);
+ if (PageHWPoison(page)) {
+ folio_put(folio);
+ error = -EIO;
+ break;
+ }
}
/*
* We must evaluate after, since reads (unlike writes)
- * are called without i_mutex protection against truncate
+ * are called without i_rwsem protection against truncate
*/
nr = PAGE_SIZE;
i_size = i_size_read(inode);
@@ -2629,14 +2671,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (index == end_index) {
nr = i_size & ~PAGE_MASK;
if (nr <= offset) {
- if (page)
- put_page(page);
+ if (folio)
+ folio_put(folio);
break;
}
}
nr -= offset;
- if (page) {
+ if (folio) {
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
@@ -2648,23 +2690,35 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* Mark the page accessed if we read the beginning.
*/
if (!offset)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ */
+ ret = copy_page_to_iter(page, offset, nr, to);
+ folio_put(folio);
+
+ } else if (user_backed_iter(to)) {
+ /*
+ * Copy to user tends to be so well optimized, but
+ * clear_user() not so much, that it is noticeably
+ * faster to copy the zero page instead of clearing.
+ */
+ ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
} else {
- page = ZERO_PAGE(0);
- get_page(page);
+ /*
+ * But submitting the same page twice in a row to
+ * splice() - or others? - can result in confusion:
+ * so don't attempt that optimization on pipes etc.
+ */
+ ret = iov_iter_zero(nr, to);
}
- /*
- * Ok, we have the page, and it's up-to-date, so
- * now we can copy it to user space...
- */
- ret = copy_page_to_iter(page, offset, nr, to);
retval += ret;
offset += ret;
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
- put_page(page);
if (!iov_iter_count(to))
break;
if (ret < nr) {
@@ -2679,86 +2733,155 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the page cache.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
- pgoff_t index, pgoff_t end, int whence)
+static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
- struct page *page;
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
- bool done = false;
- int i;
+ return true;
+}
+
+static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+}
- pagevec_init(&pvec);
- pvec.nr = 1; /* start small: we may be there already */
- while (!done) {
- pvec.nr = find_get_entries(mapping, index,
- pvec.nr, pvec.pages, indices);
- if (!pvec.nr) {
- if (whence == SEEK_DATA)
- index = end;
+static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return false;
+}
+
+static const struct pipe_buf_operations zero_pipe_buf_ops = {
+ .release = zero_pipe_buf_release,
+ .try_steal = zero_pipe_buf_try_steal,
+ .get = zero_pipe_buf_get,
+};
+
+static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
+ loff_t fpos, size_t size)
+{
+ size_t offset = fpos & ~PAGE_MASK;
+
+ size = min_t(size_t, size, PAGE_SIZE - offset);
+
+ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &zero_pipe_buf_ops,
+ .page = ZERO_PAGE(0),
+ .offset = offset,
+ .len = size,
+ };
+ pipe->head++;
+ }
+
+ return size;
+}
+
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode = file_inode(in);
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio = NULL;
+ size_t total_spliced = 0, used, npages, n, part;
+ loff_t isize;
+ int error = 0;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ do {
+ if (*ppos >= i_size_read(inode))
+ break;
+
+ error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio,
+ SGP_READ);
+ if (error) {
+ if (error == -EINVAL)
+ error = 0;
break;
}
- for (i = 0; i < pvec.nr; i++, index++) {
- if (index < indices[i]) {
- if (whence == SEEK_HOLE) {
- done = true;
- break;
- }
- index = indices[i];
- }
- page = pvec.pages[i];
- if (page && !xa_is_value(page)) {
- if (!PageUptodate(page))
- page = NULL;
- }
- if (index >= end ||
- (page && whence == SEEK_DATA) ||
- (!page && whence == SEEK_HOLE)) {
- done = true;
+ if (folio) {
+ folio_unlock(folio);
+
+ if (folio_test_hwpoison(folio) ||
+ (folio_test_large(folio) &&
+ folio_test_has_hwpoisoned(folio))) {
+ error = -EIO;
break;
}
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- pvec.nr = PAGEVEC_SIZE;
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(inode);
+ if (unlikely(*ppos >= isize))
+ break;
+ part = min_t(loff_t, isize - *ppos, len);
+
+ if (folio) {
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+ folio_mark_accessed(folio);
+ /*
+ * Ok, we have the page, and it's up-to-date, so we can
+ * now splice it into the pipe.
+ */
+ n = splice_folio_into_pipe(pipe, folio, *ppos, part);
+ folio_put(folio);
+ folio = NULL;
+ } else {
+ n = splice_zeropage_into_pipe(pipe, *ppos, part);
+ }
+
+ if (!n)
+ break;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ break;
+
cond_resched();
- }
- return index;
+ } while (len);
+
+ if (folio)
+ folio_put(folio);
+
+ file_accessed(in);
+ return total_spliced ? total_spliced : error;
}
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- pgoff_t start, end;
- loff_t new_offset;
if (whence != SEEK_DATA && whence != SEEK_HOLE)
return generic_file_llseek_size(file, offset, whence,
MAX_LFS_FILESIZE, i_size_read(inode));
- inode_lock(inode);
- /* We're holding i_mutex so we can access i_size directly */
-
- if (offset < 0 || offset >= inode->i_size)
- offset = -ENXIO;
- else {
- start = offset >> PAGE_SHIFT;
- end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- new_offset = shmem_seek_hole_data(mapping, start, end, whence);
- new_offset <<= PAGE_SHIFT;
- if (new_offset > offset) {
- if (new_offset < inode->i_size)
- offset = new_offset;
- else if (whence == SEEK_DATA)
- offset = -ENXIO;
- else
- offset = inode->i_size;
- }
- }
+ if (offset < 0)
+ return -ENXIO;
+ inode_lock(inode);
+ /* We're holding i_rwsem so we can access i_size directly */
+ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
if (offset >= 0)
offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
inode_unlock(inode);
@@ -2772,7 +2895,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_falloc shmem_falloc;
- pgoff_t start, index, end;
+ pgoff_t start, index, end, undo_fallocend;
int error;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2786,7 +2909,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
error = -EPERM;
goto out;
@@ -2841,8 +2964,17 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
inode->i_private = &shmem_falloc;
spin_unlock(&inode->i_lock);
- for (index = start; index < end; index++) {
- struct page *page;
+ /*
+ * info->fallocend is only relevant when huge pages might be
+ * involved: to prevent split_huge_page() freeing fallocated
+ * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
+ */
+ undo_fallocend = info->fallocend;
+ if (info->fallocend < end)
+ info->fallocend = end;
+
+ for (index = start; index < end; ) {
+ struct folio *folio;
/*
* Good, the fallocate(2) manpage permits EINTR: we may have
@@ -2853,9 +2985,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM;
else
- error = shmem_getpage(inode, index, &page, SGP_FALLOC);
+ error = shmem_get_folio(inode, index, &folio,
+ SGP_FALLOC);
if (error) {
- /* Remove the !PageUptodate pages we added */
+ info->fallocend = undo_fallocend;
+ /* Remove the !uptodate folios we added */
if (index > start) {
shmem_undo_range(inode,
(loff_t)start << PAGE_SHIFT,
@@ -2865,34 +2999,45 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
}
/*
+ * Here is a more important optimization than it appears:
+ * a second SGP_FALLOC on the same large folio will clear it,
+ * making it uptodate and un-undoable if we fail later.
+ */
+ index = folio_next_index(folio);
+ /* Beware 32-bit wraparound */
+ if (!index)
+ index--;
+
+ /*
* Inform shmem_writepage() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
- shmem_falloc.next++;
- if (!PageUptodate(page))
- shmem_falloc.nr_falloced++;
+ if (!folio_test_uptodate(folio))
+ shmem_falloc.nr_falloced += index - shmem_falloc.next;
+ shmem_falloc.next = index;
/*
- * If !PageUptodate, leave it that way so that freeable pages
+ * If !uptodate, leave it that way so that freeable folios
* can be recognized if we need to rollback on error later.
- * But set_page_dirty so that memory pressure will swap rather
- * than free the pages we are allocating (and SGP_CACHE pages
+ * But mark it dirty so that memory pressure will swap rather
+ * than free the folios we are allocating (and SGP_CACHE folios
* might still be clean: we now need to mark those dirty too).
*/
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
cond_resched();
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
- inode->i_ctime = current_time(inode);
undone:
spin_lock(&inode->i_lock);
inode->i_private = NULL;
spin_unlock(&inode->i_lock);
out:
+ if (!error)
+ file_modified(file);
inode_unlock(inode);
return error;
}
@@ -2915,6 +3060,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_ffree = sbinfo->free_inodes;
}
/* else leave those fields 0 like simple_statfs */
+
+ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
+
return 0;
}
@@ -2922,12 +3070,13 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
* File creation. Allocate an inode, and we're done..
*/
static int
-shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
- inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
error = simple_acl_create(dir, inode);
if (error)
@@ -2941,6 +3090,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
}
@@ -2951,12 +3101,13 @@ out_iput:
}
static int
-shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
+ struct file *file, umode_t mode)
{
struct inode *inode;
int error = -ENOSPC;
- inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
if (inode) {
error = security_inode_init_security(inode, dir,
NULL,
@@ -2966,28 +3117,30 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
error = simple_acl_create(dir, inode);
if (error)
goto out_iput;
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
}
- return error;
+ return finish_open_simple(file, error);
out_iput:
iput(inode);
return error;
}
-static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
- if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+ error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
+ if (error)
return error;
inc_nlink(dir);
return 0;
}
-static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
- return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+ return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}
/*
@@ -3013,6 +3166,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
@@ -3030,6 +3184,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
dir->i_size -= BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
return 0;
@@ -3045,29 +3200,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
return shmem_unlink(dir, dentry);
}
-static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
-{
- bool old_is_dir = d_is_dir(old_dentry);
- bool new_is_dir = d_is_dir(new_dentry);
-
- if (old_dir != new_dir && old_is_dir != new_is_dir) {
- if (old_is_dir) {
- drop_nlink(old_dir);
- inc_nlink(new_dir);
- } else {
- drop_nlink(new_dir);
- inc_nlink(old_dir);
- }
- }
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- d_inode(old_dentry)->i_ctime =
- d_inode(new_dentry)->i_ctime = current_time(old_dir);
-
- return 0;
-}
-
-static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+static int shmem_whiteout(struct mnt_idmap *idmap,
+ struct inode *old_dir, struct dentry *old_dentry)
{
struct dentry *whiteout;
int error;
@@ -3076,7 +3210,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
if (!whiteout)
return -ENOMEM;
- error = shmem_mknod(old_dir, whiteout,
+ error = shmem_mknod(idmap, old_dir, whiteout,
S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
dput(whiteout);
if (error)
@@ -3099,7 +3233,10 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
* it exists so that the VFS layer correctly free's it when it
* gets overwritten.
*/
-static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
+static int shmem_rename2(struct mnt_idmap *idmap,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
@@ -3108,7 +3245,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
@@ -3116,7 +3253,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
if (flags & RENAME_WHITEOUT) {
int error;
- error = shmem_whiteout(old_dir, old_dentry);
+ error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error)
return error;
}
@@ -3137,21 +3274,24 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
inode->i_ctime = current_time(old_dir);
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
return 0;
}
-static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
int error;
int len;
struct inode *inode;
- struct page *page;
+ struct folio *folio;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
return -ENAMETOOLONG;
- inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
if (!inode)
return -ENOSPC;
@@ -3173,21 +3313,22 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
- error = shmem_getpage(inode, 0, &page, SGP_WRITE);
+ error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
if (error) {
iput(inode);
return error;
}
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
- memcpy(page_address(page), symname, len);
- SetPageUptodate(page);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ memcpy(folio_address(folio), symname, len);
+ folio_mark_uptodate(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
return 0;
@@ -3195,35 +3336,74 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
static void shmem_put_link(void *arg)
{
- mark_page_accessed(arg);
- put_page(arg);
+ folio_mark_accessed(arg);
+ folio_put(arg);
}
static const char *shmem_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
int error;
+
if (!dentry) {
- page = find_get_page(inode->i_mapping, 0);
- if (!page)
+ folio = filemap_get_folio(inode->i_mapping, 0);
+ if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
- put_page(page);
+ if (PageHWPoison(folio_page(folio, 0)) ||
+ !folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
- error = shmem_getpage(inode, 0, &page, SGP_READ);
+ error = shmem_get_folio(inode, 0, &folio, SGP_READ);
if (error)
return ERR_PTR(error);
- unlock_page(page);
+ if (!folio)
+ return ERR_PTR(-ECHILD);
+ if (PageHWPoison(folio_page(folio, 0))) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(-ECHILD);
+ }
+ folio_unlock(folio);
}
- set_delayed_call(done, shmem_put_link, page);
- return page_address(page);
+ set_delayed_call(done, shmem_put_link, folio);
+ return folio_address(folio);
}
#ifdef CONFIG_TMPFS_XATTR
+
+static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+
+ fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
+
+ return 0;
+}
+
+static int shmem_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
+ if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
+ return -EOPNOTSUPP;
+
+ info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
+ (fa->flags & SHMEM_FL_USER_MODIFIABLE);
+
+ shmem_set_inode_flags(inode, info->fsflags);
+ inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
+ return 0;
+}
+
/*
* Superblocks without xattr inode operations may get some security.* xattr
* support from the LSM "for free". As soon as we have any other xattrs
@@ -3261,7 +3441,7 @@ static int shmem_initxattrs(struct inode *inode,
memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
xattr->name, len);
- simple_xattr_list_add(&info->xattrs, new_xattr);
+ simple_xattr_add(&info->xattrs, new_xattr);
}
return 0;
@@ -3278,14 +3458,21 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
}
static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ int err;
name = xattr_full_name(handler, name);
- return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+ err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+ if (!err) {
+ inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
+ }
+ return err;
}
static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3301,10 +3488,6 @@ static const struct xattr_handler shmem_trusted_xattr_handler = {
};
static const struct xattr_handler *shmem_xattr_handlers[] = {
-#ifdef CONFIG_TMPFS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
&shmem_security_xattr_handler,
&shmem_trusted_xattr_handler,
NULL
@@ -3318,6 +3501,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
#endif /* CONFIG_TMPFS_XATTR */
static const struct inode_operations shmem_short_symlink_operations = {
+ .getattr = shmem_getattr,
.get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3325,6 +3509,7 @@ static const struct inode_operations shmem_short_symlink_operations = {
};
static const struct inode_operations shmem_symlink_inode_operations = {
+ .getattr = shmem_getattr,
.get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3423,6 +3608,7 @@ enum shmem_param {
Opt_uid,
Opt_inode32,
Opt_inode64,
+ Opt_noswap,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3444,6 +3630,7 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_u32 ("uid", Opt_uid),
fsparam_flag ("inode32", Opt_inode32),
fsparam_flag ("inode64", Opt_inode64),
+ fsparam_flag ("noswap", Opt_noswap),
{}
};
@@ -3454,6 +3641,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
unsigned long long size;
char *rest;
int opt;
+ kuid_t kuid;
+ kgid_t kgid;
opt = fs_parse(fc, shmem_fs_parameters, param, &result);
if (opt < 0)
@@ -3475,7 +3664,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_nr_blocks:
ctx->blocks = memparse(param->string, &rest);
- if (*rest)
+ if (*rest || ctx->blocks > S64_MAX)
goto bad_value;
ctx->seen |= SHMEM_SEEN_BLOCKS;
break;
@@ -3489,14 +3678,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->mode = result.uint_32 & 07777;
break;
case Opt_uid:
- ctx->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(ctx->uid))
+ kuid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(kuid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, kuid))
goto bad_value;
+
+ ctx->uid = kuid;
break;
case Opt_gid:
- ctx->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(ctx->gid))
+ kgid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(kgid))
goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, kgid))
+ goto bad_value;
+
+ ctx->gid = kgid;
break;
case Opt_huge:
ctx->huge = result.uint_32;
@@ -3527,6 +3734,14 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->full_inums = true;
ctx->seen |= SHMEM_SEEN_INUMS;
break;
+ case Opt_noswap:
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
+ return invalfc(fc,
+ "Turning off swap in unprivileged tmpfs mounts unsupported");
+ }
+ ctx->noswap = true;
+ ctx->seen |= SHMEM_SEEN_NOSWAP;
+ break;
}
return 0;
@@ -3564,7 +3779,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
}
}
if (*this_char) {
- char *value = strchr(this_char,'=');
+ char *value = strchr(this_char, '=');
size_t len = 0;
int err;
@@ -3592,10 +3807,12 @@ static int shmem_reconfigure(struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
unsigned long inodes;
+ struct mempolicy *mpol = NULL;
const char *err;
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
err = "Cannot retroactively limit size";
@@ -3623,6 +3840,14 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Current inum too high to switch to 32-bit inums";
goto out;
}
+ if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+ err = "Cannot disable swap on remount";
+ goto out;
+ }
+ if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
+ err = "Cannot enable swap on remount if it was disabled on first mount";
+ goto out;
+ }
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
@@ -3639,20 +3864,26 @@ static int shmem_reconfigure(struct fs_context *fc)
* Preserve previous mempolicy unless mpol remount option was specified.
*/
if (ctx->mpol) {
- mpol_put(sbinfo->mpol);
+ mpol = sbinfo->mpol;
sbinfo->mpol = ctx->mpol; /* transfers initial ref */
ctx->mpol = NULL;
}
- spin_unlock(&sbinfo->stat_lock);
+
+ if (ctx->noswap)
+ sbinfo->noswap = true;
+
+ raw_spin_unlock(&sbinfo->stat_lock);
+ mpol_put(mpol);
return 0;
out:
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return invalfc(fc, "%s", err);
}
static int shmem_show_options(struct seq_file *seq, struct dentry *root)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
+ struct mempolicy *mpol;
if (sbinfo->max_blocks != shmem_default_max_blocks())
seq_printf(seq, ",size=%luk",
@@ -3695,7 +3926,11 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
- shmem_show_mpol(seq, sbinfo->mpol);
+ mpol = shmem_get_sbmpol(sbinfo);
+ shmem_show_mpol(seq, mpol);
+ mpol_put(mpol);
+ if (sbinfo->noswap)
+ seq_printf(seq, ",noswap");
return 0;
}
@@ -3717,7 +3952,6 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
- int err = -ENOMEM;
/* Round up to L1_CACHE_BYTES to resist false sharing */
sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
@@ -3740,11 +3974,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
ctx->inodes = shmem_default_max_inodes();
if (!(ctx->seen & SHMEM_SEEN_INUMS))
ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
+ sbinfo->noswap = ctx->noswap;
} else {
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC;
+ sb->s_flags |= SB_NOSEC | SB_I_VERSION;
#else
sb->s_flags |= SB_NOUSER;
#endif
@@ -3763,7 +3998,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
- spin_lock_init(&sbinfo->stat_lock);
+ raw_spin_lock_init(&sbinfo->stat_lock);
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
spin_lock_init(&sbinfo->shrinklist_lock);
@@ -3783,7 +4018,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
uuid_gen(&sb->s_uuid);
- inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
+ VM_NORESERVE);
if (!inode)
goto failed;
inode->i_uid = sbinfo->uid;
@@ -3795,7 +4031,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
failed:
shmem_put_super(sb);
- return err;
+ return -ENOMEM;
}
static int shmem_get_tree(struct fs_context *fc)
@@ -3828,7 +4064,7 @@ static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
struct shmem_inode_info *info;
- info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
if (!info)
return NULL;
return &info->vfs_inode;
@@ -3865,28 +4101,37 @@ static void shmem_destroy_inodecache(void)
kmem_cache_destroy(shmem_inode_cachep);
}
-static const struct address_space_operations shmem_aops = {
+/* Keep the page in page cache instead of truncating it */
+static int shmem_error_remove_page(struct address_space *mapping,
+ struct page *page)
+{
+ return 0;
+}
+
+const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
#endif
#ifdef CONFIG_MIGRATION
- .migratepage = migrate_page,
+ .migrate_folio = migrate_folio,
#endif
- .error_remove_page = generic_error_remove_page,
+ .error_remove_page = shmem_error_remove_page,
};
+EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
+ .open = generic_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
.read_iter = shmem_file_read_iter,
.write_iter = generic_file_write_iter,
.fsync = noop_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = shmem_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = shmem_fallocate,
#endif
@@ -3898,11 +4143,14 @@ static const struct inode_operations shmem_inode_operations = {
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
.set_acl = simple_set_acl,
+ .fileattr_get = shmem_fileattr_get,
+ .fileattr_set = shmem_fileattr_set,
#endif
};
static const struct inode_operations shmem_dir_inode_operations = {
#ifdef CONFIG_TMPFS
+ .getattr = shmem_getattr,
.create = shmem_create,
.lookup = simple_lookup,
.link = shmem_link,
@@ -3916,6 +4164,8 @@ static const struct inode_operations shmem_dir_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
+ .fileattr_get = shmem_fileattr_get,
+ .fileattr_set = shmem_fileattr_set,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
@@ -3924,6 +4174,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
};
static const struct inode_operations shmem_special_inode_operations = {
+ .getattr = shmem_getattr,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
#endif
@@ -3959,6 +4210,15 @@ static const struct vm_operations_struct shmem_vm_ops = {
#endif
};
+static const struct vm_operations_struct shmem_anon_vm_ops = {
+ .fault = shmem_fault,
+ .map_pages = filemap_map_pages,
+#ifdef CONFIG_NUMA
+ .set_policy = shmem_set_policy,
+ .get_policy = shmem_get_policy,
+#endif
+};
+
int shmem_init_fs_context(struct fs_context *fc)
{
struct shmem_options *ctx;
@@ -3984,10 +4244,14 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
+#ifdef CONFIG_SHMEM
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+#else
.fs_flags = FS_USERNS_MOUNT,
+#endif
};
-int __init shmem_init(void)
+void __init shmem_init(void)
{
int error;
@@ -4010,21 +4274,20 @@ int __init shmem_init(void)
if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
else
- shmem_huge = 0; /* just in case it was patched */
+ shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
#endif
- return 0;
+ return;
out1:
unregister_filesystem(&shmem_fs_type);
out2:
shmem_destroy_inodecache();
shm_mnt = ERR_PTR(error);
- return error;
}
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
static const int values[] = {
SHMEM_HUGE_ALWAYS,
@@ -4034,16 +4297,19 @@ static ssize_t shmem_enabled_show(struct kobject *kobj,
SHMEM_HUGE_DENY,
SHMEM_HUGE_FORCE,
};
- int i, count;
-
- for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
- const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
+ int len = 0;
+ int i;
- count += sprintf(buf + count, fmt,
- shmem_format_huge(values[i]));
+ for (i = 0; i < ARRAY_SIZE(values); i++) {
+ len += sysfs_emit_at(buf, len,
+ shmem_huge == values[i] ? "%s[%s]" : "%s%s",
+ i ? " " : "",
+ shmem_format_huge(values[i]));
}
- buf[count - 1] = '\n';
- return count;
+
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
}
static ssize_t shmem_enabled_store(struct kobject *kobj,
@@ -4072,47 +4338,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
return count;
}
-struct kobj_attribute shmem_enabled_attr =
- __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
+struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-bool shmem_huge_enabled(struct vm_area_struct *vma)
-{
- struct inode *inode = file_inode(vma->vm_file);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- loff_t i_size;
- pgoff_t off;
-
- if ((vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return false;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- return true;
- if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
- switch (sbinfo->huge) {
- case SHMEM_HUGE_NEVER:
- return false;
- case SHMEM_HUGE_ALWAYS:
- return true;
- case SHMEM_HUGE_WITHIN_SIZE:
- off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- return true;
- fallthrough;
- case SHMEM_HUGE_ADVISE:
- /* TODO: implement fadvise() hints */
- return (vma->vm_flags & VM_HUGEPAGE);
- default:
- VM_BUG_ON(1);
- return false;
- }
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
#else /* !CONFIG_SHMEM */
/*
@@ -4128,27 +4356,24 @@ static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.init_fs_context = ramfs_init_fs_context,
.parameters = ramfs_fs_parameters,
- .kill_sb = kill_litter_super,
+ .kill_sb = ramfs_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
-int __init shmem_init(void)
+void __init shmem_init(void)
{
BUG_ON(register_filesystem(&shmem_fs_type) != 0);
shm_mnt = kern_mount(&shmem_fs_type);
BUG_ON(IS_ERR(shm_mnt));
-
- return 0;
}
-int shmem_unuse(unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+int shmem_unuse(unsigned int type)
{
return 0;
}
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
return 0;
}
@@ -4173,8 +4398,9 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
+#define shmem_anon_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
-#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
+#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
@@ -4197,8 +4423,11 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
- inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
- flags);
+ if (is_idmapped_mnt(mnt))
+ return ERR_PTR(-EINVAL);
+
+ inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
+ S_IFREG | S_IRWXUGO, 0, flags);
if (unlikely(!inode)) {
shmem_unacct_size(flags, size);
return ERR_PTR(-ENOSPC);
@@ -4278,53 +4507,66 @@ int shmem_zero_setup(struct vm_area_struct *vma)
if (vma->vm_file)
fput(vma->vm_file);
vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
-
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
- (vma->vm_end & HPAGE_PMD_MASK)) {
- khugepaged_enter(vma, vma->vm_flags);
- }
+ vma->vm_ops = &shmem_anon_vm_ops;
return 0;
}
/**
- * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
- * @mapping: the page's address_space
- * @index: the page index
+ * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the folio's address_space
+ * @index: the folio index
* @gfp: the page allocator flags to use if allocating
*
* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
* with any new page allocations done using the specified allocation flags.
- * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * But read_cache_page_gfp() uses the ->read_folio() method: which does not
* suit tmpfs, since it may have pages in swapcache, and needs to find those
* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
*
* i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
* with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
*/
-struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
- pgoff_t index, gfp_t gfp)
+struct folio *shmem_read_folio_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
{
#ifdef CONFIG_SHMEM
struct inode *inode = mapping->host;
- struct page *page;
+ struct folio *folio;
int error;
- BUG_ON(mapping->a_ops != &shmem_aops);
- error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+ BUG_ON(!shmem_mapping(mapping));
+ error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
gfp, NULL, NULL, NULL);
if (error)
- page = ERR_PTR(error);
- else
- unlock_page(page);
- return page;
+ return ERR_PTR(error);
+
+ folio_unlock(folio);
+ return folio;
#else
/*
* The tiny !SHMEM case uses ramfs without swap
*/
- return read_cache_page_gfp(mapping, index, gfp);
+ return mapping_read_folio_gfp(mapping, index, gfp);
#endif
}
+EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
+
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+ struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
+ struct page *page;
+
+ if (IS_ERR(folio))
+ return &folio->page;
+
+ page = folio_file_page(folio, index);
+ if (PageHWPoison(page)) {
+ folio_put(folio);
+ return ERR_PTR(-EIO);
+ }
+
+ return page;
+}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/show_mem.c b/mm/show_mem.c
new file mode 100644
index 000000000000..01f8e9905817
--- /dev/null
+++ b/mm/show_mem.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic show_mem() implementation
+ *
+ * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/cma.h>
+#include <linux/cpuset.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+
+#include "internal.h"
+#include "swap.h"
+
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
+unsigned long totalreserve_pages __read_mostly;
+unsigned long totalcma_pages __read_mostly;
+
+static inline void show_node(struct zone *zone)
+{
+ if (IS_ENABLED(CONFIG_NUMA))
+ printk("Node %d ", zone_to_nid(zone));
+}
+
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += low_wmark_pages(zone);
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping or OOM.
+ */
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping or thrashing. Assume at least half of the page
+ * cache, or the low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
+ */
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
+void si_meminfo(struct sysinfo *val)
+{
+ val->totalram = totalram_pages();
+ val->sharedram = global_node_page_state(NR_SHMEM);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
+ val->bufferram = nr_blockdev_pages();
+ val->totalhigh = totalhigh_pages();
+ val->freehigh = nr_free_highpages();
+ val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+ int zone_type; /* needs to be signed */
+ unsigned long managed_pages = 0;
+ unsigned long managed_highpages = 0;
+ unsigned long free_highpages = 0;
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
+#ifdef CONFIG_HIGHMEM
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+
+ if (is_highmem(zone)) {
+ managed_highpages += zone_managed_pages(zone);
+ free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ }
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#else
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#endif
+ val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
+{
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ return false;
+
+ /*
+ * no node mask - aka implicit memory numa policy. Do not bother with
+ * the synchronization - read_mems_allowed_begin - because we do not
+ * have to be precise here.
+ */
+ if (!nodemask)
+ nodemask = &cpuset_current_mems_allowed;
+
+ return !node_isset(nid, *nodemask);
+}
+
+static void show_migration_types(unsigned char type)
+{
+ static const char types[MIGRATE_TYPES] = {
+ [MIGRATE_UNMOVABLE] = 'U',
+ [MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_HIGHATOMIC] = 'H',
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = 'C',
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ [MIGRATE_ISOLATE] = 'I',
+#endif
+ };
+ char tmp[MIGRATE_TYPES + 1];
+ char *p = tmp;
+ int i;
+
+ for (i = 0; i < MIGRATE_TYPES; i++) {
+ if (type & (1 << i))
+ *p++ = types[i];
+ }
+
+ *p = '\0';
+ printk(KERN_CONT "(%s) ", tmp);
+}
+
+static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
+{
+ int zone_idx;
+ for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
+ if (zone_managed_pages(pgdat->node_zones + zone_idx))
+ return true;
+ return false;
+}
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
+ */
+void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+{
+ unsigned long free_pcp = 0;
+ int cpu, nid;
+ struct zone *zone;
+ pg_data_t *pgdat;
+
+ for_each_populated_zone(zone) {
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+ }
+
+ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
+ " kernel_misc_reclaimable:%lu\n"
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
+ global_node_page_state(NR_ACTIVE_ANON),
+ global_node_page_state(NR_INACTIVE_ANON),
+ global_node_page_state(NR_ISOLATED_ANON),
+ global_node_page_state(NR_ACTIVE_FILE),
+ global_node_page_state(NR_INACTIVE_FILE),
+ global_node_page_state(NR_ISOLATED_FILE),
+ global_node_page_state(NR_UNEVICTABLE),
+ global_node_page_state(NR_FILE_DIRTY),
+ global_node_page_state(NR_WRITEBACK),
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
+ global_node_page_state(NR_FILE_MAPPED),
+ global_node_page_state(NR_SHMEM),
+ global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
+ global_zone_page_state(NR_FREE_PAGES),
+ free_pcp,
+ global_zone_page_state(NR_FREE_CMA_PAGES));
+
+ for_each_online_pgdat(pgdat) {
+ if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+ continue;
+ if (!node_has_managed_zones(pgdat, max_zone_idx))
+ continue;
+
+ printk("Node %d"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
+ " mapped:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
+ " writeback_tmp:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
+ " pagetables:%lukB"
+ " sec_pagetables:%lukB"
+ " all_unreclaimable? %s"
+ "\n",
+ pgdat->node_id,
+ K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_UNEVICTABLE)),
+ K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+ K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+ K(node_page_state(pgdat, NR_FILE_MAPPED)),
+ K(node_page_state(pgdat, NR_FILE_DIRTY)),
+ K(node_page_state(pgdat, NR_WRITEBACK)),
+ K(node_page_state(pgdat, NR_SHMEM)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(node_page_state(pgdat, NR_SHMEM_THPS)),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
+ K(node_page_state(pgdat, NR_ANON_THPS)),
+#endif
+ K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
+ K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+ "yes" : "no");
+ }
+
+ for_each_populated_zone(zone) {
+ int i;
+
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+
+ show_node(zone);
+ printk(KERN_CONT
+ "%s"
+ " free:%lukB"
+ " boost:%lukB"
+ " min:%lukB"
+ " low:%lukB"
+ " high:%lukB"
+ " reserved_highatomic:%luKB"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " writepending:%lukB"
+ " present:%lukB"
+ " managed:%lukB"
+ " mlocked:%lukB"
+ " bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
+ " free_cma:%lukB"
+ "\n",
+ zone->name,
+ K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(zone->watermark_boost),
+ K(min_wmark_pages(zone)),
+ K(low_wmark_pages(zone)),
+ K(high_wmark_pages(zone)),
+ K(zone->nr_reserved_highatomic),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
+ K(zone->present_pages),
+ K(zone_managed_pages(zone)),
+ K(zone_page_state(zone, NR_MLOCK)),
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->per_cpu_pageset->count)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
+ printk(KERN_CONT "\n");
+ }
+
+ for_each_populated_zone(zone) {
+ unsigned int order;
+ unsigned long nr[MAX_ORDER + 1], flags, total = 0;
+ unsigned char types[MAX_ORDER + 1];
+
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+ show_node(zone);
+ printk(KERN_CONT "%s: ", zone->name);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order <= MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[order];
+ int type;
+
+ nr[order] = area->nr_free;
+ total += nr[order] << order;
+
+ types[order] = 0;
+ for (type = 0; type < MIGRATE_TYPES; type++) {
+ if (!free_area_empty(area, type))
+ types[order] |= 1 << type;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ for (order = 0; order <= MAX_ORDER; order++) {
+ printk(KERN_CONT "%lu*%lukB ",
+ nr[order], K(1UL) << order);
+ if (nr[order])
+ show_migration_types(types[order]);
+ }
+ printk(KERN_CONT "= %lukB\n", K(total));
+ }
+
+ for_each_online_node(nid) {
+ if (show_mem_node_skip(filter, nid, nodemask))
+ continue;
+ hugetlb_show_meminfo_node(nid);
+ }
+
+ printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
+
+ show_swap_cache_info();
+}
+
+void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+{
+ unsigned long total = 0, reserved = 0, highmem = 0;
+ struct zone *zone;
+
+ printk("Mem-Info:\n");
+ __show_free_areas(filter, nodemask, max_zone_idx);
+
+ for_each_populated_zone(zone) {
+
+ total += zone->present_pages;
+ reserved += zone->present_pages - zone_managed_pages(zone);
+
+ if (is_highmem(zone))
+ highmem += zone->present_pages;
+ }
+
+ printk("%lu pages RAM\n", total);
+ printk("%lu pages HighMem/MovableOnly\n", highmem);
+ printk("%lu pages reserved\n", reserved);
+#ifdef CONFIG_CMA
+ printk("%lu pages cma reserved\n", totalcma_pages);
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
+#endif
+}
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
new file mode 100644
index 000000000000..3ab53fad8876
--- /dev/null
+++ b/mm/shrinker_debug.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/shrinker.h>
+#include <linux/memcontrol.h>
+
+/* defined in vmscan.c */
+extern struct rw_semaphore shrinker_rwsem;
+extern struct list_head shrinker_list;
+
+static DEFINE_IDA(shrinker_debugfs_ida);
+static struct dentry *shrinker_debugfs_root;
+
+static unsigned long shrinker_count_objects(struct shrinker *shrinker,
+ struct mem_cgroup *memcg,
+ unsigned long *count_per_node)
+{
+ unsigned long nr, total = 0;
+ int nid;
+
+ for_each_node(nid) {
+ if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ struct shrink_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .nid = nid,
+ .memcg = memcg,
+ };
+
+ nr = shrinker->count_objects(shrinker, &sc);
+ if (nr == SHRINK_EMPTY)
+ nr = 0;
+ } else {
+ nr = 0;
+ }
+
+ count_per_node[nid] = nr;
+ total += nr;
+ }
+
+ return total;
+}
+
+static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
+{
+ struct shrinker *shrinker = m->private;
+ unsigned long *count_per_node;
+ struct mem_cgroup *memcg;
+ unsigned long total;
+ bool memcg_aware;
+ int ret, nid;
+
+ count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
+ if (!count_per_node)
+ return -ENOMEM;
+
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ kfree(count_per_node);
+ return ret;
+ }
+ rcu_read_lock();
+
+ memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ if (memcg && !mem_cgroup_online(memcg))
+ continue;
+
+ total = shrinker_count_objects(shrinker,
+ memcg_aware ? memcg : NULL,
+ count_per_node);
+ if (total) {
+ seq_printf(m, "%lu", mem_cgroup_ino(memcg));
+ for_each_node(nid)
+ seq_printf(m, " %lu", count_per_node[nid]);
+ seq_putc(m, '\n');
+ }
+
+ if (!memcg_aware) {
+ mem_cgroup_iter_break(NULL, memcg);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ mem_cgroup_iter_break(NULL, memcg);
+ ret = -EINTR;
+ break;
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+
+ rcu_read_unlock();
+ up_read(&shrinker_rwsem);
+
+ kfree(count_per_node);
+ return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
+
+static int shrinker_debugfs_scan_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return nonseekable_open(inode, file);
+}
+
+static ssize_t shrinker_debugfs_scan_write(struct file *file,
+ const char __user *buf,
+ size_t size, loff_t *pos)
+{
+ struct shrinker *shrinker = file->private_data;
+ unsigned long nr_to_scan = 0, ino, read_len;
+ struct shrink_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ };
+ struct mem_cgroup *memcg = NULL;
+ int nid;
+ char kbuf[72];
+ ssize_t ret;
+
+ read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EFAULT;
+ kbuf[read_len] = '\0';
+
+ if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3)
+ return -EINVAL;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return -EINVAL;
+
+ if (nr_to_scan == 0)
+ return size;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ memcg = mem_cgroup_get_from_ino(ino);
+ if (!memcg || IS_ERR(memcg))
+ return -ENOENT;
+
+ if (!mem_cgroup_online(memcg)) {
+ mem_cgroup_put(memcg);
+ return -ENOENT;
+ }
+ } else if (ino != 0) {
+ return -EINVAL;
+ }
+
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ mem_cgroup_put(memcg);
+ return ret;
+ }
+
+ sc.nid = nid;
+ sc.memcg = memcg;
+ sc.nr_to_scan = nr_to_scan;
+ sc.nr_scanned = nr_to_scan;
+
+ shrinker->scan_objects(shrinker, &sc);
+
+ up_read(&shrinker_rwsem);
+ mem_cgroup_put(memcg);
+
+ return size;
+}
+
+static const struct file_operations shrinker_debugfs_scan_fops = {
+ .owner = THIS_MODULE,
+ .open = shrinker_debugfs_scan_open,
+ .write = shrinker_debugfs_scan_write,
+};
+
+int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+ struct dentry *entry;
+ char buf[128];
+ int id;
+
+ lockdep_assert_held(&shrinker_rwsem);
+
+ /* debugfs isn't initialized yet, add debugfs entries later. */
+ if (!shrinker_debugfs_root)
+ return 0;
+
+ id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
+ if (id < 0)
+ return id;
+ shrinker->debugfs_id = id;
+
+ snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id);
+
+ /* create debugfs entry */
+ entry = debugfs_create_dir(buf, shrinker_debugfs_root);
+ if (IS_ERR(entry)) {
+ ida_free(&shrinker_debugfs_ida, id);
+ return PTR_ERR(entry);
+ }
+ shrinker->debugfs_entry = entry;
+
+ debugfs_create_file("count", 0440, entry, shrinker,
+ &shrinker_debugfs_count_fops);
+ debugfs_create_file("scan", 0220, entry, shrinker,
+ &shrinker_debugfs_scan_fops);
+ return 0;
+}
+
+int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
+{
+ struct dentry *entry;
+ char buf[128];
+ const char *new, *old;
+ va_list ap;
+ int ret = 0;
+
+ va_start(ap, fmt);
+ new = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+
+ if (!new)
+ return -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+
+ old = shrinker->name;
+ shrinker->name = new;
+
+ if (shrinker->debugfs_entry) {
+ snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
+ shrinker->debugfs_id);
+
+ entry = debugfs_rename(shrinker_debugfs_root,
+ shrinker->debugfs_entry,
+ shrinker_debugfs_root, buf);
+ if (IS_ERR(entry))
+ ret = PTR_ERR(entry);
+ else
+ shrinker->debugfs_entry = entry;
+ }
+
+ up_write(&shrinker_rwsem);
+
+ kfree_const(old);
+
+ return ret;
+}
+EXPORT_SYMBOL(shrinker_debugfs_rename);
+
+struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
+ int *debugfs_id)
+{
+ struct dentry *entry = shrinker->debugfs_entry;
+
+ lockdep_assert_held(&shrinker_rwsem);
+
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+
+ *debugfs_id = entry ? shrinker->debugfs_id : -1;
+ shrinker->debugfs_entry = NULL;
+
+ return entry;
+}
+
+void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id)
+{
+ debugfs_remove_recursive(debugfs_entry);
+ ida_free(&shrinker_debugfs_ida, debugfs_id);
+}
+
+static int __init shrinker_debugfs_init(void)
+{
+ struct shrinker *shrinker;
+ struct dentry *dentry;
+ int ret = 0;
+
+ dentry = debugfs_create_dir("shrinker", NULL);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ shrinker_debugfs_root = dentry;
+
+ /* Create debugfs entries for shrinkers registered at boot */
+ down_write(&shrinker_rwsem);
+ list_for_each_entry(shrinker, &shrinker_list, list)
+ if (!shrinker->debugfs_entry) {
+ ret = shrinker_debugfs_add(shrinker);
+ if (ret)
+ break;
+ }
+ up_write(&shrinker_rwsem);
+
+ return ret;
+}
+late_initcall(shrinker_debugfs_init);
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 9b5cd4b004b0..fb1393b8b3a9 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -12,23 +12,22 @@
DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
static bool shuffle_param;
-static int shuffle_show(char *buffer, const struct kernel_param *kp)
-{
- return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N');
-}
-static __meminit int shuffle_store(const char *val,
+static __meminit int shuffle_param_set(const char *val,
const struct kernel_param *kp)
{
- int rc = param_set_bool(val, kp);
-
- if (rc < 0)
- return rc;
- if (shuffle_param)
+ if (param_set_bool(val, kp))
+ return -EINVAL;
+ if (*(bool *)kp->arg)
static_branch_enable(&page_alloc_shuffle_key);
return 0;
}
-module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+static const struct kernel_param_ops shuffle_param_ops = {
+ .set = shuffle_param_set,
+ .get = param_get_bool,
+};
+module_param_cb(shuffle, &shuffle_param_ops, &shuffle_param, 0400);
/*
* For two pages to be swapped in the shuffle, they must be free (on a
@@ -60,7 +59,7 @@ static struct page * __meminit shuffle_valid_page(struct zone *zone,
* ...is the page on the same list as the page we will
* shuffle it with?
*/
- if (page_order(page) != order)
+ if (buddy_order(page) != order)
return NULL;
return page;
@@ -147,8 +146,8 @@ void __meminit __shuffle_zone(struct zone *z)
spin_unlock_irqrestore(&z->lock, flags);
}
-/**
- * shuffle_free_memory - reduce the predictability of the page allocator
+/*
+ * __shuffle_free_memory - reduce the predictability of the page allocator
* @pgdat: node page data
*/
void __meminit __shuffle_free_memory(pg_data_t *pgdat)
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 71b784f0b7c3..a6bdf54f96f1 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -4,13 +4,13 @@
#define _MM_SHUFFLE_H
#include <linux/jump_label.h>
-#define SHUFFLE_ORDER (MAX_ORDER-1)
+#define SHUFFLE_ORDER MAX_ORDER
#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
extern void __shuffle_free_memory(pg_data_t *pgdat);
extern bool shuffle_pick_tail(void);
-static inline void shuffle_free_memory(pg_data_t *pgdat)
+static inline void __meminit shuffle_free_memory(pg_data_t *pgdat)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
return;
@@ -18,7 +18,7 @@ static inline void shuffle_free_memory(pg_data_t *pgdat)
}
extern void __shuffle_zone(struct zone *z);
-static inline void shuffle_zone(struct zone *z)
+static inline void __meminit shuffle_zone(struct zone *z)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
return;
diff --git a/mm/slab.c b/mm/slab.c
index 399a9d185b0f..88194391d553 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -100,6 +100,7 @@
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
+#include <linux/kfence.h>
#include <linux/cpu.h>
#include <linux/sysctl.h>
#include <linux/module.h>
@@ -217,9 +218,8 @@ static void cache_reap(struct work_struct *unused);
static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
void **list);
static inline void fixup_slab_list(struct kmem_cache *cachep,
- struct kmem_cache_node *n, struct page *page,
+ struct kmem_cache_node *n, struct slab *slab,
void **list);
-static int slab_early_init = 1;
#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
@@ -233,7 +233,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
- spin_lock_init(&parent->list_lock);
+ raw_spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
}
@@ -258,7 +258,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define BATCHREFILL_LIMIT 16
/*
- * Optimization question: fewer reaps means less probability for unnessary
+ * Optimization question: fewer reaps means less probability for unnecessary
* cpucache drain/refill cycles.
*
* OTOH the cpuarrays can contain lots of objects,
@@ -272,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
#define STATS_INC_GROWN(x) ((x)->grown++)
-#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
+#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y))
#define STATS_SET_HIGH(x) \
do { \
if ((x)->num_active > (x)->high_mark) \
@@ -296,7 +296,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
+#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -332,7 +332,7 @@ static int obj_offset(struct kmem_cache *cachep)
static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
- return (unsigned long long*) (objp + obj_offset(cachep) -
+ return (unsigned long long *) (objp + obj_offset(cachep) -
sizeof(unsigned long long));
}
@@ -371,10 +371,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
static int slab_max_order = SLAB_MAX_ORDER_LO;
static bool slab_max_order_set __initdata;
-static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
- unsigned int idx)
+static inline void *index_to_obj(struct kmem_cache *cache,
+ const struct slab *slab, unsigned int idx)
{
- return page->s_mem + cache->size * idx;
+ return slab->s_mem + cache->size * idx;
}
#define BOOT_CPUCACHE_ENTRIES 1
@@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str)
{
get_option(&str, &slab_max_order);
slab_max_order = slab_max_order < 0 ? 0 :
- min(slab_max_order, MAX_ORDER - 1);
+ min(slab_max_order, MAX_ORDER);
slab_max_order_set = true;
return 1;
@@ -549,18 +549,18 @@ static struct array_cache *alloc_arraycache(int node, int entries,
}
static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
- struct page *page, void *objp)
+ struct slab *slab, void *objp)
{
struct kmem_cache_node *n;
- int page_node;
+ int slab_node;
LIST_HEAD(list);
- page_node = page_to_nid(page);
- n = get_node(cachep, page_node);
+ slab_node = slab_nid(slab);
+ n = get_node(cachep, slab_node);
- spin_lock(&n->list_lock);
- free_block(cachep, &objp, 1, page_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, slab_node, &list);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -580,7 +580,7 @@ static int transfer_objects(struct array_cache *to,
if (!nr)
return 0;
- memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+ memcpy(to->entry + to->avail, from->entry + from->avail - nr,
sizeof(void *) *nr);
from->avail -= nr;
@@ -618,18 +618,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
return 0;
}
-static inline void *alternate_node_alloc(struct kmem_cache *cachep,
- gfp_t flags)
-{
- return NULL;
-}
-
-static inline void *____cache_alloc_node(struct kmem_cache *cachep,
- gfp_t flags, int nodeid)
-{
- return NULL;
-}
-
static inline gfp_t gfp_exact_node(gfp_t flags)
{
return flags & ~__GFP_NOFAIL;
@@ -637,9 +625,6 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
#else /* CONFIG_NUMA */
-static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
-static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
-
static struct alien_cache *__alloc_alien_cache(int node, int entries,
int batch, gfp_t gfp)
{
@@ -698,7 +683,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
struct kmem_cache_node *n = get_node(cachep, node);
if (ac->avail) {
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
/*
* Stuff objects into the remote nodes shared array first.
* That way we could avoid the overhead of putting the objects
@@ -709,7 +694,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
free_block(cachep, ac->entry, ac->avail, node, list);
ac->avail = 0;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
}
}
@@ -760,7 +745,7 @@ static void drain_alien_cache(struct kmem_cache *cachep,
}
static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
- int node, int page_node)
+ int node, int slab_node)
{
struct kmem_cache_node *n;
struct alien_cache *alien = NULL;
@@ -769,22 +754,22 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
n = get_node(cachep, node);
STATS_INC_NODEFREES(cachep);
- if (n->alien && n->alien[page_node]) {
- alien = n->alien[page_node];
+ if (n->alien && n->alien[slab_node]) {
+ alien = n->alien[slab_node];
ac = &alien->ac;
spin_lock(&alien->lock);
if (unlikely(ac->avail == ac->limit)) {
STATS_INC_ACOVERFLOW(cachep);
- __drain_alien_cache(cachep, ac, page_node, &list);
+ __drain_alien_cache(cachep, ac, slab_node, &list);
}
__free_one(ac, objp);
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
- n = get_node(cachep, page_node);
- spin_lock(&n->list_lock);
- free_block(cachep, &objp, 1, page_node, &list);
- spin_unlock(&n->list_lock);
+ n = get_node(cachep, slab_node);
+ raw_spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, slab_node, &list);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
return 1;
@@ -792,16 +777,16 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
- int page_node = page_to_nid(virt_to_page(objp));
+ int slab_node = slab_nid(virt_to_slab(objp));
int node = numa_mem_id();
/*
- * Make sure we are not freeing a object from another node to the array
+ * Make sure we are not freeing an object from another node to the array
* cache on this cpu.
*/
- if (likely(node == page_node))
+ if (likely(node == slab_node))
return 0;
- return __cache_free_alien(cachep, objp, node, page_node);
+ return __cache_free_alien(cachep, objp, node, slab_node);
}
/*
@@ -825,10 +810,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
*/
n = get_node(cachep, node);
if (n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
cachep->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
return 0;
}
@@ -846,7 +831,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
/*
* The kmem_cache_nodes don't come and go as CPUs
- * come and go. slab_mutex is sufficient
+ * come and go. slab_mutex provides sufficient
* protection here.
*/
cachep->node[node] = n;
@@ -854,12 +839,12 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
return 0;
}
-#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP)
+#if defined(CONFIG_NUMA) || defined(CONFIG_SMP)
/*
* Allocates and initializes node for a node on each slab cache, used for
* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
* will be allocated off-node since memory is not yet online for the new node.
- * When hotplugging memory or a cpu, existing node are not replaced if
+ * When hotplugging memory or a cpu, existing nodes are not replaced if
* already in use.
*
* Must hold slab_mutex.
@@ -907,7 +892,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
goto fail;
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
if (n->shared && force_change) {
free_block(cachep, n->shared->entry,
n->shared->avail, node, &list);
@@ -925,7 +910,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
new_alien = NULL;
}
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
/*
@@ -964,7 +949,7 @@ static void cpuup_canceled(long cpu)
if (!n)
continue;
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
/* Free limit for this kmem_cache_node */
n->free_limit -= cachep->batchcount;
@@ -975,7 +960,7 @@ static void cpuup_canceled(long cpu)
nc->avail = 0;
if (!cpumask_empty(mask)) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto free_slab;
}
@@ -989,7 +974,7 @@ static void cpuup_canceled(long cpu)
alien = n->alien;
n->alien = NULL;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
kfree(shared);
if (alien) {
@@ -1060,9 +1045,9 @@ int slab_prepare_cpu(unsigned int cpu)
* offline.
*
* Even if all the cpus of a node are down, we don't free the
- * kmem_cache_node of any cache. This to avoid a race between cpu_down, and
+ * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
- * the cpu going down. The list3 structure is usually allocated from
+ * the cpu going down. The kmem_cache_node structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
*/
int slab_dead_cpu(unsigned int cpu)
@@ -1094,7 +1079,7 @@ static int slab_offline_cpu(unsigned int cpu)
return 0;
}
-#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_NUMA)
/*
* Drains freelist for a node on each slab cache, used for memory hot-remove.
* Returns -EBUSY if all objects cannot be drained so that the node is not
@@ -1156,7 +1141,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
out:
return notifier_from_errno(ret);
}
-#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+#endif /* CONFIG_NUMA */
/*
* swap the static kmem_cache_node with kmalloced memory
@@ -1173,7 +1158,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
- spin_lock_init(&ptr->list_lock);
+ raw_spin_lock_init(&ptr->list_lock);
MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->node[nodeid] = ptr;
@@ -1255,16 +1240,10 @@ void __init kmem_cache_init(void)
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/
- kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
- kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
- kmalloc_info[INDEX_NODE].size,
- ARCH_KMALLOC_FLAGS, 0,
- kmalloc_info[INDEX_NODE].size);
+ new_kmalloc_cache(INDEX_NODE, KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
- slab_early_init = 0;
-
/* 5) Replace the bootstrap kmem_cache_node */
{
int nid;
@@ -1344,11 +1323,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
for_each_kmem_cache_node(cachep, node, n) {
unsigned long total_slabs, free_slabs, free_objs;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
total_slabs = n->total_slabs;
free_slabs = n->free_slabs;
free_objs = n->free_objects;
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
node, total_slabs - free_slabs, total_slabs,
@@ -1366,66 +1345,70 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
-static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
+static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
- struct page *page;
+ struct folio *folio;
+ struct slab *slab;
flags |= cachep->allocflags;
- page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
- if (!page) {
+ folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder);
+ if (!folio) {
slab_out_of_memory(cachep, flags, nodeid);
return NULL;
}
- account_slab_page(page, cachep->gfporder, cachep);
- __SetPageSlab(page);
+ slab = folio_slab(folio);
+
+ account_slab(slab, cachep->gfporder, cachep, flags);
+ __folio_set_slab(folio);
+ /* Make the flag visible before any changes to folio->mapping */
+ smp_wmb();
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (sk_memalloc_socks() && page_is_pfmemalloc(page))
- SetPageSlabPfmemalloc(page);
+ if (sk_memalloc_socks() && folio_is_pfmemalloc(folio))
+ slab_set_pfmemalloc(slab);
- return page;
+ return slab;
}
/*
* Interface to system's page release.
*/
-static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
+static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
{
int order = cachep->gfporder;
+ struct folio *folio = slab_folio(slab);
- BUG_ON(!PageSlab(page));
- __ClearPageSlabPfmemalloc(page);
- __ClearPageSlab(page);
- page_mapcount_reset(page);
- page->mapping = NULL;
+ BUG_ON(!folio_test_slab(folio));
+ __slab_clear_pfmemalloc(slab);
+ page_mapcount_reset(&folio->page);
+ folio->mapping = NULL;
+ /* Make the mapping reset visible before clearing the flag */
+ smp_wmb();
+ __folio_clear_slab(folio);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += 1 << order;
- unaccount_slab_page(page, order, cachep);
- __free_pages(page, order);
+ mm_account_reclaimed_pages(1 << order);
+ unaccount_slab(slab, order, cachep);
+ __free_pages(&folio->page, order);
}
static void kmem_rcu_free(struct rcu_head *head)
{
struct kmem_cache *cachep;
- struct page *page;
+ struct slab *slab;
- page = container_of(head, struct page, rcu_head);
- cachep = page->slab_cache;
+ slab = container_of(head, struct slab, rcu_head);
+ cachep = slab->slab_cache;
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, slab);
}
#if DEBUG
-static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+static inline bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
{
- if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
- (cachep->size % PAGE_SIZE) == 0)
- return true;
-
- return false;
+ return debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
+ ((cachep->size % PAGE_SIZE) == 0);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1434,7 +1417,7 @@ static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
if (!is_debug_pagealloc_cache(cachep))
return;
- kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+ __kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
}
#else
@@ -1551,18 +1534,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Print some data about the neighboring objects, if they
* exist:
*/
- struct page *page = virt_to_head_page(objp);
+ struct slab *slab = virt_to_slab(objp);
unsigned int objnr;
- objnr = obj_to_index(cachep, page, objp);
+ objnr = obj_to_index(cachep, slab, objp);
if (objnr) {
- objp = index_to_obj(cachep, page, objnr - 1);
+ objp = index_to_obj(cachep, slab, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
- objp = index_to_obj(cachep, page, objnr + 1);
+ objp = index_to_obj(cachep, slab, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
pr_err("Next obj: start=%px, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
@@ -1573,17 +1556,17 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
#if DEBUG
static void slab_destroy_debugcheck(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slab)
{
int i;
if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
- poison_obj(cachep, page->freelist - obj_offset(cachep),
+ poison_obj(cachep, slab->freelist - obj_offset(cachep),
POISON_FREE);
}
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, page, i);
+ void *objp = index_to_obj(cachep, slab, i);
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
@@ -1599,7 +1582,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
}
#else
static void slab_destroy_debugcheck(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slab)
{
}
#endif
@@ -1607,29 +1590,29 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
/**
* slab_destroy - destroy and release all objects in a slab
* @cachep: cache pointer being destroyed
- * @page: page pointer being destroyed
+ * @slab: slab being destroyed
*
- * Destroy all the objs in a slab page, and release the mem back to the system.
- * Before calling the slab page must have been unlinked from the cache. The
+ * Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache. The
* kmem_cache_node ->list_lock is not held/needed.
*/
-static void slab_destroy(struct kmem_cache *cachep, struct page *page)
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slab)
{
void *freelist;
- freelist = page->freelist;
- slab_destroy_debugcheck(cachep, page);
+ freelist = slab->freelist;
+ slab_destroy_debugcheck(cachep, slab);
if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
- call_rcu(&page->rcu_head, kmem_rcu_free);
+ call_rcu(&slab->rcu_head, kmem_rcu_free);
else
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, slab);
/*
* From now on, we don't use freelist
* although actual page can be freed in rcu context
*/
if (OFF_SLAB(cachep))
- kmem_cache_free(cachep->freelist_cache, freelist);
+ kfree(freelist);
}
/*
@@ -1638,11 +1621,11 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
*/
static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
{
- struct page *page, *n;
+ struct slab *slab, *n;
- list_for_each_entry_safe(page, n, list, slab_list) {
- list_del(&page->slab_list);
- slab_destroy(cachep, page);
+ list_for_each_entry_safe(slab, n, list, slab_list) {
+ list_del(&slab->slab_list);
+ slab_destroy(cachep, slab);
}
}
@@ -1681,21 +1664,27 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (flags & CFLGS_OFF_SLAB) {
struct kmem_cache *freelist_cache;
size_t freelist_size;
+ size_t freelist_cache_size;
freelist_size = num * sizeof(freelist_idx_t);
- freelist_cache = kmalloc_slab(freelist_size, 0u);
- if (!freelist_cache)
- continue;
-
- /*
- * Needed to avoid possible looping condition
- * in cache_grow_begin()
- */
- if (OFF_SLAB(freelist_cache))
- continue;
+ if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
+ freelist_cache_size = PAGE_SIZE << get_order(freelist_size);
+ } else {
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+ freelist_cache_size = freelist_cache->size;
+
+ /*
+ * Needed to avoid possible looping condition
+ * in cache_grow_begin()
+ */
+ if (OFF_SLAB(freelist_cache))
+ continue;
+ }
/* check if off slab has enough benefit */
- if (freelist_cache->size > cachep->size / 2)
+ if (freelist_cache_size > cachep->size / 2)
continue;
}
@@ -1789,8 +1778,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
}
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -1895,14 +1883,12 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
return true;
}
-/**
+/*
* __kmem_cache_create - Create a cache.
* @cachep: cache management descriptor
* @flags: SLAB flags
*
- * Returns a ptr to the cache on success, NULL on failure.
- * Cannot be called within a int, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
+ * Returns zero on success, nonzero on failure.
*
* The flags are
*
@@ -1915,8 +1901,6 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
- *
- * Return: a pointer to the created cache or %NULL in case of error
*/
int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
{
@@ -2072,11 +2056,6 @@ done:
cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
- if (OFF_SLAB(cachep)) {
- cachep->freelist_cache =
- kmalloc_slab(cachep->freelist_size, 0u);
- }
-
err = setup_cpu_cache(cachep, gfp);
if (err) {
__kmem_cache_release(cachep);
@@ -2106,7 +2085,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
#endif
}
@@ -2114,7 +2093,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, node)->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
#endif
}
@@ -2154,9 +2133,9 @@ static void do_drain(void *arg)
check_irq_off();
ac = cpu_cache_get(cachep);
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
ac->avail = 0;
slabs_destroy(cachep, &list);
}
@@ -2174,9 +2153,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
drain_alien_cache(cachep, n->alien);
for_each_kmem_cache_node(cachep, node, n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, n->shared, node, true, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -2193,20 +2172,20 @@ static int drain_freelist(struct kmem_cache *cache,
{
struct list_head *p;
int nr_freed;
- struct page *page;
+ struct slab *slab;
nr_freed = 0;
while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
p = n->slabs_free.prev;
if (p == &n->slabs_free) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto out;
}
- page = list_entry(p, struct page, slab_list);
- list_del(&page->slab_list);
+ slab = list_entry(p, struct slab, slab_list);
+ list_del(&slab->slab_list);
n->free_slabs--;
n->total_slabs--;
/*
@@ -2214,9 +2193,11 @@ static int drain_freelist(struct kmem_cache *cache,
* to the cache.
*/
n->free_objects -= cache->num;
- spin_unlock_irq(&n->list_lock);
- slab_destroy(cache, page);
+ raw_spin_unlock_irq(&n->list_lock);
+ slab_destroy(cache, slab);
nr_freed++;
+
+ cond_resched();
}
out:
return nr_freed;
@@ -2283,27 +2264,27 @@ void __kmem_cache_release(struct kmem_cache *cachep)
* Because if it is the case, that means we defer the creation of
* the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
* And we eventually call down to __kmem_cache_create(), which
- * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one.
* This is a "chicken-and-egg" problem.
*
* So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
* which are all initialized during kmem_cache_init().
*/
static void *alloc_slabmgmt(struct kmem_cache *cachep,
- struct page *page, int colour_off,
+ struct slab *slab, int colour_off,
gfp_t local_flags, int nodeid)
{
void *freelist;
- void *addr = page_address(page);
+ void *addr = slab_address(slab);
- page->s_mem = addr + colour_off;
- page->active = 0;
+ slab->s_mem = addr + colour_off;
+ slab->active = 0;
if (OBJFREELIST_SLAB(cachep))
freelist = NULL;
else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
- freelist = kmem_cache_alloc_node(cachep->freelist_cache,
+ freelist = kmalloc_node(cachep->freelist_size,
local_flags, nodeid);
} else {
/* We will use last bytes at the slab for freelist */
@@ -2314,24 +2295,24 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
return freelist;
}
-static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
+static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx)
{
- return ((freelist_idx_t *)page->freelist)[idx];
+ return ((freelist_idx_t *) slab->freelist)[idx];
}
-static inline void set_free_obj(struct page *page,
+static inline void set_free_obj(struct slab *slab,
unsigned int idx, freelist_idx_t val)
{
- ((freelist_idx_t *)(page->freelist))[idx] = val;
+ ((freelist_idx_t *)(slab->freelist))[idx] = val;
}
-static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab)
{
#if DEBUG
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, page, i);
+ void *objp = index_to_obj(cachep, slab, i);
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = NULL;
@@ -2370,44 +2351,34 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Hold information during a freelist initialization */
-union freelist_init_state {
- struct {
- unsigned int pos;
- unsigned int *list;
- unsigned int count;
- };
- struct rnd_state rnd_state;
+struct freelist_init_state {
+ unsigned int pos;
+ unsigned int *list;
+ unsigned int count;
};
/*
- * Initialize the state based on the randomization methode available.
- * return true if the pre-computed list is available, false otherwize.
+ * Initialize the state based on the randomization method available.
+ * return true if the pre-computed list is available, false otherwise.
*/
-static bool freelist_state_initialize(union freelist_init_state *state,
+static bool freelist_state_initialize(struct freelist_init_state *state,
struct kmem_cache *cachep,
unsigned int count)
{
bool ret;
- unsigned int rand;
-
- /* Use best entropy available to define a random shift */
- rand = get_random_int();
-
- /* Use a random state if the pre-computed list is not available */
if (!cachep->random_seq) {
- prandom_seed_state(&state->rnd_state, rand);
ret = false;
} else {
state->list = cachep->random_seq;
state->count = count;
- state->pos = rand % count;
+ state->pos = get_random_u32_below(count);
ret = true;
}
return ret;
}
/* Get the next entry on the list and randomize it using a random shift */
-static freelist_idx_t next_random_slot(union freelist_init_state *state)
+static freelist_idx_t next_random_slot(struct freelist_init_state *state)
{
if (state->pos >= state->count)
state->pos = 0;
@@ -2415,20 +2386,20 @@ static freelist_idx_t next_random_slot(union freelist_init_state *state)
}
/* Swap two freelist entries */
-static void swap_free_obj(struct page *page, unsigned int a, unsigned int b)
+static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b)
{
- swap(((freelist_idx_t *)page->freelist)[a],
- ((freelist_idx_t *)page->freelist)[b]);
+ swap(((freelist_idx_t *) slab->freelist)[a],
+ ((freelist_idx_t *) slab->freelist)[b]);
}
/*
* Shuffle the freelist initialization state based on pre-computed lists.
* return true if the list was successfully shuffled, false otherwise.
*/
-static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab)
{
unsigned int objfreelist = 0, i, rand, count = cachep->num;
- union freelist_init_state state;
+ struct freelist_init_state state;
bool precomputed;
if (count < 2)
@@ -2442,7 +2413,7 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
objfreelist = count - 1;
else
objfreelist = next_random_slot(&state);
- page->freelist = index_to_obj(cachep, page, objfreelist) +
+ slab->freelist = index_to_obj(cachep, slab, objfreelist) +
obj_offset(cachep);
count--;
}
@@ -2453,51 +2424,50 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
*/
if (!precomputed) {
for (i = 0; i < count; i++)
- set_free_obj(page, i, i);
+ set_free_obj(slab, i, i);
/* Fisher-Yates shuffle */
for (i = count - 1; i > 0; i--) {
- rand = prandom_u32_state(&state.rnd_state);
- rand %= (i + 1);
- swap_free_obj(page, i, rand);
+ rand = get_random_u32_below(i + 1);
+ swap_free_obj(slab, i, rand);
}
} else {
for (i = 0; i < count; i++)
- set_free_obj(page, i, next_random_slot(&state));
+ set_free_obj(slab, i, next_random_slot(&state));
}
if (OBJFREELIST_SLAB(cachep))
- set_free_obj(page, cachep->num - 1, objfreelist);
+ set_free_obj(slab, cachep->num - 1, objfreelist);
return true;
}
#else
static inline bool shuffle_freelist(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slab)
{
return false;
}
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slab)
{
int i;
void *objp;
bool shuffled;
- cache_init_objs_debug(cachep, page);
+ cache_init_objs_debug(cachep, slab);
/* Try to randomize the freelist if enabled */
- shuffled = shuffle_freelist(cachep, page);
+ shuffled = shuffle_freelist(cachep, slab);
if (!shuffled && OBJFREELIST_SLAB(cachep)) {
- page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+ slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) +
obj_offset(cachep);
}
for (i = 0; i < cachep->num; i++) {
- objp = index_to_obj(cachep, page, i);
+ objp = index_to_obj(cachep, slab, i);
objp = kasan_init_slab_obj(cachep, objp);
/* constructor could break poison info */
@@ -2508,68 +2478,56 @@ static void cache_init_objs(struct kmem_cache *cachep,
}
if (!shuffled)
- set_free_obj(page, i, i);
+ set_free_obj(slab, i, i);
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab)
{
void *objp;
- objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
- page->active++;
+ objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active));
+ slab->active++;
return objp;
}
static void slab_put_obj(struct kmem_cache *cachep,
- struct page *page, void *objp)
+ struct slab *slab, void *objp)
{
- unsigned int objnr = obj_to_index(cachep, page, objp);
+ unsigned int objnr = obj_to_index(cachep, slab, objp);
#if DEBUG
unsigned int i;
/* Verify double free bug */
- for (i = page->active; i < cachep->num; i++) {
- if (get_free_obj(page, i) == objnr) {
+ for (i = slab->active; i < cachep->num; i++) {
+ if (get_free_obj(slab, i) == objnr) {
pr_err("slab: double free detected in cache '%s', objp %px\n",
cachep->name, objp);
BUG();
}
}
#endif
- page->active--;
- if (!page->freelist)
- page->freelist = objp + obj_offset(cachep);
+ slab->active--;
+ if (!slab->freelist)
+ slab->freelist = objp + obj_offset(cachep);
- set_free_obj(page, page->active, objnr);
-}
-
-/*
- * Map pages beginning at addr to the given cache and slab. This is required
- * for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, and slab debugging.
- */
-static void slab_map_pages(struct kmem_cache *cache, struct page *page,
- void *freelist)
-{
- page->slab_cache = cache;
- page->freelist = freelist;
+ set_free_obj(slab, slab->active, objnr);
}
/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static struct page *cache_grow_begin(struct kmem_cache *cachep,
+static struct slab *cache_grow_begin(struct kmem_cache *cachep,
gfp_t flags, int nodeid)
{
void *freelist;
size_t offset;
gfp_t local_flags;
- int page_node;
+ int slab_node;
struct kmem_cache_node *n;
- struct page *page;
+ struct slab *slab;
/*
* Be lazy and only check for valid flags here, keeping it out of the
@@ -2589,12 +2547,12 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- page = kmem_getpages(cachep, local_flags, nodeid);
- if (!page)
+ slab = kmem_getpages(cachep, local_flags, nodeid);
+ if (!slab)
goto failed;
- page_node = page_to_nid(page);
- n = get_node(cachep, page_node);
+ slab_node = slab_nid(slab);
+ n = get_node(cachep, slab_node);
/* Get colour for the slab, and cal the next value. */
n->colour_next++;
@@ -2612,55 +2570,56 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
* page_address() in the latter returns a non-tagged pointer,
* as it should be for slab pages.
*/
- kasan_poison_slab(page);
+ kasan_poison_slab(slab);
/* Get slab management. */
- freelist = alloc_slabmgmt(cachep, page, offset,
- local_flags & ~GFP_CONSTRAINT_MASK, page_node);
+ freelist = alloc_slabmgmt(cachep, slab, offset,
+ local_flags & ~GFP_CONSTRAINT_MASK, slab_node);
if (OFF_SLAB(cachep) && !freelist)
goto opps1;
- slab_map_pages(cachep, page, freelist);
+ slab->slab_cache = cachep;
+ slab->freelist = freelist;
- cache_init_objs(cachep, page);
+ cache_init_objs(cachep, slab);
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
- return page;
+ return slab;
opps1:
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, slab);
failed:
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
return NULL;
}
-static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
+static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
{
struct kmem_cache_node *n;
void *list = NULL;
check_irq_off();
- if (!page)
+ if (!slab)
return;
- INIT_LIST_HEAD(&page->slab_list);
- n = get_node(cachep, page_to_nid(page));
+ INIT_LIST_HEAD(&slab->slab_list);
+ n = get_node(cachep, slab_nid(slab));
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
n->total_slabs++;
- if (!page->active) {
- list_add_tail(&page->slab_list, &n->slabs_free);
+ if (!slab->active) {
+ list_add_tail(&slab->slab_list, &n->slabs_free);
n->free_slabs++;
} else
- fixup_slab_list(cachep, n, page, &list);
+ fixup_slab_list(cachep, n, slab, &list);
STATS_INC_GROWN(cachep);
- n->free_objects += cachep->num - page->active;
- spin_unlock(&n->list_lock);
+ n->free_objects += cachep->num - slab->active;
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
}
@@ -2707,13 +2666,13 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
unsigned int objnr;
- struct page *page;
+ struct slab *slab;
BUG_ON(virt_to_cache(objp) != cachep);
objp -= obj_offset(cachep);
kfree_debugcheck(objp);
- page = virt_to_head_page(objp);
+ slab = virt_to_slab(objp);
if (cachep->flags & SLAB_RED_ZONE) {
verify_redzone_free(cachep, objp);
@@ -2723,10 +2682,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = (void *)caller;
- objnr = obj_to_index(cachep, page, objp);
+ objnr = obj_to_index(cachep, slab, objp);
BUG_ON(objnr >= cachep->num);
- BUG_ON(objp != index_to_obj(cachep, page, objnr));
+ BUG_ON(objp != index_to_obj(cachep, slab, objnr));
if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE);
@@ -2737,7 +2696,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#else
#define kfree_debugcheck(x) do { } while(0)
-#define cache_free_debugcheck(x,objp,z) (objp)
+#define cache_free_debugcheck(x, objp, z) (objp)
#endif
static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
@@ -2756,116 +2715,116 @@ static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
}
static inline void fixup_slab_list(struct kmem_cache *cachep,
- struct kmem_cache_node *n, struct page *page,
+ struct kmem_cache_node *n, struct slab *slab,
void **list)
{
/* move slabp to correct slabp list: */
- list_del(&page->slab_list);
- if (page->active == cachep->num) {
- list_add(&page->slab_list, &n->slabs_full);
+ list_del(&slab->slab_list);
+ if (slab->active == cachep->num) {
+ list_add(&slab->slab_list, &n->slabs_full);
if (OBJFREELIST_SLAB(cachep)) {
#if DEBUG
/* Poisoning will be done without holding the lock */
if (cachep->flags & SLAB_POISON) {
- void **objp = page->freelist;
+ void **objp = slab->freelist;
*objp = *list;
*list = objp;
}
#endif
- page->freelist = NULL;
+ slab->freelist = NULL;
}
} else
- list_add(&page->slab_list, &n->slabs_partial);
+ list_add(&slab->slab_list, &n->slabs_partial);
}
/* Try to find non-pfmemalloc slab if needed */
-static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
- struct page *page, bool pfmemalloc)
+static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n,
+ struct slab *slab, bool pfmemalloc)
{
- if (!page)
+ if (!slab)
return NULL;
if (pfmemalloc)
- return page;
+ return slab;
- if (!PageSlabPfmemalloc(page))
- return page;
+ if (!slab_test_pfmemalloc(slab))
+ return slab;
/* No need to keep pfmemalloc slab if we have enough free objects */
if (n->free_objects > n->free_limit) {
- ClearPageSlabPfmemalloc(page);
- return page;
+ slab_clear_pfmemalloc(slab);
+ return slab;
}
/* Move pfmemalloc slab to the end of list to speed up next search */
- list_del(&page->slab_list);
- if (!page->active) {
- list_add_tail(&page->slab_list, &n->slabs_free);
+ list_del(&slab->slab_list);
+ if (!slab->active) {
+ list_add_tail(&slab->slab_list, &n->slabs_free);
n->free_slabs++;
} else
- list_add_tail(&page->slab_list, &n->slabs_partial);
+ list_add_tail(&slab->slab_list, &n->slabs_partial);
- list_for_each_entry(page, &n->slabs_partial, slab_list) {
- if (!PageSlabPfmemalloc(page))
- return page;
+ list_for_each_entry(slab, &n->slabs_partial, slab_list) {
+ if (!slab_test_pfmemalloc(slab))
+ return slab;
}
n->free_touched = 1;
- list_for_each_entry(page, &n->slabs_free, slab_list) {
- if (!PageSlabPfmemalloc(page)) {
+ list_for_each_entry(slab, &n->slabs_free, slab_list) {
+ if (!slab_test_pfmemalloc(slab)) {
n->free_slabs--;
- return page;
+ return slab;
}
}
return NULL;
}
-static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
+static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
- struct page *page;
+ struct slab *slab;
- assert_spin_locked(&n->list_lock);
- page = list_first_entry_or_null(&n->slabs_partial, struct page,
+ assert_raw_spin_locked(&n->list_lock);
+ slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
slab_list);
- if (!page) {
+ if (!slab) {
n->free_touched = 1;
- page = list_first_entry_or_null(&n->slabs_free, struct page,
+ slab = list_first_entry_or_null(&n->slabs_free, struct slab,
slab_list);
- if (page)
+ if (slab)
n->free_slabs--;
}
if (sk_memalloc_socks())
- page = get_valid_first_slab(n, page, pfmemalloc);
+ slab = get_valid_first_slab(n, slab, pfmemalloc);
- return page;
+ return slab;
}
static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
struct kmem_cache_node *n, gfp_t flags)
{
- struct page *page;
+ struct slab *slab;
void *obj;
void *list = NULL;
if (!gfp_pfmemalloc_allowed(flags))
return NULL;
- spin_lock(&n->list_lock);
- page = get_first_slab(n, true);
- if (!page) {
- spin_unlock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
+ slab = get_first_slab(n, true);
+ if (!slab) {
+ raw_spin_unlock(&n->list_lock);
return NULL;
}
- obj = slab_get_obj(cachep, page);
+ obj = slab_get_obj(cachep, slab);
n->free_objects--;
- fixup_slab_list(cachep, n, page, &list);
+ fixup_slab_list(cachep, n, slab, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
@@ -2876,20 +2835,20 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
* or cache_grow_end() for new slab
*/
static __always_inline int alloc_block(struct kmem_cache *cachep,
- struct array_cache *ac, struct page *page, int batchcount)
+ struct array_cache *ac, struct slab *slab, int batchcount)
{
/*
* There must be at least one object available for
* allocation.
*/
- BUG_ON(page->active >= cachep->num);
+ BUG_ON(slab->active >= cachep->num);
- while (page->active < cachep->num && batchcount--) {
+ while (slab->active < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac->entry[ac->avail++] = slab_get_obj(cachep, page);
+ ac->entry[ac->avail++] = slab_get_obj(cachep, slab);
}
return batchcount;
@@ -2902,7 +2861,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
struct array_cache *ac, *shared;
int node;
void *list = NULL;
- struct page *page;
+ struct slab *slab;
check_irq_off();
node = numa_mem_id();
@@ -2924,7 +2883,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
if (!n->free_objects && (!shared || !shared->avail))
goto direct_grow;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
@@ -2935,20 +2894,20 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
while (batchcount > 0) {
/* Get slab alloc is to come from. */
- page = get_first_slab(n, false);
- if (!page)
+ slab = get_first_slab(n, false);
+ if (!slab)
goto must_grow;
check_spinlock_acquired(cachep);
- batchcount = alloc_block(cachep, ac, page, batchcount);
- fixup_slab_list(cachep, n, page, &list);
+ batchcount = alloc_block(cachep, ac, slab, batchcount);
+ fixup_slab_list(cachep, n, slab, &list);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
direct_grow:
@@ -2961,16 +2920,16 @@ direct_grow:
return obj;
}
- page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
+ slab = cache_grow_begin(cachep, gfp_exact_node(flags), node);
/*
* cache_grow_begin() can reenable interrupts,
* then ac could change.
*/
ac = cpu_cache_get(cachep);
- if (!ac->avail && page)
- alloc_block(cachep, ac, page, batchcount);
- cache_grow_end(cachep, page);
+ if (!ac->avail && slab)
+ alloc_block(cachep, ac, slab, batchcount);
+ cache_grow_end(cachep, slab);
if (!ac->avail)
return NULL;
@@ -2980,18 +2939,12 @@ direct_grow:
return ac->entry[--ac->avail];
}
-static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
- gfp_t flags)
-{
- might_sleep_if(gfpflags_allow_blocking(flags));
-}
-
#if DEBUG
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
- if (!objp)
+ if (!objp || is_kfence_address(objp))
return objp;
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
@@ -3016,15 +2969,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
- if (ARCH_SLAB_MINALIGN &&
- ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
- pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n",
- objp, (int)ARCH_SLAB_MINALIGN);
+ if ((unsigned long)objp & (arch_slab_minalign() - 1)) {
+ pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp,
+ arch_slab_minalign());
}
return objp;
}
#else
-#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
+#define cache_alloc_debugcheck_after(a, b, objp, d) (objp)
#endif
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
@@ -3063,6 +3015,8 @@ out:
}
#ifdef CONFIG_NUMA
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
+
/*
* Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
*
@@ -3100,7 +3054,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(flags);
void *obj = NULL;
- struct page *page;
+ struct slab *slab;
int nid;
unsigned int cpuset_mems_cookie;
@@ -3136,10 +3090,10 @@ retry:
* We may trigger various forms of reclaim on the allowed
* set and go into memory reserves if necessary.
*/
- page = cache_grow_begin(cache, flags, numa_mem_id());
- cache_grow_end(cache, page);
- if (page) {
- nid = page_to_nid(page);
+ slab = cache_grow_begin(cache, flags, numa_mem_id());
+ cache_grow_end(cache, slab);
+ if (slab) {
+ nid = slab_nid(slab);
obj = ____cache_alloc_node(cache,
gfp_exact_node(flags), nid);
@@ -3158,12 +3112,12 @@ retry:
}
/*
- * A interface to enable slab creation on nodeid
+ * An interface to enable slab creation on nodeid
*/
static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
- struct page *page;
+ struct slab *slab;
struct kmem_cache_node *n;
void *obj = NULL;
void *list = NULL;
@@ -3173,9 +3127,9 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
BUG_ON(!n);
check_irq_off();
- spin_lock(&n->list_lock);
- page = get_first_slab(n, false);
- if (!page)
+ raw_spin_lock(&n->list_lock);
+ slab = get_first_slab(n, false);
+ if (!slab)
goto must_grow;
check_spinlock_acquired_node(cachep, nodeid);
@@ -3184,105 +3138,70 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- BUG_ON(page->active == cachep->num);
+ BUG_ON(slab->active == cachep->num);
- obj = slab_get_obj(cachep, page);
+ obj = slab_get_obj(cachep, slab);
n->free_objects--;
- fixup_slab_list(cachep, n, page, &list);
+ fixup_slab_list(cachep, n, slab, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
must_grow:
- spin_unlock(&n->list_lock);
- page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
- if (page) {
+ raw_spin_unlock(&n->list_lock);
+ slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+ if (slab) {
/* This slab isn't counted yet so don't update free_objects */
- obj = slab_get_obj(cachep, page);
+ obj = slab_get_obj(cachep, slab);
}
- cache_grow_end(cachep, page);
+ cache_grow_end(cachep, slab);
return obj ? obj : fallback_alloc(cachep, flags);
}
static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
- unsigned long caller)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- unsigned long save_flags;
- void *ptr;
+ void *objp = NULL;
int slab_node = numa_mem_id();
- struct obj_cgroup *objcg = NULL;
-
- flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
- if (unlikely(!cachep))
- return NULL;
-
- cache_alloc_debugcheck_before(cachep, flags);
- local_irq_save(save_flags);
-
- if (nodeid == NUMA_NO_NODE)
- nodeid = slab_node;
-
- if (unlikely(!get_node(cachep, nodeid))) {
- /* Node not bootstrapped yet */
- ptr = fallback_alloc(cachep, flags);
- goto out;
- }
- if (nodeid == slab_node) {
+ if (nodeid == NUMA_NO_NODE) {
+ if (current->mempolicy || cpuset_do_slab_mem_spread()) {
+ objp = alternate_node_alloc(cachep, flags);
+ if (objp)
+ goto out;
+ }
/*
* Use the locally cached objects if possible.
* However ____cache_alloc does not allow fallback
* to other nodes. It may fail while we still have
* objects on other nodes available.
*/
- ptr = ____cache_alloc(cachep, flags);
- if (ptr)
- goto out;
- }
- /* ___cache_alloc_node can fall back to other nodes */
- ptr = ____cache_alloc_node(cachep, flags, nodeid);
- out:
- local_irq_restore(save_flags);
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-
- if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
- memset(ptr, 0, cachep->object_size);
-
- slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
- return ptr;
-}
-
-static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
-{
- void *objp;
-
- if (current->mempolicy || cpuset_do_slab_mem_spread()) {
- objp = alternate_node_alloc(cache, flags);
- if (objp)
- goto out;
+ objp = ____cache_alloc(cachep, flags);
+ nodeid = slab_node;
+ } else if (nodeid == slab_node) {
+ objp = ____cache_alloc(cachep, flags);
+ } else if (!get_node(cachep, nodeid)) {
+ /* Node not bootstrapped yet */
+ objp = fallback_alloc(cachep, flags);
+ goto out;
}
- objp = ____cache_alloc(cache, flags);
/*
* We may just have run out of memory on the local node.
* ____cache_alloc_node() knows how to locate memory on other nodes
*/
if (!objp)
- objp = ____cache_alloc_node(cache, flags, numa_mem_id());
-
- out:
+ objp = ____cache_alloc_node(cachep, flags, nodeid);
+out:
return objp;
}
#else
static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused)
{
return ____cache_alloc(cachep, flags);
}
@@ -3290,31 +3209,44 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ int nodeid, size_t orig_size, unsigned long caller)
{
unsigned long save_flags;
void *objp;
struct obj_cgroup *objcg = NULL;
+ bool init = false;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
+ cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
- cache_alloc_debugcheck_before(cachep, flags);
+ objp = kfence_alloc(cachep, orig_size, flags);
+ if (unlikely(objp))
+ goto out;
+
local_irq_save(save_flags);
- objp = __do_cache_alloc(cachep, flags);
+ objp = __do_cache_alloc(cachep, flags, nodeid);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
+ init = slab_want_init_on_alloc(flags, cachep);
- if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
- memset(objp, 0, cachep->object_size);
-
- slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
+out:
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init,
+ cachep->object_size);
return objp;
}
+static __always_inline void *
+slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ size_t orig_size, unsigned long caller)
+{
+ return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size,
+ caller);
+}
+
/*
* Caller needs to acquire correct kmem_cache_node's list_lock
* @list: List of detached free slabs should be freed by caller
@@ -3324,40 +3256,40 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
{
int i;
struct kmem_cache_node *n = get_node(cachep, node);
- struct page *page;
+ struct slab *slab;
n->free_objects += nr_objects;
for (i = 0; i < nr_objects; i++) {
void *objp;
- struct page *page;
+ struct slab *slab;
objp = objpp[i];
- page = virt_to_head_page(objp);
- list_del(&page->slab_list);
+ slab = virt_to_slab(objp);
+ list_del(&slab->slab_list);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp);
+ slab_put_obj(cachep, slab, objp);
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
- if (page->active == 0) {
- list_add(&page->slab_list, &n->slabs_free);
+ if (slab->active == 0) {
+ list_add(&slab->slab_list, &n->slabs_free);
n->free_slabs++;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&page->slab_list, &n->slabs_partial);
+ list_add_tail(&slab->slab_list, &n->slabs_partial);
}
}
while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
n->free_objects -= cachep->num;
- page = list_last_entry(&n->slabs_free, struct page, slab_list);
- list_move(&page->slab_list, list);
+ slab = list_last_entry(&n->slabs_free, struct slab, slab_list);
+ list_move(&slab->slab_list, list);
n->free_slabs--;
n->total_slabs--;
}
@@ -3374,7 +3306,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
check_irq_off();
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
if (n->shared) {
struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
@@ -3393,17 +3325,17 @@ free_done:
#if STATS
{
int i = 0;
- struct page *page;
+ struct slab *slab;
- list_for_each_entry(page, &n->slabs_free, slab_list) {
- BUG_ON(page->active);
+ list_for_each_entry(slab, &n->slabs_free, slab_list) {
+ BUG_ON(slab->active);
i++;
}
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
slabs_destroy(cachep, &list);
@@ -3416,8 +3348,26 @@ free_done:
static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
- /* Put the object into the quarantine, don't touch it for now. */
- if (kasan_slab_free(cachep, objp, _RET_IP_))
+ bool init;
+
+ memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1);
+
+ if (is_kfence_address(objp)) {
+ kmemleak_free_recursive(objp, cachep->flags);
+ __kfence_free(objp);
+ return;
+ }
+
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset must be
+ * kept together to avoid discrepancies in behavior.
+ */
+ init = slab_want_init_on_free(cachep);
+ if (init && !kasan_has_integrated_init())
+ memset(objp, 0, cachep->object_size);
+ /* KASAN might put objp into memory quarantine, delaying its reuse. */
+ if (kasan_slab_free(cachep, objp, init))
return;
/* Use KCSAN to help debug racy use-after-free. */
@@ -3434,11 +3384,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
- if (unlikely(slab_want_init_on_free(cachep)))
- memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
- memcg_slab_free_hook(cachep, &objp, 1);
/*
* Skip calling cache_free_alien() when the platform is not numa.
@@ -3458,10 +3405,10 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
}
if (sk_memalloc_socks()) {
- struct page *page = virt_to_head_page(objp);
+ struct slab *slab = virt_to_slab(objp);
- if (unlikely(PageSlabPfmemalloc(page))) {
- cache_free_pfmemalloc(cachep, page, objp);
+ if (unlikely(slab_test_pfmemalloc(slab))) {
+ cache_free_pfmemalloc(cachep, slab, objp);
return;
}
}
@@ -3469,27 +3416,30 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
__free_one(ac, objp);
}
-/**
- * kmem_cache_alloc - Allocate an object
- * @cachep: The cache to allocate from.
- * @flags: See kmalloc().
- *
- * Allocate an object from this cache. The flags are only relevant
- * if the cache has no available objects.
- *
- * Return: pointer to the new object or %NULL in case of error
- */
-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static __always_inline
+void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
+ gfp_t flags)
{
- void *ret = slab_alloc(cachep, flags, _RET_IP_);
+ void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc(_RET_IP_, ret,
- cachep->object_size, cachep->size, flags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE);
return ret;
}
+
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ return __kmem_cache_alloc_lru(cachep, NULL, flags);
+}
EXPORT_SYMBOL(kmem_cache_alloc);
+void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
+ gfp_t flags)
+{
+ return __kmem_cache_alloc_lru(cachep, lru, flags);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_lru);
+
static __always_inline void
cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
size_t size, void **p, unsigned long caller)
@@ -3503,61 +3453,44 @@ cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
- size_t i;
struct obj_cgroup *objcg = NULL;
+ unsigned long irqflags;
+ size_t i;
- s = slab_pre_alloc_hook(s, &objcg, size, flags);
+ s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
if (!s)
return 0;
- cache_alloc_debugcheck_before(s, flags);
-
- local_irq_disable();
+ local_irq_save(irqflags);
for (i = 0; i < size; i++) {
- void *objp = __do_cache_alloc(s, flags);
+ void *objp = kfence_alloc(s, s->object_size, flags) ?:
+ __do_cache_alloc(s, flags, NUMA_NO_NODE);
if (unlikely(!objp))
goto error;
p[i] = objp;
}
- local_irq_enable();
+ local_irq_restore(irqflags);
cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
- /* Clear memory outside IRQ disabled section */
- if (unlikely(slab_want_init_on_alloc(flags, s)))
- for (i = 0; i < size; i++)
- memset(p[i], 0, s->object_size);
-
- slab_post_alloc_hook(s, objcg, flags, size, p);
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled section.
+ */
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s), s->object_size);
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
- local_irq_enable();
+ local_irq_restore(irqflags);
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, objcg, flags, i, p);
- __kmem_cache_free_bulk(s, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
+ kmem_cache_free_bulk(s, i, p);
return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-#ifdef CONFIG_TRACING
-void *
-kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
-{
- void *ret;
-
- ret = slab_alloc(cachep, flags, _RET_IP_);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(_RET_IP_, ret,
- size, cachep->size, flags);
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_trace);
-#endif
-
-#ifdef CONFIG_NUMA
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
* @cachep: The cache to allocate from.
@@ -3573,105 +3506,63 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc_node(_RET_IP_, ret,
- cachep->object_size, cachep->size,
- flags, nodeid);
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid,
- size_t size)
+void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid, size_t orig_size,
+ unsigned long caller)
{
- void *ret;
-
- ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc_node(_RET_IP_, ret,
- size, cachep->size,
- flags, nodeid);
- return ret;
+ return slab_alloc_node(cachep, NULL, flags, nodeid,
+ orig_size, caller);
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
-#endif
-static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
+#ifdef CONFIG_PRINTK
+void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
- ret = kasan_kmalloc(cachep, ret, size, flags);
-
- return ret;
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- return __do_kmalloc_node(size, flags, node, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc_node);
+ unsigned int objnr;
+ void *objp;
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
- int node, unsigned long caller)
-{
- return __do_kmalloc_node(size, flags, node, caller);
+ kpp->kp_ptr = object;
+ kpp->kp_slab = slab;
+ cachep = slab->slab_cache;
+ kpp->kp_slab_cache = cachep;
+ objp = object - obj_offset(cachep);
+ kpp->kp_data_offset = obj_offset(cachep);
+ slab = virt_to_slab(objp);
+ objnr = obj_to_index(cachep, slab, objp);
+ objp = index_to_obj(cachep, slab, objnr);
+ kpp->kp_objp = objp;
+ if (DEBUG && cachep->flags & SLAB_STORE_USER)
+ kpp->kp_ret = *dbg_userword(cachep, objp);
}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif /* CONFIG_NUMA */
+#endif
-/**
- * __do_kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kmalloc).
- * @caller: function caller for debug tracking of the caller
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
- unsigned long caller)
+static __always_inline
+void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = slab_alloc(cachep, flags, caller);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(caller, ret,
- size, cachep->size, flags);
-
- return ret;
-}
+ unsigned long flags;
-void *__kmalloc(size_t size, gfp_t flags)
-{
- return __do_kmalloc(size, flags, _RET_IP_);
+ local_irq_save(flags);
+ debug_check_no_locks_freed(objp, cachep->object_size);
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, cachep->object_size);
+ __cache_free(cachep, objp, caller);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL(__kmalloc);
-void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+void __kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
- return __do_kmalloc(size, flags, caller);
+ __do_kmem_cache_free(cachep, objp, caller);
}
-EXPORT_SYMBOL(__kmalloc_track_caller);
/**
* kmem_cache_free - Deallocate an object
@@ -3683,35 +3574,39 @@ EXPORT_SYMBOL(__kmalloc_track_caller);
*/
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
- unsigned long flags;
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;
- local_irq_save(flags);
- debug_check_no_locks_freed(objp, cachep->object_size);
- if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(objp, cachep->object_size);
- __cache_free(cachep, objp, _RET_IP_);
- local_irq_restore(flags);
-
- trace_kmem_cache_free(_RET_IP_, objp);
+ trace_kmem_cache_free(_RET_IP_, objp, cachep);
+ __do_kmem_cache_free(cachep, objp, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
{
- struct kmem_cache *s;
- size_t i;
+ unsigned long flags;
- local_irq_disable();
- for (i = 0; i < size; i++) {
+ local_irq_save(flags);
+ for (int i = 0; i < size; i++) {
void *objp = p[i];
+ struct kmem_cache *s;
- if (!orig_s) /* called via kfree_bulk */
- s = virt_to_cache(objp);
- else
+ if (!orig_s) {
+ struct folio *folio = virt_to_folio(objp);
+
+ /* called via kfree_bulk */
+ if (!folio_test_slab(folio)) {
+ local_irq_restore(flags);
+ free_large_kmalloc(folio, objp);
+ local_irq_save(flags);
+ continue;
+ }
+ s = folio_slab(folio)->slab_cache;
+ } else {
s = cache_from_obj(orig_s, objp);
+ }
+
if (!s)
continue;
@@ -3721,45 +3616,12 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
__cache_free(s, objp, _RET_IP_);
}
- local_irq_enable();
+ local_irq_restore(flags);
/* FIXME: add tracing */
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-/**
- * kfree - free previously allocated memory
- * @objp: pointer returned by kmalloc.
- *
- * If @objp is NULL, no operation is performed.
- *
- * Don't free memory not originally allocated by kmalloc()
- * or you will run into trouble.
- */
-void kfree(const void *objp)
-{
- struct kmem_cache *c;
- unsigned long flags;
-
- trace_kfree(_RET_IP_, objp);
-
- if (unlikely(ZERO_OR_NULL_PTR(objp)))
- return;
- local_irq_save(flags);
- kfree_debugcheck(objp);
- c = virt_to_cache(objp);
- if (!c) {
- local_irq_restore(flags);
- return;
- }
- debug_check_no_locks_freed(objp, c->object_size);
-
- debug_check_no_obj_freed(objp, c->object_size);
- __cache_free(c, (void *)objp, _RET_IP_);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(kfree);
-
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
@@ -3832,9 +3694,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
node = cpu_to_mem(cpu);
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
free_percpu(prev);
@@ -3855,8 +3717,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (err)
goto end;
- if (limit && shared && batchcount)
- goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -3899,7 +3759,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
limit = 32;
#endif
batchcount = (limit + 1) / 2;
-skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
end:
if (err)
@@ -3929,9 +3788,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
return;
}
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, ac, node, false, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -4015,7 +3874,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
for_each_kmem_cache_node(cachep, node, n) {
check_irq_on();
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
total_slabs += n->total_slabs;
free_slabs += n->free_slabs;
@@ -4024,7 +3883,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
if (n->shared)
shared_avail += n->shared->avail;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
}
num_objs = total_slabs * cachep->num;
active_slabs = total_slabs - free_slabs;
@@ -4136,8 +3995,8 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
* Returns NULL if check passes, otherwise const char * to name of cache
* to indicate an error.
*/
-void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
- bool to_user)
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user)
{
struct kmem_cache *cachep;
unsigned int objnr;
@@ -4146,12 +4005,15 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
ptr = kasan_reset_tag(ptr);
/* Find and validate object. */
- cachep = page->slab_cache;
- objnr = obj_to_index(cachep, page, (void *)ptr);
+ cachep = slab->slab_cache;
+ objnr = obj_to_index(cachep, slab, (void *)ptr);
BUG_ON(objnr >= cachep->num);
/* Find offset within object. */
- offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+ if (is_kfence_address(ptr))
+ offset = ptr - kfence_object_start(ptr);
+ else
+ offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep);
/* Allow address range falling entirely within usercopy region. */
if (offset >= cachep->useroffset &&
@@ -4159,44 +4021,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
n <= cachep->useroffset - offset + cachep->usersize)
return;
- /*
- * If the copy is still within the allocated object, produce
- * a warning instead of rejecting the copy. This is intended
- * to be a temporary method to find any missing usercopy
- * whitelists.
- */
- if (usercopy_fallback &&
- offset <= cachep->object_size &&
- n <= cachep->object_size - offset) {
- usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
- return;
- }
-
usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
-
-/**
- * __ksize -- Uninstrumented ksize.
- * @objp: pointer to the object
- *
- * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
- * safety checks as ksize() with KASAN instrumentation enabled.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
-size_t __ksize(const void *objp)
-{
- struct kmem_cache *c;
- size_t size;
-
- BUG_ON(!objp);
- if (unlikely(objp == ZERO_SIZE_PTR))
- return 0;
-
- c = virt_to_cache(objp);
- size = c ? c->object_size : 0;
-
- return size;
-}
-EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab.h b/mm/slab.h
index 6dd4b702888a..9c0e09d0f81f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,33 +4,229 @@
/*
* Internal slab definitions
*/
+void __init kmem_cache_init(void);
+
+#ifdef CONFIG_64BIT
+# ifdef system_has_cmpxchg128
+# define system_has_freelist_aba() system_has_cmpxchg128()
+# define try_cmpxchg_freelist try_cmpxchg128
+# endif
+#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128
+typedef u128 freelist_full_t;
+#else /* CONFIG_64BIT */
+# ifdef system_has_cmpxchg64
+# define system_has_freelist_aba() system_has_cmpxchg64()
+# define try_cmpxchg_freelist try_cmpxchg64
+# endif
+#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64
+typedef u64 freelist_full_t;
+#endif /* CONFIG_64BIT */
+
+#if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#undef system_has_freelist_aba
+#endif
-#ifdef CONFIG_SLOB
/*
- * Common fields provided in kmem_cache by all slab allocators
- * This struct is either used directly by the allocator (SLOB)
- * or the allocator must include definitions for all fields
- * provided in kmem_cache_common in their definition of kmem_cache.
- *
- * Once we can do anonymous structs (C11 standard) we could put a
- * anonymous struct definition in these allocators so that the
- * separate allocations in the kmem_cache structure of SLAB and
- * SLUB is no longer needed.
+ * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
+ * problems with cmpxchg of just a pointer.
*/
-struct kmem_cache {
- unsigned int object_size;/* The original size of the object */
- unsigned int size; /* The aligned/padded/added on size */
- unsigned int align; /* Alignment as calculated */
- slab_flags_t flags; /* Active flags on the slab */
- unsigned int useroffset;/* Usercopy region offset */
- unsigned int usersize; /* Usercopy region size */
- const char *name; /* Slab name for sysfs */
- int refcount; /* Use counter */
- void (*ctor)(void *); /* Called on object slot creation */
- struct list_head list; /* List of all slab caches on the system */
+typedef union {
+ struct {
+ void *freelist;
+ unsigned long counter;
+ };
+ freelist_full_t full;
+} freelist_aba_t;
+
+/* Reuses the bits in struct page */
+struct slab {
+ unsigned long __page_flags;
+
+#if defined(CONFIG_SLAB)
+
+ struct kmem_cache *slab_cache;
+ union {
+ struct {
+ struct list_head slab_list;
+ void *freelist; /* array of free object indexes */
+ void *s_mem; /* first object */
+ };
+ struct rcu_head rcu_head;
+ };
+ unsigned int active;
+
+#elif defined(CONFIG_SLUB)
+
+ struct kmem_cache *slab_cache;
+ union {
+ struct {
+ union {
+ struct list_head slab_list;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct {
+ struct slab *next;
+ int slabs; /* Nr of slabs left */
+ };
+#endif
+ };
+ /* Double-word boundary */
+ union {
+ struct {
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
+ };
+#ifdef system_has_freelist_aba
+ freelist_aba_t freelist_counter;
+#endif
+ };
+ };
+ struct rcu_head rcu_head;
+ };
+ unsigned int __unused;
+
+#else
+#error "Unexpected slab allocator configured"
+#endif
+
+ atomic_t __page_refcount;
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+#endif
};
-#endif /* CONFIG_SLOB */
+#define SLAB_MATCH(pg, sl) \
+ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
+SLAB_MATCH(flags, __page_flags);
+SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */
+SLAB_MATCH(_refcount, __page_refcount);
+#ifdef CONFIG_MEMCG
+SLAB_MATCH(memcg_data, memcg_data);
+#endif
+#undef SLAB_MATCH
+static_assert(sizeof(struct slab) <= sizeof(struct page));
+#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB)
+static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
+#endif
+
+/**
+ * folio_slab - Converts from folio to slab.
+ * @folio: The folio.
+ *
+ * Currently struct slab is a different representation of a folio where
+ * folio_test_slab() is true.
+ *
+ * Return: The slab which contains this folio.
+ */
+#define folio_slab(folio) (_Generic((folio), \
+ const struct folio *: (const struct slab *)(folio), \
+ struct folio *: (struct slab *)(folio)))
+
+/**
+ * slab_folio - The folio allocated for a slab
+ * @slab: The slab.
+ *
+ * Slabs are allocated as folios that contain the individual objects and are
+ * using some fields in the first struct page of the folio - those fields are
+ * now accessed by struct slab. It is occasionally necessary to convert back to
+ * a folio in order to communicate with the rest of the mm. Please use this
+ * helper function instead of casting yourself, as the implementation may change
+ * in the future.
+ */
+#define slab_folio(s) (_Generic((s), \
+ const struct slab *: (const struct folio *)s, \
+ struct slab *: (struct folio *)s))
+
+/**
+ * page_slab - Converts from first struct page to slab.
+ * @p: The first (either head of compound or single) page of slab.
+ *
+ * A temporary wrapper to convert struct page to struct slab in situations where
+ * we know the page is the compound head, or single order-0 page.
+ *
+ * Long-term ideally everything would work with struct slab directly or go
+ * through folio to struct slab.
+ *
+ * Return: The slab which contains this page
+ */
+#define page_slab(p) (_Generic((p), \
+ const struct page *: (const struct slab *)(p), \
+ struct page *: (struct slab *)(p)))
+
+/**
+ * slab_page - The first struct page allocated for a slab
+ * @slab: The slab.
+ *
+ * A convenience wrapper for converting slab to the first struct page of the
+ * underlying folio, to communicate with code not yet converted to folio or
+ * struct slab.
+ */
+#define slab_page(s) folio_page(slab_folio(s), 0)
+
+/*
+ * If network-based swap is enabled, sl*b must keep track of whether pages
+ * were allocated from pfmemalloc reserves.
+ */
+static inline bool slab_test_pfmemalloc(const struct slab *slab)
+{
+ return folio_test_active((struct folio *)slab_folio(slab));
+}
+
+static inline void slab_set_pfmemalloc(struct slab *slab)
+{
+ folio_set_active(slab_folio(slab));
+}
+
+static inline void slab_clear_pfmemalloc(struct slab *slab)
+{
+ folio_clear_active(slab_folio(slab));
+}
+
+static inline void __slab_clear_pfmemalloc(struct slab *slab)
+{
+ __folio_clear_active(slab_folio(slab));
+}
+
+static inline void *slab_address(const struct slab *slab)
+{
+ return folio_address(slab_folio(slab));
+}
+
+static inline int slab_nid(const struct slab *slab)
+{
+ return folio_nid(slab_folio(slab));
+}
+
+static inline pg_data_t *slab_pgdat(const struct slab *slab)
+{
+ return folio_pgdat(slab_folio(slab));
+}
+
+static inline struct slab *virt_to_slab(const void *addr)
+{
+ struct folio *folio = virt_to_folio(addr);
+
+ if (!folio_test_slab(folio))
+ return NULL;
+
+ return folio_slab(folio);
+}
+
+static inline int slab_order(const struct slab *slab)
+{
+ return folio_order((struct folio *)slab_folio(slab));
+}
+
+static inline size_t slab_size(const struct slab *slab)
+{
+ return PAGE_SIZE << slab_order(slab);
+}
#ifdef CONFIG_SLAB
#include <linux/slab_def.h>
@@ -46,7 +242,7 @@ struct kmem_cache {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
-#include <linux/kmemleak.h>
+#include <linux/list_lru.h>
/*
* State of the slab allocator.
@@ -81,23 +277,25 @@ extern const struct kmalloc_info_struct {
unsigned int size;
} kmalloc_info[];
-#ifndef CONFIG_SLOB
/* Kmalloc array related functions */
void setup_kmalloc_cache_index_table(void);
void create_kmalloc_caches(slab_flags_t);
/* Find the kmalloc slab corresponding for a certain size */
struct kmem_cache *kmalloc_slab(size_t, gfp_t);
-#endif
+
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t orig_size,
+ unsigned long caller);
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller);
gfp_t kmalloc_fix_flags(gfp_t flags);
/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
-struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size,
- slab_flags_t flags, unsigned int useroffset,
- unsigned int usersize);
+void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type,
+ slab_flags_t flags);
extern void create_boot_cache(struct kmem_cache *, const char *name,
unsigned int size, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize);
@@ -105,28 +303,17 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(unsigned size, unsigned align,
slab_flags_t flags, const char *name, void (*ctor)(void *));
-#ifndef CONFIG_SLOB
struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *));
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *));
-#else
-static inline struct kmem_cache *
-__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
- slab_flags_t flags, void (*ctor)(void *))
-{ return NULL; }
+ slab_flags_t flags, const char *name);
-static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+static inline bool is_kmalloc_cache(struct kmem_cache *s)
{
- return flags;
+ return (s->flags & SLAB_KMALLOC);
}
-#endif
-
/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
@@ -145,12 +332,13 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | SLAB_NO_MERGE)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT)
+ SLAB_TEMPORARY | SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
#else
-#define SLAB_CACHE_FLAGS (0)
+#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE)
#endif
/* Common flags available with current configuration */
@@ -167,7 +355,10 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | \
+ SLAB_KMALLOC | \
+ SLAB_NO_MERGE | \
+ SLAB_NO_USER_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
@@ -196,16 +387,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
-/*
- * Generic implementation of bulk operations
- * These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the object listed
- * may be allocated or freed using these operations.
- */
-void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-
-static inline int cache_vmstat_idx(struct kmem_cache *s)
+static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
{
return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
@@ -218,10 +400,19 @@ DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
extern void print_tracking(struct kmem_cache *s, void *object);
+long validate_slab_cache(struct kmem_cache *s);
+static inline bool __slub_debug_enabled(void)
+{
+ return static_branch_unlikely(&slub_debug_enabled);
+}
#else
static inline void print_tracking(struct kmem_cache *s, void *object)
{
}
+static inline bool __slub_debug_enabled(void)
+{
+ return false;
+}
#endif
/*
@@ -231,39 +422,41 @@ static inline void print_tracking(struct kmem_cache *s, void *object)
*/
static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
{
-#ifdef CONFIG_SLUB_DEBUG
- VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
- if (static_branch_unlikely(&slub_debug_enabled))
+ if (IS_ENABLED(CONFIG_SLUB_DEBUG))
+ VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
+ if (__slub_debug_enabled())
return s->flags & flags;
-#endif
return false;
}
#ifdef CONFIG_MEMCG_KMEM
-static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
+/*
+ * slab_objcgs - get the object cgroups vector associated with a slab
+ * @slab: a pointer to the slab struct
+ *
+ * Returns a pointer to the object cgroups vector associated with the slab,
+ * or NULL if no such vector has been associated yet.
+ */
+static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
{
- /*
- * page->mem_cgroup and page->obj_cgroups are sharing the same
- * space. To distinguish between them in case we don't know for sure
- * that the page is a slab page (e.g. page_cgroup_ino()), let's
- * always set the lowest bit of obj_cgroups.
- */
- return (struct obj_cgroup **)
- ((unsigned long)page->obj_cgroups & ~0x1UL);
-}
+ unsigned long memcg_data = READ_ONCE(slab->memcg_data);
-static inline bool page_has_obj_cgroups(struct page *page)
-{
- return ((unsigned long)page->obj_cgroups & 0x1UL);
+ VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
+ slab_page(slab));
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
+
+ return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
-int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp);
+int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab);
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr);
-static inline void memcg_free_page_obj_cgroups(struct page *page)
+static inline void memcg_free_slab_cgroups(struct slab *slab)
{
- kfree(page_obj_cgroups(page));
- page->obj_cgroups = NULL;
+ kfree(slab_objcgs(slab));
+ slab->memcg_data = 0;
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -275,39 +468,46 @@ static inline size_t obj_full_size(struct kmem_cache *s)
return s->size + sizeof(struct obj_cgroup *);
}
-static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- size_t objects,
- gfp_t flags)
+/*
+ * Returns false if the allocation should fail.
+ */
+static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
+ struct obj_cgroup **objcgp,
+ size_t objects, gfp_t flags)
{
struct obj_cgroup *objcg;
- if (memcg_kmem_bypass())
- return NULL;
+ if (!memcg_kmem_online())
+ return true;
+
+ if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
+ return true;
objcg = get_obj_cgroup_from_current();
if (!objcg)
- return NULL;
+ return true;
- if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
- obj_cgroup_put(objcg);
- return NULL;
- }
+ if (lru) {
+ int ret;
+ struct mem_cgroup *memcg;
- return objcg;
-}
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ ret = memcg_list_lru_alloc(memcg, lru, flags);
+ css_put(&memcg->css);
-static inline void mod_objcg_state(struct obj_cgroup *objcg,
- struct pglist_data *pgdat,
- int idx, int nr)
-{
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
+ if (ret)
+ goto out;
+ }
+
+ if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s)))
+ goto out;
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mod_memcg_lruvec_state(lruvec, idx, nr);
- rcu_read_unlock();
+ *objcgp = objcg;
+ return true;
+out:
+ obj_cgroup_put(objcg);
+ return false;
}
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
@@ -315,28 +515,28 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
gfp_t flags, size_t size,
void **p)
{
- struct page *page;
+ struct slab *slab;
unsigned long off;
size_t i;
- if (!objcg)
+ if (!memcg_kmem_online() || !objcg)
return;
- flags &= ~__GFP_ACCOUNT;
for (i = 0; i < size; i++) {
if (likely(p[i])) {
- page = virt_to_head_page(p[i]);
+ slab = virt_to_slab(p[i]);
- if (!page_has_obj_cgroups(page) &&
- memcg_alloc_page_obj_cgroups(page, s, flags)) {
+ if (!slab_objcgs(slab) &&
+ memcg_alloc_slab_cgroups(slab, s, flags,
+ false)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
- off = obj_to_index(s, page, p[i]);
+ off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
- page_obj_cgroups(page)[off] = objcg;
- mod_objcg_state(objcg, page_pgdat(page),
+ slab_objcgs(slab)[off] = objcg;
+ mod_objcg_state(objcg, slab_pgdat(slab),
cache_vmstat_idx(s), obj_full_size(s));
} else {
obj_cgroup_uncharge(objcg, obj_full_size(s));
@@ -345,48 +545,40 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
obj_cgroup_put(objcg);
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
- struct kmem_cache *s;
- struct obj_cgroup *objcg;
- struct page *page;
- unsigned int off;
+ struct obj_cgroup **objcgs;
int i;
- if (!memcg_kmem_enabled())
+ if (!memcg_kmem_online())
return;
- for (i = 0; i < objects; i++) {
- if (unlikely(!p[i]))
- continue;
-
- page = virt_to_head_page(p[i]);
- if (!page_has_obj_cgroups(page))
- continue;
+ objcgs = slab_objcgs(slab);
+ if (!objcgs)
+ return;
- if (!s_orig)
- s = page->slab_cache;
- else
- s = s_orig;
+ for (i = 0; i < objects; i++) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
- off = obj_to_index(s, page, p[i]);
- objcg = page_obj_cgroups(page)[off];
+ off = obj_to_index(s, slab, p[i]);
+ objcg = objcgs[off];
if (!objcg)
continue;
- page_obj_cgroups(page)[off] = NULL;
+ objcgs[off] = NULL;
obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
-obj_full_size(s));
obj_cgroup_put(objcg);
}
}
#else /* CONFIG_MEMCG_KMEM */
-static inline bool page_has_obj_cgroups(struct page *page)
+static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
{
- return false;
+ return NULL;
}
static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
@@ -394,21 +586,23 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
return NULL;
}
-static inline int memcg_alloc_page_obj_cgroups(struct page *page,
- struct kmem_cache *s, gfp_t gfp)
+static inline int memcg_alloc_slab_cgroups(struct slab *slab,
+ struct kmem_cache *s, gfp_t gfp,
+ bool new_slab)
{
return 0;
}
-static inline void memcg_free_page_obj_cgroups(struct page *page)
+static inline void memcg_free_slab_cgroups(struct slab *slab)
{
}
-static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- size_t objects,
- gfp_t flags)
+static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
+ struct obj_cgroup **objcgp,
+ size_t objects, gfp_t flags)
{
- return NULL;
+ return true;
}
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
@@ -418,7 +612,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
{
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s,
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
}
@@ -426,29 +620,32 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s,
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
- struct page *page;
+ struct slab *slab;
- page = virt_to_head_page(obj);
- if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
+ slab = virt_to_slab(obj);
+ if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n",
__func__))
return NULL;
- return page->slab_cache;
+ return slab->slab_cache;
}
-static __always_inline void account_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+static __always_inline void account_slab(struct slab *slab, int order,
+ struct kmem_cache *s, gfp_t gfp)
{
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
+ memcg_alloc_slab_cgroups(slab, s, gfp, true);
+
+ mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
PAGE_SIZE << order);
}
-static __always_inline void unaccount_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+static __always_inline void unaccount_slab(struct slab *slab, int order,
+ struct kmem_cache *s)
{
- if (memcg_kmem_enabled())
- memcg_free_page_obj_cgroups(page);
+ if (memcg_kmem_online())
+ memcg_free_slab_cgroups(slab);
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
}
@@ -468,6 +665,10 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
return cachep;
}
+void free_large_kmalloc(struct folio *folio, void *object);
+
+size_t __ksize(const void *objp);
+
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifndef CONFIG_SLUB
@@ -499,52 +700,82 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
}
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
struct obj_cgroup **objcgp,
size_t size, gfp_t flags)
{
flags &= gfp_allowed_mask;
- fs_reclaim_acquire(flags);
- fs_reclaim_release(flags);
-
- might_sleep_if(gfpflags_allow_blocking(flags));
+ might_alloc(flags);
if (should_failslab(s, flags))
return NULL;
- if (memcg_kmem_enabled() &&
- ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
- *objcgp = memcg_slab_pre_alloc_hook(s, size, flags);
+ if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))
+ return NULL;
return s;
}
static inline void slab_post_alloc_hook(struct kmem_cache *s,
- struct obj_cgroup *objcg,
- gfp_t flags, size_t size, void **p)
+ struct obj_cgroup *objcg, gfp_t flags,
+ size_t size, void **p, bool init,
+ unsigned int orig_size)
{
+ unsigned int zero_size = s->object_size;
+ bool kasan_init = init;
size_t i;
flags &= gfp_allowed_mask;
+
+ /*
+ * For kmalloc object, the allocated memory size(object_size) is likely
+ * larger than the requested size(orig_size). If redzone check is
+ * enabled for the extra space, don't zero it, as it will be redzoned
+ * soon. The redzone operation for this extra space could be seen as a
+ * replacement of current poisoning under certain debug option, and
+ * won't break other sanity checks.
+ */
+ if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
+ (s->flags & SLAB_KMALLOC))
+ zero_size = orig_size;
+
+ /*
+ * When slub_debug is enabled, avoid memory initialization integrated
+ * into KASAN and instead zero out the memory via the memset below with
+ * the proper size. Otherwise, KASAN might overwrite SLUB redzones and
+ * cause false-positive reports. This does not lead to a performance
+ * penalty on production builds, as slub_debug is not intended to be
+ * enabled there.
+ */
+ if (__slub_debug_enabled())
+ kasan_init = false;
+
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_alloc and initialization memset must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
+ */
for (i = 0; i < size; i++) {
- p[i] = kasan_slab_alloc(s, p[i], flags);
- /* As p[i] might get tagged, call kmemleak hook after KASAN. */
+ p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init);
+ if (p[i] && init && (!kasan_init || !kasan_has_integrated_init()))
+ memset(p[i], 0, zero_size);
kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
+ kmsan_slab_alloc(s, p[i], flags);
}
- if (memcg_kmem_enabled())
- memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
+ memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
}
-#ifndef CONFIG_SLOB
/*
* The slab lists for all objects.
*/
struct kmem_cache_node {
- spinlock_t list_lock;
-
#ifdef CONFIG_SLAB
+ raw_spinlock_t list_lock;
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
@@ -560,6 +791,7 @@ struct kmem_cache_node {
#endif
#ifdef CONFIG_SLUB
+ spinlock_t list_lock;
unsigned long nr_partial;
struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
@@ -584,12 +816,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
for (__node = 0; __node < nr_node_ids; __node++) \
if ((__n = get_node(__s, __node)))
-#endif
-
-void *slab_start(struct seq_file *m, loff_t *pos);
-void *slab_next(struct seq_file *m, void *p, loff_t *pos);
-void slab_stop(struct seq_file *m, void *p);
-int memcg_slab_show(struct seq_file *m, void *p);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
void dump_unreclaimable_slab(void);
@@ -616,7 +842,8 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
{
- if (static_branch_unlikely(&init_on_alloc)) {
+ if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+ &init_on_alloc)) {
if (c->ctor)
return false;
if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
@@ -628,10 +855,39 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
static inline bool slab_want_init_on_free(struct kmem_cache *c)
{
- if (static_branch_unlikely(&init_on_free))
+ if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
+ &init_on_free))
return !(c->ctor ||
(c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
return false;
}
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
+void debugfs_slab_release(struct kmem_cache *);
+#else
+static inline void debugfs_slab_release(struct kmem_cache *s) { }
+#endif
+
+#ifdef CONFIG_PRINTK
+#define KS_ADDRS_COUNT 16
+struct kmem_obj_info {
+ void *kp_ptr;
+ struct slab *kp_slab;
+ void *kp_objp;
+ unsigned long kp_data_offset;
+ struct kmem_cache *kp_slab_cache;
+ void *kp_ret;
+ void *kp_stack[KS_ADDRS_COUNT];
+ void *kp_free_stack[KS_ADDRS_COUNT];
+};
+void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
+#endif
+
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user);
+
+#ifdef CONFIG_SLUB_DEBUG
+void skip_orig_size_check(struct kmem_cache *s, const void *object);
+#endif
+
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f9ccd5dc13f3..d1555ea2981a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -12,37 +12,33 @@
#include <linux/memory.h>
#include <linux/cache.h>
#include <linux/compiler.h>
+#include <linux/kfence.h>
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
+#include <linux/kasan.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/kmem.h>
+#include <linux/stackdepot.h>
#include "internal.h"
-
#include "slab.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
enum slab_state slab_state;
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
-#ifdef CONFIG_HARDENED_USERCOPY
-bool usercopy_fallback __ro_after_init =
- IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
-module_param(usercopy_fallback, bool, 0400);
-MODULE_PARM_DESC(usercopy_fallback,
- "WARN instead of reject usercopy whitelist violations");
-#endif
-
static LIST_HEAD(slab_caches_to_rcu_destroy);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
@@ -53,7 +49,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB | SLAB_KASAN)
+ SLAB_FAILSLAB | SLAB_NO_MERGE | kasan_never_merge())
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
@@ -69,11 +65,19 @@ static int __init setup_slab_nomerge(char *str)
return 1;
}
+static int __init setup_slab_merge(char *str)
+{
+ slab_nomerge = false;
+ return 1;
+}
+
#ifdef CONFIG_SLUB
__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
+__setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
#endif
__setup("slab_nomerge", setup_slab_nomerge);
+__setup("slab_merge", setup_slab_merge);
/*
* Determine the size of a slab object
@@ -87,8 +91,7 @@ EXPORT_SYMBOL(kmem_cache_size);
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, unsigned int size)
{
- if (!name || in_interrupt() || size < sizeof(void *) ||
- size > KMALLOC_MAX_SIZE) {
+ if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
pr_err("kmem_cache_create(%s) integrity check failed\n", name);
return -EINVAL;
}
@@ -103,33 +106,6 @@ static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
}
#endif
-void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
-{
- size_t i;
-
- for (i = 0; i < nr; i++) {
- if (s)
- kmem_cache_free(s, p[i]);
- else
- kfree(p[i]);
- }
-}
-
-int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
- void **p)
-{
- size_t i;
-
- for (i = 0; i < nr; i++) {
- void *x = p[i] = kmem_cache_alloc(s, flags);
- if (!x) {
- __kmem_cache_free_bulk(s, i, p);
- return 0;
- }
- }
- return i;
-}
-
/*
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
@@ -153,8 +129,7 @@ static unsigned int calculate_alignment(slab_flags_t flags,
align = max(align, ralign);
}
- if (align < ARCH_SLAB_MINALIGN)
- align = ARCH_SLAB_MINALIGN;
+ align = max(align, arch_slab_minalign());
return ALIGN(align, sizeof(void *));
}
@@ -170,8 +145,10 @@ int slab_unmergeable(struct kmem_cache *s)
if (s->ctor)
return 1;
+#ifdef CONFIG_HARDENED_USERCOPY
if (s->usersize)
return 1;
+#endif
/*
* We may have set a slab to be unmergeable during bootstrap.
@@ -196,7 +173,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
- flags = kmem_cache_flags(size, flags, name, NULL);
+ flags = kmem_cache_flags(size, flags, name);
if (flags & SLAB_NEVER_MERGE)
return NULL;
@@ -250,8 +227,10 @@ static struct kmem_cache *create_cache(const char *name,
s->size = s->object_size = object_size;
s->align = align;
s->ctor = ctor;
+#ifdef CONFIG_HARDENED_USERCOPY
s->useroffset = useroffset;
s->usersize = usersize;
+#endif
err = __kmem_cache_create(s, flags);
if (err)
@@ -259,14 +238,12 @@ static struct kmem_cache *create_cache(const char *name,
s->refcount = 1;
list_add(&s->list, &slab_caches);
-out:
- if (err)
- return ERR_PTR(err);
return s;
out_free_cache:
kmem_cache_free(kmem_cache, s);
- goto out;
+out:
+ return ERR_PTR(err);
}
/**
@@ -308,8 +285,19 @@ kmem_cache_create_usercopy(const char *name,
const char *cache_name;
int err;
- get_online_cpus();
- get_online_mems();
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * If no slub_debug was enabled globally, the static key is not yet
+ * enabled by setup_slub_debug(). Enable it if the cache is being
+ * created with any of the debugging flags passed explicitly.
+ * It's also possible that this is the first cache created with
+ * SLAB_STORE_USER and we should init stack_depot for it.
+ */
+ if (flags & SLAB_DEBUG_FLAGS)
+ static_branch_enable(&slub_debug_enabled);
+ if (flags & SLAB_STORE_USER)
+ stack_depot_init();
+#endif
mutex_lock(&slab_mutex);
@@ -333,7 +321,8 @@ kmem_cache_create_usercopy(const char *name,
flags &= CACHE_CREATE_MASK;
/* Fail closed on bad usersize of useroffset values. */
- if (WARN_ON(!usersize && useroffset) ||
+ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
+ WARN_ON(!usersize && useroffset) ||
WARN_ON(size < usersize || size - usersize < useroffset))
usersize = useroffset = 0;
@@ -359,16 +348,13 @@ kmem_cache_create_usercopy(const char *name,
out_unlock:
mutex_unlock(&slab_mutex);
- put_online_mems();
- put_online_cpus();
-
if (err) {
if (flags & SLAB_PANIC)
- panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
- name, err);
+ panic("%s: Failed to create slab '%s'. Error %d\n",
+ __func__, name, err);
else {
- pr_warn("kmem_cache_create(%s) failed with error %d\n",
- name, err);
+ pr_warn("%s(%s) failed with error %d\n",
+ __func__, name, err);
dump_stack();
}
return NULL;
@@ -411,6 +397,28 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align,
}
EXPORT_SYMBOL(kmem_cache_create);
+#ifdef SLAB_SUPPORTS_SYSFS
+/*
+ * For a given kmem_cache, kmem_cache_destroy() should only be called
+ * once or there will be a use-after-free problem. The actual deletion
+ * and release of the kobject does not need slab_mutex or cpu_hotplug_lock
+ * protection. So they are now done without holding those locks.
+ *
+ * Note that there will be a slight delay in the deletion of sysfs files
+ * if kmem_cache_release() is called indrectly from a work function.
+ */
+static void kmem_cache_release(struct kmem_cache *s)
+{
+ sysfs_slab_unlink(s);
+ sysfs_slab_release(s);
+}
+#else
+static void kmem_cache_release(struct kmem_cache *s)
+{
+ slab_kmem_cache_release(s);
+}
+#endif
+
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
{
LIST_HEAD(to_destroy);
@@ -435,11 +443,9 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
rcu_barrier();
list_for_each_entry_safe(s, s2, &to_destroy, list) {
-#ifdef SLAB_SUPPORTS_SYSFS
- sysfs_slab_release(s);
-#else
- slab_kmem_cache_release(s);
-#endif
+ debugfs_slab_release(s);
+ kfence_shutdown_cache(s);
+ kmem_cache_release(s);
}
}
@@ -454,18 +460,11 @@ static int shutdown_cache(struct kmem_cache *s)
list_del(&s->list);
if (s->flags & SLAB_TYPESAFE_BY_RCU) {
-#ifdef SLAB_SUPPORTS_SYSFS
- sysfs_slab_unlink(s);
-#endif
list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
schedule_work(&slab_caches_to_rcu_destroy_work);
} else {
-#ifdef SLAB_SUPPORTS_SYSFS
- sysfs_slab_unlink(s);
- sysfs_slab_release(s);
-#else
- slab_kmem_cache_release(s);
-#endif
+ kfence_shutdown_cache(s);
+ debugfs_slab_release(s);
}
return 0;
@@ -480,31 +479,29 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
- int err;
+ int refcnt;
+ bool rcu_set;
- if (unlikely(!s))
+ if (unlikely(!s) || !kasan_check_byte(s))
return;
- get_online_cpus();
- get_online_mems();
-
+ cpus_read_lock();
mutex_lock(&slab_mutex);
- s->refcount--;
- if (s->refcount)
+ rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
+
+ refcnt = --s->refcount;
+ if (refcnt)
goto out_unlock;
- err = shutdown_cache(s);
- if (err) {
- pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
- s->name);
- dump_stack();
- }
+ WARN(shutdown_cache(s),
+ "%s %s: Slab cache still has objects when called from %pS",
+ __func__, s->name, (void *)_RET_IP_);
out_unlock:
mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
+ cpus_read_unlock();
+ if (!refcnt && !rcu_set)
+ kmem_cache_release(s);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -519,15 +516,9 @@ EXPORT_SYMBOL(kmem_cache_destroy);
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
- int ret;
-
- get_online_cpus();
- get_online_mems();
kasan_cache_shrink(cachep);
- ret = __kmem_cache_shrink(cachep);
- put_online_mems();
- put_online_cpus();
- return ret;
+
+ return __kmem_cache_shrink(cachep);
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -536,7 +527,104 @@ bool slab_is_available(void)
return slab_state >= UP;
}
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_PRINTK
+/**
+ * kmem_valid_obj - does the pointer reference a valid slab object?
+ * @object: pointer to query.
+ *
+ * Return: %true if the pointer is to a not-yet-freed object from
+ * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
+ * is to an already-freed object, and %false otherwise.
+ */
+bool kmem_valid_obj(void *object)
+{
+ struct folio *folio;
+
+ /* Some arches consider ZERO_SIZE_PTR to be a valid address. */
+ if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
+ return false;
+ folio = virt_to_folio(object);
+ return folio_test_slab(folio);
+}
+EXPORT_SYMBOL_GPL(kmem_valid_obj);
+
+static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
+{
+ if (__kfence_obj_info(kpp, object, slab))
+ return;
+ __kmem_obj_info(kpp, object, slab);
+}
+
+/**
+ * kmem_dump_obj - Print available slab provenance information
+ * @object: slab object for which to find provenance information.
+ *
+ * This function uses pr_cont(), so that the caller is expected to have
+ * printed out whatever preamble is appropriate. The provenance information
+ * depends on the type of object and on how much debugging is enabled.
+ * For a slab-cache object, the fact that it is a slab object is printed,
+ * and, if available, the slab name, return address, and stack trace from
+ * the allocation and last free path of that object.
+ *
+ * This function will splat if passed a pointer to a non-slab object.
+ * If you are not sure what type of object you have, you should instead
+ * use mem_dump_obj().
+ */
+void kmem_dump_obj(void *object)
+{
+ char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
+ int i;
+ struct slab *slab;
+ unsigned long ptroffset;
+ struct kmem_obj_info kp = { };
+
+ if (WARN_ON_ONCE(!virt_addr_valid(object)))
+ return;
+ slab = virt_to_slab(object);
+ if (WARN_ON_ONCE(!slab)) {
+ pr_cont(" non-slab memory.\n");
+ return;
+ }
+ kmem_obj_info(&kp, object, slab);
+ if (kp.kp_slab_cache)
+ pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
+ else
+ pr_cont(" slab%s", cp);
+ if (is_kfence_address(object))
+ pr_cont(" (kfence)");
+ if (kp.kp_objp)
+ pr_cont(" start %px", kp.kp_objp);
+ if (kp.kp_data_offset)
+ pr_cont(" data offset %lu", kp.kp_data_offset);
+ if (kp.kp_objp) {
+ ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
+ pr_cont(" pointer offset %lu", ptroffset);
+ }
+ if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
+ pr_cont(" size %u", kp.kp_slab_cache->object_size);
+ if (kp.kp_ret)
+ pr_cont(" allocated at %pS\n", kp.kp_ret);
+ else
+ pr_cont("\n");
+ for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
+ if (!kp.kp_stack[i])
+ break;
+ pr_info(" %pS\n", kp.kp_stack[i]);
+ }
+
+ if (kp.kp_free_stack[0])
+ pr_cont(" Free path:\n");
+
+ for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
+ if (!kp.kp_free_stack[i])
+ break;
+ pr_info(" %pS\n", kp.kp_free_stack[i]);
+ }
+
+}
+EXPORT_SYMBOL_GPL(kmem_dump_obj);
+#endif
+
/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name,
unsigned int size, slab_flags_t flags,
@@ -556,8 +644,10 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
align = max(align, size);
s->align = calculate_alignment(flags, align, size);
+#ifdef CONFIG_HARDENED_USERCOPY
s->useroffset = useroffset;
s->usersize = usersize;
+#endif
err = __kmem_cache_create(s, flags);
@@ -568,16 +658,16 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
s->refcount = -1; /* Exempt from merging for now */
}
-struct kmem_cache *__init create_kmalloc_cache(const char *name,
- unsigned int size, slab_flags_t flags,
- unsigned int useroffset, unsigned int usersize)
+static struct kmem_cache *__init create_kmalloc_cache(const char *name,
+ unsigned int size,
+ slab_flags_t flags)
{
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
if (!s)
panic("Out of memory when creating slab %s\n", name);
- create_boot_cache(s, name, size, flags, useroffset, usersize);
+ create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
@@ -648,27 +738,57 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
return kmalloc_caches[kmalloc_type(flags)][index];
}
-#ifdef CONFIG_ZONE_DMA
-#define INIT_KMALLOC_INFO(__size, __short_size) \
-{ \
- .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
- .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
- .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \
- .size = __size, \
+size_t kmalloc_size_roundup(size_t size)
+{
+ struct kmem_cache *c;
+
+ /* Short-circuit the 0 size case. */
+ if (unlikely(size == 0))
+ return 0;
+ /* Short-circuit saturated "too-large" case. */
+ if (unlikely(size == SIZE_MAX))
+ return SIZE_MAX;
+ /* Above the smaller buckets, size is a multiple of page size. */
+ if (size > KMALLOC_MAX_CACHE_SIZE)
+ return PAGE_SIZE << get_order(size);
+
+ /* The flags don't matter since size_index is common to all. */
+ c = kmalloc_slab(size, GFP_KERNEL);
+ return c ? c->object_size : 0;
}
+EXPORT_SYMBOL(kmalloc_size_roundup);
+
+#ifdef CONFIG_ZONE_DMA
+#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
+#else
+#define KMALLOC_DMA_NAME(sz)
+#endif
+
+#ifdef CONFIG_MEMCG_KMEM
+#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
+#else
+#define KMALLOC_CGROUP_NAME(sz)
+#endif
+
+#ifndef CONFIG_SLUB_TINY
+#define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
#else
+#define KMALLOC_RCL_NAME(sz)
+#endif
+
#define INIT_KMALLOC_INFO(__size, __short_size) \
{ \
.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
- .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ KMALLOC_RCL_NAME(__short_size) \
+ KMALLOC_CGROUP_NAME(__short_size) \
+ KMALLOC_DMA_NAME(__short_size) \
.size = __size, \
}
-#endif
/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
- * kmalloc-67108864.
+ * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
+ * kmalloc-2M.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(0, 0),
@@ -692,12 +812,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(262144, 256k),
INIT_KMALLOC_INFO(524288, 512k),
INIT_KMALLOC_INFO(1048576, 1M),
- INIT_KMALLOC_INFO(2097152, 2M),
- INIT_KMALLOC_INFO(4194304, 4M),
- INIT_KMALLOC_INFO(8388608, 8M),
- INIT_KMALLOC_INFO(16777216, 16M),
- INIT_KMALLOC_INFO(33554432, 32M),
- INIT_KMALLOC_INFO(67108864, 64M)
+ INIT_KMALLOC_INFO(2097152, 2M)
};
/*
@@ -716,7 +831,7 @@ void __init setup_kmalloc_cache_index_table(void)
unsigned int i;
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
- (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+ !is_power_of_2(KMALLOC_MIN_SIZE));
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
unsigned int elem = size_index_elem(i);
@@ -728,7 +843,7 @@ void __init setup_kmalloc_cache_index_table(void)
if (KMALLOC_MIN_SIZE >= 64) {
/*
- * The 96 byte size cache is not used if the alignment
+ * The 96 byte sized cache is not used if the alignment
* is 64 byte.
*/
for (i = 64 + 8; i <= 96; i += 8)
@@ -747,16 +862,52 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
-static void __init
+static unsigned int __kmalloc_minalign(void)
+{
+#ifdef CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC
+ if (io_tlb_default_mem.nslabs)
+ return ARCH_KMALLOC_MINALIGN;
+#endif
+ return dma_get_cache_alignment();
+}
+
+void __init
new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
{
- if (type == KMALLOC_RECLAIM)
+ unsigned int minalign = __kmalloc_minalign();
+ unsigned int aligned_size = kmalloc_info[idx].size;
+ int aligned_idx = idx;
+
+ if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
flags |= SLAB_RECLAIM_ACCOUNT;
+ } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
+ if (mem_cgroup_kmem_disabled()) {
+ kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
+ return;
+ }
+ flags |= SLAB_ACCOUNT;
+ } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
+ flags |= SLAB_CACHE_DMA;
+ }
+
+ /*
+ * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for
+ * KMALLOC_NORMAL caches.
+ */
+ if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL))
+ flags |= SLAB_NO_MERGE;
+
+ if (minalign > ARCH_KMALLOC_MINALIGN) {
+ aligned_size = ALIGN(aligned_size, minalign);
+ aligned_idx = __kmalloc_index(aligned_size, false);
+ }
- kmalloc_caches[type][idx] = create_kmalloc_cache(
- kmalloc_info[idx].name[type],
- kmalloc_info[idx].size, flags, 0,
- kmalloc_info[idx].size);
+ if (!kmalloc_caches[type][aligned_idx])
+ kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
+ kmalloc_info[aligned_idx].name[type],
+ aligned_size, flags);
+ if (idx != aligned_idx)
+ kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
}
/*
@@ -769,7 +920,10 @@ void __init create_kmalloc_caches(slab_flags_t flags)
int i;
enum kmalloc_cache_type type;
- for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
+ /*
+ * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
+ */
+ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[type][i])
new_kmalloc_cache(i, type, flags);
@@ -790,22 +944,156 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/* Kmalloc array is now usable */
slab_state = UP;
+}
-#ifdef CONFIG_ZONE_DMA
- for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
-
- if (s) {
- kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
- kmalloc_info[i].name[KMALLOC_DMA],
- kmalloc_info[i].size,
- SLAB_CACHE_DMA | flags, 0,
- kmalloc_info[i].size);
- }
+void free_large_kmalloc(struct folio *folio, void *object)
+{
+ unsigned int order = folio_order(folio);
+
+ if (WARN_ON_ONCE(order == 0))
+ pr_warn_once("object pointer: 0x%p\n", object);
+
+ kmemleak_free(object);
+ kasan_kfree_large(object);
+ kmsan_kfree_large(object);
+
+ mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(folio_page(folio, 0), order);
+}
+
+static void *__kmalloc_large_node(size_t size, gfp_t flags, int node);
+static __always_inline
+void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = __kmalloc_large_node(size, flags, node);
+ trace_kmalloc(caller, ret, size,
+ PAGE_SIZE << get_order(size), flags, node);
+ return ret;
+ }
+
+ s = kmalloc_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = __kmem_cache_alloc_node(s, flags, node, size, caller);
+ ret = kasan_kmalloc(s, ret, size, flags);
+ trace_kmalloc(caller, ret, size, s->size, flags, node);
+ return ret;
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+
+/**
+ * kfree - free previously allocated memory
+ * @object: pointer returned by kmalloc() or kmem_cache_alloc()
+ *
+ * If @object is NULL, no operation is performed.
+ */
+void kfree(const void *object)
+{
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+
+ trace_kfree(_RET_IP_, object);
+
+ if (unlikely(ZERO_OR_NULL_PTR(object)))
+ return;
+
+ folio = virt_to_folio(object);
+ if (unlikely(!folio_test_slab(folio))) {
+ free_large_kmalloc(folio, (void *)object);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ __kmem_cache_free(s, (void *)object, _RET_IP_);
+}
+EXPORT_SYMBOL(kfree);
+
+/**
+ * __ksize -- Report full size of underlying allocation
+ * @object: pointer to the object
+ *
+ * This should only be used internally to query the true size of allocations.
+ * It is not meant to be a way to discover the usable size of an allocation
+ * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
+ * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
+ * and/or FORTIFY_SOURCE.
+ *
+ * Return: size of the actual memory used by @object in bytes
+ */
+size_t __ksize(const void *object)
+{
+ struct folio *folio;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ folio = virt_to_folio(object);
+
+ if (unlikely(!folio_test_slab(folio))) {
+ if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
+ return 0;
+ if (WARN_ON(object != folio_address(folio)))
+ return 0;
+ return folio_size(folio);
}
+
+#ifdef CONFIG_SLUB_DEBUG
+ skip_orig_size_check(folio_slab(folio)->slab_cache, object);
#endif
+
+ return slab_ksize(folio_slab(folio)->slab_cache);
+}
+
+void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+ void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE,
+ size, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
}
-#endif /* !CONFIG_SLOB */
+EXPORT_SYMBOL(kmalloc_trace);
+
+void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t size)
+{
+ void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_node_trace);
gfp_t kmalloc_fix_flags(gfp_t flags)
{
@@ -824,41 +1112,55 @@ gfp_t kmalloc_fix_flags(gfp_t flags)
* directly to the page allocator. We use __GFP_COMP, because we will need to
* know the allocation order to free the pages properly in kfree.
*/
-void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+
+static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- void *ret = NULL;
struct page *page;
+ void *ptr = NULL;
+ unsigned int order = get_order(size);
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- page = alloc_pages(flags, order);
- if (likely(page)) {
- ret = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
+ ptr = page_address(page);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
- ret = kasan_kmalloc_large(ret, size, flags);
- /* As ret might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ret, size, 1, flags);
+
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ptr, size, 1, flags);
+ kmsan_kmalloc_large(ptr, size, flags);
+
+ return ptr;
+}
+
+void *kmalloc_large(size_t size, gfp_t flags)
+{
+ void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE);
+
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
+ flags, NUMA_NO_NODE);
return ret;
}
-EXPORT_SYMBOL(kmalloc_order);
+EXPORT_SYMBOL(kmalloc_large);
-#ifdef CONFIG_TRACING
-void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- void *ret = kmalloc_order(size, flags, order);
- trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ void *ret = __kmalloc_large_node(size, flags, node);
+
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
+ flags, node);
return ret;
}
-EXPORT_SYMBOL(kmalloc_order_trace);
-#endif
+EXPORT_SYMBOL(kmalloc_large_node);
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Randomize a generic freelist */
-static void freelist_randomize(struct rnd_state *state, unsigned int *list,
+static void freelist_randomize(unsigned int *list,
unsigned int count)
{
unsigned int rand;
@@ -869,8 +1171,7 @@ static void freelist_randomize(struct rnd_state *state, unsigned int *list,
/* Fisher-Yates shuffle */
for (i = count - 1; i > 0; i--) {
- rand = prandom_u32_state(state);
- rand %= (i + 1);
+ rand = get_random_u32_below(i + 1);
swap(list[i], list[rand]);
}
}
@@ -879,7 +1180,6 @@ static void freelist_randomize(struct rnd_state *state, unsigned int *list,
int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
gfp_t gfp)
{
- struct rnd_state state;
if (count < 2 || cachep->random_seq)
return 0;
@@ -888,10 +1188,7 @@ int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
if (!cachep->random_seq)
return -ENOMEM;
- /* Get best entropy at this stage of boot */
- prandom_seed_state(&state, get_random_long());
-
- freelist_randomize(&state, cachep->random_seq, count);
+ freelist_randomize(cachep->random_seq, count);
return 0;
}
@@ -931,18 +1228,18 @@ static void print_slabinfo_header(struct seq_file *m)
seq_putc(m, '\n');
}
-void *slab_start(struct seq_file *m, loff_t *pos)
+static void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
return seq_list_start(&slab_caches, *pos);
}
-void *slab_next(struct seq_file *m, void *p, loff_t *pos)
+static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
-void slab_stop(struct seq_file *m, void *p)
+static void slab_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
@@ -978,7 +1275,7 @@ static int slab_show(struct seq_file *m, void *p)
void dump_unreclaimable_slab(void)
{
- struct kmem_cache *s, *s2;
+ struct kmem_cache *s;
struct slabinfo sinfo;
/*
@@ -996,7 +1293,7 @@ void dump_unreclaimable_slab(void)
pr_info("Unreclaimable slab info:\n");
pr_info("Name Used Total\n");
- list_for_each_entry_safe(s, s2, &slab_caches, list) {
+ list_for_each_entry(s, &slab_caches, list) {
if (s->flags & SLAB_RECLAIM_ACCOUNT)
continue;
@@ -1010,17 +1307,6 @@ void dump_unreclaimable_slab(void)
mutex_unlock(&slab_mutex);
}
-#if defined(CONFIG_MEMCG_KMEM)
-int memcg_slab_show(struct seq_file *m, void *p)
-{
- /*
- * Deprecated.
- * Please, take a look at tools/cgroup/slabinfo.py .
- */
- return 0;
-}
-#endif
-
/*
* slabinfo_op - iterator that generates /proc/slabinfo
*
@@ -1064,22 +1350,33 @@ module_init(slab_proc_init);
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
-static __always_inline void *__do_krealloc(const void *p, size_t new_size,
- gfp_t flags)
+static __always_inline __realloc_size(2) void *
+__do_krealloc(const void *p, size_t new_size, gfp_t flags)
{
void *ret;
size_t ks;
- ks = ksize(p);
+ /* Check for double-free before calling ksize. */
+ if (likely(!ZERO_OR_NULL_PTR(p))) {
+ if (!kasan_check_byte(p))
+ return NULL;
+ ks = ksize(p);
+ } else
+ ks = 0;
+ /* If the object still fits, repoison it precisely. */
if (ks >= new_size) {
p = kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}
ret = kmalloc_track_caller(new_size, flags);
- if (ret && p)
- memcpy(ret, p, ks);
+ if (ret && p) {
+ /* Disable KASAN checks as the object's redzone is accessed. */
+ kasan_disable_current();
+ memcpy(ret, kasan_reset_tag(p), ks);
+ kasan_enable_current();
+ }
return ret;
}
@@ -1091,9 +1388,9 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
* @flags: the type of memory to allocate.
*
* The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
+ * lesser of the new and old sizes (__GFP_ZERO flag is effectively ignored).
+ * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
@@ -1131,61 +1428,41 @@ void kfree_sensitive(const void *p)
void *mem = (void *)p;
ks = ksize(mem);
- if (ks)
+ if (ks) {
+ kasan_unpoison_range(mem, ks);
memzero_explicit(mem, ks);
+ }
kfree(mem);
}
EXPORT_SYMBOL(kfree_sensitive);
-/**
- * ksize - get the actual amount of memory allocated for a given object
- * @objp: Pointer to the object
- *
- * kmalloc may internally round up allocations and return more memory
- * than requested. ksize() can be used to determine the actual amount of
- * memory allocated. The caller may use this additional memory, even though
- * a smaller amount of memory was initially specified with the kmalloc call.
- * The caller must guarantee that objp points to a valid object previously
- * allocated with either kmalloc() or kmem_cache_alloc(). The object
- * must not be freed during the duration of the call.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
size_t ksize(const void *objp)
{
- size_t size;
-
/*
- * We need to check that the pointed to object is valid, and only then
- * unpoison the shadow memory below. We use __kasan_check_read(), to
- * generate a more useful report at the time ksize() is called (rather
- * than later where behaviour is undefined due to potential
- * use-after-free or double-free).
+ * We need to first check that the pointer to the object is valid.
+ * The KASAN report printed from ksize() is more useful, then when
+ * it's printed later when the behaviour could be undefined due to
+ * a potential use-after-free or double-free.
*
- * If the pointed to memory is invalid we return 0, to avoid users of
+ * We use kasan_check_byte(), which is supported for the hardware
+ * tag-based KASAN mode, unlike kasan_check_read/write().
+ *
+ * If the pointed to memory is invalid, we return 0 to avoid users of
* ksize() writing to and potentially corrupting the memory region.
*
* We want to perform the check before __ksize(), to avoid potentially
* crashing in __ksize() due to accessing invalid metadata.
*/
- if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
return 0;
- size = __ksize(objp);
- /*
- * We assume that ksize callers could use whole allocated area,
- * so we need to unpoison this area.
- */
- kasan_unpoison_shadow(objp, size);
- return size;
+ return kfence_ksize(objp) ?: __ksize(objp);
}
EXPORT_SYMBOL(ksize);
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/slob.c b/mm/slob.c
deleted file mode 100644
index 7cc9805c8091..000000000000
--- a/mm/slob.c
+++ /dev/null
@@ -1,720 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * SLOB Allocator: Simple List Of Blocks
- *
- * Matt Mackall <mpm@selenic.com> 12/30/03
- *
- * NUMA support by Paul Mundt, 2007.
- *
- * How SLOB works:
- *
- * The core of SLOB is a traditional K&R style heap allocator, with
- * support for returning aligned objects. The granularity of this
- * allocator is as little as 2 bytes, however typically most architectures
- * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
- *
- * The slob heap is a set of linked list of pages from alloc_pages(),
- * and within each page, there is a singly-linked list of free blocks
- * (slob_t). The heap is grown on demand. To reduce fragmentation,
- * heap pages are segregated into three lists, with objects less than
- * 256 bytes, objects less than 1024 bytes, and all other objects.
- *
- * Allocation from heap involves first searching for a page with
- * sufficient free blocks (using a next-fit-like approach) followed by
- * a first-fit scan of the page. Deallocation inserts objects back
- * into the free list in address order, so this is effectively an
- * address-ordered first fit.
- *
- * Above this is an implementation of kmalloc/kfree. Blocks returned
- * from kmalloc are prepended with a 4-byte header with the kmalloc size.
- * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
- * alloc_pages() directly, allocating compound pages so the page order
- * does not have to be separately tracked.
- * These objects are detected in kfree() because PageSlab()
- * is false for them.
- *
- * SLAB is emulated on top of SLOB by simply calling constructors and
- * destructors for every SLAB allocation. Objects are returned with the
- * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
- * case the low-level allocator will fragment blocks to create the proper
- * alignment. Again, objects of page-size or greater are allocated by
- * calling alloc_pages(). As SLAB objects know their size, no separate
- * size bookkeeping is necessary and there is essentially no allocation
- * space overhead, and compound pages aren't needed for multi-page
- * allocations.
- *
- * NUMA support in SLOB is fairly simplistic, pushing most of the real
- * logic down to the page allocator, and simply doing the node accounting
- * on the upper levels. In the event that a node id is explicitly
- * provided, __alloc_pages_node() with the specified node id is used
- * instead. The common case (or when the node id isn't explicitly provided)
- * will default to the current node, as per numa_node_id().
- *
- * Node aware pages are still inserted in to the global freelist, and
- * these are scanned for by matching against the node id encoded in the
- * page flags. As a result, block allocations that can be satisfied from
- * the freelist will only be done so on pages residing on the same node,
- * in order to prevent random node placement.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-
-#include <linux/mm.h>
-#include <linux/swap.h> /* struct reclaim_state */
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/rcupdate.h>
-#include <linux/list.h>
-#include <linux/kmemleak.h>
-
-#include <trace/events/kmem.h>
-
-#include <linux/atomic.h>
-
-#include "slab.h"
-/*
- * slob_block has a field 'units', which indicates size of block if +ve,
- * or offset of next block if -ve (in SLOB_UNITs).
- *
- * Free blocks of size 1 unit simply contain the offset of the next block.
- * Those with larger size contain their size in the first SLOB_UNIT of
- * memory, and the offset of the next free block in the second SLOB_UNIT.
- */
-#if PAGE_SIZE <= (32767 * 2)
-typedef s16 slobidx_t;
-#else
-typedef s32 slobidx_t;
-#endif
-
-struct slob_block {
- slobidx_t units;
-};
-typedef struct slob_block slob_t;
-
-/*
- * All partially free slob pages go on these lists.
- */
-#define SLOB_BREAK1 256
-#define SLOB_BREAK2 1024
-static LIST_HEAD(free_slob_small);
-static LIST_HEAD(free_slob_medium);
-static LIST_HEAD(free_slob_large);
-
-/*
- * slob_page_free: true for pages on free_slob_pages list.
- */
-static inline int slob_page_free(struct page *sp)
-{
- return PageSlobFree(sp);
-}
-
-static void set_slob_page_free(struct page *sp, struct list_head *list)
-{
- list_add(&sp->slab_list, list);
- __SetPageSlobFree(sp);
-}
-
-static inline void clear_slob_page_free(struct page *sp)
-{
- list_del(&sp->slab_list);
- __ClearPageSlobFree(sp);
-}
-
-#define SLOB_UNIT sizeof(slob_t)
-#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT)
-
-/*
- * struct slob_rcu is inserted at the tail of allocated slob blocks, which
- * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free
- * the block using call_rcu.
- */
-struct slob_rcu {
- struct rcu_head head;
- int size;
-};
-
-/*
- * slob_lock protects all slob allocator structures.
- */
-static DEFINE_SPINLOCK(slob_lock);
-
-/*
- * Encode the given size and next info into a free slob block s.
- */
-static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
-{
- slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
- slobidx_t offset = next - base;
-
- if (size > 1) {
- s[0].units = size;
- s[1].units = offset;
- } else
- s[0].units = -offset;
-}
-
-/*
- * Return the size of a slob block.
- */
-static slobidx_t slob_units(slob_t *s)
-{
- if (s->units > 0)
- return s->units;
- return 1;
-}
-
-/*
- * Return the next free slob block pointer after this one.
- */
-static slob_t *slob_next(slob_t *s)
-{
- slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
- slobidx_t next;
-
- if (s[0].units < 0)
- next = -s[0].units;
- else
- next = s[1].units;
- return base+next;
-}
-
-/*
- * Returns true if s is the last free block in its page.
- */
-static int slob_last(slob_t *s)
-{
- return !((unsigned long)slob_next(s) & ~PAGE_MASK);
-}
-
-static void *slob_new_pages(gfp_t gfp, int order, int node)
-{
- struct page *page;
-
-#ifdef CONFIG_NUMA
- if (node != NUMA_NO_NODE)
- page = __alloc_pages_node(node, gfp, order);
- else
-#endif
- page = alloc_pages(gfp, order);
-
- if (!page)
- return NULL;
-
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
- return page_address(page);
-}
-
-static void slob_free_pages(void *b, int order)
-{
- struct page *sp = virt_to_page(b);
-
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += 1 << order;
-
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(sp, order);
-}
-
-/*
- * slob_page_alloc() - Allocate a slob block within a given slob_page sp.
- * @sp: Page to look in.
- * @size: Size of the allocation.
- * @align: Allocation alignment.
- * @align_offset: Offset in the allocated block that will be aligned.
- * @page_removed_from_list: Return parameter.
- *
- * Tries to find a chunk of memory at least @size bytes big within @page.
- *
- * Return: Pointer to memory if allocated, %NULL otherwise. If the
- * allocation fills up @page then the page is removed from the
- * freelist, in this case @page_removed_from_list will be set to
- * true (set to false otherwise).
- */
-static void *slob_page_alloc(struct page *sp, size_t size, int align,
- int align_offset, bool *page_removed_from_list)
-{
- slob_t *prev, *cur, *aligned = NULL;
- int delta = 0, units = SLOB_UNITS(size);
-
- *page_removed_from_list = false;
- for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
- slobidx_t avail = slob_units(cur);
-
- /*
- * 'aligned' will hold the address of the slob block so that the
- * address 'aligned'+'align_offset' is aligned according to the
- * 'align' parameter. This is for kmalloc() which prepends the
- * allocated block with its size, so that the block itself is
- * aligned when needed.
- */
- if (align) {
- aligned = (slob_t *)
- (ALIGN((unsigned long)cur + align_offset, align)
- - align_offset);
- delta = aligned - cur;
- }
- if (avail >= units + delta) { /* room enough? */
- slob_t *next;
-
- if (delta) { /* need to fragment head to align? */
- next = slob_next(cur);
- set_slob(aligned, avail - delta, next);
- set_slob(cur, delta, aligned);
- prev = cur;
- cur = aligned;
- avail = slob_units(cur);
- }
-
- next = slob_next(cur);
- if (avail == units) { /* exact fit? unlink. */
- if (prev)
- set_slob(prev, slob_units(prev), next);
- else
- sp->freelist = next;
- } else { /* fragment */
- if (prev)
- set_slob(prev, slob_units(prev), cur + units);
- else
- sp->freelist = cur + units;
- set_slob(cur + units, avail - units, next);
- }
-
- sp->units -= units;
- if (!sp->units) {
- clear_slob_page_free(sp);
- *page_removed_from_list = true;
- }
- return cur;
- }
- if (slob_last(cur))
- return NULL;
- }
-}
-
-/*
- * slob_alloc: entry point into the slob allocator.
- */
-static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
- int align_offset)
-{
- struct page *sp;
- struct list_head *slob_list;
- slob_t *b = NULL;
- unsigned long flags;
- bool _unused;
-
- if (size < SLOB_BREAK1)
- slob_list = &free_slob_small;
- else if (size < SLOB_BREAK2)
- slob_list = &free_slob_medium;
- else
- slob_list = &free_slob_large;
-
- spin_lock_irqsave(&slob_lock, flags);
- /* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, slob_list, slab_list) {
- bool page_removed_from_list = false;
-#ifdef CONFIG_NUMA
- /*
- * If there's a node specification, search for a partial
- * page with a matching node id in the freelist.
- */
- if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
- continue;
-#endif
- /* Enough room on this page? */
- if (sp->units < SLOB_UNITS(size))
- continue;
-
- b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
- if (!b)
- continue;
-
- /*
- * If slob_page_alloc() removed sp from the list then we
- * cannot call list functions on sp. If so allocation
- * did not fragment the page anyway so optimisation is
- * unnecessary.
- */
- if (!page_removed_from_list) {
- /*
- * Improve fragment distribution and reduce our average
- * search time by starting our next search here. (see
- * Knuth vol 1, sec 2.5, pg 449)
- */
- if (!list_is_first(&sp->slab_list, slob_list))
- list_rotate_to_front(&sp->slab_list, slob_list);
- }
- break;
- }
- spin_unlock_irqrestore(&slob_lock, flags);
-
- /* Not enough space: must allocate a new page */
- if (!b) {
- b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
- if (!b)
- return NULL;
- sp = virt_to_page(b);
- __SetPageSlab(sp);
-
- spin_lock_irqsave(&slob_lock, flags);
- sp->units = SLOB_UNITS(PAGE_SIZE);
- sp->freelist = b;
- INIT_LIST_HEAD(&sp->slab_list);
- set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
- set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align, align_offset, &_unused);
- BUG_ON(!b);
- spin_unlock_irqrestore(&slob_lock, flags);
- }
- if (unlikely(gfp & __GFP_ZERO))
- memset(b, 0, size);
- return b;
-}
-
-/*
- * slob_free: entry point into the slob allocator.
- */
-static void slob_free(void *block, int size)
-{
- struct page *sp;
- slob_t *prev, *next, *b = (slob_t *)block;
- slobidx_t units;
- unsigned long flags;
- struct list_head *slob_list;
-
- if (unlikely(ZERO_OR_NULL_PTR(block)))
- return;
- BUG_ON(!size);
-
- sp = virt_to_page(block);
- units = SLOB_UNITS(size);
-
- spin_lock_irqsave(&slob_lock, flags);
-
- if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
- /* Go directly to page allocator. Do not pass slob allocator */
- if (slob_page_free(sp))
- clear_slob_page_free(sp);
- spin_unlock_irqrestore(&slob_lock, flags);
- __ClearPageSlab(sp);
- page_mapcount_reset(sp);
- slob_free_pages(b, 0);
- return;
- }
-
- if (!slob_page_free(sp)) {
- /* This slob page is about to become partially free. Easy! */
- sp->units = units;
- sp->freelist = b;
- set_slob(b, units,
- (void *)((unsigned long)(b +
- SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
- if (size < SLOB_BREAK1)
- slob_list = &free_slob_small;
- else if (size < SLOB_BREAK2)
- slob_list = &free_slob_medium;
- else
- slob_list = &free_slob_large;
- set_slob_page_free(sp, slob_list);
- goto out;
- }
-
- /*
- * Otherwise the page is already partially free, so find reinsertion
- * point.
- */
- sp->units += units;
-
- if (b < (slob_t *)sp->freelist) {
- if (b + units == sp->freelist) {
- units += slob_units(sp->freelist);
- sp->freelist = slob_next(sp->freelist);
- }
- set_slob(b, units, sp->freelist);
- sp->freelist = b;
- } else {
- prev = sp->freelist;
- next = slob_next(prev);
- while (b > next) {
- prev = next;
- next = slob_next(prev);
- }
-
- if (!slob_last(prev) && b + units == next) {
- units += slob_units(next);
- set_slob(b, units, slob_next(next));
- } else
- set_slob(b, units, next);
-
- if (prev + slob_units(prev) == b) {
- units = slob_units(b) + slob_units(prev);
- set_slob(prev, units, slob_next(b));
- } else
- set_slob(prev, slob_units(prev), b);
- }
-out:
- spin_unlock_irqrestore(&slob_lock, flags);
-}
-
-/*
- * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
- */
-
-static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
-{
- unsigned int *m;
- int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
- void *ret;
-
- gfp &= gfp_allowed_mask;
-
- fs_reclaim_acquire(gfp);
- fs_reclaim_release(gfp);
-
- if (size < PAGE_SIZE - minalign) {
- int align = minalign;
-
- /*
- * For power of two sizes, guarantee natural alignment for
- * kmalloc()'d objects.
- */
- if (is_power_of_2(size))
- align = max(minalign, (int) size);
-
- if (!size)
- return ZERO_SIZE_PTR;
-
- m = slob_alloc(size + minalign, gfp, align, node, minalign);
-
- if (!m)
- return NULL;
- *m = size;
- ret = (void *)m + minalign;
-
- trace_kmalloc_node(caller, ret,
- size, size + minalign, gfp, node);
- } else {
- unsigned int order = get_order(size);
-
- if (likely(order))
- gfp |= __GFP_COMP;
- ret = slob_new_pages(gfp, order, node);
-
- trace_kmalloc_node(caller, ret,
- size, PAGE_SIZE << order, gfp, node);
- }
-
- kmemleak_alloc(ret, size, 1, gfp);
- return ret;
-}
-
-void *__kmalloc(size_t size, gfp_t gfp)
-{
- return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc);
-
-void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
-{
- return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
- int node, unsigned long caller)
-{
- return __do_kmalloc_node(size, gfp, node, caller);
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif
-
-void kfree(const void *block)
-{
- struct page *sp;
-
- trace_kfree(_RET_IP_, block);
-
- if (unlikely(ZERO_OR_NULL_PTR(block)))
- return;
- kmemleak_free(block);
-
- sp = virt_to_page(block);
- if (PageSlab(sp)) {
- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
- unsigned int *m = (unsigned int *)(block - align);
- slob_free(m, *m + align);
- } else {
- unsigned int order = compound_order(sp);
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(sp, order);
-
- }
-}
-EXPORT_SYMBOL(kfree);
-
-/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
-size_t __ksize(const void *block)
-{
- struct page *sp;
- int align;
- unsigned int *m;
-
- BUG_ON(!block);
- if (unlikely(block == ZERO_SIZE_PTR))
- return 0;
-
- sp = virt_to_page(block);
- if (unlikely(!PageSlab(sp)))
- return page_size(sp);
-
- align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
- m = (unsigned int *)(block - align);
- return SLOB_UNITS(*m) * SLOB_UNIT;
-}
-EXPORT_SYMBOL(__ksize);
-
-int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
-{
- if (flags & SLAB_TYPESAFE_BY_RCU) {
- /* leave room for rcu footer at the end of object */
- c->size += sizeof(struct slob_rcu);
- }
- c->flags = flags;
- return 0;
-}
-
-static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
-{
- void *b;
-
- flags &= gfp_allowed_mask;
-
- fs_reclaim_acquire(flags);
- fs_reclaim_release(flags);
-
- if (c->size < PAGE_SIZE) {
- b = slob_alloc(c->size, flags, c->align, node, 0);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
- SLOB_UNITS(c->size) * SLOB_UNIT,
- flags, node);
- } else {
- b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
- PAGE_SIZE << get_order(c->size),
- flags, node);
- }
-
- if (b && c->ctor) {
- WARN_ON_ONCE(flags & __GFP_ZERO);
- c->ctor(b);
- }
-
- kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
- return b;
-}
-
-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
-{
- return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(kmem_cache_alloc);
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
-{
- return __do_kmalloc_node(size, gfp, node, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc_node);
-
-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
-{
- return slob_alloc_node(cachep, gfp, node);
-}
-EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
-
-static void __kmem_cache_free(void *b, int size)
-{
- if (size < PAGE_SIZE)
- slob_free(b, size);
- else
- slob_free_pages(b, get_order(size));
-}
-
-static void kmem_rcu_free(struct rcu_head *head)
-{
- struct slob_rcu *slob_rcu = (struct slob_rcu *)head;
- void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu));
-
- __kmem_cache_free(b, slob_rcu->size);
-}
-
-void kmem_cache_free(struct kmem_cache *c, void *b)
-{
- kmemleak_free_recursive(b, c->flags);
- if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
- struct slob_rcu *slob_rcu;
- slob_rcu = b + (c->size - sizeof(struct slob_rcu));
- slob_rcu->size = c->size;
- call_rcu(&slob_rcu->head, kmem_rcu_free);
- } else {
- __kmem_cache_free(b, c->size);
- }
-
- trace_kmem_cache_free(_RET_IP_, b);
-}
-EXPORT_SYMBOL(kmem_cache_free);
-
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
-{
- __kmem_cache_free_bulk(s, size, p);
-}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
-
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
-{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
-}
-EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-
-int __kmem_cache_shutdown(struct kmem_cache *c)
-{
- /* No way to check for remaining objects */
- return 0;
-}
-
-void __kmem_cache_release(struct kmem_cache *c)
-{
-}
-
-int __kmem_cache_shrink(struct kmem_cache *d)
-{
- return 0;
-}
-
-struct kmem_cache kmem_cache_boot = {
- .name = "kmem_cache",
- .size = sizeof(struct kmem_cache),
- .flags = SLAB_PANIC,
- .align = ARCH_KMALLOC_MINALIGN,
-};
-
-void __init kmem_cache_init(void)
-{
- kmem_cache = &kmem_cache_boot;
- slab_state = UP;
-}
-
-void __init kmem_cache_init_late(void)
-{
- slab_state = FULL;
-}
diff --git a/mm/slub.c b/mm/slub.c
index 61d0d2968413..e3b5d5c0eb3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3,7 +3,7 @@
* SLUB: A slab allocator that limits cache line use instead of queuing
* objects in per cpu and per node lists.
*
- * The allocator synchronizes using per slab locks or atomic operatios
+ * The allocator synchronizes using per slab locks or atomic operations
* and only uses a centralized lock to manage a pool of partial slabs.
*
* (C) 2007 SGI, Christoph Lameter
@@ -11,22 +11,26 @@
*/
#include <linux/mm.h>
-#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/swap.h> /* mm_account_reclaimed_pages() */
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
+#include <linux/swab.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
+#include <linux/kmsan.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
+#include <linux/stackdepot.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
+#include <linux/kfence.h>
#include <linux/memory.h>
#include <linux/math64.h>
#include <linux/fault-inject.h>
@@ -34,7 +38,11 @@
#include <linux/prefetch.h>
#include <linux/memcontrol.h>
#include <linux/random.h>
+#include <kunit/test.h>
+#include <kunit/test-bug.h>
+#include <linux/sort.h>
+#include <linux/debugfs.h>
#include <trace/events/kmem.h>
#include "internal.h"
@@ -42,27 +50,40 @@
/*
* Lock order:
* 1. slab_mutex (Global Mutex)
- * 2. node->list_lock
- * 3. slab_lock(page) (Only on some arches and for debugging)
+ * 2. node->list_lock (Spinlock)
+ * 3. kmem_cache->cpu_slab->lock (Local lock)
+ * 4. slab_lock(slab) (Only on some arches)
+ * 5. object_map_lock (Only for debugging)
*
* slab_mutex
*
* The role of the slab_mutex is to protect the list of all the slabs
* and to synchronize major metadata changes to slab cache structures.
+ * Also synchronizes memory hotplug callbacks.
*
- * The slab_lock is only used for debugging and on arches that do not
- * have the ability to do a cmpxchg_double. It only protects:
- * A. page->freelist -> List of object free in a page
- * B. page->inuse -> Number of objects in use
- * C. page->objects -> Number of objects in page
- * D. page->frozen -> frozen state
+ * slab_lock
+ *
+ * The slab_lock is a wrapper around the page lock, thus it is a bit
+ * spinlock.
+ *
+ * The slab_lock is only used on arches that do not have the ability
+ * to do a cmpxchg_double. It only protects:
+ *
+ * A. slab->freelist -> List of free objects in a slab
+ * B. slab->inuse -> Number of objects in use
+ * C. slab->objects -> Number of objects in slab
+ * D. slab->frozen -> frozen state
+ *
+ * Frozen slabs
*
* If a slab is frozen then it is exempt from list management. It is not
* on any list except per cpu partial list. The processor that froze the
- * slab is the one who can perform list operations on the page. Other
+ * slab is the one who can perform list operations on the slab. Other
* processors may put objects onto the freelist but the processor that
* froze the slab is the only one that can retrieve the objects from the
- * page's freelist.
+ * slab's freelist.
+ *
+ * list_lock
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
@@ -75,10 +96,41 @@
* slabs, operations can continue without any centralized lock. F.e.
* allocating a long series of objects that fill up slabs does not require
* the list lock.
- * Interrupts are disabled during allocation and deallocation in order to
- * make the slab allocator safe to use in the context of an irq. In addition
- * interrupts are disabled to ensure that the processor does not change
- * while handling per_cpu slabs, due to kernel preemption.
+ *
+ * For debug caches, all allocations are forced to go through a list_lock
+ * protected region to serialize against concurrent validation.
+ *
+ * cpu_slab->lock local lock
+ *
+ * This locks protect slowpath manipulation of all kmem_cache_cpu fields
+ * except the stat counters. This is a percpu structure manipulated only by
+ * the local cpu, so the lock protects against being preempted or interrupted
+ * by an irq. Fast path operations rely on lockless operations instead.
+ *
+ * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
+ * which means the lockless fastpath cannot be used as it might interfere with
+ * an in-progress slow path operations. In this case the local lock is always
+ * taken but it still utilizes the freelist for the common operations.
+ *
+ * lockless fastpaths
+ *
+ * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
+ * are fully lockless when satisfied from the percpu slab (and when
+ * cmpxchg_double is possible to use, otherwise slab_lock is taken).
+ * They also don't disable preemption or migration or irqs. They rely on
+ * the transaction id (tid) field to detect being preempted or moved to
+ * another cpu.
+ *
+ * irq, preemption, migration considerations
+ *
+ * Interrupts are disabled as part of list_lock or local_lock operations, or
+ * around the slab_lock operation, in order to make the slab allocator safe
+ * to use in the context of an irq.
+ *
+ * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
+ * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
+ * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
+ * doesn't have to be revalidated in each section protected by the local lock.
*
* SLUB assigns one slab for allocation to each processor.
* Allocations only occur from these slabs called cpu slabs.
@@ -93,7 +145,7 @@
* minimal so we rely on the page allocators per cpu caches for
* fast frees and allocs.
*
- * page->frozen The slab is frozen and exempt from list processing.
+ * slab->frozen The slab is frozen and exempt from list processing.
* This means that the slab is dedicated to a purpose
* such as satisfying allocations for a specific
* processor. Objects may be freed in the slab while
@@ -114,19 +166,60 @@
* the fast path and disables lockless freelists.
*/
+/*
+ * We could simply use migrate_disable()/enable() but as long as it's a
+ * function call even on !PREEMPT_RT, use inline preempt_disable() there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#define USE_LOCKLESS_FAST_PATH() (true)
+#else
+#define slub_get_cpu_ptr(var) \
+({ \
+ migrate_disable(); \
+ this_cpu_ptr(var); \
+})
+#define slub_put_cpu_ptr(var) \
+do { \
+ (void)(var); \
+ migrate_enable(); \
+} while (0)
+#define USE_LOCKLESS_FAST_PATH() (false)
+#endif
+
+#ifndef CONFIG_SLUB_TINY
+#define __fastpath_inline __always_inline
+#else
+#define __fastpath_inline
+#endif
+
#ifdef CONFIG_SLUB_DEBUG
#ifdef CONFIG_SLUB_DEBUG_ON
DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
-#endif
+#endif /* CONFIG_SLUB_DEBUG */
+
+/* Structure holding parameters for get_partial() call chain */
+struct partial_context {
+ struct slab **slab;
+ gfp_t flags;
+ unsigned int orig_size;
+};
static inline bool kmem_cache_debug(struct kmem_cache *s)
{
return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
+static inline bool slub_debug_orig_size(struct kmem_cache *s)
+{
+ return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
+ (s->flags & SLAB_KMALLOC));
+}
+
void *fixup_red_left(struct kmem_cache *s, void *p)
{
if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
@@ -152,14 +245,12 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
* - Variable sizing of the per node arrays
*/
-/* Enable to test recovery from slab corruption on boot */
-#undef SLUB_RESILIENCY_TEST
-
/* Enable to log cmpxchg failures */
#undef SLUB_DEBUG_CMPXCHG
+#ifndef CONFIG_SLUB_TINY
/*
- * Mininum number of partial slabs. These will be left on the partial
+ * Minimum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
*/
#define MIN_PARTIAL 5
@@ -170,6 +261,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
* sort the partial list by the number of objects in use.
*/
#define MAX_PARTIAL 10
+#else
+#define MIN_PARTIAL 0
+#define MAX_PARTIAL 0
+#endif
#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
@@ -191,13 +286,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#define OO_SHIFT 16
#define OO_MASK ((1 << OO_SHIFT) - 1)
-#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
+#define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */
/* Internal SLUB flags */
/* Poison object */
#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
/* Use cmpxchg_double */
+
+#ifdef system_has_freelist_aba
#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
+#else
+#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
+#endif
/*
* Tracking user of a slab.
@@ -205,8 +305,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#define TRACK_ADDRS_COUNT 16
struct track {
unsigned long addr; /* Called from address */
-#ifdef CONFIG_STACKTRACE
- unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#ifdef CONFIG_STACKDEPOT
+ depot_stack_handle_t handle;
#endif
int cpu; /* Was running on cpu */
int pid; /* Pid context */
@@ -215,7 +315,7 @@ struct track {
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SYSFS
+#ifdef SLAB_SUPPORTS_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
#else
@@ -224,6 +324,12 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
#endif
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
+static void debugfs_slab_add(struct kmem_cache *);
+#else
+static inline void debugfs_slab_add(struct kmem_cache *s) { }
+#endif
+
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
@@ -235,6 +341,21 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
#endif
}
+/*
+ * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
+ * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
+ * differ during memory hotplug/hotremove operations.
+ * Protected by slab_mutex.
+ */
+static nodemask_t slab_nodes;
+
+#ifndef CONFIG_SLUB_TINY
+/*
+ * Workqueue used for flush_cpu_slab().
+ */
+static struct workqueue_struct *flushwq;
+#endif
+
/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -249,7 +370,7 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
/*
- * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+ * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
* Normally, this doesn't cause any issues, as both set_freepointer()
* and get_freepointer() are called with a pointer with the same tag.
* However, there are some issues with CONFIG_SLUB_DEBUG code. For
@@ -275,14 +396,28 @@ static inline void *freelist_dereference(const struct kmem_cache *s,
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
+ object = kasan_reset_tag(object);
return freelist_dereference(s, object + s->offset);
}
+#ifndef CONFIG_SLUB_TINY
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
- prefetch(object + s->offset);
+ prefetchw(object + s->offset);
}
+#endif
+/*
+ * When running under KMSAN, get_freepointer_safe() may return an uninitialized
+ * pointer value in the case the current thread loses the race for the next
+ * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
+ * slab_alloc_node() will fail, so the uninitialized value won't be used, but
+ * KMSAN will still check all arguments of cmpxchg because of imperfect
+ * handling of inline assembly.
+ * To work around this problem, we apply __no_kmsan_checks to ensure that
+ * get_freepointer_safe() returns initialized memory.
+ */
+__no_kmsan_checks
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
unsigned long freepointer_addr;
@@ -291,6 +426,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
+ object = kasan_reset_tag(object);
freepointer_addr = (unsigned long)object + s->offset;
copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
return freelist_ptr(s, p, freepointer_addr);
@@ -304,6 +440,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
BUG_ON(object == fp); /* naive detection of double free or corruption */
#endif
+ freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
}
@@ -338,48 +475,108 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
return x.x & OO_MASK;
}
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
+{
+ unsigned int nr_slabs;
+
+ s->cpu_partial = nr_objects;
+
+ /*
+ * We take the number of objects but actually limit the number of
+ * slabs on the per cpu partial list, in order to limit excessive
+ * growth of the list. For simplicity we assume that the slabs will
+ * be half-full.
+ */
+ nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
+ s->cpu_partial_slabs = nr_slabs;
+}
+#else
+static inline void
+slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
+{
+}
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+
/*
* Per slab locking using the pagelock
*/
-static __always_inline void slab_lock(struct page *page)
+static __always_inline void slab_lock(struct slab *slab)
{
+ struct page *page = slab_page(slab);
+
VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
-static __always_inline void slab_unlock(struct page *page)
+static __always_inline void slab_unlock(struct slab *slab)
{
+ struct page *page = slab_page(slab);
+
VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
-/* Interrupts must be disabled (for the fallback code to work right) */
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+static inline bool
+__update_freelist_fast(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+#ifdef system_has_freelist_aba
+ freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
+
+ return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
+#else
+ return false;
+#endif
+}
+
+static inline bool
+__update_freelist_slow(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+ bool ret = false;
+
+ slab_lock(slab);
+ if (slab->freelist == freelist_old &&
+ slab->counters == counters_old) {
+ slab->freelist = freelist_new;
+ slab->counters = counters_new;
+ ret = true;
+ }
+ slab_unlock(slab);
+
+ return ret;
+}
+
+/*
+ * Interrupts must be disabled (for the fallback code to work right), typically
+ * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
+ * part of bit_spin_lock(), is sufficient because the policy is not to allow any
+ * allocation/ free operation in hardirq context. Therefore nothing can
+ * interrupt the operation.
+ */
+static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
- VM_BUG_ON(!irqs_disabled());
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ bool ret;
+
+ if (USE_LOCKLESS_FAST_PATH())
+ lockdep_assert_irqs_disabled();
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&page->freelist, &page->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
- slab_lock(page);
- if (page->freelist == freelist_old &&
- page->counters == counters_old) {
- page->freelist = freelist_new;
- page->counters = counters_new;
- slab_unlock(page);
- return true;
- }
- slab_unlock(page);
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
}
+ if (likely(ret))
+ return true;
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -391,36 +588,26 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
return false;
}
-static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ bool ret;
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&page->freelist, &page->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
unsigned long flags;
local_irq_save(flags);
- slab_lock(page);
- if (page->freelist == freelist_old &&
- page->counters == counters_old) {
- page->freelist = freelist_new;
- page->counters = counters_new;
- slab_unlock(page);
- local_irq_restore(flags);
- return true;
- }
- slab_unlock(page);
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
local_irq_restore(flags);
}
+ if (likely(ret))
+ return true;
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -436,35 +623,37 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
static DEFINE_SPINLOCK(object_map_lock);
-/*
- * Determine a map of object in use on a page.
- *
- * Node listlock must be held to guarantee that the page does
- * not vanish from under us.
- */
-static unsigned long *get_map(struct kmem_cache *s, struct page *page)
- __acquires(&object_map_lock)
+static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
+ struct slab *slab)
{
+ void *addr = slab_address(slab);
void *p;
- void *addr = page_address(page);
- VM_BUG_ON(!irqs_disabled());
+ bitmap_zero(obj_map, slab->objects);
- spin_lock(&object_map_lock);
+ for (p = slab->freelist; p; p = get_freepointer(s, p))
+ set_bit(__obj_to_index(s, addr, p), obj_map);
+}
- bitmap_zero(object_map, page->objects);
+#if IS_ENABLED(CONFIG_KUNIT)
+static bool slab_add_kunit_errors(void)
+{
+ struct kunit_resource *resource;
- for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(__obj_to_index(s, addr, p), object_map);
+ if (!kunit_get_current_test())
+ return false;
- return object_map;
-}
+ resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
+ if (!resource)
+ return false;
-static void put_map(unsigned long *map) __releases(&object_map_lock)
-{
- VM_BUG_ON(map != object_map);
- spin_unlock(&object_map_lock);
+ (*(int *)resource->data)++;
+ kunit_put_resource(resource);
+ return true;
}
+#else
+static inline bool slab_add_kunit_errors(void) { return false; }
+#endif
static inline unsigned int size_from_object(struct kmem_cache *s)
{
@@ -516,17 +705,17 @@ static inline void metadata_access_disable(void)
/* Verify that a pointer has an address that is valid within a slab page */
static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, void *object)
+ struct slab *slab, void *object)
{
void *base;
if (!object)
return 1;
- base = page_address(page);
+ base = slab_address(slab);
object = kasan_reset_tag(object);
object = restore_red_left(s, object);
- if (object < base || object >= base + page->objects * s->size ||
+ if (object < base || object >= base + slab->objects * s->size ||
(object - base) % s->size) {
return 0;
}
@@ -538,8 +727,8 @@ static void print_section(char *level, char *text, u8 *addr,
unsigned int length)
{
metadata_access_enable();
- print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
- length, 1);
+ print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
+ 16, 1, kasan_reset_tag((void *)addr), length, 1);
metadata_access_disable();
}
@@ -570,59 +759,77 @@ static struct track *get_track(struct kmem_cache *s, void *object,
p = object + get_info_end(s);
- return p + alloc;
+ return kasan_reset_tag(p + alloc);
}
-static void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr)
+#ifdef CONFIG_STACKDEPOT
+static noinline depot_stack_handle_t set_track_prepare(void)
{
- struct track *p = get_track(s, object, alloc);
+ depot_stack_handle_t handle;
+ unsigned long entries[TRACK_ADDRS_COUNT];
+ unsigned int nr_entries;
- if (addr) {
-#ifdef CONFIG_STACKTRACE
- unsigned int nr_entries;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
+ handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+
+ return handle;
+}
+#else
+static inline depot_stack_handle_t set_track_prepare(void)
+{
+ return 0;
+}
+#endif
- metadata_access_enable();
- nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
- metadata_access_disable();
+static void set_track_update(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr,
+ depot_stack_handle_t handle)
+{
+ struct track *p = get_track(s, object, alloc);
- if (nr_entries < TRACK_ADDRS_COUNT)
- p->addrs[nr_entries] = 0;
+#ifdef CONFIG_STACKDEPOT
+ p->handle = handle;
#endif
- p->addr = addr;
- p->cpu = smp_processor_id();
- p->pid = current->pid;
- p->when = jiffies;
- } else {
- memset(p, 0, sizeof(struct track));
- }
+ p->addr = addr;
+ p->cpu = smp_processor_id();
+ p->pid = current->pid;
+ p->when = jiffies;
+}
+
+static __always_inline void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr)
+{
+ depot_stack_handle_t handle = set_track_prepare();
+
+ set_track_update(s, object, alloc, addr, handle);
}
static void init_tracking(struct kmem_cache *s, void *object)
{
+ struct track *p;
+
if (!(s->flags & SLAB_STORE_USER))
return;
- set_track(s, object, TRACK_FREE, 0UL);
- set_track(s, object, TRACK_ALLOC, 0UL);
+ p = get_track(s, object, TRACK_ALLOC);
+ memset(p, 0, 2*sizeof(struct track));
}
static void print_track(const char *s, struct track *t, unsigned long pr_time)
{
+ depot_stack_handle_t handle __maybe_unused;
+
if (!t->addr)
return;
- pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
-#ifdef CONFIG_STACKTRACE
- {
- int i;
- for (i = 0; i < TRACK_ADDRS_COUNT; i++)
- if (t->addrs[i])
- pr_err("\t%pS\n", (void *)t->addrs[i]);
- else
- break;
- }
+#ifdef CONFIG_STACKDEPOT
+ handle = READ_ONCE(t->handle);
+ if (handle)
+ stack_depot_print(handle);
+ else
+ pr_err("object allocation/free stack trace missing\n");
#endif
}
@@ -636,11 +843,62 @@ void print_tracking(struct kmem_cache *s, void *object)
print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
}
-static void print_page_info(struct page *page)
+static void print_slab_info(const struct slab *slab)
{
- pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
- page, page->objects, page->inuse, page->freelist, page->flags);
+ struct folio *folio = (struct folio *)slab_folio(slab);
+
+ pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
+ slab, slab->objects, slab->inuse, slab->freelist,
+ folio_flags(folio, 0));
+}
+/*
+ * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
+ * family will round up the real request size to these fixed ones, so
+ * there could be an extra area than what is requested. Save the original
+ * request size in the meta data area, for better debug and sanity check.
+ */
+static inline void set_orig_size(struct kmem_cache *s,
+ void *object, unsigned int orig_size)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return;
+
+#ifdef CONFIG_KASAN_GENERIC
+ /*
+ * KASAN could save its free meta data in object's data area at
+ * offset 0, if the size is larger than 'orig_size', it will
+ * overlap the data redzone in [orig_size+1, object_size], and
+ * the check should be skipped.
+ */
+ if (kasan_metadata_size(s, true) > orig_size)
+ orig_size = s->object_size;
+#endif
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ *(unsigned int *)p = orig_size;
+}
+
+static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return s->object_size;
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ return *(unsigned int *)p;
+}
+
+void skip_orig_size_check(struct kmem_cache *s, const void *object)
+{
+ set_orig_size(s, (void *)object, s->object_size);
}
static void slab_bug(struct kmem_cache *s, char *fmt, ...)
@@ -654,16 +912,18 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
pr_err("=============================================================================\n");
pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
-
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
va_end(args);
}
+__printf(2, 3)
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ if (slab_add_kunit_errors())
+ return;
+
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -671,42 +931,28 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
va_end(args);
}
-static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
- void **freelist, void *nextfree)
-{
- if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
- !check_valid_pointer(s, page, nextfree) && freelist) {
- object_err(s, page, *freelist, "Freechain corrupt");
- *freelist = NULL;
- slab_fix(s, "Isolate corrupted freechain");
- return true;
- }
-
- return false;
-}
-
-static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
+static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
{
unsigned int off; /* Offset of last byte */
- u8 *addr = page_address(page);
+ u8 *addr = slab_address(slab);
print_tracking(s, p);
- print_page_info(page);
+ print_slab_info(slab);
- pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
s->red_left_pad);
else if (p > addr + 16)
print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
- print_section(KERN_ERR, "Object ", p,
+ print_section(KERN_ERR, "Object ", p,
min_t(unsigned int, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p + s->object_size,
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
off = get_info_end(s);
@@ -714,70 +960,106 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- off += kasan_metadata_size(s);
+ if (slub_debug_orig_size(s))
+ off += sizeof(unsigned int);
+
+ off += kasan_metadata_size(s, false);
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section(KERN_ERR, "Padding ", p + off,
+ print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
dump_stack();
}
-void object_err(struct kmem_cache *s, struct page *page,
+static void object_err(struct kmem_cache *s, struct slab *slab,
u8 *object, char *reason)
{
+ if (slab_add_kunit_errors())
+ return;
+
slab_bug(s, "%s", reason);
- print_trailer(s, page, object);
+ print_trailer(s, slab, object);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
-static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
+static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
+ void **freelist, void *nextfree)
+{
+ if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
+ !check_valid_pointer(s, slab, nextfree) && freelist) {
+ object_err(s, slab, *freelist, "Freechain corrupt");
+ *freelist = NULL;
+ slab_fix(s, "Isolate corrupted freechain");
+ return true;
+ }
+
+ return false;
+}
+
+static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
const char *fmt, ...)
{
va_list args;
char buf[100];
+ if (slab_add_kunit_errors())
+ return;
+
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
slab_bug(s, "%s", buf);
- print_page_info(page);
+ print_slab_info(slab);
dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
{
- u8 *p = object;
+ u8 *p = kasan_reset_tag(object);
+ unsigned int poison_size = s->object_size;
- if (s->flags & SLAB_RED_ZONE)
+ if (s->flags & SLAB_RED_ZONE) {
memset(p - s->red_left_pad, val, s->red_left_pad);
+ if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
+ /*
+ * Redzone the extra allocated space by kmalloc than
+ * requested, and the poison size will be limited to
+ * the original request size accordingly.
+ */
+ poison_size = get_orig_size(s, object);
+ }
+ }
+
if (s->flags & __OBJECT_POISON) {
- memset(p, POISON_FREE, s->object_size - 1);
- p[s->object_size - 1] = POISON_END;
+ memset(p, POISON_FREE, poison_size - 1);
+ p[poison_size - 1] = POISON_END;
}
if (s->flags & SLAB_RED_ZONE)
- memset(p + s->object_size, val, s->inuse - s->object_size);
+ memset(p + poison_size, val, s->inuse - poison_size);
}
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
void *from, void *to)
{
- slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
memset(from, data, to - from);
}
-static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
+static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
u8 *object, char *what,
u8 *start, unsigned int value, unsigned int bytes)
{
u8 *fault;
u8 *end;
- u8 *addr = page_address(page);
+ u8 *addr = slab_address(slab);
metadata_access_enable();
- fault = memchr_inv(start, value, bytes);
+ fault = memchr_inv(kasan_reset_tag(start), value, bytes);
metadata_access_disable();
if (!fault)
return 1;
@@ -786,12 +1068,17 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
while (end > fault && end[-1] == value)
end--;
+ if (slab_add_kunit_errors())
+ goto skip_bug_print;
+
slab_bug(s, "%s overwritten", what);
- pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
- print_trailer(s, page, object);
+ print_trailer(s, slab, object);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+skip_bug_print:
restore_bytes(s, what, value, fault, end);
return 0;
}
@@ -820,7 +1107,8 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
*
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
- * C. Padding to reach required alignment boundary or at mininum
+ * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
+ * D. Padding to reach required alignment boundary or at minimum
* one word if debugging is on to be able to detect writes
* before the word boundary.
*
@@ -834,25 +1122,29 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
* may be used with merged slabcaches.
*/
-static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
+static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
{
unsigned long off = get_info_end(s); /* The end of info */
- if (s->flags & SLAB_STORE_USER)
+ if (s->flags & SLAB_STORE_USER) {
/* We also have user information there */
off += 2 * sizeof(struct track);
- off += kasan_metadata_size(s);
+ if (s->flags & SLAB_KMALLOC)
+ off += sizeof(unsigned int);
+ }
+
+ off += kasan_metadata_size(s, false);
if (size_from_object(s) == off)
return 1;
- return check_bytes_and_report(s, page, p, "Object padding",
+ return check_bytes_and_report(s, slab, p, "Object padding",
p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
-static int slab_pad_check(struct kmem_cache *s, struct page *page)
+static void slab_pad_check(struct kmem_cache *s, struct slab *slab)
{
u8 *start;
u8 *fault;
@@ -862,49 +1154,60 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
int remainder;
if (!(s->flags & SLAB_POISON))
- return 1;
+ return;
- start = page_address(page);
- length = page_size(page);
+ start = slab_address(slab);
+ length = slab_size(slab);
end = start + length;
remainder = length % s->size;
if (!remainder)
- return 1;
+ return;
pad = end - remainder;
metadata_access_enable();
- fault = memchr_inv(pad, POISON_INUSE, remainder);
+ fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
metadata_access_disable();
if (!fault)
- return 1;
+ return;
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
- return 0;
}
-static int check_object(struct kmem_cache *s, struct page *page,
+static int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val)
{
u8 *p = object;
u8 *endobject = object + s->object_size;
+ unsigned int orig_size;
if (s->flags & SLAB_RED_ZONE) {
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, slab, object, "Left Redzone",
object - s->red_left_pad, val, s->red_left_pad))
return 0;
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, slab, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
+
+ if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
+ orig_size = get_orig_size(s, object);
+
+ if (s->object_size > orig_size &&
+ !check_bytes_and_report(s, slab, object,
+ "kmalloc Redzone", p + orig_size,
+ val, s->object_size - orig_size)) {
+ return 0;
+ }
+ }
} else {
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
- check_bytes_and_report(s, page, p, "Alignment padding",
+ check_bytes_and_report(s, slab, p, "Alignment padding",
endobject, POISON_INUSE,
s->inuse - s->object_size);
}
@@ -912,15 +1215,15 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_POISON) {
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
- (!check_bytes_and_report(s, page, p, "Poison", p,
+ (!check_bytes_and_report(s, slab, p, "Poison", p,
POISON_FREE, s->object_size - 1) ||
- !check_bytes_and_report(s, page, p, "Poison",
+ !check_bytes_and_report(s, slab, p, "End Poison",
p + s->object_size - 1, POISON_END, 1)))
return 0;
/*
* check_pad_bytes cleans up on its own.
*/
- check_pad_bytes(s, page, p);
+ check_pad_bytes(s, slab, p);
}
if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
@@ -931,8 +1234,8 @@ static int check_object(struct kmem_cache *s, struct page *page,
return 1;
/* Check free pointer validity */
- if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
- object_err(s, page, p, "Freepointer corrupt");
+ if (!check_valid_pointer(s, slab, get_freepointer(s, p))) {
+ object_err(s, slab, p, "Freepointer corrupt");
/*
* No choice but to zap it and thus lose the remainder
* of the free objects in this slab. May cause
@@ -944,57 +1247,55 @@ static int check_object(struct kmem_cache *s, struct page *page,
return 1;
}
-static int check_slab(struct kmem_cache *s, struct page *page)
+static int check_slab(struct kmem_cache *s, struct slab *slab)
{
int maxobj;
- VM_BUG_ON(!irqs_disabled());
-
- if (!PageSlab(page)) {
- slab_err(s, page, "Not a valid slab page");
+ if (!folio_test_slab(slab_folio(slab))) {
+ slab_err(s, slab, "Not a valid slab page");
return 0;
}
- maxobj = order_objects(compound_order(page), s->size);
- if (page->objects > maxobj) {
- slab_err(s, page, "objects %u > max %u",
- page->objects, maxobj);
+ maxobj = order_objects(slab_order(slab), s->size);
+ if (slab->objects > maxobj) {
+ slab_err(s, slab, "objects %u > max %u",
+ slab->objects, maxobj);
return 0;
}
- if (page->inuse > page->objects) {
- slab_err(s, page, "inuse %u > max %u",
- page->inuse, page->objects);
+ if (slab->inuse > slab->objects) {
+ slab_err(s, slab, "inuse %u > max %u",
+ slab->inuse, slab->objects);
return 0;
}
/* Slab_pad_check fixes things up after itself */
- slab_pad_check(s, page);
+ slab_pad_check(s, slab);
return 1;
}
/*
- * Determine if a certain object on a page is on the freelist. Must hold the
+ * Determine if a certain object in a slab is on the freelist. Must hold the
* slab lock to guarantee that the chains are in a consistent state.
*/
-static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
+static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
{
int nr = 0;
void *fp;
void *object = NULL;
int max_objects;
- fp = page->freelist;
- while (fp && nr <= page->objects) {
+ fp = slab->freelist;
+ while (fp && nr <= slab->objects) {
if (fp == search)
return 1;
- if (!check_valid_pointer(s, page, fp)) {
+ if (!check_valid_pointer(s, slab, fp)) {
if (object) {
- object_err(s, page, object,
+ object_err(s, slab, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
} else {
- slab_err(s, page, "Freepointer corrupt");
- page->freelist = NULL;
- page->inuse = page->objects;
+ slab_err(s, slab, "Freepointer corrupt");
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
slab_fix(s, "Freelist cleared");
return 0;
}
@@ -1005,34 +1306,34 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
nr++;
}
- max_objects = order_objects(compound_order(page), s->size);
+ max_objects = order_objects(slab_order(slab), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
- if (page->objects != max_objects) {
- slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
- page->objects, max_objects);
- page->objects = max_objects;
- slab_fix(s, "Number of objects adjusted.");
+ if (slab->objects != max_objects) {
+ slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
+ slab->objects, max_objects);
+ slab->objects = max_objects;
+ slab_fix(s, "Number of objects adjusted");
}
- if (page->inuse != page->objects - nr) {
- slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
- page->inuse, page->objects - nr);
- page->inuse = page->objects - nr;
- slab_fix(s, "Object count adjusted.");
+ if (slab->inuse != slab->objects - nr) {
+ slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
+ slab->inuse, slab->objects - nr);
+ slab->inuse = slab->objects - nr;
+ slab_fix(s, "Object count adjusted");
}
return search == NULL;
}
-static void trace(struct kmem_cache *s, struct page *page, void *object,
+static void trace(struct kmem_cache *s, struct slab *slab, void *object,
int alloc)
{
if (s->flags & SLAB_TRACE) {
pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
s->name,
alloc ? "alloc" : "free",
- object, page->inuse,
- page->freelist);
+ object, slab->inuse,
+ slab->freelist);
if (!alloc)
print_section(KERN_INFO, "Object ", (void *)object,
@@ -1046,30 +1347,22 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
* Tracking of fully allocated slabs for debugging purposes.
*/
static void add_full(struct kmem_cache *s,
- struct kmem_cache_node *n, struct page *page)
+ struct kmem_cache_node *n, struct slab *slab)
{
if (!(s->flags & SLAB_STORE_USER))
return;
lockdep_assert_held(&n->list_lock);
- list_add(&page->slab_list, &n->full);
+ list_add(&slab->slab_list, &n->full);
}
-static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
+static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
{
if (!(s->flags & SLAB_STORE_USER))
return;
lockdep_assert_held(&n->list_lock);
- list_del(&page->slab_list);
-}
-
-/* Tracking of the number of slabs for debugging purposes */
-static inline unsigned long slabs_node(struct kmem_cache *s, int node)
-{
- struct kmem_cache_node *n = get_node(s, node);
-
- return atomic_long_read(&n->nr_slabs);
+ list_del(&slab->slab_list);
}
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
@@ -1101,8 +1394,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
}
/* Object debug checks for alloc/free paths */
-static void setup_object_debug(struct kmem_cache *s, struct page *page,
- void *object)
+static void setup_object_debug(struct kmem_cache *s, void *object)
{
if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
return;
@@ -1112,148 +1404,93 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
}
static
-void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
+void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
{
if (!kmem_cache_debug_flags(s, SLAB_POISON))
return;
metadata_access_enable();
- memset(addr, POISON_INUSE, page_size(page));
+ memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
metadata_access_disable();
}
static inline int alloc_consistency_checks(struct kmem_cache *s,
- struct page *page, void *object)
+ struct slab *slab, void *object)
{
- if (!check_slab(s, page))
+ if (!check_slab(s, slab))
return 0;
- if (!check_valid_pointer(s, page, object)) {
- object_err(s, page, object, "Freelist Pointer check fails");
+ if (!check_valid_pointer(s, slab, object)) {
+ object_err(s, slab, object, "Freelist Pointer check fails");
return 0;
}
- if (!check_object(s, page, object, SLUB_RED_INACTIVE))
+ if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
return 0;
return 1;
}
-static noinline int alloc_debug_processing(struct kmem_cache *s,
- struct page *page,
- void *object, unsigned long addr)
+static noinline bool alloc_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *object, int orig_size)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!alloc_consistency_checks(s, page, object))
+ if (!alloc_consistency_checks(s, slab, object))
goto bad;
}
- /* Success perform special debug activities for allocs */
- if (s->flags & SLAB_STORE_USER)
- set_track(s, object, TRACK_ALLOC, addr);
- trace(s, page, object, 1);
+ /* Success. Perform special debug activities for allocs */
+ trace(s, slab, object, 1);
+ set_orig_size(s, object, orig_size);
init_object(s, object, SLUB_RED_ACTIVE);
- return 1;
+ return true;
bad:
- if (PageSlab(page)) {
+ if (folio_test_slab(slab_folio(slab))) {
/*
* If this is a slab page then lets do the best we can
* to avoid issues in the future. Marking all objects
* as used avoids touching the remaining objects.
*/
slab_fix(s, "Marking all objects used");
- page->inuse = page->objects;
- page->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab->freelist = NULL;
}
- return 0;
+ return false;
}
static inline int free_consistency_checks(struct kmem_cache *s,
- struct page *page, void *object, unsigned long addr)
+ struct slab *slab, void *object, unsigned long addr)
{
- if (!check_valid_pointer(s, page, object)) {
- slab_err(s, page, "Invalid object pointer 0x%p", object);
+ if (!check_valid_pointer(s, slab, object)) {
+ slab_err(s, slab, "Invalid object pointer 0x%p", object);
return 0;
}
- if (on_freelist(s, page, object)) {
- object_err(s, page, object, "Object already free");
+ if (on_freelist(s, slab, object)) {
+ object_err(s, slab, object, "Object already free");
return 0;
}
- if (!check_object(s, page, object, SLUB_RED_ACTIVE))
+ if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
return 0;
- if (unlikely(s != page->slab_cache)) {
- if (!PageSlab(page)) {
- slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
+ if (unlikely(s != slab->slab_cache)) {
+ if (!folio_test_slab(slab_folio(slab))) {
+ slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
object);
- } else if (!page->slab_cache) {
+ } else if (!slab->slab_cache) {
pr_err("SLUB <none>: no slab for object 0x%p.\n",
object);
dump_stack();
} else
- object_err(s, page, object,
+ object_err(s, slab, object,
"page slab pointer corrupt.");
return 0;
}
return 1;
}
-/* Supports checking bulk free of a constructed freelist */
-static noinline int free_debug_processing(
- struct kmem_cache *s, struct page *page,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr)
-{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- void *object = head;
- int cnt = 0;
- unsigned long flags;
- int ret = 0;
-
- spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(page);
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!check_slab(s, page))
- goto out;
- }
-
-next_object:
- cnt++;
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!free_consistency_checks(s, page, object, addr))
- goto out;
- }
-
- if (s->flags & SLAB_STORE_USER)
- set_track(s, object, TRACK_FREE, addr);
- trace(s, page, object, 0);
- /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
- init_object(s, object, SLUB_RED_INACTIVE);
-
- /* Reached end of constructed freelist yet? */
- if (object != tail) {
- object = get_freepointer(s, object);
- goto next_object;
- }
- ret = 1;
-
-out:
- if (cnt != bulk_cnt)
- slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
- bulk_cnt, cnt);
-
- slab_unlock(page);
- spin_unlock_irqrestore(&n->list_lock, flags);
- if (!ret)
- slab_fix(s, "Object at 0x%p not freed", object);
- return ret;
-}
-
/*
* Parse a block of slub_debug options. Blocks are delimited by ';'
*
@@ -1345,12 +1582,13 @@ check_slabs:
static int __init setup_slub_debug(char *str)
{
slab_flags_t flags;
+ slab_flags_t global_flags;
char *saved_str;
char *slab_list;
bool global_slub_debug_changed = false;
bool slab_list_specified = false;
- slub_debug = DEBUG_DEFAULT_FLAGS;
+ global_flags = DEBUG_DEFAULT_FLAGS;
if (*str++ != '=' || !*str)
/*
* No options specified. Switch on full debugging.
@@ -1362,27 +1600,35 @@ static int __init setup_slub_debug(char *str)
str = parse_slub_debug_flags(str, &flags, &slab_list, true);
if (!slab_list) {
- slub_debug = flags;
+ global_flags = flags;
global_slub_debug_changed = true;
} else {
slab_list_specified = true;
+ if (flags & SLAB_STORE_USER)
+ stack_depot_request_early_init();
}
}
/*
* For backwards compatibility, a single list of flags with list of
- * slabs means debugging is only enabled for those slabs, so the global
- * slub_debug should be 0. We can extended that to multiple lists as
+ * slabs means debugging is only changed for those slabs, so the global
+ * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
+ * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
* long as there is no option specifying flags without a slab list.
*/
if (slab_list_specified) {
if (!global_slub_debug_changed)
- slub_debug = 0;
+ global_flags = slub_debug;
slub_debug_string = saved_str;
}
out:
+ slub_debug = global_flags;
+ if (slub_debug & SLAB_STORE_USER)
+ stack_depot_request_early_init();
if (slub_debug != 0 || slub_debug_string)
static_branch_enable(&slub_debug_enabled);
+ else
+ static_branch_disable(&slub_debug_enabled);
if ((static_branch_unlikely(&init_on_alloc) ||
static_branch_unlikely(&init_on_free)) &&
(slub_debug & SLAB_POISON))
@@ -1397,7 +1643,6 @@ __setup("slub_debug", setup_slub_debug);
* @object_size: the size of an object without meta data
* @flags: flags to set
* @name: name of the cache
- * @ctor: constructor function
*
* Debug option(s) are applied to @flags. In addition to the debug
* option(s), if a slab name (or multiple) is specified i.e.
@@ -1405,13 +1650,24 @@ __setup("slub_debug", setup_slub_debug);
* then only the select slabs will receive the debug option(s).
*/
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
char *iter;
size_t len;
char *next_block;
slab_flags_t block_flags;
+ slab_flags_t slub_debug_local = slub_debug;
+
+ if (flags & SLAB_NO_USER_FLAGS)
+ return flags;
+
+ /*
+ * If the slab cache is for debugging (e.g. kmemleak) then
+ * don't store user (stack trace) information by default,
+ * but let the user enable it via the command line below.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ slub_debug_local &= ~SLAB_STORE_USER;
len = strlen(name);
next_block = slub_debug_string;
@@ -1446,33 +1702,32 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
}
}
- return flags | slub_debug;
+ return flags | slub_debug_local;
}
#else /* !CONFIG_SLUB_DEBUG */
-static inline void setup_object_debug(struct kmem_cache *s,
- struct page *page, void *object) {}
+static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
static inline
-void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
+void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
-static inline int alloc_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, unsigned long addr) { return 0; }
+static inline bool alloc_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *object, int orig_size) { return true; }
-static inline int free_debug_processing(
- struct kmem_cache *s, struct page *page,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr) { return 0; }
+static inline bool free_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail, int *bulk_cnt,
+ unsigned long addr, depot_stack_handle_t handle) { return true; }
-static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
- { return 1; }
-static inline int check_object(struct kmem_cache *s, struct page *page,
+static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
+static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
+static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
+static inline void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
- struct page *page) {}
+ struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
- struct page *page) {}
+ struct slab *slab) {}
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -1480,8 +1735,6 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
#define disable_higher_order_debug 0
-static inline unsigned long slabs_node(struct kmem_cache *s, int node)
- { return 0; }
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
{ return 0; }
static inline void inc_slabs_node(struct kmem_cache *s, int node,
@@ -1489,49 +1742,27 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+#ifndef CONFIG_SLUB_TINY
+static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
void **freelist, void *nextfree)
{
return false;
}
+#endif
#endif /* CONFIG_SLUB_DEBUG */
/*
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- ptr = kasan_kmalloc_large(ptr, size, flags);
- /* As ptr might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ptr, size, 1, flags);
- return ptr;
-}
-
-static __always_inline void kfree_hook(void *x)
-{
- kmemleak_free(x);
- kasan_kfree_large(x, _RET_IP_);
-}
-
-static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
+static __always_inline bool slab_free_hook(struct kmem_cache *s,
+ void *x, bool init)
{
kmemleak_free_recursive(x, s->flags);
+ kmsan_slab_free(s, x);
- /*
- * Trouble is that we may no longer disable interrupts in the fast path
- * So in order to make the debug calls that expect irqs to be
- * disabled we need to disable interrupts temporarily.
- */
-#ifdef CONFIG_LOCKDEP
- {
- unsigned long flags;
+ debug_check_no_locks_freed(x, s->object_size);
- local_irq_save(flags);
- debug_check_no_locks_freed(x, s->object_size);
- local_irq_restore(flags);
- }
-#endif
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
@@ -1540,18 +1771,40 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
__kcsan_check_access(x, s->object_size,
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
- /* KASAN might put x into memory quarantine, delaying its reuse */
- return kasan_slab_free(s, x, _RET_IP_);
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset's must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * The initialization memset's clear the object and the metadata,
+ * but don't touch the SLAB redzone.
+ */
+ if (init) {
+ int rsize;
+
+ if (!kasan_has_integrated_init())
+ memset(kasan_reset_tag(x), 0, s->object_size);
+ rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
+ memset((char *)kasan_reset_tag(x) + s->inuse, 0,
+ s->size - s->inuse - rsize);
+ }
+ /* KASAN might put x into memory quarantine, delaying its reuse. */
+ return kasan_slab_free(s, x, init);
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
- void **head, void **tail)
+ void **head, void **tail,
+ int *cnt)
{
void *object;
void *next = *head;
void *old_tail = *tail ? *tail : *head;
- int rsize;
+
+ if (is_kfence_address(next)) {
+ slab_free_hook(s, next, false);
+ return true;
+ }
/* Head and tail of the reconstructed freelist */
*head = NULL;
@@ -1561,25 +1814,19 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
object = next;
next = get_freepointer(s, object);
- if (slab_want_init_on_free(s)) {
- /*
- * Clear the object and the metadata, but don't touch
- * the redzone.
- */
- memset(object, 0, s->object_size);
- rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
- : 0;
- memset((char *)object + s->inuse, 0,
- s->size - s->inuse - rsize);
-
- }
/* If object's reuse doesn't have to be delayed */
- if (!slab_free_hook(s, object)) {
+ if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
/* Move object to the new freelist */
set_freepointer(s, object, *head);
*head = object;
if (!*tail)
*tail = object;
+ } else {
+ /*
+ * Adjust the reconstructed freelist depth
+ * accordingly if object's reuse is delayed.
+ */
+ --(*cnt);
}
} while (object != old_tail);
@@ -1589,10 +1836,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
return *head != NULL;
}
-static void *setup_object(struct kmem_cache *s, struct page *page,
- void *object)
+static void *setup_object(struct kmem_cache *s, void *object)
{
- setup_object_debug(s, page, object);
+ setup_object_debug(s, object);
object = kasan_init_slab_obj(s, object);
if (unlikely(s->ctor)) {
kasan_unpoison_object_data(s, object);
@@ -1605,21 +1851,29 @@ static void *setup_object(struct kmem_cache *s, struct page *page,
/*
* Slab allocation and freeing
*/
-static inline struct page *alloc_slab_page(struct kmem_cache *s,
- gfp_t flags, int node, struct kmem_cache_order_objects oo)
+static inline struct slab *alloc_slab_page(gfp_t flags, int node,
+ struct kmem_cache_order_objects oo)
{
- struct page *page;
+ struct folio *folio;
+ struct slab *slab;
unsigned int order = oo_order(oo);
if (node == NUMA_NO_NODE)
- page = alloc_pages(flags, order);
+ folio = (struct folio *)alloc_pages(flags, order);
else
- page = __alloc_pages_node(node, flags, order);
+ folio = (struct folio *)__alloc_pages_node(node, flags, order);
- if (page)
- account_slab_page(page, order, s);
+ if (!folio)
+ return NULL;
- return page;
+ slab = folio_slab(folio);
+ __folio_set_slab(folio);
+ /* Make the flag visible before any changes to folio->mapping */
+ smp_wmb();
+ if (folio_is_pfmemalloc(folio))
+ slab_set_pfmemalloc(slab);
+
+ return slab;
}
#ifdef CONFIG_SLAB_FREELIST_RANDOM
@@ -1664,7 +1918,7 @@ static void __init init_freelist_randomization(void)
}
/* Get the next entry on the pre-computed freelist randomized */
-static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
+static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab,
unsigned long *pos, void *start,
unsigned long page_limit,
unsigned long freelist_count)
@@ -1686,32 +1940,32 @@ static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
}
/* Shuffle the single linked freelist based on a random pre-computed sequence */
-static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
+static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
{
void *start;
void *cur;
void *next;
unsigned long idx, pos, page_limit, freelist_count;
- if (page->objects < 2 || !s->random_seq)
+ if (slab->objects < 2 || !s->random_seq)
return false;
freelist_count = oo_objects(s->oo);
- pos = get_random_int() % freelist_count;
+ pos = get_random_u32_below(freelist_count);
- page_limit = page->objects * s->size;
- start = fixup_red_left(s, page_address(page));
+ page_limit = slab->objects * s->size;
+ start = fixup_red_left(s, slab_address(slab));
/* First entry is used as the base of the freelist */
- cur = next_freelist_entry(s, page, &pos, start, page_limit,
+ cur = next_freelist_entry(s, slab, &pos, start, page_limit,
freelist_count);
- cur = setup_object(s, page, cur);
- page->freelist = cur;
+ cur = setup_object(s, cur);
+ slab->freelist = cur;
- for (idx = 1; idx < page->objects; idx++) {
- next = next_freelist_entry(s, page, &pos, start, page_limit,
+ for (idx = 1; idx < slab->objects; idx++) {
+ next = next_freelist_entry(s, slab, &pos, start, page_limit,
freelist_count);
- next = setup_object(s, page, next);
+ next = setup_object(s, next);
set_freepointer(s, cur, next);
cur = next;
}
@@ -1725,15 +1979,15 @@ static inline int init_cache_random_seq(struct kmem_cache *s)
return 0;
}
static inline void init_freelist_randomization(void) { }
-static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
+static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
{
return false;
}
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
-static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
- struct page *page;
+ struct slab *slab;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p, *next;
@@ -1742,9 +1996,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
flags &= gfp_allowed_mask;
- if (gfpflags_allow_blocking(flags))
- local_irq_enable();
-
flags |= s->allocflags;
/*
@@ -1753,159 +2004,220 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
- alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
- page = alloc_slab_page(s, alloc_gfp, node, oo);
- if (unlikely(!page)) {
+ slab = alloc_slab_page(alloc_gfp, node, oo);
+ if (unlikely(!slab)) {
oo = s->min;
alloc_gfp = flags;
/*
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- page = alloc_slab_page(s, alloc_gfp, node, oo);
- if (unlikely(!page))
- goto out;
+ slab = alloc_slab_page(alloc_gfp, node, oo);
+ if (unlikely(!slab))
+ return NULL;
stat(s, ORDER_FALLBACK);
}
- page->objects = oo_objects(oo);
+ slab->objects = oo_objects(oo);
+ slab->inuse = 0;
+ slab->frozen = 0;
- page->slab_cache = s;
- __SetPageSlab(page);
- if (page_is_pfmemalloc(page))
- SetPageSlabPfmemalloc(page);
+ account_slab(slab, oo_order(oo), s, flags);
- kasan_poison_slab(page);
+ slab->slab_cache = s;
- start = page_address(page);
+ kasan_poison_slab(slab);
- setup_page_debug(s, page, start);
+ start = slab_address(slab);
- shuffle = shuffle_freelist(s, page);
+ setup_slab_debug(s, slab, start);
+
+ shuffle = shuffle_freelist(s, slab);
if (!shuffle) {
start = fixup_red_left(s, start);
- start = setup_object(s, page, start);
- page->freelist = start;
- for (idx = 0, p = start; idx < page->objects - 1; idx++) {
+ start = setup_object(s, start);
+ slab->freelist = start;
+ for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
next = p + s->size;
- next = setup_object(s, page, next);
+ next = setup_object(s, next);
set_freepointer(s, p, next);
p = next;
}
set_freepointer(s, p, NULL);
}
- page->inuse = page->objects;
- page->frozen = 1;
-
-out:
- if (gfpflags_allow_blocking(flags))
- local_irq_disable();
- if (!page)
- return NULL;
-
- inc_slabs_node(s, page_to_nid(page), page->objects);
-
- return page;
+ return slab;
}
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);
+ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
+
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}
-static void __free_slab(struct kmem_cache *s, struct page *page)
+static void __free_slab(struct kmem_cache *s, struct slab *slab)
{
- int order = compound_order(page);
+ struct folio *folio = slab_folio(slab);
+ int order = folio_order(folio);
int pages = 1 << order;
- if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
- void *p;
-
- slab_pad_check(s, page);
- for_each_object(p, s, page_address(page),
- page->objects)
- check_object(s, page, p, SLUB_RED_INACTIVE);
- }
-
- __ClearPageSlabPfmemalloc(page);
- __ClearPageSlab(page);
-
- page->mapping = NULL;
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += pages;
- unaccount_slab_page(page, order, s);
- __free_pages(page, order);
+ __slab_clear_pfmemalloc(slab);
+ folio->mapping = NULL;
+ /* Make the mapping reset visible before clearing the flag */
+ smp_wmb();
+ __folio_clear_slab(folio);
+ mm_account_reclaimed_pages(pages);
+ unaccount_slab(slab, order, s);
+ __free_pages(&folio->page, order);
}
static void rcu_free_slab(struct rcu_head *h)
{
- struct page *page = container_of(h, struct page, rcu_head);
+ struct slab *slab = container_of(h, struct slab, rcu_head);
- __free_slab(page->slab_cache, page);
+ __free_slab(slab->slab_cache, slab);
}
-static void free_slab(struct kmem_cache *s, struct page *page)
+static void free_slab(struct kmem_cache *s, struct slab *slab)
{
- if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
- call_rcu(&page->rcu_head, rcu_free_slab);
- } else
- __free_slab(s, page);
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
+ void *p;
+
+ slab_pad_check(s, slab);
+ for_each_object(p, s, slab_address(slab), slab->objects)
+ check_object(s, slab, p, SLUB_RED_INACTIVE);
+ }
+
+ if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
+ call_rcu(&slab->rcu_head, rcu_free_slab);
+ else
+ __free_slab(s, slab);
}
-static void discard_slab(struct kmem_cache *s, struct page *page)
+static void discard_slab(struct kmem_cache *s, struct slab *slab)
{
- dec_slabs_node(s, page_to_nid(page), page->objects);
- free_slab(s, page);
+ dec_slabs_node(s, slab_nid(slab), slab->objects);
+ free_slab(s, slab);
}
/*
* Management of partially allocated slabs.
*/
static inline void
-__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
+__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
- list_add_tail(&page->slab_list, &n->partial);
+ list_add_tail(&slab->slab_list, &n->partial);
else
- list_add(&page->slab_list, &n->partial);
+ list_add(&slab->slab_list, &n->partial);
}
static inline void add_partial(struct kmem_cache_node *n,
- struct page *page, int tail)
+ struct slab *slab, int tail)
{
lockdep_assert_held(&n->list_lock);
- __add_partial(n, page, tail);
+ __add_partial(n, slab, tail);
}
static inline void remove_partial(struct kmem_cache_node *n,
- struct page *page)
+ struct slab *slab)
{
lockdep_assert_held(&n->list_lock);
- list_del(&page->slab_list);
+ list_del(&slab->slab_list);
n->nr_partial--;
}
/*
+ * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
+ * slab from the n->partial list. Remove only a single object from the slab, do
+ * the alloc_debug_processing() checks and leave the slab on the list, or move
+ * it to full list if it was the last free object.
+ */
+static void *alloc_single_from_partial(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct slab *slab, int orig_size)
+{
+ void *object;
+
+ lockdep_assert_held(&n->list_lock);
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse++;
+
+ if (!alloc_debug_processing(s, slab, object, orig_size)) {
+ remove_partial(n, slab);
+ return NULL;
+ }
+
+ if (slab->inuse == slab->objects) {
+ remove_partial(n, slab);
+ add_full(s, n, slab);
+ }
+
+ return object;
+}
+
+/*
+ * Called only for kmem_cache_debug() caches to allocate from a freshly
+ * allocated slab. Allocate a single object instead of whole freelist
+ * and put the slab to the partial (or full) list.
+ */
+static void *alloc_single_from_new_slab(struct kmem_cache *s,
+ struct slab *slab, int orig_size)
+{
+ int nid = slab_nid(slab);
+ struct kmem_cache_node *n = get_node(s, nid);
+ unsigned long flags;
+ void *object;
+
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse = 1;
+
+ if (!alloc_debug_processing(s, slab, object, orig_size))
+ /*
+ * It's not really expected that this would fail on a
+ * freshly allocated slab, but a concurrent memory
+ * corruption in theory could cause that.
+ */
+ return NULL;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (slab->inuse == slab->objects)
+ add_full(s, n, slab);
+ else
+ add_partial(n, slab, DEACTIVATE_TO_HEAD);
+
+ inc_slabs_node(s, nid, slab->objects);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ return object;
+}
+
+/*
* Remove slab from the partial list, freeze it and
* return the pointer to the freelist.
*
* Returns a list of objects or NULL if it fails.
*/
static inline void *acquire_slab(struct kmem_cache *s,
- struct kmem_cache_node *n, struct page *page,
- int mode, int *objects)
+ struct kmem_cache_node *n, struct slab *slab,
+ int mode)
{
void *freelist;
unsigned long counters;
- struct page new;
+ struct slab new;
lockdep_assert_held(&n->list_lock);
@@ -1914,12 +2226,11 @@ static inline void *acquire_slab(struct kmem_cache *s,
* The old freelist is the list of objects for the
* per cpu allocation list.
*/
- freelist = page->freelist;
- counters = page->counters;
+ freelist = slab->freelist;
+ counters = slab->counters;
new.counters = counters;
- *objects = new.objects - new.inuse;
if (mode) {
- new.inuse = page->objects;
+ new.inuse = slab->objects;
new.freelist = NULL;
} else {
new.freelist = freelist;
@@ -1928,80 +2239,96 @@ static inline void *acquire_slab(struct kmem_cache *s,
VM_BUG_ON(new.frozen);
new.frozen = 1;
- if (!__cmpxchg_double_slab(s, page,
+ if (!__slab_update_freelist(s, slab,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
return NULL;
- remove_partial(n, page);
+ remove_partial(n, slab);
WARN_ON(!freelist);
return freelist;
}
-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
-static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
+#else
+static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
+ int drain) { }
+#endif
+static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
/*
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
- struct kmem_cache_cpu *c, gfp_t flags)
+ struct partial_context *pc)
{
- struct page *page, *page2;
+ struct slab *slab, *slab2;
void *object = NULL;
- unsigned int available = 0;
- int objects;
+ unsigned long flags;
+ unsigned int partial_slabs = 0;
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
- * partial slab and there is none available then get_partials()
+ * partial slab and there is none available then get_partial()
* will return NULL.
*/
if (!n || !n->nr_partial)
return NULL;
- spin_lock(&n->list_lock);
- list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
void *t;
- if (!pfmemalloc_match(page, flags))
+ if (!pfmemalloc_match(slab, pc->flags))
continue;
- t = acquire_slab(s, n, page, object == NULL, &objects);
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
+ object = alloc_single_from_partial(s, n, slab,
+ pc->orig_size);
+ if (object)
+ break;
+ continue;
+ }
+
+ t = acquire_slab(s, n, slab, object == NULL);
if (!t)
break;
- available += objects;
if (!object) {
- c->page = page;
+ *pc->slab = slab;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
- put_cpu_partial(s, page, 0);
+ put_cpu_partial(s, slab, 0);
stat(s, CPU_PARTIAL_NODE);
+ partial_slabs++;
}
+#ifdef CONFIG_SLUB_CPU_PARTIAL
if (!kmem_cache_has_cpu_partial(s)
- || available > slub_cpu_partial(s) / 2)
+ || partial_slabs > s->cpu_partial_slabs / 2)
break;
+#else
+ break;
+#endif
}
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
return object;
}
/*
- * Get a page from somewhere. Search in increasing NUMA distances.
+ * Get a slab from somewhere. Search in increasing NUMA distances.
*/
-static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
- struct kmem_cache_cpu *c)
+static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type highest_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(pc->flags);
void *object;
unsigned int cpuset_mems_cookie;
@@ -2029,15 +2356,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(mempolicy_slab_node(), flags);
+ zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
struct kmem_cache_node *n;
n = get_node(s, zone_to_nid(zone));
- if (n && cpuset_zone_allowed(zone, flags) &&
+ if (n && cpuset_zone_allowed(zone, pc->flags) &&
n->nr_partial > s->min_partial) {
- object = get_partial_node(s, n, c, flags);
+ object = get_partial_node(s, n, pc);
if (object) {
/*
* Don't check read_mems_allowed_retry()
@@ -2056,10 +2383,9 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
}
/*
- * Get a partial page, lock it and return it.
+ * Get a partial slab, lock it and return it.
*/
-static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
- struct kmem_cache_cpu *c)
+static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc)
{
void *object;
int searchnode = node;
@@ -2067,13 +2393,15 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- object = get_partial_node(s, get_node(s, searchnode), c, flags);
+ object = get_partial_node(s, get_node(s, searchnode), pc);
if (object || node != NUMA_NO_NODE)
return object;
- return get_any_partial(s, flags, c);
+ return get_any_partial(s, pc);
}
+#ifndef CONFIG_SLUB_TINY
+
#ifdef CONFIG_PREEMPTION
/*
* Calculate the next globally unique transaction for disambiguation
@@ -2087,7 +2415,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
* different cpus.
*/
#define TID_STEP 1
-#endif
+#endif /* CONFIG_PREEMPTION */
static inline unsigned long next_tid(unsigned long tid)
{
@@ -2138,195 +2466,155 @@ static inline void note_cmpxchg_failure(const char *n,
static void init_kmem_cache_cpus(struct kmem_cache *s)
{
int cpu;
+ struct kmem_cache_cpu *c;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(s->cpu_slab, cpu);
+ local_lock_init(&c->lock);
+ c->tid = init_tid(cpu);
+ }
}
/*
- * Remove the cpu slab
+ * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
+ * unfreezes the slabs and puts it on the proper list.
+ * Assumes the slab has been already safely taken away from kmem_cache_cpu
+ * by the caller.
*/
-static void deactivate_slab(struct kmem_cache *s, struct page *page,
- void *freelist, struct kmem_cache_cpu *c)
-{
- enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- int lock = 0;
- enum slab_modes l = M_NONE, m = M_NONE;
- void *nextfree;
+static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
+ void *freelist)
+{
+ enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST };
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+ int free_delta = 0;
+ enum slab_modes mode = M_NONE;
+ void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
- struct page new;
- struct page old;
+ unsigned long flags = 0;
+ struct slab new;
+ struct slab old;
- if (page->freelist) {
+ if (slab->freelist) {
stat(s, DEACTIVATE_REMOTE_FREES);
tail = DEACTIVATE_TO_TAIL;
}
/*
- * Stage one: Free all available per cpu objects back
- * to the page freelist while it is still frozen. Leave the
- * last one.
- *
- * There is no need to take the list->lock because the page
- * is still frozen.
+ * Stage one: Count the objects on cpu's freelist as free_delta and
+ * remember the last object in freelist_tail for later splicing.
*/
- while (freelist && (nextfree = get_freepointer(s, freelist))) {
- void *prior;
- unsigned long counters;
+ freelist_tail = NULL;
+ freelist_iter = freelist;
+ while (freelist_iter) {
+ nextfree = get_freepointer(s, freelist_iter);
/*
* If 'nextfree' is invalid, it is possible that the object at
- * 'freelist' is already corrupted. So isolate all objects
- * starting at 'freelist'.
+ * 'freelist_iter' is already corrupted. So isolate all objects
+ * starting at 'freelist_iter' by skipping them.
*/
- if (freelist_corrupted(s, page, &freelist, nextfree))
+ if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
break;
- do {
- prior = page->freelist;
- counters = page->counters;
- set_freepointer(s, freelist, prior);
- new.counters = counters;
- new.inuse--;
- VM_BUG_ON(!new.frozen);
+ freelist_tail = freelist_iter;
+ free_delta++;
- } while (!__cmpxchg_double_slab(s, page,
- prior, counters,
- freelist, new.counters,
- "drain percpu freelist"));
-
- freelist = nextfree;
+ freelist_iter = nextfree;
}
/*
- * Stage two: Ensure that the page is unfrozen while the
- * list presence reflects the actual number of objects
- * during unfreeze.
+ * Stage two: Unfreeze the slab while splicing the per-cpu
+ * freelist to the head of slab's freelist.
*
- * We setup the list membership and then perform a cmpxchg
- * with the count. If there is a mismatch then the page
- * is not unfrozen but the page is on the wrong list.
+ * Ensure that the slab is unfrozen while the list presence
+ * reflects the actual number of objects during unfreeze.
*
- * Then we restart the process which may have to remove
- * the page from the list that we just put it on again
- * because the number of objects in the slab may have
- * changed.
+ * We first perform cmpxchg holding lock and insert to list
+ * when it succeed. If there is mismatch then the slab is not
+ * unfrozen and number of objects in the slab may have changed.
+ * Then release lock and retry cmpxchg again.
*/
redo:
- old.freelist = page->freelist;
- old.counters = page->counters;
+ old.freelist = READ_ONCE(slab->freelist);
+ old.counters = READ_ONCE(slab->counters);
VM_BUG_ON(!old.frozen);
/* Determine target state of the slab */
new.counters = old.counters;
- if (freelist) {
- new.inuse--;
- set_freepointer(s, freelist, old.freelist);
+ if (freelist_tail) {
+ new.inuse -= free_delta;
+ set_freepointer(s, freelist_tail, old.freelist);
new.freelist = freelist;
} else
new.freelist = old.freelist;
new.frozen = 0;
- if (!new.inuse && n->nr_partial >= s->min_partial)
- m = M_FREE;
- else if (new.freelist) {
- m = M_PARTIAL;
- if (!lock) {
- lock = 1;
- /*
- * Taking the spinlock removes the possibility
- * that acquire_slab() will see a slab page that
- * is frozen
- */
- spin_lock(&n->list_lock);
- }
+ if (!new.inuse && n->nr_partial >= s->min_partial) {
+ mode = M_FREE;
+ } else if (new.freelist) {
+ mode = M_PARTIAL;
+ /*
+ * Taking the spinlock removes the possibility that
+ * acquire_slab() will see a slab that is frozen
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
} else {
- m = M_FULL;
-#ifdef CONFIG_SLUB_DEBUG
- if ((s->flags & SLAB_STORE_USER) && !lock) {
- lock = 1;
- /*
- * This also ensures that the scanning of full
- * slabs from diagnostic functions will not see
- * any frozen slabs.
- */
- spin_lock(&n->list_lock);
- }
-#endif
+ mode = M_FULL_NOLIST;
}
- if (l != m) {
- if (l == M_PARTIAL)
- remove_partial(n, page);
- else if (l == M_FULL)
- remove_full(s, n, page);
-
- if (m == M_PARTIAL)
- add_partial(n, page, tail);
- else if (m == M_FULL)
- add_full(s, n, page);
- }
- l = m;
- if (!__cmpxchg_double_slab(s, page,
+ if (!slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
- "unfreezing slab"))
+ "unfreezing slab")) {
+ if (mode == M_PARTIAL)
+ spin_unlock_irqrestore(&n->list_lock, flags);
goto redo;
+ }
- if (lock)
- spin_unlock(&n->list_lock);
- if (m == M_PARTIAL)
+ if (mode == M_PARTIAL) {
+ add_partial(n, slab, tail);
+ spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, tail);
- else if (m == M_FULL)
- stat(s, DEACTIVATE_FULL);
- else if (m == M_FREE) {
+ } else if (mode == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, page);
+ discard_slab(s, slab);
stat(s, FREE_SLAB);
+ } else if (mode == M_FULL_NOLIST) {
+ stat(s, DEACTIVATE_FULL);
}
-
- c->page = NULL;
- c->freelist = NULL;
}
-/*
- * Unfreeze all the cpu partial slabs.
- *
- * This function must be called with interrupts disabled
- * for the cpu using c (or some other guarantee must be there
- * to guarantee no concurrent accesses).
- */
-static void unfreeze_partials(struct kmem_cache *s,
- struct kmem_cache_cpu *c)
-{
#ifdef CONFIG_SLUB_CPU_PARTIAL
+static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
+{
struct kmem_cache_node *n = NULL, *n2 = NULL;
- struct page *page, *discard_page = NULL;
+ struct slab *slab, *slab_to_discard = NULL;
+ unsigned long flags = 0;
- while ((page = slub_percpu_partial(c))) {
- struct page new;
- struct page old;
+ while (partial_slab) {
+ struct slab new;
+ struct slab old;
- slub_set_percpu_partial(c, page);
+ slab = partial_slab;
+ partial_slab = slab->next;
- n2 = get_node(s, page_to_nid(page));
+ n2 = get_node(s, slab_nid(slab));
if (n != n2) {
if (n)
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
n = n2;
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
}
do {
- old.freelist = page->freelist;
- old.counters = page->counters;
+ old.freelist = slab->freelist;
+ old.counters = slab->counters;
VM_BUG_ON(!old.frozen);
new.counters = old.counters;
@@ -2334,134 +2622,231 @@ static void unfreeze_partials(struct kmem_cache *s,
new.frozen = 0;
- } while (!__cmpxchg_double_slab(s, page,
+ } while (!__slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"));
if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
- page->next = discard_page;
- discard_page = page;
+ slab->next = slab_to_discard;
+ slab_to_discard = slab;
} else {
- add_partial(n, page, DEACTIVATE_TO_TAIL);
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
}
if (n)
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
- while (discard_page) {
- page = discard_page;
- discard_page = discard_page->next;
+ while (slab_to_discard) {
+ slab = slab_to_discard;
+ slab_to_discard = slab_to_discard->next;
stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, page);
+ discard_slab(s, slab);
stat(s, FREE_SLAB);
}
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
/*
- * Put a page that was just frozen (in __slab_free|get_partial_node) into a
- * partial page slot if available.
+ * Unfreeze all the cpu partial slabs.
+ */
+static void unfreeze_partials(struct kmem_cache *s)
+{
+ struct slab *partial_slab;
+ unsigned long flags;
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ partial_slab = this_cpu_read(s->cpu_slab->partial);
+ this_cpu_write(s->cpu_slab->partial, NULL);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ if (partial_slab)
+ __unfreeze_partials(s, partial_slab);
+}
+
+static void unfreeze_partials_cpu(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
+{
+ struct slab *partial_slab;
+
+ partial_slab = slub_percpu_partial(c);
+ c->partial = NULL;
+
+ if (partial_slab)
+ __unfreeze_partials(s, partial_slab);
+}
+
+/*
+ * Put a slab that was just frozen (in __slab_free|get_partial_node) into a
+ * partial slab slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
*/
-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- struct page *oldpage;
- int pages;
- int pobjects;
+ struct slab *oldslab;
+ struct slab *slab_to_unfreeze = NULL;
+ unsigned long flags;
+ int slabs = 0;
- preempt_disable();
- do {
- pages = 0;
- pobjects = 0;
- oldpage = this_cpu_read(s->cpu_slab->partial);
-
- if (oldpage) {
- pobjects = oldpage->pobjects;
- pages = oldpage->pages;
- if (drain && pobjects > slub_cpu_partial(s)) {
- unsigned long flags;
- /*
- * partial array is full. Move the existing
- * set to the per node partial list.
- */
- local_irq_save(flags);
- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
- local_irq_restore(flags);
- oldpage = NULL;
- pobjects = 0;
- pages = 0;
- stat(s, CPU_PARTIAL_DRAIN);
- }
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+
+ oldslab = this_cpu_read(s->cpu_slab->partial);
+
+ if (oldslab) {
+ if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
+ /*
+ * Partial array is full. Move the existing set to the
+ * per node partial list. Postpone the actual unfreezing
+ * outside of the critical section.
+ */
+ slab_to_unfreeze = oldslab;
+ oldslab = NULL;
+ } else {
+ slabs = oldslab->slabs;
}
+ }
- pages++;
- pobjects += page->objects - page->inuse;
+ slabs++;
- page->pages = pages;
- page->pobjects = pobjects;
- page->next = oldpage;
+ slab->slabs = slabs;
+ slab->next = oldslab;
- } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
- != oldpage);
- if (unlikely(!slub_cpu_partial(s))) {
- unsigned long flags;
+ this_cpu_write(s->cpu_slab->partial, slab);
- local_irq_save(flags);
- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ if (slab_to_unfreeze) {
+ __unfreeze_partials(s, slab_to_unfreeze);
+ stat(s, CPU_PARTIAL_DRAIN);
}
- preempt_enable();
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
+#else /* CONFIG_SLUB_CPU_PARTIAL */
+
+static inline void unfreeze_partials(struct kmem_cache *s) { }
+static inline void unfreeze_partials_cpu(struct kmem_cache *s,
+ struct kmem_cache_cpu *c) { }
+
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
- stat(s, CPUSLAB_FLUSH);
- deactivate_slab(s, c->page, c->freelist, c);
+ unsigned long flags;
+ struct slab *slab;
+ void *freelist;
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+
+ slab = c->slab;
+ freelist = c->freelist;
+ c->slab = NULL;
+ c->freelist = NULL;
c->tid = next_tid(c->tid);
+
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ if (slab) {
+ deactivate_slab(s, slab, freelist);
+ stat(s, CPUSLAB_FLUSH);
+ }
+}
+
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+{
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ void *freelist = c->freelist;
+ struct slab *slab = c->slab;
+
+ c->slab = NULL;
+ c->freelist = NULL;
+ c->tid = next_tid(c->tid);
+
+ if (slab) {
+ deactivate_slab(s, slab, freelist);
+ stat(s, CPUSLAB_FLUSH);
+ }
+
+ unfreeze_partials_cpu(s, c);
}
+struct slub_flush_work {
+ struct work_struct work;
+ struct kmem_cache *s;
+ bool skip;
+};
+
/*
* Flush cpu slab.
*
- * Called from IPI handler with interrupts disabled.
+ * Called from CPU work handler with migration disabled.
*/
-static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+static void flush_cpu_slab(struct work_struct *w)
{
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ struct kmem_cache *s;
+ struct kmem_cache_cpu *c;
+ struct slub_flush_work *sfw;
+
+ sfw = container_of(w, struct slub_flush_work, work);
+
+ s = sfw->s;
+ c = this_cpu_ptr(s->cpu_slab);
- if (c->page)
+ if (c->slab)
flush_slab(s, c);
- unfreeze_partials(s, c);
+ unfreeze_partials(s);
}
-static void flush_cpu_slab(void *d)
+static bool has_cpu_slab(int cpu, struct kmem_cache *s)
{
- struct kmem_cache *s = d;
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- __flush_cpu_slab(s, smp_processor_id());
+ return c->slab || slub_percpu_partial(c);
}
-static bool has_cpu_slab(int cpu, void *info)
+static DEFINE_MUTEX(flush_lock);
+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
+
+static void flush_all_cpus_locked(struct kmem_cache *s)
{
- struct kmem_cache *s = info;
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ struct slub_flush_work *sfw;
+ unsigned int cpu;
- return c->page || slub_percpu_partial(c);
+ lockdep_assert_cpus_held();
+ mutex_lock(&flush_lock);
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ if (!has_cpu_slab(cpu, s)) {
+ sfw->skip = true;
+ continue;
+ }
+ INIT_WORK(&sfw->work, flush_cpu_slab);
+ sfw->skip = false;
+ sfw->s = s;
+ queue_work_on(cpu, flushwq, &sfw->work);
+ }
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ if (sfw->skip)
+ continue;
+ flush_work(&sfw->work);
+ }
+
+ mutex_unlock(&flush_lock);
}
static void flush_all(struct kmem_cache *s)
{
- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
+ cpus_read_lock();
+ flush_all_cpus_locked(s);
+ cpus_read_unlock();
}
/*
@@ -2471,63 +2856,124 @@ static void flush_all(struct kmem_cache *s)
static int slub_cpu_dead(unsigned int cpu)
{
struct kmem_cache *s;
- unsigned long flags;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- local_irq_save(flags);
+ list_for_each_entry(s, &slab_caches, list)
__flush_cpu_slab(s, cpu);
- local_irq_restore(flags);
- }
mutex_unlock(&slab_mutex);
return 0;
}
+#else /* CONFIG_SLUB_TINY */
+static inline void flush_all_cpus_locked(struct kmem_cache *s) { }
+static inline void flush_all(struct kmem_cache *s) { }
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
+static inline int slub_cpu_dead(unsigned int cpu) { return 0; }
+#endif /* CONFIG_SLUB_TINY */
+
/*
* Check if the objects in a per cpu structure fit numa
* locality expectations.
*/
-static inline int node_match(struct page *page, int node)
+static inline int node_match(struct slab *slab, int node)
{
#ifdef CONFIG_NUMA
- if (node != NUMA_NO_NODE && page_to_nid(page) != node)
+ if (node != NUMA_NO_NODE && slab_nid(slab) != node)
return 0;
#endif
return 1;
}
#ifdef CONFIG_SLUB_DEBUG
-static int count_free(struct page *page)
+static int count_free(struct slab *slab)
{
- return page->objects - page->inuse;
+ return slab->objects - slab->inuse;
}
static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
{
return atomic_long_read(&n->total_objects);
}
+
+/* Supports checking bulk free of a constructed freelist */
+static inline bool free_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail, int *bulk_cnt,
+ unsigned long addr, depot_stack_handle_t handle)
+{
+ bool checks_ok = false;
+ void *object = head;
+ int cnt = 0;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, slab))
+ goto out;
+ }
+
+ if (slab->inuse < *bulk_cnt) {
+ slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
+ slab->inuse, *bulk_cnt);
+ goto out;
+ }
+
+next_object:
+
+ if (++cnt > *bulk_cnt)
+ goto out_cnt;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, slab, object, addr))
+ goto out;
+ }
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track_update(s, object, TRACK_FREE, addr, handle);
+ trace(s, slab, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
+ init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
+ checks_ok = true;
+
+out_cnt:
+ if (cnt != *bulk_cnt) {
+ slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
+ *bulk_cnt, cnt);
+ *bulk_cnt = cnt;
+ }
+
+out:
+
+ if (!checks_ok)
+ slab_fix(s, "Object at 0x%p not freed", object);
+
+ return checks_ok;
+}
#endif /* CONFIG_SLUB_DEBUG */
-#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
+#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
static unsigned long count_partial(struct kmem_cache_node *n,
- int (*get_count)(struct page *))
+ int (*get_count)(struct slab *))
{
unsigned long flags;
unsigned long x = 0;
- struct page *page;
+ struct slab *slab;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, slab_list)
- x += get_count(page);
+ list_for_each_entry(slab, &n->partial, slab_list)
+ x += get_count(slab);
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
-#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
+#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
+#ifdef CONFIG_SLUB_DEBUG
static noinline void
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{
-#ifdef CONFIG_SLUB_DEBUG
static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
int node;
@@ -2558,79 +3004,60 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
node, nr_slabs, nr_objs, nr_free);
}
-#endif
}
+#else /* CONFIG_SLUB_DEBUG */
+static inline void
+slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
+#endif
-static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
- int node, struct kmem_cache_cpu **pc)
+static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
{
- void *freelist;
- struct kmem_cache_cpu *c = *pc;
- struct page *page;
-
- WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
-
- freelist = get_partial(s, flags, node, c);
-
- if (freelist)
- return freelist;
-
- page = new_slab(s, flags, node);
- if (page) {
- c = raw_cpu_ptr(s->cpu_slab);
- if (c->page)
- flush_slab(s, c);
-
- /*
- * No other reference to the page yet so we can
- * muck around with it freely without cmpxchg
- */
- freelist = page->freelist;
- page->freelist = NULL;
-
- stat(s, ALLOC_SLAB);
- c->page = page;
- *pc = c;
- }
+ if (unlikely(slab_test_pfmemalloc(slab)))
+ return gfp_pfmemalloc_allowed(gfpflags);
- return freelist;
+ return true;
}
-static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+#ifndef CONFIG_SLUB_TINY
+static inline bool
+__update_cpu_freelist_fast(struct kmem_cache *s,
+ void *freelist_old, void *freelist_new,
+ unsigned long tid)
{
- if (unlikely(PageSlabPfmemalloc(page)))
- return gfp_pfmemalloc_allowed(gfpflags);
+ freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
- return true;
+ return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
+ &old.full, new.full);
}
/*
- * Check the page->freelist of a page and either transfer the freelist to the
- * per cpu freelist or deactivate the page.
- *
- * The page is still frozen if the return value is not NULL.
+ * Check the slab->freelist and either transfer the freelist to the
+ * per cpu freelist or deactivate the slab.
*
- * If this function returns NULL then the page has been unfrozen.
+ * The slab is still frozen if the return value is not NULL.
*
- * This function must be called with interrupt disabled.
+ * If this function returns NULL then the slab has been unfrozen.
*/
-static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
{
- struct page new;
+ struct slab new;
unsigned long counters;
void *freelist;
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+
do {
- freelist = page->freelist;
- counters = page->counters;
+ freelist = slab->freelist;
+ counters = slab->counters;
new.counters = counters;
VM_BUG_ON(!new.frozen);
- new.inuse = page->objects;
+ new.inuse = slab->objects;
new.frozen = freelist != NULL;
- } while (!__cmpxchg_double_slab(s, page,
+ } while (!__slab_update_freelist(s, slab,
freelist, counters,
NULL, new.counters,
"get_freelist"));
@@ -2654,42 +3081,44 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
*
- * Version of __slab_alloc to use when we know that interrupts are
+ * Version of __slab_alloc to use when we know that preemption is
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *freelist;
- struct page *page;
+ struct slab *slab;
+ unsigned long flags;
+ struct partial_context pc;
stat(s, ALLOC_SLOWPATH);
- page = c->page;
- if (!page) {
+reread_slab:
+
+ slab = READ_ONCE(c->slab);
+ if (!slab) {
/*
* if the node is not online or has no normal memory, just
* ignore the node constraint
*/
if (unlikely(node != NUMA_NO_NODE &&
- !node_state(node, N_NORMAL_MEMORY)))
+ !node_isset(node, slab_nodes)))
node = NUMA_NO_NODE;
goto new_slab;
}
redo:
- if (unlikely(!node_match(page, node))) {
+ if (unlikely(!node_match(slab, node))) {
/*
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
*/
- if (!node_state(node, N_NORMAL_MEMORY)) {
+ if (!node_isset(node, slab_nodes)) {
node = NUMA_NO_NODE;
- goto redo;
} else {
stat(s, ALLOC_NODE_MISMATCH);
- deactivate_slab(s, page, c->freelist, c);
- goto new_slab;
+ goto deactivate_slab;
}
}
@@ -2698,20 +3127,25 @@ redo:
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
- if (unlikely(!pfmemalloc_match(page, gfpflags))) {
- deactivate_slab(s, page, c->freelist, c);
- goto new_slab;
+ if (unlikely(!pfmemalloc_match(slab, gfpflags)))
+ goto deactivate_slab;
+
+ /* must check again c->slab in case we got preempted and it changed */
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(slab != c->slab)) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_slab;
}
-
- /* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;
if (freelist)
goto load_freelist;
- freelist = get_freelist(s, page);
+ freelist = get_freelist(s, slab);
if (!freelist) {
- c->page = NULL;
+ c->slab = NULL;
+ c->tid = next_tid(c->tid);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -2719,103 +3153,179 @@ redo:
stat(s, ALLOC_REFILL);
load_freelist:
+
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+
/*
* freelist is pointing to the list of objects to be used.
- * page is pointing to the page from which the objects are obtained.
- * That page must be frozen for per cpu allocations to work.
+ * slab is pointing to the slab from which the objects are obtained.
+ * That slab must be frozen for per cpu allocations to work.
*/
- VM_BUG_ON(!c->page->frozen);
+ VM_BUG_ON(!c->slab->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
return freelist;
+deactivate_slab:
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (slab != c->slab) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_slab;
+ }
+ freelist = c->freelist;
+ c->slab = NULL;
+ c->freelist = NULL;
+ c->tid = next_tid(c->tid);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ deactivate_slab(s, slab, freelist);
+
new_slab:
if (slub_percpu_partial(c)) {
- page = c->page = slub_percpu_partial(c);
- slub_set_percpu_partial(c, page);
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(c->slab)) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_slab;
+ }
+ if (unlikely(!slub_percpu_partial(c))) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ /* we were preempted and partial list got empty */
+ goto new_objects;
+ }
+
+ slab = c->slab = slub_percpu_partial(c);
+ slub_set_percpu_partial(c, slab);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, CPU_PARTIAL_ALLOC);
goto redo;
}
- freelist = new_slab_objects(s, gfpflags, node, &c);
+new_objects:
+
+ pc.flags = gfpflags;
+ pc.slab = &slab;
+ pc.orig_size = orig_size;
+ freelist = get_partial(s, node, &pc);
+ if (freelist)
+ goto check_new_slab;
+
+ slub_put_cpu_ptr(s->cpu_slab);
+ slab = new_slab(s, gfpflags, node);
+ c = slub_get_cpu_ptr(s->cpu_slab);
- if (unlikely(!freelist)) {
+ if (unlikely(!slab)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
}
- page = c->page;
- if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
- goto load_freelist;
+ stat(s, ALLOC_SLAB);
- /* Only entered in the debug case */
- if (kmem_cache_debug(s) &&
- !alloc_debug_processing(s, page, freelist, addr))
- goto new_slab; /* Slab failed checks. Next slab needed */
+ if (kmem_cache_debug(s)) {
+ freelist = alloc_single_from_new_slab(s, slab, orig_size);
- deactivate_slab(s, page, get_freepointer(s, freelist), c);
- return freelist;
+ if (unlikely(!freelist))
+ goto new_objects;
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+
+ return freelist;
+ }
+
+ /*
+ * No other reference to the slab yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ freelist = slab->freelist;
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab->frozen = 1;
+
+ inc_slabs_node(s, slab_nid(slab), slab->objects);
+
+check_new_slab:
+
+ if (kmem_cache_debug(s)) {
+ /*
+ * For debug caches here we had to go through
+ * alloc_single_from_partial() so just store the tracking info
+ * and return the object
+ */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+
+ return freelist;
+ }
+
+ if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
+ /*
+ * For !pfmemalloc_match() case we don't load freelist so that
+ * we don't make further mismatched allocations easier.
+ */
+ deactivate_slab(s, slab, get_freepointer(s, freelist));
+ return freelist;
+ }
+
+retry_load_slab:
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(c->slab)) {
+ void *flush_freelist = c->freelist;
+ struct slab *flush_slab = c->slab;
+
+ c->slab = NULL;
+ c->freelist = NULL;
+ c->tid = next_tid(c->tid);
+
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ deactivate_slab(s, flush_slab, flush_freelist);
+
+ stat(s, CPUSLAB_FLUSH);
+
+ goto retry_load_slab;
+ }
+ c->slab = slab;
+
+ goto load_freelist;
}
/*
- * Another one that disabled interrupt and compensates for possible
- * cpu changes by refetching the per cpu area pointer.
+ * A wrapper for ___slab_alloc() for contexts where preemption is not yet
+ * disabled. Compensates for possible cpu changes by refetching the per cpu area
+ * pointer.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *p;
- unsigned long flags;
- local_irq_save(flags);
-#ifdef CONFIG_PREEMPTION
+#ifdef CONFIG_PREEMPT_COUNT
/*
* We may have been preempted and rescheduled on a different
- * cpu before disabling interrupts. Need to reload cpu area
+ * cpu before disabling preemption. Need to reload cpu area
* pointer.
*/
- c = this_cpu_ptr(s->cpu_slab);
+ c = slub_get_cpu_ptr(s->cpu_slab);
#endif
- p = ___slab_alloc(s, gfpflags, node, addr, c);
- local_irq_restore(flags);
+ p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
+#ifdef CONFIG_PREEMPT_COUNT
+ slub_put_cpu_ptr(s->cpu_slab);
+#endif
return p;
}
-/*
- * If the object has been wiped upon free, make sure it's fully initialized by
- * zeroing out freelist pointer.
- */
-static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
- void *obj)
-{
- if (unlikely(slab_want_init_on_free(s)) && obj)
- memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
-}
-
-/*
- * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
- * have the fastpath folded into their functions. So no function call
- * overhead for requests that can be satisfied on the fastpath.
- *
- * The fastpath works by first checking if the lockless freelist can be used.
- * If not then __slab_alloc is called for slow processing.
- *
- * Otherwise we can simply pick the next object from the lockless free list.
- */
-static __always_inline void *slab_alloc_node(struct kmem_cache *s,
- gfp_t gfpflags, int node, unsigned long addr)
+static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
- void *object;
struct kmem_cache_cpu *c;
- struct page *page;
+ struct slab *slab;
unsigned long tid;
- struct obj_cgroup *objcg = NULL;
+ void *object;
- s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
- if (!s)
- return NULL;
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -2823,22 +3333,21 @@ redo:
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
- * We should guarantee that tid and kmem_cache are retrieved on
- * the same cpu. It could be different if CONFIG_PREEMPTION so we need
- * to check if it is matched or not.
+ * We must guarantee that tid and kmem_cache_cpu are retrieved on the
+ * same cpu. We read first the kmem_cache_cpu pointer and use it to read
+ * the tid. If we are preempted and switched to another cpu between the
+ * two reads, it's OK as the two are still associated with the same cpu
+ * and cmpxchg later will validate the cpu.
*/
- do {
- tid = this_cpu_read(s->cpu_slab->tid);
- c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPTION) &&
- unlikely(tid != READ_ONCE(c->tid)));
+ c = raw_cpu_ptr(s->cpu_slab);
+ tid = READ_ONCE(c->tid);
/*
* Irqless object alloc/free algorithm used here depends on sequence
* of fetching cpu_slab's data. tid should be fetched before anything
- * on c to guarantee that object and page associated with previous tid
+ * on c to guarantee that object and slab associated with previous tid
* won't be used with current tid. If we fetch tid first, object and
- * page could be one associated with next tid and our alloc/free
+ * slab could be one associated with next tid and our alloc/free
* request will be failed. In this case, we will retry. So, no problem.
*/
barrier();
@@ -2851,9 +3360,11 @@ redo:
*/
object = c->freelist;
- page = c->page;
- if (unlikely(!object || !node_match(page, node))) {
- object = __slab_alloc(s, gfpflags, node, addr, c);
+ slab = c->slab;
+
+ if (!USE_LOCKLESS_FAST_PATH() ||
+ unlikely(!object || !slab || !node_match(slab, node))) {
+ object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -2871,11 +3382,7 @@ redo:
* against code executing on this cpu *not* from access by
* other cpus.
*/
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- object, tid,
- next_object, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
@@ -2883,106 +3390,234 @@ redo:
stat(s, ALLOC_FASTPATH);
}
- maybe_wipe_obj_freeptr(s, object);
+ return object;
+}
+#else /* CONFIG_SLUB_TINY */
+static void *__slab_alloc_node(struct kmem_cache *s,
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
+{
+ struct partial_context pc;
+ struct slab *slab;
+ void *object;
+
+ pc.flags = gfpflags;
+ pc.slab = &slab;
+ pc.orig_size = orig_size;
+ object = get_partial(s, node, &pc);
+
+ if (object)
+ return object;
+
+ slab = new_slab(s, gfpflags, node);
+ if (unlikely(!slab)) {
+ slab_out_of_memory(s, gfpflags, node);
+ return NULL;
+ }
+
+ object = alloc_single_from_new_slab(s, slab, orig_size);
+
+ return object;
+}
+#endif /* CONFIG_SLUB_TINY */
+
+/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+ void *obj)
+{
+ if (unlikely(slab_want_init_on_free(s)) && obj)
+ memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
+ 0, sizeof(void *));
+}
- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
- memset(object, 0, s->object_size);
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
+{
+ void *object;
+ struct obj_cgroup *objcg = NULL;
+ bool init = false;
+
+ s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
+ if (!s)
+ return NULL;
+
+ object = kfence_alloc(s, orig_size, gfpflags);
+ if (unlikely(object))
+ goto out;
+
+ object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
- slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
+ maybe_wipe_obj_freeptr(s, object);
+ init = slab_want_init_on_alloc(gfpflags, s);
+
+out:
+ /*
+ * When init equals 'true', like for kzalloc() family, only
+ * @orig_size bytes might be zeroed instead of s->object_size
+ */
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
return object;
}
-static __always_inline void *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, unsigned long addr)
+static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags, unsigned long addr, size_t orig_size)
{
- return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+ return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size);
}
-void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+static __fastpath_inline
+void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
- s->size, gfpflags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
return ret;
}
+
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+{
+ return __kmem_cache_alloc_lru(s, NULL, gfpflags);
+}
EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
- trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
+ return __kmem_cache_alloc_lru(s, lru, gfpflags);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_lru);
+
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t orig_size,
+ unsigned long caller)
+{
+ return slab_alloc_node(s, NULL, gfpflags, node,
+ caller, orig_size);
}
-EXPORT_SYMBOL(kmem_cache_alloc_trace);
-#endif
-#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc_node(_RET_IP_, ret,
- s->object_size, s->size, gfpflags, node);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
- gfp_t gfpflags,
- int node, size_t size)
+static noinline void free_to_partial_list(
+ struct kmem_cache *s, struct slab *slab,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+ struct slab *slab_free = NULL;
+ int cnt = bulk_cnt;
+ unsigned long flags;
+ depot_stack_handle_t handle = 0;
- trace_kmalloc_node(_RET_IP_, ret,
- size, s->size, gfpflags, node);
+ if (s->flags & SLAB_STORE_USER)
+ handle = set_track_prepare();
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
+ void *prior = slab->freelist;
+
+ /* Perform the actual freeing while we still hold the locks */
+ slab->inuse -= cnt;
+ set_freepointer(s, tail, prior);
+ slab->freelist = head;
+
+ /*
+ * If the slab is empty, and node's partial list is full,
+ * it should be discarded anyway no matter it's on full or
+ * partial list.
+ */
+ if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
+ slab_free = slab;
+
+ if (!prior) {
+ /* was on full list */
+ remove_full(s, n, slab);
+ if (!slab_free) {
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ } else if (slab_free) {
+ remove_partial(n, slab);
+ stat(s, FREE_REMOVE_PARTIAL);
+ }
+ }
+
+ if (slab_free) {
+ /*
+ * Update the counters while still holding n->list_lock to
+ * prevent spurious validation warnings
+ */
+ dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ if (slab_free) {
+ stat(s, FREE_SLAB);
+ free_slab(s, slab_free);
+ }
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
-#endif
-#endif /* CONFIG_NUMA */
/*
* Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
*
* So we still attempt to reduce cache line usage. Just take the slab
- * lock and free the item. If there is no additional partial page
+ * lock and free the item. If there is no additional partial slab
* handling required then we can return immediately.
*/
-static void __slab_free(struct kmem_cache *s, struct page *page,
+static void __slab_free(struct kmem_cache *s, struct slab *slab,
void *head, void *tail, int cnt,
unsigned long addr)
{
void *prior;
int was_frozen;
- struct page new;
+ struct slab new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
unsigned long flags;
stat(s, FREE_SLOWPATH);
- if (kmem_cache_debug(s) &&
- !free_debug_processing(s, page, head, tail, cnt, addr))
+ if (kfence_free(head))
return;
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
+ free_to_partial_list(s, slab, head, tail, cnt, addr);
+ return;
+ }
+
do {
if (unlikely(n)) {
spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
- prior = page->freelist;
- counters = page->counters;
+ prior = slab->freelist;
+ counters = slab->counters;
set_freepointer(s, tail, prior);
new.counters = counters;
was_frozen = new.frozen;
@@ -3001,7 +3636,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
} else { /* Needs to be taken off a list */
- n = get_node(s, page_to_nid(page));
+ n = get_node(s, slab_nid(slab));
/*
* Speculatively acquire the list_lock.
* If the cmpxchg does not succeed then we may
@@ -3015,7 +3650,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
}
}
- } while (!cmpxchg_double_slab(s, page,
+ } while (!slab_update_freelist(s, slab,
prior, counters,
head, new.counters,
"__slab_free"));
@@ -3030,10 +3665,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_FROZEN);
} else if (new.frozen) {
/*
- * If we just froze the page then put it onto the
+ * If we just froze the slab then put it onto the
* per cpu partial list.
*/
- put_cpu_partial(s, page, 1);
+ put_cpu_partial(s, slab, 1);
stat(s, CPU_PARTIAL_FREE);
}
@@ -3048,8 +3683,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- remove_full(s, n, page);
- add_partial(n, page, DEACTIVATE_TO_TAIL);
+ remove_full(s, n, slab);
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -3060,18 +3695,19 @@ slab_empty:
/*
* Slab on the partial list.
*/
- remove_partial(n, page);
+ remove_partial(n, slab);
stat(s, FREE_REMOVE_PARTIAL);
} else {
/* Slab must be on the full list */
- remove_full(s, n, page);
+ remove_full(s, n, slab);
}
spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
- discard_slab(s, page);
+ discard_slab(s, slab);
}
+#ifndef CONFIG_SLUB_TINY
/*
* Fastpath with forced inlining to produce a kfree and kmem_cache_free that
* can perform fastpath freeing without additional function calls.
@@ -3084,18 +3720,18 @@ slab_empty:
* with all sorts of special processing.
*
* Bulk free of a freelist with several objects (all pointing to the
- * same page) possible by specifying head and tail ptr, plus objects
+ * same slab) possible by specifying head and tail ptr, plus objects
* count (cnt). Bulk free indicated by tail pointer being set.
*/
static __always_inline void do_slab_free(struct kmem_cache *s,
- struct page *page, void *head, void *tail,
+ struct slab *slab, void *head, void *tail,
int cnt, unsigned long addr)
{
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
+ void **freelist;
- memcg_slab_free_hook(s, &head, 1);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3103,65 +3739,93 @@ redo:
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succeed.
*/
- do {
- tid = this_cpu_read(s->cpu_slab->tid);
- c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPTION) &&
- unlikely(tid != READ_ONCE(c->tid)));
+ c = raw_cpu_ptr(s->cpu_slab);
+ tid = READ_ONCE(c->tid);
/* Same with comment on barrier() in slab_alloc_node() */
barrier();
- if (likely(page == c->page)) {
- void **freelist = READ_ONCE(c->freelist);
+ if (unlikely(slab != c->slab)) {
+ __slab_free(s, slab, head, tail_obj, cnt, addr);
+ return;
+ }
- set_freepointer(s, tail_obj, freelist);
+ if (USE_LOCKLESS_FAST_PATH()) {
+ freelist = READ_ONCE(c->freelist);
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- freelist, tid,
- head, next_tid(tid)))) {
+ set_freepointer(s, tail_obj, freelist);
+ if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
- stat(s, FREE_FASTPATH);
- } else
- __slab_free(s, page, head, tail_obj, cnt, addr);
+ } else {
+ /* Update the free list under the local lock */
+ local_lock(&s->cpu_slab->lock);
+ c = this_cpu_ptr(s->cpu_slab);
+ if (unlikely(slab != c->slab)) {
+ local_unlock(&s->cpu_slab->lock);
+ goto redo;
+ }
+ tid = c->tid;
+ freelist = c->freelist;
+
+ set_freepointer(s, tail_obj, freelist);
+ c->freelist = head;
+ c->tid = next_tid(tid);
+
+ local_unlock(&s->cpu_slab->lock);
+ }
+ stat(s, FREE_FASTPATH);
+}
+#else /* CONFIG_SLUB_TINY */
+static void do_slab_free(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail,
+ int cnt, unsigned long addr)
+{
+ void *tail_obj = tail ? : head;
+ __slab_free(s, slab, head, tail_obj, cnt, addr);
}
+#endif /* CONFIG_SLUB_TINY */
-static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
- void *head, void *tail, int cnt,
+static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab,
+ void *head, void *tail, void **p, int cnt,
unsigned long addr)
{
+ memcg_slab_free_hook(s, slab, p, cnt);
/*
* With KASAN enabled slab_free_freelist_hook modifies the freelist
* to remove objects, whose reuse must be delayed.
*/
- if (slab_free_freelist_hook(s, &head, &tail))
- do_slab_free(s, page, head, tail, cnt, addr);
+ if (slab_free_freelist_hook(s, &head, &tail, &cnt))
+ do_slab_free(s, slab, head, tail, cnt, addr);
}
#ifdef CONFIG_KASAN_GENERIC
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
{
- do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
+ do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr);
}
#endif
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller)
+{
+ slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller);
+}
+
void kmem_cache_free(struct kmem_cache *s, void *x)
{
s = cache_from_obj(s, x);
if (!s)
return;
- slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
- trace_kmem_cache_free(_RET_IP_, x);
+ trace_kmem_cache_free(_RET_IP_, x, s);
+ slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
struct detached_freelist {
- struct page *page;
+ struct slab *slab;
void *tail;
void *freelist;
int cnt;
@@ -3171,8 +3835,8 @@ struct detached_freelist {
/*
* This function progressively scans the array with free objects (with
* a limited look ahead) and extract objects belonging to the same
- * page. It builds a detached freelist directly within the given
- * page/objects. This can happen without any need for
+ * slab. It builds a detached freelist directly within the given
+ * slab/objects. This can happen without any need for
* synchronization, because the objects are owned by running process.
* The freelist is build up as a single linked list in the objects.
* The idea is, that this detached freelist can then be bulk
@@ -3184,115 +3848,105 @@ static inline
int build_detached_freelist(struct kmem_cache *s, size_t size,
void **p, struct detached_freelist *df)
{
- size_t first_skipped_index = 0;
int lookahead = 3;
void *object;
- struct page *page;
+ struct folio *folio;
+ size_t same;
- /* Always re-init detached_freelist */
- df->page = NULL;
-
- do {
- object = p[--size];
- /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
- } while (!object && size);
-
- if (!object)
- return 0;
-
- page = virt_to_head_page(object);
+ object = p[--size];
+ folio = virt_to_folio(object);
if (!s) {
/* Handle kalloc'ed objects */
- if (unlikely(!PageSlab(page))) {
- BUG_ON(!PageCompound(page));
- kfree_hook(object);
- __free_pages(page, compound_order(page));
- p[size] = NULL; /* mark object processed */
+ if (unlikely(!folio_test_slab(folio))) {
+ free_large_kmalloc(folio, object);
+ df->slab = NULL;
return size;
}
/* Derive kmem_cache from object */
- df->s = page->slab_cache;
+ df->slab = folio_slab(folio);
+ df->s = df->slab->slab_cache;
} else {
+ df->slab = folio_slab(folio);
df->s = cache_from_obj(s, object); /* Support for memcg */
}
/* Start new detached freelist */
- df->page = page;
- set_freepointer(df->s, object, NULL);
df->tail = object;
df->freelist = object;
- p[size] = NULL; /* mark object processed */
df->cnt = 1;
+ if (is_kfence_address(object))
+ return size;
+
+ set_freepointer(df->s, object, NULL);
+
+ same = size;
while (size) {
object = p[--size];
- if (!object)
- continue; /* Skip processed objects */
-
- /* df->page is always set at this point */
- if (df->page == virt_to_head_page(object)) {
+ /* df->slab is always set at this point */
+ if (df->slab == virt_to_slab(object)) {
/* Opportunity build freelist */
set_freepointer(df->s, object, df->freelist);
df->freelist = object;
df->cnt++;
- p[size] = NULL; /* mark object processed */
-
+ same--;
+ if (size != same)
+ swap(p[size], p[same]);
continue;
}
/* Limit look ahead search */
if (!--lookahead)
break;
-
- if (!first_skipped_index)
- first_skipped_index = size + 1;
}
- return first_skipped_index;
+ return same;
}
/* Note that interrupts must be enabled when calling this function. */
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
{
- if (WARN_ON(!size))
+ if (!size)
return;
- memcg_slab_free_hook(s, p, size);
do {
struct detached_freelist df;
size = build_detached_freelist(s, size, p, &df);
- if (!df.page)
+ if (!df.slab)
continue;
- slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
+ slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt,
+ _RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-/* Note that interrupts must be enabled when calling this function. */
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+#ifndef CONFIG_SLUB_TINY
+static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, struct obj_cgroup *objcg)
{
struct kmem_cache_cpu *c;
+ unsigned long irqflags;
int i;
- struct obj_cgroup *objcg = NULL;
- /* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, &objcg, size, flags);
- if (unlikely(!s))
- return false;
/*
* Drain objects in the per cpu slab, while disabling local
* IRQs, which protects against PREEMPT and interrupts
* handlers invoking normal fastpath.
*/
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ c = slub_get_cpu_ptr(s->cpu_slab);
+ local_lock_irqsave(&s->cpu_slab->lock, irqflags);
for (i = 0; i < size; i++) {
- void *object = c->freelist;
+ void *object = kfence_alloc(s, s->object_size, flags);
+ if (unlikely(object)) {
+ p[i] = object;
+ continue;
+ }
+
+ object = c->freelist;
if (unlikely(!object)) {
/*
* We may have removed an object from c->freelist using
@@ -3303,18 +3957,22 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
*/
c->tid = next_tid(c->tid);
+ local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
+
/*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
*/
p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c);
+ _RET_IP_, c, s->object_size);
if (unlikely(!p[i]))
goto error;
c = this_cpu_ptr(s->cpu_slab);
maybe_wipe_obj_freeptr(s, p[i]);
+ local_lock_irqsave(&s->cpu_slab->lock, irqflags);
+
continue; /* goto for-loop */
}
c->freelist = get_freepointer(s, object);
@@ -3322,25 +3980,75 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
maybe_wipe_obj_freeptr(s, p[i]);
}
c->tid = next_tid(c->tid);
- local_irq_enable();
+ local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
+ slub_put_cpu_ptr(s->cpu_slab);
+
+ return i;
+
+error:
+ slub_put_cpu_ptr(s->cpu_slab);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
+ kmem_cache_free_bulk(s, i, p);
+ return 0;
+
+}
+#else /* CONFIG_SLUB_TINY */
+static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, struct obj_cgroup *objcg)
+{
+ int i;
+
+ for (i = 0; i < size; i++) {
+ void *object = kfence_alloc(s, s->object_size, flags);
+
+ if (unlikely(object)) {
+ p[i] = object;
+ continue;
+ }
- /* Clear memory outside IRQ disabled fastpath loop */
- if (unlikely(slab_want_init_on_alloc(flags, s))) {
- int j;
+ p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE,
+ _RET_IP_, s->object_size);
+ if (unlikely(!p[i]))
+ goto error;
- for (j = 0; j < i; j++)
- memset(p[j], 0, s->object_size);
+ maybe_wipe_obj_freeptr(s, p[i]);
}
- /* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, objcg, flags, size, p);
return i;
+
error:
- local_irq_enable();
- slab_post_alloc_hook(s, objcg, flags, i, p);
- __kmem_cache_free_bulk(s, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
+ kmem_cache_free_bulk(s, i, p);
return 0;
}
+#endif /* CONFIG_SLUB_TINY */
+
+/* Note that interrupts must be enabled when calling this function. */
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ int i;
+ struct obj_cgroup *objcg = NULL;
+
+ if (!size)
+ return 0;
+
+ /* memcg and kmem_cache debug support */
+ s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
+ if (unlikely(!s))
+ return 0;
+
+ i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg);
+
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled fastpath loop.
+ */
+ if (i != 0)
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s), s->object_size);
+ return i;
+}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3358,13 +4066,14 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
*/
/*
- * Mininum / Maximum order of slab pages. This influences locking overhead
+ * Minimum / Maximum order of slab pages. This influences locking overhead
* and slab fragmentation. A higher order reduces the number of partial slabs
* and increases the number of allocations possible without having to
* take the list_lock.
*/
static unsigned int slub_min_order;
-static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static unsigned int slub_max_order =
+ IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
static unsigned int slub_min_objects;
/*
@@ -3389,10 +4098,10 @@ static unsigned int slub_min_objects;
*
* Higher order allocations also allow the placement of more objects in a
* slab and thereby reduce object handling overhead. If the user has
- * requested a higher mininum order then we start with that one instead of
+ * requested a higher minimum order then we start with that one instead of
* the smallest order which will fit the object.
*/
-static inline unsigned int slab_order(unsigned int size,
+static inline unsigned int calc_slab_order(unsigned int size,
unsigned int min_objects, unsigned int max_order,
unsigned int fract_leftover)
{
@@ -3422,6 +4131,7 @@ static inline int calculate_order(unsigned int size)
unsigned int order;
unsigned int min_objects;
unsigned int max_objects;
+ unsigned int nr_cpus;
/*
* Attempt to find best configuration for a slab. This
@@ -3432,8 +4142,21 @@ static inline int calculate_order(unsigned int size)
* we reduce the minimum objects required in a slab.
*/
min_objects = slub_min_objects;
- if (!min_objects)
- min_objects = 4 * (fls(nr_cpu_ids) + 1);
+ if (!min_objects) {
+ /*
+ * Some architectures will only update present cpus when
+ * onlining them, so don't trust the number if it's just 1. But
+ * we also don't want to use nr_cpu_ids always, as on some other
+ * architectures, there can be many possible cpus, but never
+ * onlined. Here we compromise between trying to avoid too high
+ * order on systems that appear larger than they are, and too
+ * low order on systems that appear smaller than they are.
+ */
+ nr_cpus = num_present_cpus();
+ if (nr_cpus <= 1)
+ nr_cpus = nr_cpu_ids;
+ min_objects = 4 * (fls(nr_cpus) + 1);
+ }
max_objects = order_objects(slub_max_order, size);
min_objects = min(min_objects, max_objects);
@@ -3442,7 +4165,7 @@ static inline int calculate_order(unsigned int size)
fraction = 16;
while (fraction >= 4) {
- order = slab_order(size, min_objects,
+ order = calc_slab_order(size, min_objects,
slub_max_order, fraction);
if (order <= slub_max_order)
return order;
@@ -3455,15 +4178,15 @@ static inline int calculate_order(unsigned int size)
* We were unable to place multiple objects in a slab. Now
* lets see if we can place a single object there.
*/
- order = slab_order(size, 1, slub_max_order, 1);
+ order = calc_slab_order(size, 1, slub_max_order, 1);
if (order <= slub_max_order)
return order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
- order = slab_order(size, 1, MAX_ORDER, 1);
- if (order < MAX_ORDER)
+ order = calc_slab_order(size, 1, MAX_ORDER, 1);
+ if (order <= MAX_ORDER)
return order;
return -ENOSYS;
}
@@ -3481,10 +4204,12 @@ init_kmem_cache_node(struct kmem_cache_node *n)
#endif
}
+#ifndef CONFIG_SLUB_TINY
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
- KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
+ NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
+ sizeof(struct kmem_cache_cpu));
/*
* Must align to double word boundary for the double cmpxchg
@@ -3500,6 +4225,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
return 1;
}
+#else
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+ return 1;
+}
+#endif /* CONFIG_SLUB_TINY */
static struct kmem_cache *kmem_cache_node;
@@ -3514,39 +4245,38 @@ static struct kmem_cache *kmem_cache_node;
*/
static void early_kmem_cache_node_alloc(int node)
{
- struct page *page;
+ struct slab *slab;
struct kmem_cache_node *n;
BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
- page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
+ slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
- BUG_ON(!page);
- if (page_to_nid(page) != node) {
+ BUG_ON(!slab);
+ inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects);
+ if (slab_nid(slab) != node) {
pr_err("SLUB: Unable to allocate memory from node %d\n", node);
pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
}
- n = page->freelist;
+ n = slab->freelist;
BUG_ON(!n);
#ifdef CONFIG_SLUB_DEBUG
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
- GFP_KERNEL);
- page->freelist = get_freepointer(kmem_cache_node, n);
- page->inuse = 1;
- page->frozen = 0;
+ n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
+ slab->freelist = get_freepointer(kmem_cache_node, n);
+ slab->inuse = 1;
kmem_cache_node->node[node] = n;
init_kmem_cache_node(n);
- inc_slabs_node(kmem_cache_node, node, page->objects);
+ inc_slabs_node(kmem_cache_node, node, slab->objects);
/*
* No locks need to be taken here as it has just been
* initialized and there is no concurrent access.
*/
- __add_partial(n, page, DEACTIVATE_TO_HEAD);
+ __add_partial(n, slab, DEACTIVATE_TO_HEAD);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3563,7 +4293,9 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
void __kmem_cache_release(struct kmem_cache *s)
{
cache_random_seq_destroy(s);
+#ifndef CONFIG_SLUB_TINY
free_percpu(s->cpu_slab);
+#endif
free_kmem_cache_nodes(s);
}
@@ -3571,7 +4303,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
- for_each_node_state(node, N_NORMAL_MEMORY) {
+ for_each_node_mask(node, slab_nodes) {
struct kmem_cache_node *n;
if (slab_state == DOWN) {
@@ -3592,18 +4324,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
return 1;
}
-static void set_min_partial(struct kmem_cache *s, unsigned long min)
-{
- if (min < MIN_PARTIAL)
- min = MIN_PARTIAL;
- else if (min > MAX_PARTIAL)
- min = MAX_PARTIAL;
- s->min_partial = min;
-}
-
static void set_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
+ unsigned int nr_objects;
+
/*
* cpu_partial determined the maximum number of objects kept in the
* per cpu partial lists of a processor.
@@ -3613,24 +4338,22 @@ static void set_cpu_partial(struct kmem_cache *s)
* filled up again with minimal effort. The slab will never hit the
* per node partial lists and therefore no locking will be required.
*
- * This setting also determines
- *
- * A) The number of objects from per cpu partial slabs dumped to the
- * per node list when we reach the limit.
- * B) The number of objects in cpu partial slabs to extract from the
- * per node list when we run out of per cpu objects. We only fetch
- * 50% to keep some capacity around for frees.
+ * For backwards compatibility reasons, this is determined as number
+ * of objects, even though we now limit maximum number of pages, see
+ * slub_set_cpu_partial()
*/
if (!kmem_cache_has_cpu_partial(s))
- slub_set_cpu_partial(s, 0);
+ nr_objects = 0;
else if (s->size >= PAGE_SIZE)
- slub_set_cpu_partial(s, 2);
+ nr_objects = 6;
else if (s->size >= 1024)
- slub_set_cpu_partial(s, 6);
+ nr_objects = 24;
else if (s->size >= 256)
- slub_set_cpu_partial(s, 13);
+ nr_objects = 52;
else
- slub_set_cpu_partial(s, 30);
+ nr_objects = 120;
+
+ slub_set_cpu_partial(s, nr_objects);
#endif
}
@@ -3638,11 +4361,10 @@ static void set_cpu_partial(struct kmem_cache *s)
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
*/
-static int calculate_sizes(struct kmem_cache *s, int forced_order)
+static int calculate_sizes(struct kmem_cache *s)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
- unsigned int freepointer_area;
unsigned int order;
/*
@@ -3651,13 +4373,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the possible location of the free pointer.
*/
size = ALIGN(size, sizeof(void *));
- /*
- * This is the area of the object where a freepointer can be
- * safely written. If redzoning adds more to the inuse size, we
- * can't use that portion for writing the freepointer, so
- * s->offset must be limited within this for the general case.
- */
- freepointer_area = size;
#ifdef CONFIG_SLUB_DEBUG
/*
@@ -3683,19 +4398,22 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* With that we have determined the number of bytes in actual use
- * by the object. This is the potential offset to the free pointer.
+ * by the object and redzoning.
*/
s->inuse = size;
- if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
- s->ctor)) {
+ if (slub_debug_orig_size(s) ||
+ (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
+ s->ctor) {
/*
* Relocate free pointer after the object if it is not
* permitted to overwrite the first word of the object on
* kmem_cache_free.
*
* This is the case if we do RCU, have a constructor or
- * destructor or are poisoning the objects.
+ * destructor, are poisoning the objects, or are
+ * redzoning an object smaller than sizeof(void *).
*
* The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the
@@ -3704,22 +4422,27 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
s->offset = size;
size += sizeof(void *);
- } else if (freepointer_area > sizeof(void *)) {
+ } else {
/*
* Store freelist pointer near middle of object to keep
* it away from the edges of the object to avoid small
* sized over/underflows from neighboring allocations.
*/
- s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
+ s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
}
#ifdef CONFIG_SLUB_DEBUG
- if (flags & SLAB_STORE_USER)
+ if (flags & SLAB_STORE_USER) {
/*
* Need to store information about allocs and frees after
* the object.
*/
size += 2 * sizeof(struct track);
+
+ /* Save the original kmalloc request size */
+ if (flags & SLAB_KMALLOC)
+ size += sizeof(unsigned int);
+ }
#endif
kasan_cache_create(s, &size, &s->flags);
@@ -3748,10 +4471,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
size = ALIGN(size, s->align);
s->size = size;
s->reciprocal_size = reciprocal_value(size);
- if (forced_order >= 0)
- order = forced_order;
- else
- order = calculate_order(size);
+ order = calculate_order(size);
if ((int)order < 0)
return 0;
@@ -3774,20 +4494,18 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
s->oo = oo_make(order, size);
s->min = oo_make(get_order(size), size);
- if (oo_objects(s->oo) > oo_objects(s->max))
- s->max = s->oo;
return !!oo_objects(s->oo);
}
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
- s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
+ s->flags = kmem_cache_flags(s->size, flags, s->name);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
#endif
- if (!calculate_sizes(s, -1))
+ if (!calculate_sizes(s))
goto error;
if (disable_higher_order_debug) {
/*
@@ -3797,23 +4515,24 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
if (get_order(s->size) > get_order(s->object_size)) {
s->flags &= ~DEBUG_METADATA_FLAGS;
s->offset = 0;
- if (!calculate_sizes(s, -1))
+ if (!calculate_sizes(s))
goto error;
}
}
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
+#ifdef system_has_freelist_aba
+ if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
+ }
#endif
/*
- * The larger the object size is, the more pages we want on the partial
+ * The larger the object size is, the more slabs we want on the partial
* list to avoid pounding the page allocator excessively.
*/
- set_min_partial(s, ilog2(s->size) / 2);
+ s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
+ s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
set_cpu_partial(s);
@@ -3833,32 +4552,31 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
if (alloc_kmem_cache_cpus(s))
return 0;
- free_kmem_cache_nodes(s);
error:
+ __kmem_cache_release(s);
return -EINVAL;
}
-static void list_slab_objects(struct kmem_cache *s, struct page *page,
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
- void *addr = page_address(page);
- unsigned long *map;
+ void *addr = slab_address(slab);
void *p;
- slab_err(s, page, text, s->name);
- slab_lock(page);
+ slab_err(s, slab, text, s->name);
- map = get_map(s, page);
- for_each_object(p, s, addr, page->objects) {
+ spin_lock(&object_map_lock);
+ __fill_map(object_map, s, slab);
+
+ for_each_object(p, s, addr, slab->objects) {
- if (!test_bit(__obj_to_index(s, addr, p), map)) {
- pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+ if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
+ pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
- put_map(map);
- slab_unlock(page);
+ spin_unlock(&object_map_lock);
#endif
}
@@ -3870,23 +4588,23 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
LIST_HEAD(discard);
- struct page *page, *h;
+ struct slab *slab, *h;
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &n->partial, slab_list) {
- if (!page->inuse) {
- remove_partial(n, page);
- list_add(&page->slab_list, &discard);
+ list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
+ if (!slab->inuse) {
+ remove_partial(n, slab);
+ list_add(&slab->slab_list, &discard);
} else {
- list_slab_objects(s, page,
+ list_slab_objects(s, slab,
"Objects remaining in %s on __kmem_cache_shutdown()");
}
}
spin_unlock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &discard, slab_list)
- discard_slab(s, page);
+ list_for_each_entry_safe(slab, h, &discard, slab_list)
+ discard_slab(s, slab);
}
bool __kmem_cache_empty(struct kmem_cache *s)
@@ -3895,7 +4613,7 @@ bool __kmem_cache_empty(struct kmem_cache *s)
struct kmem_cache_node *n;
for_each_kmem_cache_node(s, node, n)
- if (n->nr_partial || slabs_node(s, node))
+ if (n->nr_partial || node_nr_slabs(n))
return false;
return true;
}
@@ -3908,16 +4626,75 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
int node;
struct kmem_cache_node *n;
- flush_all(s);
+ flush_all_cpus_locked(s);
/* Attempt to free all objects */
for_each_kmem_cache_node(s, node, n) {
free_partial(s, n);
- if (n->nr_partial || slabs_node(s, node))
+ if (n->nr_partial || node_nr_slabs(n))
return 1;
}
return 0;
}
+#ifdef CONFIG_PRINTK
+void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
+{
+ void *base;
+ int __maybe_unused i;
+ unsigned int objnr;
+ void *objp;
+ void *objp0;
+ struct kmem_cache *s = slab->slab_cache;
+ struct track __maybe_unused *trackp;
+
+ kpp->kp_ptr = object;
+ kpp->kp_slab = slab;
+ kpp->kp_slab_cache = s;
+ base = slab_address(slab);
+ objp0 = kasan_reset_tag(object);
+#ifdef CONFIG_SLUB_DEBUG
+ objp = restore_red_left(s, objp0);
+#else
+ objp = objp0;
+#endif
+ objnr = obj_to_index(s, slab, objp);
+ kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
+ objp = base + s->size * objnr;
+ kpp->kp_objp = objp;
+ if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
+ || (objp - base) % s->size) ||
+ !(s->flags & SLAB_STORE_USER))
+ return;
+#ifdef CONFIG_SLUB_DEBUG
+ objp = fixup_red_left(s, objp);
+ trackp = get_track(s, objp, TRACK_ALLOC);
+ kpp->kp_ret = (void *)trackp->addr;
+#ifdef CONFIG_STACKDEPOT
+ {
+ depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ handle = READ_ONCE(trackp->handle);
+ if (handle) {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
+ kpp->kp_stack[i] = (void *)entries[i];
+ }
+
+ trackp = get_track(s, objp, TRACK_FREE);
+ handle = READ_ONCE(trackp->handle);
+ if (handle) {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
+ kpp->kp_free_stack[i] = (void *)entries[i];
+ }
+ }
+#endif
+#endif
+}
+#endif
+
/********************************************************************
* Kmalloc subsystem
*******************************************************************/
@@ -3934,7 +4711,7 @@ __setup("slub_min_order=", setup_slub_min_order);
static int __init setup_slub_max_order(char *str)
{
get_option(&str, (int *)&slub_max_order);
- slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
+ slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER);
return 1;
}
@@ -3950,78 +4727,6 @@ static int __init setup_slub_min_objects(char *str)
__setup("slub_min_objects=", setup_slub_min_objects);
-void *__kmalloc(size_t size, gfp_t flags)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, flags);
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, flags, _RET_IP_);
-
- trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc);
-
-#ifdef CONFIG_NUMA
-static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
-{
- struct page *page;
- void *ptr = NULL;
- unsigned int order = get_order(size);
-
- flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, order);
- if (page) {
- ptr = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
- }
-
- return kmalloc_large_node_hook(ptr, size, flags);
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, flags, node);
-
- trace_kmalloc_node(_RET_IP_, ret,
- size, PAGE_SIZE << get_order(size),
- flags, node);
-
- return ret;
- }
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, flags, node, _RET_IP_);
-
- trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_HARDENED_USERCOPY
/*
* Rejects incorrectly sized objects and objects that are to be copied
@@ -4031,28 +4736,31 @@ EXPORT_SYMBOL(__kmalloc_node);
* Returns NULL if check passes, otherwise const char * to name of cache
* to indicate an error.
*/
-void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
- bool to_user)
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user)
{
struct kmem_cache *s;
unsigned int offset;
- size_t object_size;
+ bool is_kfence = is_kfence_address(ptr);
ptr = kasan_reset_tag(ptr);
/* Find object and usable object size. */
- s = page->slab_cache;
+ s = slab->slab_cache;
/* Reject impossible pointers. */
- if (ptr < page_address(page))
+ if (ptr < slab_address(slab))
usercopy_abort("SLUB object not in SLUB page?!", NULL,
to_user, 0, n);
/* Find offset within object. */
- offset = (ptr - page_address(page)) % s->size;
+ if (is_kfence)
+ offset = ptr - kfence_object_start(ptr);
+ else
+ offset = (ptr - slab_address(slab)) % s->size;
/* Adjust for redzone and reject if within the redzone. */
- if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
+ if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
if (offset < s->red_left_pad)
usercopy_abort("SLUB object in left red zone",
s->name, to_user, offset, n);
@@ -4065,66 +4773,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
n <= s->useroffset - offset + s->usersize)
return;
- /*
- * If the copy is still within the allocated object, produce
- * a warning instead of rejecting the copy. This is intended
- * to be a temporary method to find any missing usercopy
- * whitelists.
- */
- object_size = slab_ksize(s);
- if (usercopy_fallback &&
- offset <= object_size && n <= object_size - offset) {
- usercopy_warn("SLUB object", s->name, to_user, offset, n);
- return;
- }
-
usercopy_abort("SLUB object", s->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
-size_t __ksize(const void *object)
-{
- struct page *page;
-
- if (unlikely(object == ZERO_SIZE_PTR))
- return 0;
-
- page = virt_to_head_page(object);
-
- if (unlikely(!PageSlab(page))) {
- WARN_ON(!PageCompound(page));
- return page_size(page);
- }
-
- return slab_ksize(page->slab_cache);
-}
-EXPORT_SYMBOL(__ksize);
-
-void kfree(const void *x)
-{
- struct page *page;
- void *object = (void *)x;
-
- trace_kfree(_RET_IP_, x);
-
- if (unlikely(ZERO_OR_NULL_PTR(x)))
- return;
-
- page = virt_to_head_page(x);
- if (unlikely(!PageSlab(page))) {
- unsigned int order = compound_order(page);
-
- BUG_ON(!PageCompound(page));
- kfree_hook(object);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(page, order);
- return;
- }
- slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
-}
-EXPORT_SYMBOL(kfree);
-
#define SHRINK_PROMOTE_MAX 32
/*
@@ -4136,19 +4788,18 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+static int __kmem_cache_do_shrink(struct kmem_cache *s)
{
int node;
int i;
struct kmem_cache_node *n;
- struct page *page;
- struct page *t;
+ struct slab *slab;
+ struct slab *t;
struct list_head discard;
struct list_head promote[SHRINK_PROMOTE_MAX];
unsigned long flags;
int ret = 0;
- flush_all(s);
for_each_kmem_cache_node(s, node, n) {
INIT_LIST_HEAD(&discard);
for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
@@ -4160,22 +4811,23 @@ int __kmem_cache_shrink(struct kmem_cache *s)
* Build lists of slabs to discard or promote.
*
* Note that concurrent frees may occur while we hold the
- * list_lock. page->inuse here is the upper limit.
+ * list_lock. slab->inuse here is the upper limit.
*/
- list_for_each_entry_safe(page, t, &n->partial, slab_list) {
- int free = page->objects - page->inuse;
+ list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
+ int free = slab->objects - slab->inuse;
- /* Do not reread page->inuse */
+ /* Do not reread slab->inuse */
barrier();
/* We do not keep full slabs on the list */
BUG_ON(free <= 0);
- if (free == page->objects) {
- list_move(&page->slab_list, &discard);
+ if (free == slab->objects) {
+ list_move(&slab->slab_list, &discard);
n->nr_partial--;
+ dec_slabs_node(s, node, slab->objects);
} else if (free <= SHRINK_PROMOTE_MAX)
- list_move(&page->slab_list, promote + free - 1);
+ list_move(&slab->slab_list, promote + free - 1);
}
/*
@@ -4188,23 +4840,31 @@ int __kmem_cache_shrink(struct kmem_cache *s)
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, &discard, slab_list)
- discard_slab(s, page);
+ list_for_each_entry_safe(slab, t, &discard, slab_list)
+ free_slab(s, slab);
- if (slabs_node(s, node))
+ if (node_nr_slabs(n))
ret = 1;
}
return ret;
}
+int __kmem_cache_shrink(struct kmem_cache *s)
+{
+ flush_all(s);
+ return __kmem_cache_do_shrink(s);
+}
+
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ list_for_each_entry(s, &slab_caches, list) {
+ flush_all_cpus_locked(s);
+ __kmem_cache_do_shrink(s);
+ }
mutex_unlock(&slab_mutex);
return 0;
@@ -4212,8 +4872,6 @@ static int slab_mem_going_offline_callback(void *arg)
static void slab_mem_offline_callback(void *arg)
{
- struct kmem_cache_node *n;
- struct kmem_cache *s;
struct memory_notify *marg = arg;
int offline_node;
@@ -4227,21 +4885,12 @@ static void slab_mem_offline_callback(void *arg)
return;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- n = get_node(s, offline_node);
- if (n) {
- /*
- * if n->nr_slabs > 0, slabs still exist on the node
- * that is going down. We were unable to free them,
- * and offline_pages() function shouldn't call this
- * callback. So, we must fail.
- */
- BUG_ON(slabs_node(s, offline_node));
-
- s->node[offline_node] = NULL;
- kmem_cache_free(kmem_cache_node, n);
- }
- }
+ node_clear(offline_node, slab_nodes);
+ /*
+ * We no longer free kmem_cache_node structures here, as it would be
+ * racy with all get_node() users, and infeasible to protect them with
+ * slab_mutex.
+ */
mutex_unlock(&slab_mutex);
}
@@ -4268,6 +4917,12 @@ static int slab_mem_going_online_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
/*
+ * The structure may already exist if the node was previously
+ * onlined and offlined.
+ */
+ if (get_node(s, nid))
+ continue;
+ /*
* XXX: kmem_cache_alloc_node will fallback to other nodes
* since memory is not yet available from the node that
* is brought up.
@@ -4280,6 +4935,11 @@ static int slab_mem_going_online_callback(void *arg)
init_kmem_cache_node(n);
s->node[nid] = n;
}
+ /*
+ * Any cache created after this point will also have kmem_cache_node
+ * initialized for the new node.
+ */
+ node_set(nid, slab_nodes);
out:
mutex_unlock(&slab_mutex);
return ret;
@@ -4312,11 +4972,6 @@ static int slab_memory_callback(struct notifier_block *self,
return ret;
}
-static struct notifier_block slab_memory_callback_nb = {
- .notifier_call = slab_memory_callback,
- .priority = SLAB_CALLBACK_PRI,
-};
-
/********************************************************************
* Basic setup of slabs
*******************************************************************/
@@ -4342,7 +4997,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
*/
__flush_cpu_slab(s, smp_processor_id());
for_each_kmem_cache_node(s, node, n) {
- struct page *p;
+ struct slab *p;
list_for_each_entry(p, &n->partial, slab_list)
p->slab_cache = s;
@@ -4360,17 +5015,29 @@ void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
+ int node;
if (debug_guardpage_minorder())
slub_max_order = 0;
+ /* Print slub debugging pointers without hashing */
+ if (__slub_debug_enabled())
+ no_hash_pointers_enable(NULL);
+
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
+ /*
+ * Initialize the nodemask for which we will allocate per node
+ * structures. Here we don't need taking slab_mutex yet.
+ */
+ for_each_node_state(node, N_NORMAL_MEMORY)
+ node_set(node, slab_nodes);
+
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
- register_hotmemory_notifier(&slab_memory_callback_nb);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
@@ -4401,6 +5068,10 @@ void __init kmem_cache_init(void)
void __init kmem_cache_init_late(void)
{
+#ifndef CONFIG_SLUB_TINY
+ flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!flushwq);
+#endif
}
struct kmem_cache *
@@ -4411,6 +5082,9 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
+ if (sysfs_slab_alias(s, name))
+ return NULL;
+
s->refcount++;
/*
@@ -4419,11 +5093,6 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
*/
s->object_size = max(s->object_size, size);
s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
-
- if (sysfs_slab_alias(s, name)) {
- s->refcount--;
- s = NULL;
- }
}
return s;
@@ -4442,157 +5111,119 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
return 0;
err = sysfs_slab_add(s);
- if (err)
+ if (err) {
__kmem_cache_release(s);
-
- return err;
-}
-
-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, gfpflags);
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, gfpflags, caller);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc(caller, ret, size, s->size, gfpflags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
- int node, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, gfpflags, node);
-
- trace_kmalloc_node(caller, ret,
- size, PAGE_SIZE << get_order(size),
- gfpflags, node);
-
- return ret;
+ return err;
}
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, gfpflags, node, caller);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
+ if (s->flags & SLAB_STORE_USER)
+ debugfs_slab_add(s);
- return ret;
+ return 0;
}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif
-#ifdef CONFIG_SYSFS
-static int count_inuse(struct page *page)
+#ifdef SLAB_SUPPORTS_SYSFS
+static int count_inuse(struct slab *slab)
{
- return page->inuse;
+ return slab->inuse;
}
-static int count_total(struct page *page)
+static int count_total(struct slab *slab)
{
- return page->objects;
+ return slab->objects;
}
#endif
#ifdef CONFIG_SLUB_DEBUG
-static void validate_slab(struct kmem_cache *s, struct page *page)
+static void validate_slab(struct kmem_cache *s, struct slab *slab,
+ unsigned long *obj_map)
{
void *p;
- void *addr = page_address(page);
- unsigned long *map;
+ void *addr = slab_address(slab);
- slab_lock(page);
-
- if (!check_slab(s, page) || !on_freelist(s, page, NULL))
- goto unlock;
+ if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
+ return;
/* Now we know that a valid freelist exists */
- map = get_map(s, page);
- for_each_object(p, s, addr, page->objects) {
- u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
+ __fill_map(obj_map, s, slab);
+ for_each_object(p, s, addr, slab->objects) {
+ u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
- if (!check_object(s, page, p, val))
+ if (!check_object(s, slab, p, val))
break;
}
- put_map(map);
-unlock:
- slab_unlock(page);
}
static int validate_slab_node(struct kmem_cache *s,
- struct kmem_cache_node *n)
+ struct kmem_cache_node *n, unsigned long *obj_map)
{
unsigned long count = 0;
- struct page *page;
+ struct slab *slab;
unsigned long flags;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, slab_list) {
- validate_slab(s, page);
+ list_for_each_entry(slab, &n->partial, slab_list) {
+ validate_slab(s, slab, obj_map);
count++;
}
- if (count != n->nr_partial)
+ if (count != n->nr_partial) {
pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
s->name, count, n->nr_partial);
+ slab_add_kunit_errors();
+ }
if (!(s->flags & SLAB_STORE_USER))
goto out;
- list_for_each_entry(page, &n->full, slab_list) {
- validate_slab(s, page);
+ list_for_each_entry(slab, &n->full, slab_list) {
+ validate_slab(s, slab, obj_map);
count++;
}
- if (count != atomic_long_read(&n->nr_slabs))
+ if (count != node_nr_slabs(n)) {
pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
- s->name, count, atomic_long_read(&n->nr_slabs));
+ s->name, count, node_nr_slabs(n));
+ slab_add_kunit_errors();
+ }
out:
spin_unlock_irqrestore(&n->list_lock, flags);
return count;
}
-static long validate_slab_cache(struct kmem_cache *s)
+long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
struct kmem_cache_node *n;
+ unsigned long *obj_map;
+
+ obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
+ if (!obj_map)
+ return -ENOMEM;
flush_all(s);
for_each_kmem_cache_node(s, node, n)
- count += validate_slab_node(s, n);
+ count += validate_slab_node(s, n, obj_map);
+
+ bitmap_free(obj_map);
return count;
}
+EXPORT_SYMBOL(validate_slab_cache);
+
+#ifdef CONFIG_DEBUG_FS
/*
* Generate lists of code addresses where slabcache objects are allocated
* and freed.
*/
struct location {
+ depot_stack_handle_t handle;
unsigned long count;
unsigned long addr;
+ unsigned long waste;
long long sum_time;
long min_time;
long max_time;
@@ -4606,8 +5237,11 @@ struct loc_track {
unsigned long max;
unsigned long count;
struct location *loc;
+ loff_t idx;
};
+static struct dentry *slab_debugfs_root;
+
static void free_loc_track(struct loc_track *t)
{
if (t->max)
@@ -4636,13 +5270,19 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
}
static int add_location(struct loc_track *t, struct kmem_cache *s,
- const struct track *track)
+ const struct track *track,
+ unsigned int orig_size)
{
long start, end, pos;
struct location *l;
- unsigned long caddr;
+ unsigned long caddr, chandle, cwaste;
unsigned long age = jiffies - track->when;
+ depot_stack_handle_t handle = 0;
+ unsigned int waste = s->object_size - orig_size;
+#ifdef CONFIG_STACKDEPOT
+ handle = READ_ONCE(track->handle);
+#endif
start = -1;
end = t->count;
@@ -4656,10 +5296,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
if (pos == end)
break;
- caddr = t->loc[pos].addr;
- if (track->addr == caddr) {
+ l = &t->loc[pos];
+ caddr = l->addr;
+ chandle = l->handle;
+ cwaste = l->waste;
+ if ((track->addr == caddr) && (handle == chandle) &&
+ (waste == cwaste)) {
- l = &t->loc[pos];
l->count++;
if (track->when) {
l->sum_time += age;
@@ -4682,6 +5325,11 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
if (track->addr < caddr)
end = pos;
+ else if (track->addr == caddr && handle < chandle)
+ end = pos;
+ else if (track->addr == caddr && handle == chandle &&
+ waste < cwaste)
+ end = pos;
else
start = pos;
}
@@ -4704,6 +5352,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
l->max_time = age;
l->min_pid = track->pid;
l->max_pid = track->pid;
+ l->handle = handle;
+ l->waste = waste;
cpumask_clear(to_cpumask(l->cpus));
cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
nodes_clear(l->nodes);
@@ -4712,162 +5362,25 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
- struct page *page, enum track_item alloc)
+ struct slab *slab, enum track_item alloc,
+ unsigned long *obj_map)
{
- void *addr = page_address(page);
+ void *addr = slab_address(slab);
+ bool is_alloc = (alloc == TRACK_ALLOC);
void *p;
- unsigned long *map;
- map = get_map(s, page);
- for_each_object(p, s, addr, page->objects)
- if (!test_bit(__obj_to_index(s, addr, p), map))
- add_location(t, s, get_track(s, p, alloc));
- put_map(map);
-}
+ __fill_map(obj_map, s, slab);
-static int list_locations(struct kmem_cache *s, char *buf,
- enum track_item alloc)
-{
- int len = 0;
- unsigned long i;
- struct loc_track t = { 0, 0, NULL };
- int node;
- struct kmem_cache_node *n;
-
- if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
- GFP_KERNEL)) {
- return sprintf(buf, "Out of memory\n");
- }
- /* Push back cpu slabs */
- flush_all(s);
-
- for_each_kmem_cache_node(s, node, n) {
- unsigned long flags;
- struct page *page;
-
- if (!atomic_long_read(&n->nr_slabs))
- continue;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, slab_list)
- process_slab(&t, s, page, alloc);
- list_for_each_entry(page, &n->full, slab_list)
- process_slab(&t, s, page, alloc);
- spin_unlock_irqrestore(&n->list_lock, flags);
- }
-
- for (i = 0; i < t.count; i++) {
- struct location *l = &t.loc[i];
-
- if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
- break;
- len += sprintf(buf + len, "%7ld ", l->count);
-
- if (l->addr)
- len += sprintf(buf + len, "%pS", (void *)l->addr);
- else
- len += sprintf(buf + len, "<not-available>");
-
- if (l->sum_time != l->min_time) {
- len += sprintf(buf + len, " age=%ld/%ld/%ld",
- l->min_time,
- (long)div_u64(l->sum_time, l->count),
- l->max_time);
- } else
- len += sprintf(buf + len, " age=%ld",
- l->min_time);
-
- if (l->min_pid != l->max_pid)
- len += sprintf(buf + len, " pid=%ld-%ld",
- l->min_pid, l->max_pid);
- else
- len += sprintf(buf + len, " pid=%ld",
- l->min_pid);
-
- if (num_online_cpus() > 1 &&
- !cpumask_empty(to_cpumask(l->cpus)) &&
- len < PAGE_SIZE - 60)
- len += scnprintf(buf + len, PAGE_SIZE - len - 50,
- " cpus=%*pbl",
- cpumask_pr_args(to_cpumask(l->cpus)));
-
- if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
- len < PAGE_SIZE - 60)
- len += scnprintf(buf + len, PAGE_SIZE - len - 50,
- " nodes=%*pbl",
- nodemask_pr_args(&l->nodes));
-
- len += sprintf(buf + len, "\n");
- }
-
- free_loc_track(&t);
- if (!t.count)
- len += sprintf(buf, "No data\n");
- return len;
+ for_each_object(p, s, addr, slab->objects)
+ if (!test_bit(__obj_to_index(s, addr, p), obj_map))
+ add_location(t, s, get_track(s, p, alloc),
+ is_alloc ? get_orig_size(s, p) :
+ s->object_size);
}
+#endif /* CONFIG_DEBUG_FS */
#endif /* CONFIG_SLUB_DEBUG */
-#ifdef SLUB_RESILIENCY_TEST
-static void __init resiliency_test(void)
-{
- u8 *p;
- int type = KMALLOC_NORMAL;
-
- BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
-
- pr_err("SLUB resiliency testing\n");
- pr_err("-----------------------\n");
- pr_err("A. Corruption after allocation\n");
-
- p = kzalloc(16, GFP_KERNEL);
- p[16] = 0x12;
- pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
- p + 16);
-
- validate_slab_cache(kmalloc_caches[type][4]);
-
- /* Hmmm... The next two are dangerous */
- p = kzalloc(32, GFP_KERNEL);
- p[32 + sizeof(void *)] = 0x34;
- pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
- p);
- pr_err("If allocated object is overwritten then not detectable\n\n");
-
- validate_slab_cache(kmalloc_caches[type][5]);
- p = kzalloc(64, GFP_KERNEL);
- p += 64 + (get_cycles() & 0xff) * sizeof(void *);
- *p = 0x56;
- pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
- p);
- pr_err("If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches[type][6]);
-
- pr_err("\nB. Corruption after free\n");
- p = kzalloc(128, GFP_KERNEL);
- kfree(p);
- *p = 0x78;
- pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[type][7]);
-
- p = kzalloc(256, GFP_KERNEL);
- kfree(p);
- p[50] = 0x9a;
- pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[type][8]);
-
- p = kzalloc(512, GFP_KERNEL);
- kfree(p);
- p[512] = 0xab;
- pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[type][9]);
-}
-#else
-#ifdef CONFIG_SYSFS
-static void resiliency_test(void) {};
-#endif
-#endif /* SLUB_RESILIENCY_TEST */
-
-#ifdef CONFIG_SYSFS
+#ifdef SLAB_SUPPORTS_SYSFS
enum slab_stat_type {
SL_ALL, /* All slabs */
SL_PARTIAL, /* Only partially allocated slabs */
@@ -4882,29 +5395,14 @@ enum slab_stat_type {
#define SO_OBJECTS (1 << SL_OBJECTS)
#define SO_TOTAL (1 << SL_TOTAL)
-#ifdef CONFIG_MEMCG
-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
-
-static int __init setup_slub_memcg_sysfs(char *str)
-{
- int v;
-
- if (get_option(&str, &v) > 0)
- memcg_sysfs_enabled = v;
-
- return 1;
-}
-
-__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
-#endif
-
static ssize_t show_slab_objects(struct kmem_cache *s,
- char *buf, unsigned long flags)
+ char *buf, unsigned long flags)
{
unsigned long total = 0;
int node;
int x;
unsigned long *nodes;
+ int len = 0;
nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
if (!nodes)
@@ -4917,35 +5415,37 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
cpu);
int node;
- struct page *page;
+ struct slab *slab;
- page = READ_ONCE(c->page);
- if (!page)
+ slab = READ_ONCE(c->slab);
+ if (!slab)
continue;
- node = page_to_nid(page);
+ node = slab_nid(slab);
if (flags & SO_TOTAL)
- x = page->objects;
+ x = slab->objects;
else if (flags & SO_OBJECTS)
- x = page->inuse;
+ x = slab->inuse;
else
x = 1;
total += x;
nodes[node] += x;
- page = slub_percpu_partial_read_once(c);
- if (page) {
- node = page_to_nid(page);
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ slab = slub_percpu_partial_read_once(c);
+ if (slab) {
+ node = slab_nid(slab);
if (flags & SO_TOTAL)
WARN_ON_ONCE(1);
else if (flags & SO_OBJECTS)
WARN_ON_ONCE(1);
else
- x = page->pages;
+ x = slab->slabs;
total += x;
nodes[node] += x;
}
+#endif
}
}
@@ -4967,12 +5467,11 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
for_each_kmem_cache_node(s, node, n) {
if (flags & SO_TOTAL)
- x = atomic_long_read(&n->total_objects);
+ x = node_nr_objs(n);
else if (flags & SO_OBJECTS)
- x = atomic_long_read(&n->total_objects) -
- count_partial(n, count_free);
+ x = node_nr_objs(n) - count_partial(n, count_free);
else
- x = atomic_long_read(&n->nr_slabs);
+ x = node_nr_slabs(n);
total += x;
nodes[node] += x;
}
@@ -4993,15 +5492,19 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
nodes[node] += x;
}
}
- x = sprintf(buf, "%lu", total);
+
+ len += sysfs_emit_at(buf, len, "%lu", total);
#ifdef CONFIG_NUMA
- for (node = 0; node < nr_node_ids; node++)
+ for (node = 0; node < nr_node_ids; node++) {
if (nodes[node])
- x += sprintf(buf + x, " N%d=%lu",
- node, nodes[node]);
+ len += sysfs_emit_at(buf, len, " N%d=%lu",
+ node, nodes[node]);
+ }
#endif
+ len += sysfs_emit_at(buf, len, "\n");
kfree(nodes);
- return x + sprintf(buf + x, "\n");
+
+ return len;
}
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
@@ -5014,46 +5517,44 @@ struct slab_attribute {
};
#define SLAB_ATTR_RO(_name) \
- static struct slab_attribute _name##_attr = \
- __ATTR(_name, 0400, _name##_show, NULL)
+ static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
#define SLAB_ATTR(_name) \
- static struct slab_attribute _name##_attr = \
- __ATTR(_name, 0600, _name##_show, _name##_store)
+ static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->size);
+ return sysfs_emit(buf, "%u\n", s->size);
}
SLAB_ATTR_RO(slab_size);
static ssize_t align_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->align);
+ return sysfs_emit(buf, "%u\n", s->align);
}
SLAB_ATTR_RO(align);
static ssize_t object_size_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->object_size);
+ return sysfs_emit(buf, "%u\n", s->object_size);
}
SLAB_ATTR_RO(object_size);
static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", oo_objects(s->oo));
+ return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
}
SLAB_ATTR_RO(objs_per_slab);
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", oo_order(s->oo));
+ return sysfs_emit(buf, "%u\n", oo_order(s->oo));
}
SLAB_ATTR_RO(order);
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%lu\n", s->min_partial);
+ return sysfs_emit(buf, "%lu\n", s->min_partial);
}
static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
@@ -5066,14 +5567,19 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
if (err)
return err;
- set_min_partial(s, min);
+ s->min_partial = min;
return length;
}
SLAB_ATTR(min_partial);
static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", slub_cpu_partial(s));
+ unsigned int nr_partial = 0;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ nr_partial = s->cpu_partial;
+#endif
+
+ return sysfs_emit(buf, "%u\n", nr_partial);
}
static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
@@ -5098,13 +5604,13 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
if (!s->ctor)
return 0;
- return sprintf(buf, "%pS\n", s->ctor);
+ return sysfs_emit(buf, "%pS\n", s->ctor);
}
SLAB_ATTR_RO(ctor);
static ssize_t aliases_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
+ return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
}
SLAB_ATTR_RO(aliases);
@@ -5120,12 +5626,6 @@ static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(cpu_slabs);
-static ssize_t objects_show(struct kmem_cache *s, char *buf)
-{
- return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
-}
-SLAB_ATTR_RO(objects);
-
static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
{
return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
@@ -5135,67 +5635,75 @@ SLAB_ATTR_RO(objects_partial);
static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
{
int objects = 0;
- int pages = 0;
- int cpu;
- int len;
+ int slabs = 0;
+ int cpu __maybe_unused;
+ int len = 0;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
for_each_online_cpu(cpu) {
- struct page *page;
+ struct slab *slab;
- page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
+ slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
- if (page) {
- pages += page->pages;
- objects += page->pobjects;
- }
+ if (slab)
+ slabs += slab->slabs;
}
+#endif
- len = sprintf(buf, "%d(%d)", objects, pages);
+ /* Approximate half-full slabs, see slub_set_cpu_partial() */
+ objects = (slabs * oo_objects(s->oo)) / 2;
+ len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_SLUB_CPU_PARTIAL
for_each_online_cpu(cpu) {
- struct page *page;
-
- page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
-
- if (page && len < PAGE_SIZE - 20)
- len += sprintf(buf + len, " C%d=%d(%d)", cpu,
- page->pobjects, page->pages);
+ struct slab *slab;
+
+ slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
+ if (slab) {
+ slabs = READ_ONCE(slab->slabs);
+ objects = (slabs * oo_objects(s->oo)) / 2;
+ len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
+ cpu, objects, slabs);
+ }
}
#endif
- return len + sprintf(buf + len, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
}
SLAB_ATTR_RO(slabs_cpu_partial);
static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}
SLAB_ATTR_RO(reclaim_account);
static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
}
SLAB_ATTR_RO(hwcache_align);
#ifdef CONFIG_ZONE_DMA
static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
}
SLAB_ATTR_RO(cache_dma);
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
static ssize_t usersize_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->usersize);
+ return sysfs_emit(buf, "%u\n", s->usersize);
}
SLAB_ATTR_RO(usersize);
+#endif
static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
}
SLAB_ATTR_RO(destroy_by_rcu);
@@ -5212,35 +5720,41 @@ static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(total_objects);
+static ssize_t objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects);
+
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
SLAB_ATTR_RO(sanity_checks);
static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}
SLAB_ATTR_RO(trace);
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
}
SLAB_ATTR_RO(red_zone);
static ssize_t poison_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
}
SLAB_ATTR_RO(poison);
static ssize_t store_user_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
}
SLAB_ATTR_RO(store_user);
@@ -5255,7 +5769,7 @@ static ssize_t validate_store(struct kmem_cache *s,
{
int ret = -EINVAL;
- if (buf[0] == '1') {
+ if (buf[0] == '1' && kmem_cache_debug(s)) {
ret = validate_slab_cache(s);
if (ret >= 0)
ret = length;
@@ -5264,29 +5778,28 @@ static ssize_t validate_store(struct kmem_cache *s,
}
SLAB_ATTR(validate);
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_ALLOC);
-}
-SLAB_ATTR_RO(alloc_calls);
-
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_FREE);
-}
-SLAB_ATTR_RO(free_calls);
#endif /* CONFIG_SLUB_DEBUG */
#ifdef CONFIG_FAILSLAB
static ssize_t failslab_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}
-SLAB_ATTR_RO(failslab);
+
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ if (s->refcount > 1)
+ return -EINVAL;
+
+ if (buf[0] == '1')
+ WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
+ else
+ WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
+
+ return length;
+}
+SLAB_ATTR(failslab);
#endif
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
@@ -5308,7 +5821,7 @@ SLAB_ATTR(shrink);
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10);
+ return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
}
static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
@@ -5335,7 +5848,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
{
unsigned long sum = 0;
int cpu;
- int len;
+ int len = 0;
int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
if (!data)
@@ -5348,16 +5861,19 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
sum += x;
}
- len = sprintf(buf, "%lu", sum);
+ len += sysfs_emit_at(buf, len, "%lu", sum);
#ifdef CONFIG_SMP
for_each_online_cpu(cpu) {
- if (data[cpu] && len < PAGE_SIZE - 20)
- len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
+ if (data[cpu])
+ len += sysfs_emit_at(buf, len, " C%d=%u",
+ cpu, data[cpu]);
}
#endif
kfree(data);
- return len + sprintf(buf + len, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
}
static void clear_stat(struct kmem_cache *s, enum stat_item si)
@@ -5411,6 +5927,29 @@ STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
#endif /* CONFIG_SLUB_STATS */
+#ifdef CONFIG_KFENCE
+static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
+}
+
+static ssize_t skip_kfence_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int ret = length;
+
+ if (buf[0] == '0')
+ s->flags &= ~SLAB_SKIP_KFENCE;
+ else if (buf[0] == '1')
+ s->flags |= SLAB_SKIP_KFENCE;
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
+SLAB_ATTR(skip_kfence);
+#endif
+
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
&object_size_attr.attr,
@@ -5418,7 +5957,6 @@ static struct attribute *slab_attrs[] = {
&order_attr.attr,
&min_partial_attr.attr,
&cpu_partial_attr.attr,
- &objects_attr.attr,
&objects_partial_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
@@ -5432,6 +5970,7 @@ static struct attribute *slab_attrs[] = {
&slabs_cpu_partial_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
+ &objects_attr.attr,
&slabs_attr.attr,
&sanity_checks_attr.attr,
&trace_attr.attr,
@@ -5439,8 +5978,6 @@ static struct attribute *slab_attrs[] = {
&poison_attr.attr,
&store_user_attr.attr,
&validate_attr.attr,
- &alloc_calls_attr.attr,
- &free_calls_attr.attr,
#endif
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
@@ -5479,7 +6016,12 @@ static struct attribute *slab_attrs[] = {
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
&usersize_attr.attr,
+#endif
+#ifdef CONFIG_KFENCE
+ &skip_kfence_attr.attr,
+#endif
NULL
};
@@ -5494,7 +6036,6 @@ static ssize_t slab_attr_show(struct kobject *kobj,
{
struct slab_attribute *attribute;
struct kmem_cache *s;
- int err;
attribute = to_slab_attr(attr);
s = to_slab(kobj);
@@ -5502,9 +6043,7 @@ static ssize_t slab_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- err = attribute->show(s, buf);
-
- return err;
+ return attribute->show(s, buf);
}
static ssize_t slab_attr_store(struct kobject *kobj,
@@ -5513,7 +6052,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
{
struct slab_attribute *attribute;
struct kmem_cache *s;
- int err;
attribute = to_slab_attr(attr);
s = to_slab(kobj);
@@ -5521,8 +6059,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- err = attribute->store(s, buf, len);
- return err;
+ return attribute->store(s, buf, len);
}
static void kmem_cache_release(struct kobject *k)
@@ -5535,7 +6072,7 @@ static const struct sysfs_ops slab_sysfs_ops = {
.store = slab_attr_store,
};
-static struct kobj_type slab_ktype = {
+static const struct kobj_type slab_ktype = {
.sysfs_ops = &slab_sysfs_ops,
.release = kmem_cache_release,
};
@@ -5547,7 +6084,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
return slab_kset;
}
-#define ID_STR_LENGTH 64
+#define ID_STR_LENGTH 32
/* Create a unique string id for a slab cache:
*
@@ -5558,7 +6095,8 @@ static char *create_unique_id(struct kmem_cache *s)
char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
char *p = name;
- BUG_ON(!name);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
*p++ = ':';
/*
@@ -5580,9 +6118,13 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'A';
if (p != name + 1)
*p++ = '-';
- p += sprintf(p, "%07u", s->size);
+ p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
- BUG_ON(p > name + ID_STR_LENGTH - 1);
+ if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
+ kfree(name);
+ return ERR_PTR(-EINVAL);
+ }
+ kmsan_unpoison_memory(name, p - name);
return name;
}
@@ -5593,11 +6135,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
- if (!kset) {
- kobject_init(&s->kobj, &slab_ktype);
- return 0;
- }
-
if (!unmergeable && disable_higher_order_debug &&
(slub_debug & DEBUG_METADATA_FLAGS))
unmergeable = 1;
@@ -5616,14 +6153,14 @@ static int sysfs_slab_add(struct kmem_cache *s)
* for the symlinks.
*/
name = create_unique_id(s);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
}
s->kobj.kset = kset;
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- if (err) {
- kobject_put(&s->kobj);
+ if (err)
goto out;
- }
err = sysfs_create_group(&s->kobj, &slab_attr_group);
if (err)
@@ -5686,6 +6223,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
al->name = name;
al->next = alias_list;
alias_list = al;
+ kmsan_unpoison_memory(al, sizeof(*al));
return 0;
}
@@ -5700,7 +6238,7 @@ static int __init slab_sysfs_init(void)
if (!slab_kset) {
mutex_unlock(&slab_mutex);
pr_err("Cannot register slab subsystem.\n");
- return -ENOSYS;
+ return -ENOMEM;
}
slab_state = FULL;
@@ -5724,13 +6262,225 @@ static int __init slab_sysfs_init(void)
}
mutex_unlock(&slab_mutex);
- resiliency_test();
return 0;
}
+late_initcall(slab_sysfs_init);
+#endif /* SLAB_SUPPORTS_SYSFS */
-__initcall(slab_sysfs_init);
-#endif /* CONFIG_SYSFS */
+#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int slab_debugfs_show(struct seq_file *seq, void *v)
+{
+ struct loc_track *t = seq->private;
+ struct location *l;
+ unsigned long idx;
+
+ idx = (unsigned long) t->idx;
+ if (idx < t->count) {
+ l = &t->loc[idx];
+ seq_printf(seq, "%7ld ", l->count);
+
+ if (l->addr)
+ seq_printf(seq, "%pS", (void *)l->addr);
+ else
+ seq_puts(seq, "<not-available>");
+
+ if (l->waste)
+ seq_printf(seq, " waste=%lu/%lu",
+ l->count * l->waste, l->waste);
+
+ if (l->sum_time != l->min_time) {
+ seq_printf(seq, " age=%ld/%llu/%ld",
+ l->min_time, div_u64(l->sum_time, l->count),
+ l->max_time);
+ } else
+ seq_printf(seq, " age=%ld", l->min_time);
+
+ if (l->min_pid != l->max_pid)
+ seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
+ else
+ seq_printf(seq, " pid=%ld",
+ l->min_pid);
+
+ if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
+ seq_printf(seq, " cpus=%*pbl",
+ cpumask_pr_args(to_cpumask(l->cpus)));
+
+ if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
+ seq_printf(seq, " nodes=%*pbl",
+ nodemask_pr_args(&l->nodes));
+
+#ifdef CONFIG_STACKDEPOT
+ {
+ depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries, j;
+
+ handle = READ_ONCE(l->handle);
+ if (handle) {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ seq_puts(seq, "\n");
+ for (j = 0; j < nr_entries; j++)
+ seq_printf(seq, " %pS\n", (void *)entries[j]);
+ }
+ }
+#endif
+ seq_puts(seq, "\n");
+ }
+
+ if (!idx && !t->count)
+ seq_puts(seq, "No data\n");
+
+ return 0;
+}
+
+static void slab_debugfs_stop(struct seq_file *seq, void *v)
+{
+}
+
+static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+ struct loc_track *t = seq->private;
+
+ t->idx = ++(*ppos);
+ if (*ppos <= t->count)
+ return ppos;
+
+ return NULL;
+}
+
+static int cmp_loc_by_count(const void *a, const void *b, const void *data)
+{
+ struct location *loc1 = (struct location *)a;
+ struct location *loc2 = (struct location *)b;
+
+ if (loc1->count > loc2->count)
+ return -1;
+ else
+ return 1;
+}
+
+static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct loc_track *t = seq->private;
+
+ t->idx = *ppos;
+ return ppos;
+}
+
+static const struct seq_operations slab_debugfs_sops = {
+ .start = slab_debugfs_start,
+ .next = slab_debugfs_next,
+ .stop = slab_debugfs_stop,
+ .show = slab_debugfs_show,
+};
+
+static int slab_debug_trace_open(struct inode *inode, struct file *filep)
+{
+
+ struct kmem_cache_node *n;
+ enum track_item alloc;
+ int node;
+ struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
+ sizeof(struct loc_track));
+ struct kmem_cache *s = file_inode(filep)->i_private;
+ unsigned long *obj_map;
+
+ if (!t)
+ return -ENOMEM;
+
+ obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
+ if (!obj_map) {
+ seq_release_private(inode, filep);
+ return -ENOMEM;
+ }
+
+ if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
+ alloc = TRACK_ALLOC;
+ else
+ alloc = TRACK_FREE;
+
+ if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
+ bitmap_free(obj_map);
+ seq_release_private(inode, filep);
+ return -ENOMEM;
+ }
+
+ for_each_kmem_cache_node(s, node, n) {
+ unsigned long flags;
+ struct slab *slab;
+
+ if (!node_nr_slabs(n))
+ continue;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(slab, &n->partial, slab_list)
+ process_slab(t, s, slab, alloc, obj_map);
+ list_for_each_entry(slab, &n->full, slab_list)
+ process_slab(t, s, slab, alloc, obj_map);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+
+ /* Sort locations by count */
+ sort_r(t->loc, t->count, sizeof(struct location),
+ cmp_loc_by_count, NULL, NULL);
+
+ bitmap_free(obj_map);
+ return 0;
+}
+
+static int slab_debug_trace_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct loc_track *t = seq->private;
+
+ free_loc_track(t);
+ return seq_release_private(inode, file);
+}
+
+static const struct file_operations slab_debugfs_fops = {
+ .open = slab_debug_trace_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = slab_debug_trace_release,
+};
+
+static void debugfs_slab_add(struct kmem_cache *s)
+{
+ struct dentry *slab_cache_dir;
+
+ if (unlikely(!slab_debugfs_root))
+ return;
+
+ slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
+
+ debugfs_create_file("alloc_traces", 0400,
+ slab_cache_dir, s, &slab_debugfs_fops);
+
+ debugfs_create_file("free_traces", 0400,
+ slab_cache_dir, s, &slab_debugfs_fops);
+}
+
+void debugfs_slab_release(struct kmem_cache *s)
+{
+ debugfs_lookup_and_remove(s->name, slab_debugfs_root);
+}
+
+static int __init slab_debugfs_init(void)
+{
+ struct kmem_cache *s;
+
+ slab_debugfs_root = debugfs_create_dir("slab", NULL);
+
+ list_for_each_entry(s, &slab_caches, list)
+ if (s->flags & SLAB_STORE_USER)
+ debugfs_slab_add(s);
+
+ return 0;
+
+}
+__initcall(slab_debugfs_init);
+#endif
/*
* The /proc/slabinfo ABI
*/
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 16183d85a7d5..a044a130405b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,6 +27,7 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+
#include <asm/dma.h>
#include <asm/pgalloc.h>
@@ -132,25 +133,40 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size,
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
- unsigned long pfn = pte_pfn(*pte);
+ unsigned long pfn = pte_pfn(ptep_get(pte));
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
- pr_warn("[%lx-%lx] potential offnode page_structs\n",
+ pr_warn_once("[%lx-%lx] potential offnode page_structs\n",
start, end - 1);
}
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap,
+ struct page *reuse)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
- if (pte_none(*pte)) {
+ if (pte_none(ptep_get(pte))) {
pte_t entry;
void *p;
- p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
- if (!p)
- return NULL;
+ if (!reuse) {
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p)
+ return NULL;
+ } else {
+ /*
+ * When a PTE/PMD entry is freed from the init_mm
+ * there's a free_pages() call to this page allocated
+ * above. Thus this get_page() is paired with the
+ * put_page_testzero() on the freeing path.
+ * This can only called by certain ZONE_DEVICE path,
+ * and through vmemmap_populate_compound_pages() when
+ * slab is available.
+ */
+ get_page(reuse);
+ p = page_to_virt(reuse);
+ }
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
}
@@ -180,6 +196,10 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
+void __weak __meminit pmd_init(void *addr)
+{
+}
+
pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
@@ -187,11 +207,16 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
+ pmd_init(p);
pud_populate(&init_mm, pud, p);
}
return pud;
}
+void __weak __meminit pud_init(void *addr)
+{
+}
+
p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
@@ -199,6 +224,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
+ pud_init(p);
p4d_populate(&init_mm, p4d, p);
}
return p4d;
@@ -216,49 +242,228 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
return pgd;
}
-int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
- int node, struct vmem_altmap *altmap)
+static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
{
- unsigned long addr = start;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return NULL;
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return NULL;
+ pud = vmemmap_pud_populate(p4d, addr, node);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_populate(pud, addr, node);
+ if (!pmd)
+ return NULL;
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
+ if (!pte)
+ return NULL;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ return pte;
+}
+
+static int __meminit vmemmap_populate_range(unsigned long start,
+ unsigned long end, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ unsigned long addr = start;
+ pte_t *pte;
+
for (; addr < end; addr += PAGE_SIZE) {
+ pte = vmemmap_populate_address(addr, node, altmap, reuse);
+ if (!pte)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
+{
+ return vmemmap_populate_range(start, end, node, altmap, NULL);
+}
+
+void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+}
+
+int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next)
+{
+ return 0;
+}
+
+int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
return -ENOMEM;
+
p4d = vmemmap_p4d_populate(pgd, addr, node);
if (!p4d)
return -ENOMEM;
+
pud = vmemmap_pud_populate(p4d, addr, node);
if (!pud)
return -ENOMEM;
- pmd = vmemmap_pmd_populate(pud, addr, node);
- if (!pmd)
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ continue;
+ } else if (altmap) {
+ /*
+ * No fallback: In any case we care about, the
+ * altmap should be reasonably sized and aligned
+ * such that vmemmap_alloc_block_buf() will always
+ * succeed. For consistency with the PTE case,
+ * return an error here as failure could indicate
+ * a configuration issue with the size of the altmap.
+ */
+ return -ENOMEM;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next))
+ continue;
+ if (vmemmap_populate_basepages(addr, next, node, altmap))
return -ENOMEM;
- pte = vmemmap_pte_populate(pmd, addr, node, altmap);
+ }
+ return 0;
+}
+
+/*
+ * For compound pages bigger than section size (e.g. x86 1G compound
+ * pages with 2M subsection size) fill the rest of sections as tail
+ * pages.
+ *
+ * Note that memremap_pages() resets @nr_range value and will increment
+ * it after each range successful onlining. Thus the value or @nr_range
+ * at section memmap populate corresponds to the in-progress range
+ * being onlined here.
+ */
+static bool __meminit reuse_compound_section(unsigned long start_pfn,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+ unsigned long offset = start_pfn -
+ PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
+
+ return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
+}
+
+static pte_t * __meminit compound_section_tail_page(unsigned long addr)
+{
+ pte_t *pte;
+
+ addr -= PAGE_SIZE;
+
+ /*
+ * Assuming sections are populated sequentially, the previous section's
+ * page data can be reused.
+ */
+ pte = pte_offset_kernel(pmd_off_k(addr), addr);
+ if (!pte)
+ return NULL;
+
+ return pte;
+}
+
+static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long size, addr;
+ pte_t *pte;
+ int rc;
+
+ if (reuse_compound_section(start_pfn, pgmap)) {
+ pte = compound_section_tail_page(start);
+ if (!pte)
+ return -ENOMEM;
+
+ /*
+ * Reuse the page that was populated in the prior iteration
+ * with just tail struct pages.
+ */
+ return vmemmap_populate_range(start, end, node, NULL,
+ pte_page(ptep_get(pte)));
+ }
+
+ size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
+ for (addr = start; addr < end; addr += size) {
+ unsigned long next, last = addr + size;
+
+ /* Populate the head page vmemmap page */
+ pte = vmemmap_populate_address(addr, node, NULL, NULL);
if (!pte)
return -ENOMEM;
- vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ /* Populate the tail pages vmemmap page */
+ next = addr + PAGE_SIZE;
+ pte = vmemmap_populate_address(next, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ /*
+ * Reuse the previous page for the rest of tail pages
+ * See layout diagram in Documentation/mm/vmemmap_dedup.rst
+ */
+ next += PAGE_SIZE;
+ rc = vmemmap_populate_range(next, last, node, NULL,
+ pte_page(ptep_get(pte)));
+ if (rc)
+ return -ENOMEM;
}
return 0;
}
struct page * __meminit __populate_section_memmap(unsigned long pfn,
- unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
+ int r;
if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
return NULL;
- if (vmemmap_populate(start, end, nid, altmap))
+ if (vmemmap_can_optimize(altmap, pgmap))
+ r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
+ else
+ r = vmemmap_populate(start, end, nid, altmap);
+
+ if (r < 0)
return NULL;
return pfn_to_page(pfn);
diff --git a/mm/sparse.c b/mm/sparse.c
index b25ad8e64839..297a8b772e8d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,6 +13,7 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/bootmem_info.h>
#include "internal.h"
#include <asm/dma.h>
@@ -108,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
}
#endif
-#ifdef CONFIG_SPARSEMEM_EXTREME
-unsigned long __section_nr(struct mem_section *ms)
-{
- unsigned long root_nr;
- struct mem_section *root = NULL;
-
- for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
- root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
- if (!root)
- continue;
-
- if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
- break;
- }
-
- VM_BUG_ON(!root);
-
- return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
-}
-#else
-unsigned long __section_nr(struct mem_section *ms)
-{
- return (unsigned long)(ms - mem_section[0]);
-}
-#endif
-
/*
* During early boot, before section_mem_map is used for an actual
* mem_map, we use section_mem_map to store the section's NUMA
@@ -142,7 +117,7 @@ unsigned long __section_nr(struct mem_section *ms)
*/
static inline unsigned long sparse_encode_early_nid(int nid)
{
- return (nid << SECTION_NID_SHIFT);
+ return ((unsigned long)nid << SECTION_NID_SHIFT);
}
static inline int sparse_early_nid(struct mem_section *section)
@@ -151,7 +126,7 @@ static inline int sparse_early_nid(struct mem_section *section)
}
/* Validate the physical addressing limitations of the model */
-void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
@@ -186,10 +161,9 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
* those loops early.
*/
unsigned long __highest_present_section_nr;
-static void section_mark_present(struct mem_section *ms)
+static void __section_mark_present(struct mem_section *ms,
+ unsigned long section_nr)
{
- unsigned long section_nr = __section_nr(ms);
-
if (section_nr > __highest_present_section_nr)
__highest_present_section_nr = section_nr;
@@ -257,7 +231,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
if (unlikely(!mem_section)) {
unsigned long size, align;
- size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
+ size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
mem_section = memblock_alloc(size, align);
if (!mem_section)
@@ -279,7 +253,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
if (!ms->section_mem_map) {
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_IS_ONLINE;
- section_mark_present(ms);
+ __section_mark_present(ms, section);
}
}
}
@@ -307,11 +281,12 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
{
unsigned long coded_mem_map =
(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
- BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+ BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
return coded_mem_map;
}
+#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Decode mem_map from the coded memmap
*/
@@ -321,6 +296,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
coded_mem_map &= SECTION_MAP_MASK;
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
+#endif /* CONFIG_MEMORY_HOTPLUG */
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
@@ -343,6 +319,16 @@ size_t mem_section_usage_size(void)
}
#ifdef CONFIG_MEMORY_HOTREMOVE
+static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
+{
+#ifndef CONFIG_NUMA
+ VM_BUG_ON(pgdat != &contig_page_data);
+ return __pa_symbol(&contig_page_data);
+#else
+ return __pa(pgdat);
+#endif
+}
+
static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
@@ -360,7 +346,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
@@ -388,7 +374,7 @@ static void __init check_usemap_section_nr(int nid,
}
usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
- pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
@@ -441,7 +427,8 @@ static unsigned long __init section_map_size(void)
}
struct page __init *__populate_section_memmap(unsigned long pfn,
- unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long size = section_map_size();
struct page *map = sparse_buffer_alloc(size);
@@ -450,8 +437,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
if (map)
return map;
- map = memblock_alloc_try_nid_raw(size, size, addr,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ map = memmap_alloc(size, size, addr, nid, false);
if (!map)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
__func__, size, PAGE_SIZE, nid, &addr);
@@ -466,7 +452,7 @@ static void *sparsemap_buf_end __meminitdata;
static inline void __meminit sparse_buffer_free(unsigned long size)
{
WARN_ON(!sparsemap_buf || size == 0);
- memblock_free_early(__pa(sparsemap_buf), size);
+ memblock_free(sparsemap_buf, size);
}
static void __init sparse_buffer_init(unsigned long size, int nid)
@@ -478,8 +464,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
* and we want it to be properly aligned to the section size - this is
* especially the case for VMEMMAP which maps memmap to PMDs
*/
- sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
- addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
sparsemap_buf_end = sparsemap_buf + size;
}
@@ -540,11 +525,12 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
break;
map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
- nid, NULL);
+ nid, NULL, NULL);
if (!map) {
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
+ sparse_buffer_fini();
goto failed;
}
check_usemap_section_nr(nid, usage);
@@ -621,7 +607,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
}
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
/* Mark all memory sections within the pfn range as offline */
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -642,13 +627,13 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
ms->section_mem_map &= ~SECTION_IS_ONLINE;
}
}
-#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static struct page * __meminit populate_section_memmap(unsigned long pfn,
- unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
- return __populate_section_memmap(pfn, nr_pages, nid, altmap);
+ return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
}
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
@@ -716,8 +701,9 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
return rc;
}
#else
-struct page * __meminit populate_section_memmap(unsigned long pfn,
- unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+static struct page * __meminit populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
return kvmalloc_node(array_size(sizeof(struct page),
PAGES_PER_SECTION), GFP_KERNEL, nid);
@@ -739,7 +725,7 @@ static void free_map_bootmem(struct page *memmap)
>> PAGE_SHIFT;
for (i = 0; i < nr_pages; i++, page++) {
- magic = (unsigned long) page->freelist;
+ magic = page->index;
BUG_ON(magic == NODE_INFO);
@@ -840,12 +826,13 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
}
static struct page * __meminit section_activate(int nid, unsigned long pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
struct mem_section *ms = __pfn_to_section(pfn);
struct mem_section_usage *usage = NULL;
struct page *memmap;
- int rc = 0;
+ int rc;
if (!ms->usage) {
usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
@@ -872,7 +859,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
if (nr_pages < PAGES_PER_SECTION && early_section(ms))
return pfn_to_page(pfn);
- memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+ memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
if (!memmap) {
section_deactivate(pfn, nr_pages, altmap);
return ERR_PTR(-ENOMEM);
@@ -886,7 +873,8 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
* @nid: The node to add section on
* @start_pfn: start pfn of the memory range
* @nr_pages: number of pfns to add in the section
- * @altmap: device page map
+ * @altmap: alternate pfns to allocate the memmap backing store
+ * @pgmap: alternate compound page geometry for devmap mappings
*
* This is only intended for hotplug.
*
@@ -900,7 +888,8 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
* * -ENOMEM - Out of memory.
*/
int __meminit sparse_add_section(int nid, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct mem_section *ms;
@@ -911,7 +900,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
if (ret < 0)
return ret;
- memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+ memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap);
if (IS_ERR(memmap))
return PTR_ERR(memmap);
@@ -923,7 +912,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
- section_mark_present(ms);
+ __section_mark_present(ms, section_nr);
/* Align memmap to section boundary in the subsection case */
if (section_nr_to_pfn(section_nr) != start_pfn)
@@ -933,39 +922,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
return 0;
}
-#ifdef CONFIG_MEMORY_FAILURE
-static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
{
- int i;
+ struct mem_section *ms = __pfn_to_section(pfn);
- /*
- * A further optimization is to have per section refcounted
- * num_poisoned_pages. But that would need more space per memmap, so
- * for now just do a quick global check to speed up this routine in the
- * absence of bad pages.
- */
- if (atomic_long_read(&num_poisoned_pages) == 0)
+ if (WARN_ON_ONCE(!valid_section(ms)))
return;
- for (i = 0; i < nr_pages; i++) {
- if (PageHWPoison(&memmap[i])) {
- num_poisoned_pages_dec();
- ClearPageHWPoison(&memmap[i]);
- }
- }
-}
-#else
-static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
-}
-#endif
-
-void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
- unsigned long nr_pages, unsigned long map_offset,
- struct vmem_altmap *altmap)
-{
- clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
- nr_pages - map_offset);
section_deactivate(pfn, nr_pages, altmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 47a47681c86b..cd8f0150ba3a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -36,71 +36,77 @@
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>
+#include <linux/buffer_head.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>
-/* How many pages do we try to swap or page in/out together? */
+/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
+const int page_cluster_max = 31;
-/* Protecting only lru_rotate.pvec which requires disabling interrupts */
+/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
struct lru_rotate {
local_lock_t lock;
- struct pagevec pvec;
+ struct folio_batch fbatch;
};
static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
.lock = INIT_LOCAL_LOCK(lock),
};
/*
- * The following struct pagevec are grouped together because they are protected
+ * The following folio batches are grouped together because they are protected
* by disabling preemption (and interrupts remain enabled).
*/
-struct lru_pvecs {
+struct cpu_fbatches {
local_lock_t lock;
- struct pagevec lru_add;
- struct pagevec lru_deactivate_file;
- struct pagevec lru_deactivate;
- struct pagevec lru_lazyfree;
+ struct folio_batch lru_add;
+ struct folio_batch lru_deactivate_file;
+ struct folio_batch lru_deactivate;
+ struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
- struct pagevec activate_page;
+ struct folio_batch activate;
#endif
};
-static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
+static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
.lock = INIT_LOCAL_LOCK(lock),
};
/*
- * This path almost never happens for VM activity - pages are normally
- * freed via pagevecs. But it gets used by networking.
+ * This path almost never happens for VM activity - pages are normally freed
+ * in batches. But it gets used by networking - and for compound pages.
*/
-static void __page_cache_release(struct page *page)
+static void __page_cache_release(struct folio *folio)
{
- if (PageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
+ if (folio_test_lru(folio)) {
struct lruvec *lruvec;
unsigned long flags;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- VM_BUG_ON_PAGE(!PageLRU(page), page);
- __ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ lruvec = folio_lruvec_lock_irqsave(folio, &flags);
+ lruvec_del_folio(lruvec, folio);
+ __folio_clear_lru_flags(folio);
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ }
+ /* See comment on folio_test_mlocked in release_pages() */
+ if (unlikely(folio_test_mlocked(folio))) {
+ long nr_pages = folio_nr_pages(folio);
+
+ __folio_clear_mlocked(folio);
+ zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
}
- __ClearPageWaiters(page);
}
-static void __put_single_page(struct page *page)
+static void __folio_put_small(struct folio *folio)
{
- __page_cache_release(page);
- mem_cgroup_uncharge(page);
- free_unref_page(page);
+ __page_cache_release(folio);
+ mem_cgroup_uncharge(folio);
+ free_unref_page(&folio->page, 0);
}
-static void __put_compound_page(struct page *page)
+static void __folio_put_large(struct folio *folio)
{
/*
* __page_cache_release() is supposed to be called for thp, not for
@@ -108,186 +114,188 @@ static void __put_compound_page(struct page *page)
* (it's never listed to any LRU lists) and no memcg routines should
* be called for hugetlb (it has a separate hugetlb_cgroup.)
*/
- if (!PageHuge(page))
- __page_cache_release(page);
- destroy_compound_page(page);
+ if (!folio_test_hugetlb(folio))
+ __page_cache_release(folio);
+ destroy_large_folio(folio);
}
-void __put_page(struct page *page)
+void __folio_put(struct folio *folio)
{
- if (is_zone_device_page(page)) {
- put_dev_pagemap(page->pgmap);
-
- /*
- * The page belongs to the device that created pgmap. Do
- * not return it to page allocator.
- */
- return;
- }
-
- if (unlikely(PageCompound(page)))
- __put_compound_page(page);
+ if (unlikely(folio_is_zone_device(folio)))
+ free_zone_device_page(&folio->page);
+ else if (unlikely(folio_test_large(folio)))
+ __folio_put_large(folio);
else
- __put_single_page(page);
+ __folio_put_small(folio);
}
-EXPORT_SYMBOL(__put_page);
+EXPORT_SYMBOL(__folio_put);
/**
* put_pages_list() - release a list of pages
* @pages: list of pages threaded on page->lru
*
- * Release a list of pages which are strung together on page.lru. Currently
- * used by read_cache_pages() and related error recovery code.
+ * Release a list of pages which are strung together on page.lru.
*/
void put_pages_list(struct list_head *pages)
{
- while (!list_empty(pages)) {
- struct page *victim;
+ struct folio *folio, *next;
- victim = lru_to_page(pages);
- list_del(&victim->lru);
- put_page(victim);
+ list_for_each_entry_safe(folio, next, pages, lru) {
+ if (!folio_put_testzero(folio)) {
+ list_del(&folio->lru);
+ continue;
+ }
+ if (folio_test_large(folio)) {
+ list_del(&folio->lru);
+ __folio_put_large(folio);
+ continue;
+ }
+ /* LRU flag must be clear because it's passed using the lru */
}
+
+ free_unref_page_list(pages);
+ INIT_LIST_HEAD(pages);
}
EXPORT_SYMBOL(put_pages_list);
-/*
- * get_kernel_pages() - pin kernel pages in memory
- * @kiov: An array of struct kvec structures
- * @nr_segs: number of segments to pin
- * @write: pinning for read/write, currently ignored
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_segs long.
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with.
- */
-int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
- struct page **pages)
+typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
+
+static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
{
- int seg;
+ int was_unevictable = folio_test_clear_unevictable(folio);
+ long nr_pages = folio_nr_pages(folio);
- for (seg = 0; seg < nr_segs; seg++) {
- if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
- return seg;
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- pages[seg] = kmap_to_page(kiov[seg].iov_base);
- get_page(pages[seg]);
+ /*
+ * Is an smp_mb__after_atomic() still required here, before
+ * folio_evictable() tests the mlocked flag, to rule out the possibility
+ * of stranding an evictable folio on an unevictable LRU? I think
+ * not, because __munlock_folio() only clears the mlocked flag
+ * while the LRU lock is held.
+ *
+ * (That is not true of __page_cache_release(), and not necessarily
+ * true of release_pages(): but those only clear the mlocked flag after
+ * folio_put_testzero() has excluded any other users of the folio.)
+ */
+ if (folio_evictable(folio)) {
+ if (was_unevictable)
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ } else {
+ folio_clear_active(folio);
+ folio_set_unevictable(folio);
+ /*
+ * folio->mlock_count = !!folio_test_mlocked(folio)?
+ * But that leaves __mlock_folio() in doubt whether another
+ * actor has already counted the mlock or not. Err on the
+ * safe side, underestimate, let page reclaim fix it, rather
+ * than leaving a page on the unevictable LRU indefinitely.
+ */
+ folio->mlock_count = 0;
+ if (!was_unevictable)
+ __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
- return seg;
+ lruvec_add_folio(lruvec, folio);
+ trace_mm_lru_insertion(folio);
}
-EXPORT_SYMBOL_GPL(get_kernel_pages);
-/*
- * get_kernel_page() - pin a kernel page in memory
- * @start: starting kernel address
- * @write: pinning for read/write, currently ignored
- * @pages: array that receives pointer to the page pinned.
- * Must be at least nr_segs long.
- *
- * Returns 1 if page is pinned. If the page was not pinned, returns
- * -errno. The page returned must be released with a put_page() call
- * when it is finished with.
- */
-int get_kernel_page(unsigned long start, int write, struct page **pages)
-{
- const struct kvec kiov = {
- .iov_base = (void *)start,
- .iov_len = PAGE_SIZE
- };
-
- return get_kernel_pages(&kiov, 1, write, pages);
-}
-EXPORT_SYMBOL_GPL(get_kernel_page);
-
-static void pagevec_lru_move_fn(struct pagevec *pvec,
- void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
- void *arg)
+static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
{
int i;
- struct pglist_data *pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- pgdat = pagepgdat;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
- }
+ /* block memcg migration while the folio moves between lru */
+ if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
+ continue;
+
+ lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+ move_fn(lruvec, folio);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- (*move_fn)(page, lruvec, arg);
+ folio_set_lru(folio);
}
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- release_pages(pvec->pages, pvec->nr);
- pagevec_reinit(pvec);
+
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ folios_put(fbatch->folios, folio_batch_count(fbatch));
+ folio_batch_reinit(fbatch);
}
-static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void folio_batch_add_and_move(struct folio_batch *fbatch,
+ struct folio *folio, move_fn_t move_fn)
{
- int *pgmoved = arg;
-
- if (PageLRU(page) && !PageUnevictable(page)) {
- del_page_from_lru_list(page, lruvec, page_lru(page));
- ClearPageActive(page);
- add_page_to_lru_list_tail(page, lruvec, page_lru(page));
- (*pgmoved) += thp_nr_pages(page);
- }
+ if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
+ !lru_cache_disabled())
+ return;
+ folio_batch_move_lru(fbatch, move_fn);
}
-/*
- * pagevec_move_tail() must be called with IRQ disabled.
- * Otherwise this may cause nasty races.
- */
-static void pagevec_move_tail(struct pagevec *pvec)
+static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
{
- int pgmoved = 0;
-
- pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
- __count_vm_events(PGROTATED, pgmoved);
+ if (!folio_test_unevictable(folio)) {
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ lruvec_add_folio_tail(lruvec, folio);
+ __count_vm_events(PGROTATED, folio_nr_pages(folio));
+ }
}
/*
- * Writeback is about to end against a page which has been marked for immediate
- * reclaim. If it still appears to be reclaimable, move it to the tail of the
- * inactive list.
+ * Writeback is about to end against a folio which has been marked for
+ * immediate reclaim. If it still appears to be reclaimable, move it
+ * to the tail of the inactive list.
+ *
+ * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
*/
-void rotate_reclaimable_page(struct page *page)
+void folio_rotate_reclaimable(struct folio *folio)
{
- if (!PageLocked(page) && !PageDirty(page) &&
- !PageUnevictable(page) && PageLRU(page)) {
- struct pagevec *pvec;
+ if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
+ !folio_test_unevictable(folio) && folio_test_lru(folio)) {
+ struct folio_batch *fbatch;
unsigned long flags;
- get_page(page);
+ folio_get(folio);
local_lock_irqsave(&lru_rotate.lock, flags);
- pvec = this_cpu_ptr(&lru_rotate.pvec);
- if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_move_tail(pvec);
+ fbatch = this_cpu_ptr(&lru_rotate.fbatch);
+ folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
}
-void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
+void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
{
+ unsigned long cost;
+
+ /*
+ * Reflect the relative cost of incurring IO and spending CPU
+ * time on rotations. This doesn't attempt to make a precise
+ * comparison, it just says: if reloads are about comparable
+ * between the LRU lists, or rotations are overwhelmingly
+ * different between them, adjust scan balance for CPU work.
+ */
+ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+
do {
unsigned long lrusize;
+ /*
+ * Hold lruvec->lru_lock is safe here, since
+ * 1) The pinned lruvec in reclaim, or
+ * 2) From a pre-LRU page during refault (which also holds the
+ * rcu lock, so would be safe even if the page was on the LRU
+ * and could move simultaneously to a new lruvec).
+ */
+ spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
- lruvec->file_cost += nr_pages;
+ lruvec->file_cost += cost;
else
- lruvec->anon_cost += nr_pages;
+ lruvec->anon_cost += cost;
/*
* Decay previous events
@@ -306,27 +314,25 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
+ spin_unlock_irq(&lruvec->lru_lock);
} while ((lruvec = parent_lruvec(lruvec)));
}
-void lru_note_cost_page(struct page *page)
+void lru_note_cost_refault(struct folio *folio)
{
- lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
- page_is_file_lru(page), thp_nr_pages(page));
+ lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
+ folio_nr_pages(folio), 0);
}
-static void __activate_page(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
- int nr_pages = thp_nr_pages(page);
+ if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ long nr_pages = folio_nr_pages(folio);
- del_page_from_lru_list(page, lruvec, lru);
- SetPageActive(page);
- lru += LRU_ACTIVE;
- add_page_to_lru_list(page, lruvec, lru);
- trace_mm_lru_activate(page);
+ lruvec_del_folio(lruvec, folio);
+ folio_set_active(folio);
+ lruvec_add_folio(lruvec, folio);
+ trace_mm_lru_activate(folio);
__count_vm_events(PGACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
@@ -335,79 +341,109 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
}
#ifdef CONFIG_SMP
-static void activate_page_drain(int cpu)
+static void folio_activate_drain(int cpu)
{
- struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
+ struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, folio_activate_fn);
}
-static bool need_activate_page_drain(int cpu)
+void folio_activate(struct folio *folio)
{
- return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
-}
-
-static void activate_page(struct page *page)
-{
- page = compound_head(page);
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.activate_page);
- get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, __activate_page, NULL);
- local_unlock(&lru_pvecs.lock);
+ if (folio_test_lru(folio) && !folio_test_active(folio) &&
+ !folio_test_unevictable(folio)) {
+ struct folio_batch *fbatch;
+
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.activate);
+ folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
#else
-static inline void activate_page_drain(int cpu)
+static inline void folio_activate_drain(int cpu)
{
}
-static void activate_page(struct page *page)
+void folio_activate(struct folio *folio)
{
- pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;
- page = compound_head(page);
- spin_lock_irq(&pgdat->lru_lock);
- __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
- spin_unlock_irq(&pgdat->lru_lock);
+ if (folio_test_clear_lru(folio)) {
+ lruvec = folio_lruvec_lock_irq(folio);
+ folio_activate_fn(lruvec, folio);
+ unlock_page_lruvec_irq(lruvec);
+ folio_set_lru(folio);
+ }
}
#endif
-static void __lru_cache_activate_page(struct page *page)
+static void __lru_cache_activate_folio(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
int i;
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
/*
- * Search backwards on the optimistic assumption that the page being
- * activated has just been added to this pagevec. Note that only
- * the local pagevec is examined as a !PageLRU page could be in the
+ * Search backwards on the optimistic assumption that the folio being
+ * activated has just been added to this batch. Note that only
+ * the local batch is examined as a !LRU folio could be in the
* process of being released, reclaimed, migrated or on a remote
- * pagevec that is currently being drained. Furthermore, marking
- * a remote pagevec's page PageActive potentially hits a race where
- * a page is marked PageActive just after it is added to the inactive
+ * batch that is currently being drained. Furthermore, marking
+ * a remote batch's folio active potentially hits a race where
+ * a folio is marked active just after it is added to the inactive
* list causing accounting errors and BUG_ON checks to trigger.
*/
- for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
- struct page *pagevec_page = pvec->pages[i];
+ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
+ struct folio *batch_folio = fbatch->folios[i];
- if (pagevec_page == page) {
- SetPageActive(page);
+ if (batch_folio == folio) {
+ folio_set_active(folio);
break;
}
}
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
+}
+
+#ifdef CONFIG_LRU_GEN
+static void folio_inc_refs(struct folio *folio)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ if (folio_test_unevictable(folio))
+ return;
+
+ if (!folio_test_referenced(folio)) {
+ folio_set_referenced(folio);
+ return;
+ }
+
+ if (!folio_test_workingset(folio)) {
+ folio_set_workingset(folio);
+ return;
+ }
+
+ /* see the comment on MAX_NR_TIERS */
+ do {
+ new_flags = old_flags & LRU_REFS_MASK;
+ if (new_flags == LRU_REFS_MASK)
+ break;
+
+ new_flags += BIT(LRU_REFS_PGOFF);
+ new_flags |= old_flags & ~LRU_REFS_MASK;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+}
+#else
+static void folio_inc_refs(struct folio *folio)
+{
}
+#endif /* CONFIG_LRU_GEN */
/*
* Mark a page as having seen activity.
@@ -419,150 +455,140 @@ static void __lru_cache_activate_page(struct page *page)
* When a newly allocated page is not yet visible, so safe for non-atomic ops,
* __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
*/
-void mark_page_accessed(struct page *page)
+void folio_mark_accessed(struct folio *folio)
{
- page = compound_head(page);
+ if (lru_gen_enabled()) {
+ folio_inc_refs(folio);
+ return;
+ }
- if (!PageReferenced(page)) {
- SetPageReferenced(page);
- } else if (PageUnevictable(page)) {
+ if (!folio_test_referenced(folio)) {
+ folio_set_referenced(folio);
+ } else if (folio_test_unevictable(folio)) {
/*
* Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
* this list is never rotated or maintained, so marking an
- * evictable page accessed has no effect.
+ * unevictable page accessed has no effect.
*/
- } else if (!PageActive(page)) {
+ } else if (!folio_test_active(folio)) {
/*
- * If the page is on the LRU, queue it for activation via
- * lru_pvecs.activate_page. Otherwise, assume the page is on a
- * pagevec, mark it active and it'll be moved to the active
+ * If the folio is on the LRU, queue it for activation via
+ * cpu_fbatches.activate. Otherwise, assume the folio is in a
+ * folio_batch, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
- if (PageLRU(page))
- activate_page(page);
+ if (folio_test_lru(folio))
+ folio_activate(folio);
else
- __lru_cache_activate_page(page);
- ClearPageReferenced(page);
- workingset_activation(page);
+ __lru_cache_activate_folio(folio);
+ folio_clear_referenced(folio);
+ workingset_activation(folio);
}
- if (page_is_idle(page))
- clear_page_idle(page);
+ if (folio_test_idle(folio))
+ folio_clear_idle(folio);
}
-EXPORT_SYMBOL(mark_page_accessed);
+EXPORT_SYMBOL(folio_mark_accessed);
/**
- * lru_cache_add - add a page to a page list
- * @page: the page to be added to the LRU.
+ * folio_add_lru - Add a folio to an LRU list.
+ * @folio: The folio to be added to the LRU.
*
- * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * Queue the folio for addition to the LRU. The decision on whether
* to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
+ * folio_batch is drained. This gives a chance for the caller of folio_add_lru()
+ * have the folio added to the active list using folio_mark_accessed().
*/
-void lru_cache_add(struct page *page)
+void folio_add_lru(struct folio *folio)
{
- struct pagevec *pvec;
-
- VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
-
- get_page(page);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
- if (!pagevec_add(pvec, page) || PageCompound(page))
- __pagevec_lru_add(pvec);
- local_unlock(&lru_pvecs.lock);
+ struct folio_batch *fbatch;
+
+ VM_BUG_ON_FOLIO(folio_test_active(folio) &&
+ folio_test_unevictable(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+
+ /* see the comment in lru_gen_add_folio() */
+ if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
+ folio_set_active(folio);
+
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
+ folio_batch_add_and_move(fbatch, folio, lru_add_fn);
+ local_unlock(&cpu_fbatches.lock);
}
-EXPORT_SYMBOL(lru_cache_add);
+EXPORT_SYMBOL(folio_add_lru);
/**
- * lru_cache_add_inactive_or_unevictable
- * @page: the page to be added to LRU
- * @vma: vma in which page is mapped for determining reclaimability
+ * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA.
+ * @folio: The folio to be added to the LRU.
+ * @vma: VMA in which the folio is mapped.
*
- * Place @page on the inactive or unevictable LRU list, depending on its
- * evictability.
+ * If the VMA is mlocked, @folio is added to the unevictable list.
+ * Otherwise, it is treated the same way as folio_add_lru().
*/
-void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma)
+void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
{
- bool unevictable;
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
- if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
- /*
- * We use the irq-unsafe __mod_zone_page_stat because this
- * counter is not modified from interrupt context, and the pte
- * lock is held(spinlock), which implies preemption disabled.
- */
- __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
- count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
- }
- lru_cache_add(page);
+ if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
+ mlock_new_folio(folio);
+ else
+ folio_add_lru(folio);
}
/*
- * If the page can not be invalidated, it is moved to the
+ * If the folio cannot be invalidated, it is moved to the
* inactive list to speed up its reclaim. It is moved to the
* head of the list, rather than the tail, to give the flusher
* threads some time to write it out, as this is much more
* effective than the single-page writeout from reclaim.
*
- * If the page isn't page_mapped and dirty/writeback, the page
- * could reclaim asap using PG_reclaim.
+ * If the folio isn't mapped and dirty/writeback, the folio
+ * could be reclaimed asap using the reclaim flag.
*
- * 1. active, mapped page -> none
- * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
- * 3. inactive, mapped page -> none
- * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
+ * 1. active, mapped folio -> none
+ * 2. active, dirty/writeback folio -> inactive, head, reclaim
+ * 3. inactive, mapped folio -> none
+ * 4. inactive, dirty/writeback folio -> inactive, head, reclaim
* 5. inactive, clean -> inactive, tail
* 6. Others -> none
*
- * In 4, why it moves inactive's head, the VM expects the page would
- * be write it out by flusher threads as this is much more effective
+ * In 4, it moves to the head of the inactive list so the folio is
+ * written out by flusher threads as this is much more efficient
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
{
- int lru;
- bool active;
- int nr_pages = thp_nr_pages(page);
+ bool active = folio_test_active(folio);
+ long nr_pages = folio_nr_pages(folio);
- if (!PageLRU(page))
+ if (folio_test_unevictable(folio))
return;
- if (PageUnevictable(page))
+ /* Some processes are using the folio */
+ if (folio_mapped(folio))
return;
- /* Some processes are using the page */
- if (page_mapped(page))
- return;
-
- active = PageActive(page);
- lru = page_lru_base_type(page);
-
- del_page_from_lru_list(page, lruvec, lru + active);
- ClearPageActive(page);
- ClearPageReferenced(page);
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
- if (PageWriteback(page) || PageDirty(page)) {
+ if (folio_test_writeback(folio) || folio_test_dirty(folio)) {
/*
- * PG_reclaim could be raced with end_page_writeback
- * It can make readahead confusing. But race window
- * is _really_ small and it's non-critical problem.
+ * Setting the reclaim flag could race with
+ * folio_end_writeback() and confuse readahead. But the
+ * race window is _really_ small and it's not a critical
+ * problem.
*/
- add_page_to_lru_list(page, lruvec, lru);
- SetPageReclaim(page);
+ lruvec_add_folio(lruvec, folio);
+ folio_set_reclaim(folio);
} else {
/*
- * The page's writeback ends up during pagevec
- * We moves tha page into tail of inactive.
+ * The folio's writeback ended while it was in the batch.
+ * We move that folio to the tail of the inactive list.
*/
- add_page_to_lru_list_tail(page, lruvec, lru);
+ lruvec_add_folio_tail(lruvec, folio);
__count_vm_events(PGROTATED, nr_pages);
}
@@ -573,17 +599,15 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
}
}
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
- int nr_pages = thp_nr_pages(page);
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
+ long nr_pages = folio_nr_pages(folio);
- del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
- ClearPageActive(page);
- ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
+ lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
@@ -591,25 +615,22 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
}
}
-static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page) && !PageUnevictable(page)) {
- bool active = PageActive(page);
- int nr_pages = thp_nr_pages(page);
-
- del_page_from_lru_list(page, lruvec,
- LRU_INACTIVE_ANON + active);
- ClearPageActive(page);
- ClearPageReferenced(page);
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
+ long nr_pages = folio_nr_pages(folio);
+
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
/*
- * Lazyfree pages are clean anonymous pages. They have
- * PG_swapbacked flag cleared, to distinguish them from normal
- * anonymous pages
+ * Lazyfree folios are clean anonymous folios. They have
+ * the swapbacked flag cleared, to distinguish them from normal
+ * anonymous folios
*/
- ClearPageSwapBacked(page);
- add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
+ folio_clear_swapbacked(folio);
+ lruvec_add_folio(lruvec, folio);
__count_vm_events(PGLAZYFREE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
@@ -618,129 +639,143 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
}
/*
- * Drain pages out of the cpu's pagevecs.
+ * Drain pages out of the cpu's folio_batch.
* Either "cpu" is the current CPU, and preemption has already been
* disabled; or "cpu" is being hot-unplugged, and is already dead.
*/
void lru_add_drain_cpu(int cpu)
{
- struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
+ struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
+ struct folio_batch *fbatch = &fbatches->lru_add;
- if (pagevec_count(pvec))
- __pagevec_lru_add(pvec);
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_add_fn);
- pvec = &per_cpu(lru_rotate.pvec, cpu);
+ fbatch = &per_cpu(lru_rotate.fbatch, cpu);
/* Disabling interrupts below acts as a compiler barrier. */
- if (data_race(pagevec_count(pvec))) {
+ if (data_race(folio_batch_count(fbatch))) {
unsigned long flags;
/* No harm done if a racing interrupt already did this */
local_lock_irqsave(&lru_rotate.lock, flags);
- pagevec_move_tail(pvec);
+ folio_batch_move_lru(fbatch, lru_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
- pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ fbatch = &fbatches->lru_deactivate_file;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_deactivate_file_fn);
- pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ fbatch = &fbatches->lru_deactivate;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_deactivate_fn);
- pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+ fbatch = &fbatches->lru_lazyfree;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_lazyfree_fn);
- activate_page_drain(cpu);
+ folio_activate_drain(cpu);
}
/**
- * deactivate_file_page - forcefully deactivate a file page
- * @page: page to deactivate
+ * deactivate_file_folio() - Deactivate a file folio.
+ * @folio: Folio to deactivate.
*
- * This function hints the VM that @page is a good reclaim candidate,
- * for example if its invalidation fails due to the page being dirty
+ * This function hints to the VM that @folio is a good reclaim candidate,
+ * for example if its invalidation fails due to the folio being dirty
* or under writeback.
+ *
+ * Context: Caller holds a reference on the folio.
*/
-void deactivate_file_page(struct page *page)
+void deactivate_file_folio(struct folio *folio)
{
- /*
- * In a workload with many unevictable page such as mprotect,
- * unevictable page deactivation for accelerating reclaim is pointless.
- */
- if (PageUnevictable(page))
- return;
-
- if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
+ /* Deactivating an unevictable folio will not accelerate reclaim */
+ if (folio_test_unevictable(folio))
+ return;
- if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- local_unlock(&lru_pvecs.lock);
- }
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
+ folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
+ local_unlock(&cpu_fbatches.lock);
}
/*
- * deactivate_page - deactivate a page
- * @page: page to deactivate
+ * folio_deactivate - deactivate a folio
+ * @folio: folio to deactivate
*
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page. This is done to accelerate the reclaim
- * of @page.
+ * folio_deactivate() moves @folio to the inactive list if @folio was on the
+ * active list and was not unevictable. This is done to accelerate the
+ * reclaim of @folio.
*/
-void deactivate_page(struct page *page)
+void folio_deactivate(struct folio *folio)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
- get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- local_unlock(&lru_pvecs.lock);
+ if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
+ (folio_test_active(folio) || lru_gen_enabled())) {
+ struct folio_batch *fbatch;
+
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
+ folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
/**
- * mark_page_lazyfree - make an anon page lazyfree
- * @page: page to deactivate
+ * folio_mark_lazyfree - make an anon folio lazyfree
+ * @folio: folio to deactivate
*
- * mark_page_lazyfree() moves @page to the inactive file list.
- * This is done to accelerate the reclaim of @page.
+ * folio_mark_lazyfree() moves @folio to the inactive file list.
+ * This is done to accelerate the reclaim of @folio.
*/
-void mark_page_lazyfree(struct page *page)
+void folio_mark_lazyfree(struct folio *folio)
{
- if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page) && !PageUnevictable(page)) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
- get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
- local_unlock(&lru_pvecs.lock);
+ if (folio_test_lru(folio) && folio_test_anon(folio) &&
+ folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
+ !folio_test_unevictable(folio)) {
+ struct folio_batch *fbatch;
+
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
+ folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
void lru_add_drain(void)
{
- local_lock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
+ lru_add_drain_cpu(smp_processor_id());
+ local_unlock(&cpu_fbatches.lock);
+ mlock_drain_local();
+}
+
+/*
+ * It's called from per-cpu workqueue context in SMP case so
+ * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
+ * the same cpu. It shouldn't be a problem in !SMP case since
+ * the core is only one and the locks will disable preemption.
+ */
+static void lru_add_and_bh_lrus_drain(void)
+{
+ local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
+ invalidate_bh_lrus_cpu();
+ mlock_drain_local();
}
void lru_add_drain_cpu_zone(struct zone *zone)
{
- local_lock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
drain_local_pages(zone);
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
+ mlock_drain_local();
}
#ifdef CONFIG_SMP
@@ -749,7 +784,22 @@ static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
- lru_add_drain();
+ lru_add_and_bh_lrus_drain();
+}
+
+static bool cpu_needs_drain(unsigned int cpu)
+{
+ struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
+
+ /* Check these in order of likelihood that they're not zero */
+ return folio_batch_count(&fbatches->lru_add) ||
+ data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) ||
+ folio_batch_count(&fbatches->lru_deactivate_file) ||
+ folio_batch_count(&fbatches->lru_deactivate) ||
+ folio_batch_count(&fbatches->lru_lazyfree) ||
+ folio_batch_count(&fbatches->activate) ||
+ need_mlock_drain(cpu) ||
+ has_bh_in_lru(cpu, NULL);
}
/*
@@ -759,7 +809,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
* Calling this function with cpu hotplug locks held can actually lead
* to obscure indirect dependencies via WQ context.
*/
-void lru_add_drain_all(void)
+static inline void __lru_add_drain_all(bool force_all_cpus)
{
/*
* lru_drain_gen - Global pages generation number
@@ -784,8 +834,9 @@ void lru_add_drain_all(void)
return;
/*
- * Guarantee pagevec counter stores visible by this CPU are visible to
- * other CPUs before loading the current drain generation.
+ * Guarantee folio_batch counter stores visible by this CPU
+ * are visible to other CPUs before loading the current drain
+ * generation.
*/
smp_mb();
@@ -804,21 +855,22 @@ void lru_add_drain_all(void)
* (C) Exit the draining operation if a newer generation, from another
* lru_add_drain_all(), was already scheduled for draining. Check (A).
*/
- if (unlikely(this_gen != lru_drain_gen))
+ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
goto done;
/*
* (D) Increment global generation number
*
* Pairs with smp_load_acquire() at (B), outside of the critical
- * section. Use a full memory barrier to guarantee that the new global
- * drain generation number is stored before loading pagevec counters.
+ * section. Use a full memory barrier to guarantee that the
+ * new global drain generation number is stored before loading
+ * folio_batch counters.
*
* This pairing must be done here, before the for_each_online_cpu loop
* below which drains the page vectors.
*
* Let x, y, and z represent some system CPU numbers, where x < y < z.
- * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+ * Assume CPU #z is in the middle of the for_each_online_cpu loop
* below and has already reached CPU #y's per-cpu data. CPU #x comes
* along, adds some pages to its per-cpu vectors, then calls
* lru_add_drain_all().
@@ -834,12 +886,7 @@ void lru_add_drain_all(void)
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
- if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
- data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
- need_activate_page_drain(cpu)) {
+ if (cpu_needs_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
__cpumask_set_cpu(cpu, &has_work);
@@ -852,6 +899,11 @@ void lru_add_drain_all(void)
done:
mutex_unlock(&lock);
}
+
+void lru_add_drain_all(void)
+{
+ __lru_add_drain_all(false);
+}
#else
void lru_add_drain_all(void)
{
@@ -859,94 +911,132 @@ void lru_add_drain_all(void)
}
#endif /* CONFIG_SMP */
+atomic_t lru_disable_count = ATOMIC_INIT(0);
+
+/*
+ * lru_cache_disable() needs to be called before we start compiling
+ * a list of pages to be migrated using isolate_lru_page().
+ * It drains pages on LRU cache and then disable on all cpus until
+ * lru_cache_enable is called.
+ *
+ * Must be paired with a call to lru_cache_enable().
+ */
+void lru_cache_disable(void)
+{
+ atomic_inc(&lru_disable_count);
+ /*
+ * Readers of lru_disable_count are protected by either disabling
+ * preemption or rcu_read_lock:
+ *
+ * preempt_disable, local_irq_disable [bh_lru_lock()]
+ * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT]
+ * preempt_disable [local_lock !CONFIG_PREEMPT_RT]
+ *
+ * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
+ * preempt_disable() regions of code. So any CPU which sees
+ * lru_disable_count = 0 will have exited the critical
+ * section when synchronize_rcu() returns.
+ */
+ synchronize_rcu_expedited();
+#ifdef CONFIG_SMP
+ __lru_add_drain_all(true);
+#else
+ lru_add_and_bh_lrus_drain();
+#endif
+}
+
/**
* release_pages - batched put_page()
- * @pages: array of pages to release
+ * @arg: array of pages to release
* @nr: number of pages
*
- * Decrement the reference count on all the pages in @pages. If it
+ * Decrement the reference count on all the pages in @arg. If it
* fell to zero, remove the page from the LRU and free it.
+ *
+ * Note that the argument can be an array of pages, encoded pages,
+ * or folio pointers. We ignore any encoded bits, and turn any of
+ * them into just a folio that gets free'd.
*/
-void release_pages(struct page **pages, int nr)
+void release_pages(release_pages_arg arg, int nr)
{
int i;
+ struct encoded_page **encoded = arg.encoded_pages;
LIST_HEAD(pages_to_free);
- struct pglist_data *locked_pgdat = NULL;
- struct lruvec *lruvec;
- unsigned long flags;
+ struct lruvec *lruvec = NULL;
+ unsigned long flags = 0;
unsigned int lock_batch;
for (i = 0; i < nr; i++) {
- struct page *page = pages[i];
+ struct folio *folio;
+
+ /* Turn any of the argument types into a folio */
+ folio = page_folio(encoded_page_ptr(encoded[i]));
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
- * same pgdat. The lock is held only if pgdat != NULL.
+ * same lruvec. The lock is held only if lruvec != NULL.
*/
- if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
- page = compound_head(page);
- if (is_huge_zero_page(page))
+ if (is_huge_zero_page(&folio->page))
continue;
- if (is_zone_device_page(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
- flags);
- locked_pgdat = NULL;
+ if (folio_is_zone_device(folio)) {
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
- /*
- * ZONE_DEVICE pages that return 'false' from
- * page_is_devmap_managed() do not require special
- * processing, and instead, expect a call to
- * put_page_testzero().
- */
- if (page_is_devmap_managed(page)) {
- put_devmap_managed_page(page);
+ if (put_devmap_managed_page(&folio->page))
continue;
- }
+ if (folio_put_testzero(folio))
+ free_zone_device_page(&folio->page);
+ continue;
}
- if (!put_page_testzero(page))
+ if (!folio_put_testzero(folio))
continue;
- if (PageCompound(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (folio_test_large(folio)) {
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
- __put_compound_page(page);
+ __folio_put_large(folio);
continue;
}
- if (PageLRU(page)) {
- struct pglist_data *pgdat = page_pgdat(page);
+ if (folio_test_lru(folio)) {
+ struct lruvec *prev_lruvec = lruvec;
- if (pgdat != locked_pgdat) {
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
- flags);
+ lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
+ &flags);
+ if (prev_lruvec != lruvec)
lock_batch = 0;
- locked_pgdat = pgdat;
- spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
- }
- lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
- VM_BUG_ON_PAGE(!PageLRU(page), page);
- __ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_off_lru(page));
+ lruvec_del_folio(lruvec, folio);
+ __folio_clear_lru_flags(folio);
}
- __ClearPageWaiters(page);
+ /*
+ * In rare cases, when truncation or holepunching raced with
+ * munlock after VM_LOCKED was cleared, Mlocked may still be
+ * found set here. This does not indicate a problem, unless
+ * "unevictable_pgs_cleared" appears worryingly large.
+ */
+ if (unlikely(folio_test_mlocked(folio))) {
+ __folio_clear_mlocked(folio);
+ zone_stat_sub_folio(folio, NR_MLOCK);
+ count_vm_event(UNEVICTABLE_PGCLEARED);
+ }
- list_add(&page->lru, &pages_to_free);
+ list_add(&folio->lru, &pages_to_free);
}
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
mem_cgroup_uncharge_list(&pages_to_free);
free_unref_page_list(&pages_to_free);
@@ -954,225 +1044,47 @@ void release_pages(struct page **pages, int nr)
EXPORT_SYMBOL(release_pages);
/*
- * The pages which we're about to release may be in the deferred lru-addition
+ * The folios which we're about to release may be in the deferred lru-addition
* queues. That would prevent them from really being freed right now. That's
- * OK from a correctness point of view but is inefficient - those pages may be
+ * OK from a correctness point of view but is inefficient - those folios may be
* cache-warm and we want to give them back to the page allocator ASAP.
*
- * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
- * and __pagevec_lru_add_active() call release_pages() directly to avoid
+ * So __folio_batch_release() will drain those queues here.
+ * folio_batch_move_lru() calls folios_put() directly to avoid
* mutual recursion.
*/
-void __pagevec_release(struct pagevec *pvec)
+void __folio_batch_release(struct folio_batch *fbatch)
{
- if (!pvec->percpu_pvec_drained) {
+ if (!fbatch->percpu_pvec_drained) {
lru_add_drain();
- pvec->percpu_pvec_drained = true;
- }
- release_pages(pvec->pages, pagevec_count(pvec));
- pagevec_reinit(pvec);
-}
-EXPORT_SYMBOL(__pagevec_release);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* used by __split_huge_page_refcount() */
-void lru_add_page_tail(struct page *page, struct page *page_tail,
- struct lruvec *lruvec, struct list_head *list)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- VM_BUG_ON_PAGE(PageCompound(page_tail), page);
- VM_BUG_ON_PAGE(PageLRU(page_tail), page);
- lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
-
- if (!list)
- SetPageLRU(page_tail);
-
- if (likely(PageLRU(page)))
- list_add_tail(&page_tail->lru, &page->lru);
- else if (list) {
- /* page reclaim is reclaiming a huge page */
- get_page(page_tail);
- list_add_tail(&page_tail->lru, list);
- } else {
- /*
- * Head page has not yet been counted, as an hpage,
- * so we must account for each subpage individually.
- *
- * Put page_tail on the list at the correct position
- * so they all end up in order.
- */
- add_page_to_lru_list_tail(page_tail, lruvec,
- page_lru(page_tail));
- }
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
-{
- enum lru_list lru;
- int was_unevictable = TestClearPageUnevictable(page);
- int nr_pages = thp_nr_pages(page);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
-
- /*
- * Page becomes evictable in two ways:
- * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
- * 2) Before acquiring LRU lock to put the page to correct LRU and then
- * a) do PageLRU check with lock [check_move_unevictable_pages]
- * b) do PageLRU check before lock [clear_page_mlock]
- *
- * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
- * following strict ordering:
- *
- * #0: __pagevec_lru_add_fn #1: clear_page_mlock
- *
- * SetPageLRU() TestClearPageMlocked()
- * smp_mb() // explicit ordering // above provides strict
- * // ordering
- * PageMlocked() PageLRU()
- *
- *
- * if '#1' does not observe setting of PG_lru by '#0' and fails
- * isolation, the explicit barrier will make sure that page_evictable
- * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
- * can be reordered after PageMlocked check and can make '#1' to fail
- * the isolation of the page whose Mlocked bit is cleared (#0 is also
- * looking at the same page) and the evictable page will be stranded
- * in an unevictable LRU.
- */
- SetPageLRU(page);
- smp_mb__after_atomic();
-
- if (page_evictable(page)) {
- lru = page_lru(page);
- if (was_unevictable)
- __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
- } else {
- lru = LRU_UNEVICTABLE;
- ClearPageActive(page);
- SetPageUnevictable(page);
- if (!was_unevictable)
- __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
+ fbatch->percpu_pvec_drained = true;
}
-
- add_page_to_lru_list(page, lruvec, lru);
- trace_mm_lru_insertion(page, lru);
-}
-
-/*
- * Add the passed pages to the LRU, then drop the caller's refcount
- * on them. Reinitialises the caller's pagevec.
- */
-void __pagevec_lru_add(struct pagevec *pvec)
-{
- pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
-}
-
-/**
- * pagevec_lookup_entries - gang pagecache lookup
- * @pvec: Where the resulting entries are placed
- * @mapping: The address_space to search
- * @start: The starting entry index
- * @nr_entries: The maximum number of pages
- * @indices: The cache indices corresponding to the entries in @pvec
- *
- * pagevec_lookup_entries() will search for and return a group of up
- * to @nr_pages pages and shadow entries in the mapping. All
- * entries are placed in @pvec. pagevec_lookup_entries() takes a
- * reference against actual pages in @pvec.
- *
- * The search returns a group of mapping-contiguous entries with
- * ascending indexes. There may be holes in the indices due to
- * not-present entries.
- *
- * Only one subpage of a Transparent Huge Page is returned in one call:
- * allowing truncate_inode_pages_range() to evict the whole THP without
- * cycling through a pagevec of extra references.
- *
- * pagevec_lookup_entries() returns the number of entries which were
- * found.
- */
-unsigned pagevec_lookup_entries(struct pagevec *pvec,
- struct address_space *mapping,
- pgoff_t start, unsigned nr_entries,
- pgoff_t *indices)
-{
- pvec->nr = find_get_entries(mapping, start, nr_entries,
- pvec->pages, indices);
- return pagevec_count(pvec);
+ release_pages(fbatch->folios, folio_batch_count(fbatch));
+ folio_batch_reinit(fbatch);
}
+EXPORT_SYMBOL(__folio_batch_release);
/**
- * pagevec_remove_exceptionals - pagevec exceptionals pruning
- * @pvec: The pagevec to prune
+ * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
+ * @fbatch: The batch to prune
*
- * pagevec_lookup_entries() fills both pages and exceptional radix
- * tree entries into the pagevec. This function prunes all
- * exceptionals from @pvec without leaving holes, so that it can be
- * passed on to page-only pagevec operations.
+ * find_get_entries() fills a batch with both folios and shadow/swap/DAX
+ * entries. This function prunes all the non-folio entries from @fbatch
+ * without leaving holes, so that it can be passed on to folio-only batch
+ * operations.
*/
-void pagevec_remove_exceptionals(struct pagevec *pvec)
+void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
{
- int i, j;
+ unsigned int i, j;
- for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- if (!xa_is_value(page))
- pvec->pages[j++] = page;
+ for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
+ if (!xa_is_value(folio))
+ fbatch->folios[j++] = folio;
}
- pvec->nr = j;
-}
-
-/**
- * pagevec_lookup_range - gang pagecache lookup
- * @pvec: Where the resulting pages are placed
- * @mapping: The address_space to search
- * @start: The starting page index
- * @end: The final page index
- *
- * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
- * pages in the mapping starting from index @start and upto index @end
- * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
- * reference against the pages in @pvec.
- *
- * The search returns a group of mapping-contiguous pages with ascending
- * indexes. There may be holes in the indices due to not-present pages. We
- * also update @start to index the next page for the traversal.
- *
- * pagevec_lookup_range() returns the number of pages which were found. If this
- * number is smaller than PAGEVEC_SIZE, the end of specified range has been
- * reached.
- */
-unsigned pagevec_lookup_range(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *start, pgoff_t end)
-{
- pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
- pvec->pages);
- return pagevec_count(pvec);
-}
-EXPORT_SYMBOL(pagevec_lookup_range);
-
-unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *index, pgoff_t end,
- xa_mark_t tag)
-{
- pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
- PAGEVEC_SIZE, pvec->pages);
- return pagevec_count(pvec);
+ fbatch->nr = j;
}
-EXPORT_SYMBOL(pagevec_lookup_range_tag);
-unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *index, pgoff_t end,
- xa_mark_t tag, unsigned max_pages)
-{
- pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
- min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
- return pagevec_count(pvec);
-}
-EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
/*
* Perform any setup for the swap system
*/
@@ -1190,26 +1102,3 @@ void __init swap_setup(void)
* _really_ don't want to cluster much more
*/
}
-
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-void put_devmap_managed_page(struct page *page)
-{
- int count;
-
- if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
- return;
-
- count = page_ref_dec_return(page);
-
- /*
- * devmap page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (count == 1)
- free_devmap_managed_page(page);
- else if (!count)
- __put_page(page);
-}
-EXPORT_SYMBOL(put_devmap_managed_page);
-#endif
diff --git a/mm/swap.h b/mm/swap.h
new file mode 100644
index 000000000000..7c033d793f15
--- /dev/null
+++ b/mm/swap.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_SWAP_H
+#define _MM_SWAP_H
+
+#ifdef CONFIG_SWAP
+#include <linux/blk_types.h> /* for bio_end_io_t */
+
+/* linux/mm/page_io.c */
+int sio_pool_init(void);
+struct swap_iocb;
+void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug);
+void __swap_read_unplug(struct swap_iocb *plug);
+static inline void swap_read_unplug(struct swap_iocb *plug)
+{
+ if (unlikely(plug))
+ __swap_read_unplug(plug);
+}
+void swap_write_unplug(struct swap_iocb *sio);
+int swap_writepage(struct page *page, struct writeback_control *wbc);
+void __swap_writepage(struct page *page, struct writeback_control *wbc);
+
+/* linux/mm/swap_state.c */
+/* One swap address space for each 64M swap space */
+#define SWAP_ADDRESS_SPACE_SHIFT 14
+#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
+extern struct address_space *swapper_spaces[];
+#define swap_address_space(entry) \
+ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
+ >> SWAP_ADDRESS_SPACE_SHIFT])
+
+void show_swap_cache_info(void);
+bool add_to_swap(struct folio *folio);
+void *get_shadow_from_swap_cache(swp_entry_t entry);
+int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
+ gfp_t gfp, void **shadowp);
+void __delete_from_swap_cache(struct folio *folio,
+ swp_entry_t entry, void *shadow);
+void delete_from_swap_cache(struct folio *folio);
+void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end);
+struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr);
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index);
+
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ bool do_poll,
+ struct swap_iocb **plug);
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ bool *new_page_allocated);
+struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
+ struct vm_fault *vmf);
+struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
+ struct vm_fault *vmf);
+
+static inline unsigned int folio_swap_flags(struct folio *folio)
+{
+ return page_swap_info(&folio->page)->flags;
+}
+#else /* CONFIG_SWAP */
+struct swap_iocb;
+static inline void swap_readpage(struct page *page, bool do_poll,
+ struct swap_iocb **plug)
+{
+}
+static inline void swap_write_unplug(struct swap_iocb *sio)
+{
+}
+
+static inline struct address_space *swap_address_space(swp_entry_t entry)
+{
+ return NULL;
+}
+
+static inline void show_swap_cache_info(void)
+{
+}
+
+static inline struct page *swap_cluster_readahead(swp_entry_t entry,
+ gfp_t gfp_mask, struct vm_fault *vmf)
+{
+ return NULL;
+}
+
+static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
+ struct vm_fault *vmf)
+{
+ return NULL;
+}
+
+static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+{
+ return 0;
+}
+
+static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return NULL;
+}
+
+static inline
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index)
+{
+ return filemap_get_folio(mapping, index);
+}
+
+static inline bool add_to_swap(struct folio *folio)
+{
+ return false;
+}
+
+static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+ return NULL;
+}
+
+static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
+ gfp_t gfp_mask, void **shadowp)
+{
+ return -1;
+}
+
+static inline void __delete_from_swap_cache(struct folio *folio,
+ swp_entry_t entry, void *shadow)
+{
+}
+
+static inline void delete_from_swap_cache(struct folio *folio)
+{
+}
+
+static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end)
+{
+}
+
+static inline unsigned int folio_swap_flags(struct folio *folio)
+{
+ return 0;
+}
+#endif /* CONFIG_SWAP */
+#endif /* _MM_SWAP_H */
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 7f34343c075a..db6c4a26cf59 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -167,14 +167,15 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
int swap_cgroup_swapon(int type, unsigned long max_pages)
{
void *array;
- unsigned long array_size;
unsigned long length;
struct swap_cgroup_ctrl *ctrl;
+ if (mem_cgroup_disabled())
+ return 0;
+
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
- array_size = length * sizeof(void *);
- array = vzalloc(array_size);
+ array = vcalloc(length, sizeof(void *));
if (!array)
goto nomem;
@@ -206,6 +207,9 @@ void swap_cgroup_swapoff(int type)
unsigned long i, length;
struct swap_cgroup_ctrl *ctrl;
+ if (mem_cgroup_disabled())
+ return;
+
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0357fbe70645..0bec1f705f8e 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -16,7 +16,7 @@
* to local caches without needing to acquire swap_info
* lock. We do not reuse the returned slots directly but
* move them back to the global pool in a batch. This
- * allows the slots to coaellesce and reduce fragmentation.
+ * allows the slots to coalesce and reduce fragmentation.
*
* The swap entry allocated is marked with SWAP_HAS_CACHE
* flag in map_count that prevents it from being allocated
@@ -30,6 +30,7 @@
#include <linux/swap_slots.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mutex.h>
#include <linux/mm.h>
@@ -43,8 +44,6 @@ static DEFINE_MUTEX(swap_slots_cache_mutex);
static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
static void __drain_swap_slots_cache(unsigned int type);
-static void deactivate_swap_slots_cache(void);
-static void reactivate_swap_slots_cache(void);
#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
#define SLOTS_CACHE 0x1
@@ -72,9 +71,9 @@ void disable_swap_slots_cache_lock(void)
swap_slot_cache_enabled = false;
if (swap_slot_cache_initialized) {
/* serialize with cpu hotplug operations */
- get_online_cpus();
+ cpus_read_lock();
__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
- put_online_cpus();
+ cpus_read_unlock();
}
}
@@ -118,7 +117,7 @@ static int alloc_swap_slot_cache(unsigned int cpu)
/*
* Do allocation outside swap_slots_cache_mutex
- * as kvzalloc could trigger reclaim and get_swap_page,
+ * as kvzalloc could trigger reclaim and folio_alloc_swap,
* which can lock swap_slots_cache_mutex.
*/
slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
@@ -193,8 +192,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
cache->slots_ret = NULL;
}
spin_unlock_irq(&cache->free_lock);
- if (slots)
- kvfree(slots);
+ kvfree(slots);
}
}
@@ -215,7 +213,7 @@ static void __drain_swap_slots_cache(unsigned int type)
* this function can be invoked in the cpu
* hot plug path:
* cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
- * -> memory allocation -> direct reclaim -> get_swap_page
+ * -> memory allocation -> direct reclaim -> folio_alloc_swap
* -> drain_swap_slots_cache
*
* Hence the loop over current online cpu below could miss cpu that
@@ -260,7 +258,7 @@ out_unlock:
/* called with swap slot cache's alloc lock held */
static int refill_swap_slots_cache(struct swap_slots_cache *cache)
{
- if (!use_swap_slot_cache || cache->nr)
+ if (!use_swap_slot_cache)
return 0;
cache->cur = 0;
@@ -271,7 +269,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
return cache->nr;
}
-int free_swap_slot(swp_entry_t entry)
+void free_swap_slot(swp_entry_t entry)
{
struct swap_slots_cache *cache;
@@ -299,20 +297,18 @@ int free_swap_slot(swp_entry_t entry)
direct_free:
swapcache_free_entries(&entry, 1);
}
-
- return 0;
}
-swp_entry_t get_swap_page(struct page *page)
+swp_entry_t folio_alloc_swap(struct folio *folio)
{
swp_entry_t entry;
struct swap_slots_cache *cache;
entry.val = 0;
- if (PageTransHuge(page)) {
- if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, &entry, HPAGE_PMD_NR);
+ if (folio_test_large(folio)) {
+ if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
+ get_swap_pages(1, &entry, folio_nr_pages(folio));
goto out;
}
@@ -346,8 +342,8 @@ repeat:
get_swap_pages(1, &entry, 1);
out:
- if (mem_cgroup_try_charge_swap(page, entry)) {
- put_swap_page(page, entry);
+ if (mem_cgroup_try_charge_swap(folio, entry)) {
+ put_swap_folio(folio, entry);
entry.val = 0;
}
return entry;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index aa40e706604c..f8ea7015bad4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -16,13 +16,13 @@
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
-#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
+#include "swap.h"
/*
* swapper_space is a fiction, retained to simplify the path through
@@ -30,9 +30,9 @@
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .set_page_dirty = swap_set_page_dirty,
+ .dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
- .migratepage = migrate_page,
+ .migrate_folio = migrate_folio,
#endif
};
@@ -58,50 +58,11 @@ static bool enable_vma_readahead __read_mostly = true;
#define GET_SWAP_RA_VAL(vma) \
(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
-#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++)
-#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr))
-
-static struct {
- unsigned long add_total;
- unsigned long del_total;
- unsigned long find_success;
- unsigned long find_total;
-} swap_cache_info;
-
-unsigned long total_swapcache_pages(void)
-{
- unsigned int i, j, nr;
- unsigned long ret = 0;
- struct address_space *spaces;
- struct swap_info_struct *si;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- swp_entry_t entry = swp_entry(i, 1);
-
- /* Avoid get_swap_device() to warn for bad swap entry */
- if (!swp_swap_info(entry))
- continue;
- /* Prevent swapoff to free swapper_spaces */
- si = get_swap_device(entry);
- if (!si)
- continue;
- nr = nr_swapper_spaces[i];
- spaces = swapper_spaces[i];
- for (j = 0; j < nr; j++)
- ret += spaces[j].nrpages;
- put_swap_device(si);
- }
- return ret;
-}
-
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
- printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
- swap_cache_info.add_total, swap_cache_info.del_total,
- swap_cache_info.find_success, swap_cache_info.find_total);
printk("Free swap = %ldkB\n",
get_nr_swap_pages() << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
@@ -113,57 +74,53 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
pgoff_t idx = swp_offset(entry);
struct page *page;
- page = find_get_entry(address_space, idx);
+ page = xa_load(&address_space->i_pages, idx);
if (xa_is_value(page))
return page;
- if (page)
- put_page(page);
return NULL;
}
/*
- * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * add_to_swap_cache resembles filemap_add_folio on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int add_to_swap_cache(struct page *page, swp_entry_t entry,
+int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp)
{
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
- XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
- unsigned long i, nr = thp_nr_pages(page);
+ XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
+ unsigned long i, nr = folio_nr_pages(folio);
void *old;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageSwapCache(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ xas_set_update(&xas, workingset_update_node);
- page_ref_add(page, nr);
- SetPageSwapCache(page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
- do {
- unsigned long nr_shadows = 0;
+ folio_ref_add(folio, nr);
+ folio_set_swapcache(folio);
+ do {
xas_lock_irq(&xas);
xas_create_range(&xas);
if (xas_error(&xas))
goto unlock;
for (i = 0; i < nr; i++) {
- VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
old = xas_load(&xas);
if (xa_is_value(old)) {
- nr_shadows++;
if (shadowp)
*shadowp = old;
}
- set_page_private(page + i, entry.val + i);
- xas_store(&xas, page);
+ set_page_private(folio_page(folio, i), entry.val + i);
+ xas_store(&xas, folio);
xas_next(&xas);
}
- address_space->nrexceptional -= nr_shadows;
address_space->nrpages += nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
- ADD_CACHE_INFO(add_total, nr);
+ __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -171,59 +128,63 @@ unlock:
if (!xas_error(&xas))
return 0;
- ClearPageSwapCache(page);
- page_ref_sub(page, nr);
+ folio_clear_swapcache(folio);
+ folio_ref_sub(folio, nr);
return xas_error(&xas);
}
/*
- * This must be called only on pages that have
+ * This must be called only on folios that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page,
+void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow)
{
struct address_space *address_space = swap_address_space(entry);
- int i, nr = thp_nr_pages(page);
+ int i;
+ long nr = folio_nr_pages(folio);
pgoff_t idx = swp_offset(entry);
XA_STATE(xas, &address_space->i_pages, idx);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ xas_set_update(&xas, workingset_update_node);
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_PAGE(entry != page, entry);
- set_page_private(page + i, 0);
+ VM_BUG_ON_PAGE(entry != folio, entry);
+ set_page_private(folio_page(folio, i), 0);
xas_next(&xas);
}
- ClearPageSwapCache(page);
- if (shadow)
- address_space->nrexceptional += nr;
+ folio_clear_swapcache(folio);
address_space->nrpages -= nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
- ADD_CACHE_INFO(del_total, nr);
+ __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}
/**
- * add_to_swap - allocate swap space for a page
- * @page: page we want to move to swap
+ * add_to_swap - allocate swap space for a folio
+ * @folio: folio we want to move to swap
+ *
+ * Allocate swap space for the folio and add the folio to the
+ * swap cache.
*
- * Allocate swap space for the page and add the page to the
- * swap cache. Caller needs to hold the page lock.
+ * Context: Caller needs to hold the folio lock.
+ * Return: Whether the folio was added to the swap cache.
*/
-int add_to_swap(struct page *page)
+bool add_to_swap(struct folio *folio)
{
swp_entry_t entry;
int err;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageUptodate(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
- entry = get_swap_page(page);
+ entry = folio_alloc_swap(folio);
if (!entry.val)
- return 0;
+ return false;
/*
* XArray node allocations from PF_MEMALLOC contexts could
@@ -236,7 +197,7 @@ int add_to_swap(struct page *page)
/*
* Add it to the swap cache.
*/
- err = add_to_swap_cache(page, entry,
+ err = add_to_swap_cache(folio, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
if (err)
/*
@@ -245,41 +206,42 @@ int add_to_swap(struct page *page)
*/
goto fail;
/*
- * Normally the page will be dirtied in unmap because its pte should be
- * dirty. A special case is MADV_FREE page. The page'e pte could have
- * dirty bit cleared but the page's SwapBacked bit is still set because
- * clearing the dirty bit and SwapBacked bit has no lock protected. For
- * such page, unmap will not set dirty bit for it, so page reclaim will
- * not write the page out. This can cause data corruption when the page
- * is swap in later. Always setting the dirty bit for the page solves
- * the problem.
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
*/
- set_page_dirty(page);
+ folio_mark_dirty(folio);
- return 1;
+ return true;
fail:
- put_swap_page(page, entry);
- return 0;
+ put_swap_folio(folio, entry);
+ return false;
}
/*
- * This must be called only on pages that have
+ * This must be called only on folios that have
* been verified to be in the swap cache and locked.
- * It will never put the page into the free list,
- * the caller has a reference on the page.
+ * It will never put the folio into the free list,
+ * the caller has a reference on the folio.
*/
-void delete_from_swap_cache(struct page *page)
+void delete_from_swap_cache(struct folio *folio)
{
- swp_entry_t entry = { .val = page_private(page) };
+ swp_entry_t entry = folio_swap_entry(folio);
struct address_space *address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page, entry, NULL);
+ __delete_from_swap_cache(folio, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
- put_swap_page(page, entry);
- page_ref_sub(page, thp_nr_pages(page));
+ put_swap_folio(folio, entry);
+ folio_ref_sub(folio, folio_nr_pages(folio));
}
void clear_shadow_from_swap_cache(int type, unsigned long begin,
@@ -289,19 +251,18 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
void *old;
for (;;) {
- unsigned long nr_shadows = 0;
swp_entry_t entry = swp_entry(type, curr);
struct address_space *address_space = swap_address_space(entry);
XA_STATE(xas, &address_space->i_pages, curr);
+ xas_set_update(&xas, workingset_update_node);
+
xa_lock_irq(&address_space->i_pages);
xas_for_each(&xas, old, end) {
if (!xa_is_value(old))
continue;
xas_store(&xas, NULL);
- nr_shadows++;
}
- address_space->nrexceptional -= nr_shadows;
xa_unlock_irq(&address_space->i_pages);
/* search the next swapcache until we meet end */
@@ -313,23 +274,26 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
}
}
-/*
- * If we are the only user, then try to free up the swap cache.
- *
- * Its ok to check for PageSwapCache without the page lock
+/*
+ * If we are the only user, then try to free up the swap cache.
+ *
+ * Its ok to check the swapcache flag without the folio lock
* here because we are going to recheck again inside
- * try_to_free_swap() _with_ the lock.
+ * folio_free_swap() _with_ the lock.
* - Marcelo
*/
-static inline void free_swap_cache(struct page *page)
+void free_swap_cache(struct page *page)
{
- if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
- try_to_free_swap(page);
- unlock_page(page);
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
}
}
-/*
+/*
* Perform a free_page(), also freeing any swap cache associated with
* this page if it is the last user of the page.
*/
@@ -344,15 +308,12 @@ void free_page_and_swap_cache(struct page *page)
* Passed an array of pages, drop them all from swapcache and then release
* them. They are removed from the LRU and freed if this is their last use.
*/
-void free_pages_and_swap_cache(struct page **pages, int nr)
+void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
{
- struct page **pagep = pages;
- int i;
-
lru_add_drain();
- for (i = 0; i < nr; i++)
- free_swap_cache(pagep[i]);
- release_pages(pagep, nr);
+ for (int i = 0; i < nr; i++)
+ free_swap_cache(encoded_page_ptr(pages[i]));
+ release_pages(pages, nr);
}
static inline bool swap_use_vma_readahead(void)
@@ -361,37 +322,31 @@ static inline bool swap_use_vma_readahead(void)
}
/*
- * Lookup a swap entry in the swap cache. A found page will be returned
+ * Lookup a swap entry in the swap cache. A found folio will be returned
* unlocked and with its refcount incremented - we rely on the kernel
- * lock getting page table operations atomic even if we drop the page
+ * lock getting page table operations atomic even if we drop the folio
* lock before returning.
+ *
+ * Caller must lock the swap device or hold a reference to keep it valid.
*/
-struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
- unsigned long addr)
+struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
- struct swap_info_struct *si;
+ struct folio *folio;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
- put_swap_device(si);
-
- INC_CACHE_INFO(find_total);
- if (page) {
+ folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
+ if (!IS_ERR(folio)) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
- INC_CACHE_INFO(find_success);
/*
* At the moment, we don't support PG_readahead for anon THP
* so let's bail out rather than confusing the readahead stat.
*/
- if (unlikely(PageTransCompound(page)))
- return page;
+ if (unlikely(folio_test_large(folio)))
+ return folio;
- readahead = TestClearPageReadahead(page);
+ readahead = folio_test_clear_readahead(folio);
if (vma && vma_ra) {
unsigned long ra_val;
int win, hits;
@@ -410,42 +365,49 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
if (!vma || !vma_ra)
atomic_inc(&swapin_readahead_hits);
}
+ } else {
+ folio = NULL;
}
- return page;
+ return folio;
}
/**
- * find_get_incore_page - Find and get a page from the page or swap caches.
+ * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
* @mapping: The address_space to search.
* @index: The page cache index.
*
- * This differs from find_get_page() in that it will also look for the
- * page in the swap cache.
+ * This differs from filemap_get_folio() in that it will also look for the
+ * folio in the swap cache.
*
- * Return: The found page or %NULL.
+ * Return: The found folio or %NULL.
*/
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index)
{
swp_entry_t swp;
struct swap_info_struct *si;
- struct page *page = find_get_entry(mapping, index);
+ struct folio *folio = filemap_get_entry(mapping, index);
- if (!page)
- return page;
- if (!xa_is_value(page))
- return find_subpage(page, index);
+ if (!folio)
+ return ERR_PTR(-ENOENT);
+ if (!xa_is_value(folio))
+ return folio;
if (!shmem_mapping(mapping))
- return NULL;
+ return ERR_PTR(-ENOENT);
- swp = radix_to_swp_entry(page);
+ swp = radix_to_swp_entry(folio);
+ /* There might be swapin error entries in shmem mapping. */
+ if (non_swap_entry(swp))
+ return ERR_PTR(-ENOENT);
/* Prevent swapoff from happening to us */
si = get_swap_device(swp);
if (!si)
- return NULL;
- page = find_get_page(swap_address_space(swp), swp_offset(swp));
+ return ERR_PTR(-ENOENT);
+ index = swp_offset(swp);
+ folio = filemap_get_folio(swap_address_space(swp), index);
put_swap_device(si);
- return page;
+ return folio;
}
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
@@ -453,26 +415,28 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
bool *new_page_allocated)
{
struct swap_info_struct *si;
+ struct folio *folio;
struct page *page;
void *shadow = NULL;
*new_page_allocated = false;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
for (;;) {
int err;
/*
* First check the swap cache. Since this is normally
- * called after lookup_swap_cache() failed, re-calling
+ * called after swap_cache_get_folio() failed, re-calling
* that would confuse statistics.
*/
- si = get_swap_device(entry);
- if (!si)
- return NULL;
- page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
- put_swap_device(si);
- if (page)
- return page;
+ folio = filemap_get_folio(swap_address_space(entry),
+ swp_offset(entry));
+ if (!IS_ERR(folio)) {
+ page = folio_file_page(folio, swp_offset(entry));
+ goto got_page;
+ }
/*
* Just skip read ahead for unused swap slot.
@@ -482,17 +446,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* as SWAP_HAS_CACHE. That's done in later part of code or
* else swap_off will be aborted if we return NULL.
*/
- if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
- return NULL;
+ if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
+ goto fail_put_swap;
/*
* Get a new page to read into from swap. Allocate it now,
* before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
* cause any racers to loop around until we add it to cache.
*/
- page = alloc_page_vma(gfp_mask, vma, addr);
- if (!page)
- return NULL;
+ folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
+ if (!folio)
+ goto fail_put_swap;
/*
* Swap entry may have been freed since our caller observed it.
@@ -501,9 +465,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (!err)
break;
- put_page(page);
+ folio_put(folio);
if (err != -EEXIST)
- return NULL;
+ goto fail_put_swap;
/*
* We might race against __delete_from_swap_cache(), and
@@ -512,39 +476,42 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* __read_swap_cache_async(), which has set SWAP_HAS_CACHE
* in swap_map, but not yet added its page to swap cache.
*/
- cond_resched();
+ schedule_timeout_uninterruptible(1);
}
/*
* The swap entry is ours to swap in. Prepare the new page.
*/
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
- put_swap_page(page, entry);
+ if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
goto fail_unlock;
- }
- if (mem_cgroup_charge(page, NULL, gfp_mask)) {
- delete_from_swap_cache(page);
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
+ if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- }
+
+ mem_cgroup_swapin_uncharge_swap(entry);
if (shadow)
- workingset_refault(page, shadow);
+ workingset_refault(folio, shadow);
- /* Caller will initiate read into locked page */
- SetPageWorkingset(page);
- lru_cache_add(page);
+ /* Caller will initiate read into locked folio */
+ folio_add_lru(folio);
*new_page_allocated = true;
+ page = &folio->page;
+got_page:
+ put_swap_device(si);
return page;
fail_unlock:
- unlock_page(page);
- put_page(page);
+ put_swap_folio(folio, entry);
+ folio_unlock(folio);
+ folio_put(folio);
+fail_put_swap:
+ put_swap_device(si);
return NULL;
}
@@ -553,16 +520,22 @@ fail_unlock:
* and reading the disk if it is not already cached.
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
+ *
+ * get/put_swap_device() aren't needed to call this function, because
+ * __read_swap_cache_async() call them and swap_readpage() holds the
+ * swap cache folio lock.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr, bool do_poll)
+ struct vm_area_struct *vma,
+ unsigned long addr, bool do_poll,
+ struct swap_iocb **plug)
{
bool page_was_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
vma, addr, &page_was_allocated);
if (page_was_allocated)
- swap_readpage(retpage, do_poll);
+ swap_readpage(retpage, do_poll, plug);
return retpage;
}
@@ -656,6 +629,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long mask;
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
+ struct swap_iocb *splug = NULL;
bool do_poll = true, page_allocated;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
@@ -664,13 +638,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!mask)
goto skip;
- /* Test swap type to make sure the dereference is safe */
- if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
- struct inode *inode = si->swap_file->f_mapping->host;
- if (inode_read_congested(inode))
- goto skip;
- }
-
do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
@@ -689,7 +656,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!page)
continue;
if (page_allocated) {
- swap_readpage(page, false);
+ swap_readpage(page, false, &splug);
if (offset != entry_offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
@@ -698,10 +665,12 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
put_page(page);
}
blk_finish_plug(&plug);
+ swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
- return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+ /* The page was likely read above, so no need for plugging here */
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -729,37 +698,32 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
void exit_swap_address_space(unsigned int type)
{
- kvfree(swapper_spaces[type]);
+ int i;
+ struct address_space *spaces = swapper_spaces[type];
+
+ for (i = 0; i < nr_swapper_spaces[type]; i++)
+ VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
+ kvfree(spaces);
nr_swapper_spaces[type] = 0;
swapper_spaces[type] = NULL;
}
-static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
- unsigned long faddr,
- unsigned long lpfn,
- unsigned long rpfn,
- unsigned long *start,
- unsigned long *end)
-{
- *start = max3(lpfn, PFN_DOWN(vma->vm_start),
- PFN_DOWN(faddr & PMD_MASK));
- *end = min3(rpfn, PFN_DOWN(vma->vm_end),
- PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
-}
+#define SWAP_RA_ORDER_CEILING 5
+
+struct vma_swap_readahead {
+ unsigned short win;
+ unsigned short offset;
+ unsigned short nr_pte;
+};
static void swap_ra_info(struct vm_fault *vmf,
- struct vma_swap_readahead *ra_info)
+ struct vma_swap_readahead *ra_info)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
- swp_entry_t entry;
- unsigned long faddr, pfn, fpfn;
+ unsigned long faddr, pfn, fpfn, lpfn, rpfn;
unsigned long start, end;
- pte_t *pte, *orig_pte;
- unsigned int max_win, hits, prev_win, win, left;
-#ifndef CONFIG_64BIT
- pte_t *tpte;
-#endif
+ unsigned int max_win, hits, prev_win, win;
max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
SWAP_RA_ORDER_CEILING);
@@ -769,13 +733,6 @@ static void swap_ra_info(struct vm_fault *vmf,
}
faddr = vmf->address;
- orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
- entry = pte_to_swp_entry(*pte);
- if ((unlikely(non_swap_entry(entry)))) {
- pte_unmap(orig_pte);
- return;
- }
-
fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
@@ -785,34 +742,28 @@ static void swap_ra_info(struct vm_fault *vmf,
max_win, prev_win);
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(faddr, win, 0));
-
- if (win == 1) {
- pte_unmap(orig_pte);
+ if (win == 1)
return;
- }
- /* Copy the PTEs because the page table may be unmapped */
- if (fpfn == pfn + 1)
- swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
- else if (pfn == fpfn + 1)
- swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
- &start, &end);
- else {
- left = (win - 1) / 2;
- swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
- &start, &end);
+ if (fpfn == pfn + 1) {
+ lpfn = fpfn;
+ rpfn = fpfn + win;
+ } else if (pfn == fpfn + 1) {
+ lpfn = fpfn - win + 1;
+ rpfn = fpfn + 1;
+ } else {
+ unsigned int left = (win - 1) / 2;
+
+ lpfn = fpfn - left;
+ rpfn = fpfn + win - left;
}
+ start = max3(lpfn, PFN_DOWN(vma->vm_start),
+ PFN_DOWN(faddr & PMD_MASK));
+ end = min3(rpfn, PFN_DOWN(vma->vm_end),
+ PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+
ra_info->nr_pte = end - start;
ra_info->offset = fpfn - start;
- pte -= ra_info->offset;
-#ifdef CONFIG_64BIT
- ra_info->ptes = pte;
-#else
- tpte = ra_info->ptes;
- for (pfn = start; pfn != end; pfn++)
- *tpte++ = *pte++;
-#endif
- pte_unmap(orig_pte);
}
/**
@@ -823,7 +774,7 @@ static void swap_ra_info(struct vm_fault *vmf,
*
* Returns the struct page for entry and addr, after queueing swapin.
*
- * Primitive swap readahead code. We simply read in a few pages whoes
+ * Primitive swap readahead code. We simply read in a few pages whose
* virtual addresses are around the fault address in the same vma.
*
* Caller must hold read mmap_lock if vmf->vma is not NULL.
@@ -833,35 +784,45 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
struct blk_plug plug;
+ struct swap_iocb *splug = NULL;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
- pte_t *pte, pentry;
+ pte_t *pte = NULL, pentry;
+ unsigned long addr;
swp_entry_t entry;
unsigned int i;
bool page_allocated;
- struct vma_swap_readahead ra_info = {0,};
+ struct vma_swap_readahead ra_info = {
+ .win = 1,
+ };
swap_ra_info(vmf, &ra_info);
if (ra_info.win == 1)
goto skip;
+ addr = vmf->address - (ra_info.offset * PAGE_SIZE);
+
blk_start_plug(&plug);
- for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
- i++, pte++) {
- pentry = *pte;
- if (pte_none(pentry))
- continue;
- if (pte_present(pentry))
+ for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) {
+ if (!pte++) {
+ pte = pte_offset_map(vmf->pmd, addr);
+ if (!pte)
+ break;
+ }
+ pentry = ptep_get_lockless(pte);
+ if (!is_swap_pte(pentry))
continue;
entry = pte_to_swp_entry(pentry);
if (unlikely(non_swap_entry(entry)))
continue;
+ pte_unmap(pte);
+ pte = NULL;
page = __read_swap_cache_async(entry, gfp_mask, vma,
- vmf->address, &page_allocated);
+ addr, &page_allocated);
if (!page)
continue;
if (page_allocated) {
- swap_readpage(page, false);
+ swap_readpage(page, false, &splug);
if (i != ra_info.offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
@@ -869,11 +830,15 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
}
put_page(page);
}
+ if (pte)
+ pte_unmap(pte);
blk_finish_plug(&plug);
+ swap_read_unplug(splug);
lru_add_drain();
skip:
+ /* The page was likely read above, so no need for plugging here */
return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- ra_info.win == 1);
+ ra_info.win == 1, NULL);
}
/**
@@ -900,31 +865,29 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
+ return sysfs_emit(buf, "%s\n",
+ enable_vma_readahead ? "true" : "false");
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
- enable_vma_readahead = true;
- else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
- enable_vma_readahead = false;
- else
- return -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &enable_vma_readahead);
+ if (ret)
+ return ret;
return count;
}
-static struct kobj_attribute vma_ra_enabled_attr =
- __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
- vma_ra_enabled_store);
+static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
static struct attribute *swap_attrs[] = {
&vma_ra_enabled_attr.attr,
NULL,
};
-static struct attribute_group swap_attr_group = {
+static const struct attribute_group swap_attr_group = {
.attrs = swap_attrs,
};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c4a613688a17..b15112b1f1a8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -6,6 +6,7 @@
* Swap reorganised 29.12.95, Stephen Tweedie
*/
+#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
@@ -18,7 +19,7 @@
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/shmem_fs.h>
-#include <linux/blkdev.h>
+#include <linux/blk-cgroup.h>
#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
@@ -39,17 +40,19 @@
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
+#include <linux/completion.h>
+#include <linux/suspend.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "swap.h"
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-DEFINE_SPINLOCK(swap_lock);
+static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
/*
@@ -61,6 +64,10 @@ EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority = -1;
+unsigned long swapfile_maximum_size;
+#ifdef CONFIG_MIGRATION
+bool swap_migration_ad_supported;
+#endif /* CONFIG_MIGRATION */
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
@@ -71,14 +78,14 @@ static const char Unused_offset[] = "Unused swap offset entry ";
* all active swap_info_structs
* protected with swap_lock, and ordered by priority.
*/
-PLIST_HEAD(swap_active_head);
+static PLIST_HEAD(swap_active_head);
/*
* all available (active, not full) swap_info_structs
* protected with swap_avail_lock, ordered by priority.
- * This is used by get_swap_page() instead of swap_active_head
+ * This is used by folio_alloc_swap() instead of swap_active_head
* because swap_active_head includes all swap_info_structs,
- * but get_swap_page() doesn't need to look at full ones.
+ * but folio_alloc_swap() doesn't need to look at full ones.
* This uses its own lock instead of swap_lock because when a
* swap_info_struct changes between not-full/full, it needs to
* add/remove itself to/from this list, but the swap_info_struct->lock
@@ -100,11 +107,10 @@ atomic_t nr_rotate_swap = ATOMIC_INIT(0);
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
- if (type >= READ_ONCE(nr_swapfiles))
+ if (type >= MAX_SWAPFILES)
return NULL;
- smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
- return READ_ONCE(swap_info[type]);
+ return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}
static inline unsigned char swap_count(unsigned char ent)
@@ -127,27 +133,27 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
- struct page *page;
+ struct folio *folio;
int ret = 0;
- page = find_get_page(swap_address_space(entry), offset);
- if (!page)
+ folio = filemap_get_folio(swap_address_space(entry), offset);
+ if (IS_ERR(folio))
return 0;
/*
* When this function is called from scan_swap_map_slots() and it's
- * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+ * called by vmscan.c at reclaiming folios. So we hold a folio lock
* here. We have to use trylock for avoiding deadlock. This is a special
- * case and you should use try_to_free_swap() with explicit lock_page()
+ * case and you should use folio_free_swap() with explicit folio_lock()
* in usual operations.
*/
- if (trylock_page(page)) {
+ if (folio_trylock(folio)) {
if ((flags & TTRS_ANYWAY) ||
- ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
- ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
- ret = try_to_free_swap(page);
- unlock_page(page);
+ ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
+ ret = folio_free_swap(folio);
+ folio_unlock(folio);
}
- put_page(page);
+ folio_put(folio);
return ret;
}
@@ -180,7 +186,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, 0);
+ nr_blocks, GFP_KERNEL);
if (err)
return err;
cond_resched();
@@ -191,7 +197,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, 0);
+ nr_blocks, GFP_KERNEL);
if (err)
break;
@@ -220,6 +226,19 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
BUG();
}
+sector_t swap_page_sector(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct swap_extent *se;
+ sector_t sector;
+ pgoff_t offset;
+
+ offset = __page_file_index(page);
+ se = offset_to_swap_extent(sis, offset);
+ sector = se->start_block + (offset - se->start_page);
+ return sector << (PAGE_SHIFT - 9);
+}
+
/*
* swap allocation tell device that a cluster of swap can now be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -242,7 +261,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, 0))
+ nr_blocks, GFP_NOIO))
break;
se = next_se(se);
@@ -440,10 +459,10 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
unsigned int idx)
{
/*
- * If scan_swap_map() can't find a free cluster, it will check
+ * If scan_swap_map_slots() can't find a free cluster, it will check
* si->swap_map directly. To make sure the discarding cluster isn't
- * taken by scan_swap_map(), mark the swap entries bad (occupied). It
- * will be cleared after discard
+ * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
+ * It will be cleared after discard
*/
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
SWAP_MAP_BAD, SWAPFILE_CLUSTER);
@@ -499,6 +518,14 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(&si->lock);
}
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+ struct swap_info_struct *si;
+
+ si = container_of(ref, struct swap_info_struct, users);
+ complete(&si->comp);
+}
+
static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
{
struct swap_cluster_info *ci = si->cluster_info;
@@ -568,7 +595,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
}
/*
- * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
* cluster list. Avoiding such abuse to avoid list corruption.
*/
static bool
@@ -653,6 +680,7 @@ static void __del_from_avail_list(struct swap_info_struct *p)
{
int nid;
+ assert_spin_locked(&p->lock);
for_each_node(nid)
plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
@@ -673,7 +701,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
si->lowest_bit += nr_entries;
if (end == si->highest_bit)
WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
- si->inuse_pages += nr_entries;
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
@@ -710,7 +738,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
add_to_avail_list(si);
}
atomic_long_add(nr_entries, &nr_swap_pages);
- si->inuse_pages -= nr_entries;
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -746,14 +774,29 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
/* No free swap slots available */
if (si->highest_bit <= si->lowest_bit)
return;
- next = si->lowest_bit +
- prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+ next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
next = max_t(unsigned int, next, si->lowest_bit);
}
this_cpu_write(*si->cluster_next_cpu, next);
}
+static bool swap_offset_available_and_locked(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ if (data_race(!si->swap_map[offset])) {
+ spin_lock(&si->lock);
+ return true;
+ }
+
+ if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+ spin_lock(&si->lock);
+ return true;
+ }
+
+ return false;
+}
+
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[])
@@ -931,37 +974,23 @@ done:
scan:
spin_unlock(&si->lock);
while (++offset <= READ_ONCE(si->highest_bit)) {
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
- goto checks;
- }
- if (vm_swap_full() &&
- READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
- goto checks;
- }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
scanned_many = true;
}
+ if (swap_offset_available_and_locked(si, offset))
+ goto checks;
}
offset = si->lowest_bit;
while (offset < scan_base) {
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
- goto checks;
- }
- if (vm_swap_full() &&
- READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
- goto checks;
- }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
scanned_many = true;
}
+ if (swap_offset_available_and_locked(si, offset))
+ goto checks;
offset++;
}
spin_lock(&si->lock);
@@ -975,8 +1004,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
{
unsigned long idx;
struct swap_cluster_info *ci;
- unsigned long offset, i;
- unsigned char *map;
+ unsigned long offset;
/*
* Should not even be attempting cluster allocations when huge
@@ -996,9 +1024,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
alloc_cluster(si, idx);
cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
- map = si->swap_map + offset;
- for (i = 0; i < SWAPFILE_CLUSTER; i++)
- map[i] = SWAP_HAS_CACHE;
+ memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
unlock_cluster(ci);
swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
*slot = swp_entry(si->type, offset);
@@ -1019,21 +1045,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
swap_range_free(si, offset, SWAPFILE_CLUSTER);
}
-static unsigned long scan_swap_map(struct swap_info_struct *si,
- unsigned char usage)
-{
- swp_entry_t entry;
- int n_ret;
-
- n_ret = scan_swap_map_slots(si, usage, 1, &entry);
-
- if (n_ret)
- return swp_offset(entry);
- else
- return 0;
-
-}
-
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
{
unsigned long size = swap_entry_size(entry_size);
@@ -1045,16 +1056,18 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
/* Only single cluster request supported */
WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
+ spin_lock(&swap_avail_lock);
+
avail_pgs = atomic_long_read(&nr_swap_pages) / size;
- if (avail_pgs <= 0)
+ if (avail_pgs <= 0) {
+ spin_unlock(&swap_avail_lock);
goto noswap;
+ }
n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
atomic_long_sub(n_goal * size, &nr_swap_pages);
- spin_lock(&swap_avail_lock);
-
start_over:
node = numa_node_id();
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
@@ -1087,21 +1100,20 @@ start_over:
spin_unlock(&si->lock);
if (n_ret || size == SWAPFILE_CLUSTER)
goto check_out;
- pr_debug("scan_swap_map of si %d failed to find offset\n",
- si->type);
+ cond_resched();
spin_lock(&swap_avail_lock);
nextsi:
/*
* if we got here, it's likely that si was almost full before,
- * and since scan_swap_map() can drop the si->lock, multiple
- * callers probably all tried to get a page from the same si
- * and it filled up before we could get one; or, the si filled
- * up between us dropping swap_avail_lock and taking si->lock.
- * Since we dropped the swap_avail_lock, the swap_avail_head
- * list may have been modified; so if next is still in the
- * swap_avail_head list then try it, otherwise start over
- * if we have not gotten any slots.
+ * and since scan_swap_map_slots() can drop the si->lock,
+ * multiple callers probably all tried to get a page from the
+ * same si and it filled up before we could get one; or, the si
+ * filled up between us dropping swap_avail_lock and taking
+ * si->lock. Since we dropped the swap_avail_lock, the
+ * swap_avail_head list may have been modified; so if next is
+ * still in the swap_avail_head list then try it, otherwise
+ * start over if we have not gotten any slots.
*/
if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
@@ -1117,32 +1129,7 @@ noswap:
return n_ret;
}
-/* The only caller of this function is now suspend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
- pgoff_t offset;
-
- if (!si)
- goto fail;
-
- spin_lock(&si->lock);
- if (si->flags & SWP_WRITEOK) {
- atomic_long_dec(&nr_swap_pages);
- /* This is called for allocating swap entry, not cache */
- offset = scan_swap_map(si, 1);
- if (offset) {
- spin_unlock(&si->lock);
- return swp_entry(type, offset);
- }
- atomic_long_inc(&nr_swap_pages);
- }
- spin_unlock(&si->lock);
-fail:
- return (swp_entry_t) {0};
-}
-
-static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
unsigned long offset;
@@ -1157,47 +1144,25 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
offset = swp_offset(entry);
if (offset >= p->max)
goto bad_offset;
+ if (data_race(!p->swap_map[swp_offset(entry)]))
+ goto bad_free;
return p;
+bad_free:
+ pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
+ goto out;
bad_offset:
- pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
goto out;
bad_device:
- pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
goto out;
bad_nofile:
- pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
-out:
- return NULL;
-}
-
-static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = __swap_info_get(entry);
- if (!p)
- goto out;
- if (data_race(!p->swap_map[swp_offset(entry)]))
- goto bad_free;
- return p;
-
-bad_free:
- pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
return NULL;
}
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
- if (p)
- spin_lock(&p->lock);
- return p;
-}
-
static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
struct swap_info_struct *q)
{
@@ -1255,23 +1220,23 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
}
/*
+ * When we get a swap entry, if there aren't some other ways to
+ * prevent swapoff, such as the folio in swap cache is locked, page
+ * table lock is held, etc., the swap entry may become invalid because
+ * of swapoff. Then, we need to enclose all swap related functions
+ * with get_swap_device() and put_swap_device(), unless the swap
+ * functions call get/put_swap_device() by themselves.
+ *
* Check whether swap entry is valid in the swap device. If so,
* return pointer to swap_info_struct, and keep the swap entry valid
* via preventing the swap device from being swapoff, until
* put_swap_device() is called. Otherwise return NULL.
*
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
* Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc. The caller must
- * be prepared for that. For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff. The caller must be prepared for that. For
+ * example, the following situation is possible.
*
* CPU1 CPU2
* do_swap_page()
@@ -1299,21 +1264,28 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
-
- rcu_read_lock();
- if (data_race(!(si->flags & SWP_VALID)))
- goto unlock_out;
+ if (!percpu_ref_tryget_live(&si->users))
+ goto out;
+ /*
+ * Guarantee the si->users are checked before accessing other
+ * fields of swap_info_struct.
+ *
+ * Paired with the spin_unlock() after setup_swap_info() in
+ * enable_swap_info().
+ */
+ smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
- goto unlock_out;
+ goto put_out;
return si;
bad_nofile:
pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
return NULL;
-unlock_out:
- rcu_read_unlock();
+put_out:
+ pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
+ percpu_ref_put(&si->users);
return NULL;
}
@@ -1366,7 +1338,7 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-void put_swap_page(struct page *page, swp_entry_t entry)
+void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1375,7 +1347,7 @@ void put_swap_page(struct page *page, swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
- int size = swap_entry_size(thp_nr_pages(page));
+ int size = swap_entry_size(folio_nr_pages(folio));
si = _swap_info_get(entry);
if (!si)
@@ -1465,54 +1437,12 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
spin_unlock(&p->lock);
}
-/*
- * How many references to page are currently swapped out?
- * This does not give an exact answer when swap count is continued,
- * but does include the high COUNT_CONTINUED flag to allow for that.
- */
-int page_swapcount(struct page *page)
-{
- int count = 0;
- struct swap_info_struct *p;
- struct swap_cluster_info *ci;
- swp_entry_t entry;
- unsigned long offset;
-
- entry.val = page_private(page);
- p = _swap_info_get(entry);
- if (p) {
- offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(p, offset);
- count = swap_count(p->swap_map[offset]);
- unlock_cluster_or_swap_info(p, ci);
- }
- return count;
-}
-
int __swap_count(swp_entry_t entry)
{
- struct swap_info_struct *si;
- pgoff_t offset = swp_offset(entry);
- int count = 0;
-
- si = get_swap_device(entry);
- if (si) {
- count = swap_count(si->swap_map[offset]);
- put_swap_device(si);
- }
- return count;
-}
-
-static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
-{
- int count = 0;
+ struct swap_info_struct *si = swp_swap_info(entry);
pgoff_t offset = swp_offset(entry);
- struct swap_cluster_info *ci;
- ci = lock_cluster_or_swap_info(si, offset);
- count = swap_count(si->swap_map[offset]);
- unlock_cluster_or_swap_info(si, ci);
- return count;
+ return swap_count(si->swap_map[offset]);
}
/*
@@ -1520,16 +1450,15 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-int __swp_swapcount(swp_entry_t entry)
+int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
- int count = 0;
- struct swap_info_struct *si;
+ pgoff_t offset = swp_offset(entry);
+ struct swap_cluster_info *ci;
+ int count;
- si = get_swap_device(entry);
- if (si) {
- count = swap_swapcount(si, entry);
- put_swap_device(si);
- }
+ ci = lock_cluster_or_swap_info(si, offset);
+ count = swap_count(si->swap_map[offset]);
+ unlock_cluster_or_swap_info(si, ci);
return count;
}
@@ -1606,167 +1535,61 @@ unlock_out:
return ret;
}
-static bool page_swapped(struct page *page)
-{
- swp_entry_t entry;
- struct swap_info_struct *si;
-
- if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
- return page_swapcount(page) != 0;
-
- page = compound_head(page);
- entry.val = page_private(page);
- si = _swap_info_get(entry);
- if (si)
- return swap_page_trans_huge_swapped(si, entry);
- return false;
-}
-
-static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
- int *total_swapcount)
+static bool folio_swapped(struct folio *folio)
{
- int i, map_swapcount, _total_mapcount, _total_swapcount;
- unsigned long offset = 0;
- struct swap_info_struct *si;
- struct swap_cluster_info *ci = NULL;
- unsigned char *map = NULL;
- int mapcount, swapcount = 0;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
+ swp_entry_t entry = folio_swap_entry(folio);
+ struct swap_info_struct *si = _swap_info_get(entry);
- if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
- mapcount = page_trans_huge_mapcount(page, total_mapcount);
- if (PageSwapCache(page))
- swapcount = page_swapcount(page);
- if (total_swapcount)
- *total_swapcount = swapcount;
- return mapcount + swapcount;
- }
-
- page = compound_head(page);
+ if (!si)
+ return false;
- _total_mapcount = _total_swapcount = map_swapcount = 0;
- if (PageSwapCache(page)) {
- swp_entry_t entry;
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
+ return swap_swapcount(si, entry) != 0;
- entry.val = page_private(page);
- si = _swap_info_get(entry);
- if (si) {
- map = si->swap_map;
- offset = swp_offset(entry);
- }
- }
- if (map)
- ci = lock_cluster(si, offset);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- mapcount = atomic_read(&page[i]._mapcount) + 1;
- _total_mapcount += mapcount;
- if (map) {
- swapcount = swap_count(map[offset + i]);
- _total_swapcount += swapcount;
- }
- map_swapcount = max(map_swapcount, mapcount + swapcount);
- }
- unlock_cluster(ci);
- if (PageDoubleMap(page)) {
- map_swapcount -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
- }
- mapcount = compound_mapcount(page);
- map_swapcount += mapcount;
- _total_mapcount += mapcount;
- if (total_mapcount)
- *total_mapcount = _total_mapcount;
- if (total_swapcount)
- *total_swapcount = _total_swapcount;
-
- return map_swapcount;
+ return swap_page_trans_huge_swapped(si, entry);
}
-/*
- * We can write to an anon page without COW if there are no other references
- * to it. And as a side-effect, free up its swap: because the old content
- * on disk will never be read, and seeking back there to write new content
- * later would only waste time away from clustering.
+/**
+ * folio_free_swap() - Free the swap space used for this folio.
+ * @folio: The folio to remove.
*
- * NOTE: total_map_swapcount should not be relied upon by the caller if
- * reuse_swap_page() returns false, but it may be always overwritten
- * (see the other implementation for CONFIG_SWAP=n).
+ * If swap is getting full, or if there are no more mappings of this folio,
+ * then call folio_free_swap to free its swap space.
+ *
+ * Return: true if we were able to release the swap space.
*/
-bool reuse_swap_page(struct page *page, int *total_map_swapcount)
+bool folio_free_swap(struct folio *folio)
{
- int count, total_mapcount, total_swapcount;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (unlikely(PageKsm(page)))
+ if (!folio_test_swapcache(folio))
+ return false;
+ if (folio_test_writeback(folio))
+ return false;
+ if (folio_swapped(folio))
return false;
- count = page_trans_huge_map_swapcount(page, &total_mapcount,
- &total_swapcount);
- if (total_map_swapcount)
- *total_map_swapcount = total_mapcount + total_swapcount;
- if (count == 1 && PageSwapCache(page) &&
- (likely(!PageTransCompound(page)) ||
- /* The remaining swap count will be freed soon */
- total_swapcount == page_swapcount(page))) {
- if (!PageWriteback(page)) {
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- } else {
- swp_entry_t entry;
- struct swap_info_struct *p;
-
- entry.val = page_private(page);
- p = swap_info_get(entry);
- if (p->flags & SWP_STABLE_WRITES) {
- spin_unlock(&p->lock);
- return false;
- }
- spin_unlock(&p->lock);
- }
- }
-
- return count <= 1;
-}
-
-/*
- * If swap is getting full, or if there are no more mappings of this page,
- * then try_to_free_swap is called to free its swap space.
- */
-int try_to_free_swap(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- if (!PageSwapCache(page))
- return 0;
- if (PageWriteback(page))
- return 0;
- if (page_swapped(page))
- return 0;
/*
* Once hibernation has begun to create its image of memory,
- * there's a danger that one of the calls to try_to_free_swap()
+ * there's a danger that one of the calls to folio_free_swap()
* - most probably a call from __try_to_reclaim_swap() while
* hibernation is allocating its own swap pages for the image,
* but conceivably even a call from memory reclaim - will free
- * the swap from a page which has already been recorded in the
- * image as a clean swapcache page, and then reuse its swap for
+ * the swap from a folio which has already been recorded in the
+ * image as a clean swapcache folio, and then reuse its swap for
* another page of the image. On waking from hibernation, the
- * original page might be freed under memory pressure, then
+ * original folio might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
* Hibernation suspends storage while it is writing the image
* to disk so check that here.
*/
if (pm_suspended_storage())
- return 0;
+ return false;
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- return 1;
+ delete_from_swap_cache(folio);
+ folio_set_dirty(folio);
+ return true;
}
/*
@@ -1793,6 +1616,24 @@ int free_swap_and_cache(swp_entry_t entry)
}
#ifdef CONFIG_HIBERNATION
+
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
+ swp_entry_t entry = {0};
+
+ if (!si)
+ goto fail;
+
+ /* This is called for allocating swap entry, not cache */
+ spin_lock(&si->lock);
+ if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
+ atomic_long_dec(&nr_swap_pages);
+ spin_unlock(&si->lock);
+fail:
+ return entry;
+}
+
/*
* Find the swap type that corresponds to given device (if any).
*
@@ -1852,12 +1693,13 @@ int find_first_swap(dev_t *device)
*/
sector_t swapdev_block(int type, pgoff_t offset)
{
- struct block_device *bdev;
struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_extent *se;
if (!si || !(si->flags & SWP_WRITEOK))
return 0;
- return map_swap_entry(swp_entry(type, offset), &bdev);
+ se = offset_to_swap_extent(si, offset);
+ return se->start_block + (offset - se->start_page);
}
/*
@@ -1889,7 +1731,7 @@ unsigned int count_swap_pages(int type, int free)
static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
}
/*
@@ -1898,38 +1740,81 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
* force COW, vm_page_prot omits write permission from any private vma.
*/
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, swp_entry_t entry, struct page *page)
+ unsigned long addr, swp_entry_t entry, struct folio *folio)
{
+ struct page *page = folio_file_page(folio, swp_offset(entry));
struct page *swapcache;
spinlock_t *ptl;
- pte_t *pte;
+ pte_t *pte, new_pte, old_pte;
+ bool hwpoisoned = PageHWPoison(page);
int ret = 1;
swapcache = page;
page = ksm_might_need_to_copy(page, vma, addr);
if (unlikely(!page))
return -ENOMEM;
+ else if (unlikely(PTR_ERR(page) == -EHWPOISON))
+ hwpoisoned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
+ if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
+ swp_entry_to_pte(entry)))) {
ret = 0;
goto out;
}
+ old_pte = ptep_get(pte);
+
+ if (unlikely(hwpoisoned || !PageUptodate(page))) {
+ swp_entry_t swp_entry;
+
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+ if (hwpoisoned) {
+ swp_entry = make_hwpoison_entry(swapcache);
+ page = swapcache;
+ } else {
+ swp_entry = make_swapin_error_entry();
+ }
+ new_pte = swp_entry_to_pte(swp_entry);
+ ret = 0;
+ goto setpte;
+ }
+
+ /* See do_swap_page() */
+ BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
+ BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
- page_add_anon_rmap(page, vma, addr, false);
+ rmap_t rmap_flags = RMAP_NONE;
+
+ /*
+ * See do_swap_page(): PageWriteback() would be problematic.
+ * However, we do a wait_on_page_writeback() just before this
+ * call and have the page locked.
+ */
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (pte_swp_exclusive(old_pte))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ page_add_anon_rmap(page, vma, addr, rmap_flags);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr, false);
+ page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
}
+ new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ if (pte_swp_soft_dirty(old_pte))
+ new_pte = pte_mksoft_dirty(new_pte);
+ if (pte_swp_uffd_wp(old_pte))
+ new_pte = pte_mkuffd_wp(new_pte);
+setpte:
+ set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out:
- pte_unmap_unlock(pte, ptl);
+ if (pte)
+ pte_unmap_unlock(pte, ptl);
if (page != swapcache) {
unlock_page(page);
put_page(page);
@@ -1939,80 +1824,83 @@ out:
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
- unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+ unsigned int type)
{
- struct page *page;
- swp_entry_t entry;
- pte_t *pte;
+ pte_t *pte = NULL;
struct swap_info_struct *si;
- unsigned long offset;
- int ret = 0;
- volatile unsigned char *swap_map;
si = swap_info[type];
- pte = pte_offset_map(pmd, addr);
do {
- struct vm_fault vmf;
+ struct folio *folio;
+ unsigned long offset;
+ unsigned char swp_count;
+ swp_entry_t entry;
+ int ret;
+ pte_t ptent;
- if (!is_swap_pte(*pte))
+ if (!pte++) {
+ pte = pte_offset_map(pmd, addr);
+ if (!pte)
+ break;
+ }
+
+ ptent = ptep_get_lockless(pte);
+
+ if (!is_swap_pte(ptent))
continue;
- entry = pte_to_swp_entry(*pte);
+ entry = pte_to_swp_entry(ptent);
if (swp_type(entry) != type)
continue;
offset = swp_offset(entry);
- if (frontswap && !frontswap_test(si, offset))
- continue;
-
pte_unmap(pte);
- swap_map = &si->swap_map[offset];
- page = lookup_swap_cache(entry, vma, addr);
- if (!page) {
- vmf.vma = vma;
- vmf.address = addr;
- vmf.pmd = pmd;
+ pte = NULL;
+
+ folio = swap_cache_get_folio(entry, vma, addr);
+ if (!folio) {
+ struct page *page;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = addr,
+ .real_address = addr,
+ .pmd = pmd,
+ };
+
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
&vmf);
+ if (page)
+ folio = page_folio(page);
}
- if (!page) {
- if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
- goto try_next;
+ if (!folio) {
+ swp_count = READ_ONCE(si->swap_map[offset]);
+ if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
+ continue;
return -ENOMEM;
}
- lock_page(page);
- wait_on_page_writeback(page);
- ret = unuse_pte(vma, pmd, addr, entry, page);
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ ret = unuse_pte(vma, pmd, addr, entry, folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
- goto out;
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
}
- try_to_free_swap(page);
- unlock_page(page);
- put_page(page);
-
- if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
- ret = FRONTSWAP_PAGES_UNUSED;
- goto out;
- }
-try_next:
- pte = pte_offset_map(pmd, addr);
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ } while (addr += PAGE_SIZE, addr != end);
- ret = 0;
-out:
- return ret;
+ if (pte)
+ pte_unmap(pte);
+ return 0;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+ unsigned int type)
{
pmd_t *pmd;
unsigned long next;
@@ -2022,10 +1910,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
do {
cond_resched();
next = pmd_addr_end(addr, end);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- continue;
- ret = unuse_pte_range(vma, pmd, addr, next, type,
- frontswap, fs_pages_to_unuse);
+ ret = unuse_pte_range(vma, pmd, addr, next, type);
if (ret)
return ret;
} while (pmd++, addr = next, addr != end);
@@ -2034,8 +1919,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
- unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+ unsigned int type)
{
pud_t *pud;
unsigned long next;
@@ -2046,8 +1930,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- ret = unuse_pmd_range(vma, pud, addr, next, type,
- frontswap, fs_pages_to_unuse);
+ ret = unuse_pmd_range(vma, pud, addr, next, type);
if (ret)
return ret;
} while (pud++, addr = next, addr != end);
@@ -2056,8 +1939,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+ unsigned int type)
{
p4d_t *p4d;
unsigned long next;
@@ -2068,16 +1950,14 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
- ret = unuse_pud_range(vma, p4d, addr, next, type,
- frontswap, fs_pages_to_unuse);
+ ret = unuse_pud_range(vma, p4d, addr, next, type);
if (ret)
return ret;
} while (p4d++, addr = next, addr != end);
return 0;
}
-static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
- bool frontswap, unsigned long *fs_pages_to_unuse)
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
{
pgd_t *pgd;
unsigned long addr, end, next;
@@ -2091,28 +1971,27 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- ret = unuse_p4d_range(vma, pgd, addr, next, type,
- frontswap, fs_pages_to_unuse);
+ ret = unuse_p4d_range(vma, pgd, addr, next, type);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
-static int unuse_mm(struct mm_struct *mm, unsigned int type,
- bool frontswap, unsigned long *fs_pages_to_unuse)
+static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
struct vm_area_struct *vma;
int ret = 0;
+ VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (vma->anon_vma) {
- ret = unuse_vma(vma, type, frontswap,
- fs_pages_to_unuse);
+ ret = unuse_vma(vma, type);
if (ret)
break;
}
+
cond_resched();
}
mmap_read_unlock(mm);
@@ -2120,12 +1999,12 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
}
/*
- * Scan swap_map (or frontswap_map if frontswap parameter is true)
- * from current position to next entry still in use. Return 0
- * if there are no inuse entries after prev till end of the map.
+ * Scan swap_map from current position to next entry still in use.
+ * Return 0 if there are no inuse entries after prev till end of
+ * the map.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev, bool frontswap)
+ unsigned int prev)
{
unsigned int i;
unsigned char count;
@@ -2139,8 +2018,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
for (i = prev + 1; i < si->max; i++) {
count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
- if (!frontswap || frontswap_test(si, i))
- break;
+ break;
if ((i % LATENCY_LIMIT) == 0)
cond_resched();
}
@@ -2151,32 +2029,24 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
return i;
}
-/*
- * If the boolean frontswap is true, only unuse pages_to_unuse pages;
- * pages_to_unuse==0 means all pages; ignored if frontswap is false
- */
-int try_to_unuse(unsigned int type, bool frontswap,
- unsigned long pages_to_unuse)
+static int try_to_unuse(unsigned int type)
{
struct mm_struct *prev_mm;
struct mm_struct *mm;
struct list_head *p;
int retval = 0;
struct swap_info_struct *si = swap_info[type];
- struct page *page;
+ struct folio *folio;
swp_entry_t entry;
unsigned int i;
if (!READ_ONCE(si->inuse_pages))
return 0;
- if (!frontswap)
- pages_to_unuse = 0;
-
retry:
- retval = shmem_unuse(type, frontswap, &pages_to_unuse);
+ retval = shmem_unuse(type);
if (retval)
- goto out;
+ return retval;
prev_mm = &init_mm;
mmget(prev_mm);
@@ -2193,11 +2063,10 @@ retry:
spin_unlock(&mmlist_lock);
mmput(prev_mm);
prev_mm = mm;
- retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
-
+ retval = unuse_mm(mm, type);
if (retval) {
mmput(prev_mm);
- goto out;
+ return retval;
}
/*
@@ -2214,32 +2083,24 @@ retry:
i = 0;
while (READ_ONCE(si->inuse_pages) &&
!signal_pending(current) &&
- (i = find_next_to_unuse(si, i, frontswap)) != 0) {
+ (i = find_next_to_unuse(si, i)) != 0) {
entry = swp_entry(type, i);
- page = find_get_page(swap_address_space(entry), i);
- if (!page)
+ folio = filemap_get_folio(swap_address_space(entry), i);
+ if (IS_ERR(folio))
continue;
/*
- * It is conceivable that a racing task removed this page from
- * swap cache just before we acquired the page lock. The page
+ * It is conceivable that a racing task removed this folio from
+ * swap cache just before we acquired the page lock. The folio
* might even be back in swap cache on another swap area. But
- * that is okay, try_to_free_swap() only removes stale pages.
+ * that is okay, folio_free_swap() only removes stale folios.
*/
- lock_page(page);
- wait_on_page_writeback(page);
- try_to_free_swap(page);
- unlock_page(page);
- put_page(page);
-
- /*
- * For frontswap, we just need to unuse pages_to_unuse, if
- * it was specified. Need not check frontswap again here as
- * we already zeroed out pages_to_unuse if not frontswap.
- */
- if (pages_to_unuse && --pages_to_unuse == 0)
- goto out;
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
/*
@@ -2248,19 +2109,20 @@ retry:
* Under global memory pressure, swap entries can be reinserted back
* into process space after the mmlist loop above passes over them.
*
- * Limit the number of retries? No: when mmget_not_zero() above fails,
- * that mm is likely to be freeing swap from exit_mmap(), which proceeds
- * at its own independent pace; and even shmem_writepage() could have
- * been preempted after get_swap_page(), temporarily hiding that swap.
- * It's easy and robust (though cpu-intensive) just to keep retrying.
+ * Limit the number of retries? No: when mmget_not_zero()
+ * above fails, that mm is likely to be freeing swap from
+ * exit_mmap(), which proceeds at its own independent pace;
+ * and even shmem_writepage() could have been preempted after
+ * folio_alloc_swap(), temporarily hiding that swap. It's easy
+ * and robust (though cpu-intensive) just to keep retrying.
*/
if (READ_ONCE(si->inuse_pages)) {
if (!signal_pending(current))
goto retry;
- retval = -EINTR;
+ return -EINTR;
}
-out:
- return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
+
+ return 0;
}
/*
@@ -2284,36 +2146,6 @@ static void drain_mmlist(void)
}
/*
- * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset for the specified swap entry.
- * Note that the type of this function is sector_t, but it returns page offset
- * into the bdev, not sector offset.
- */
-static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
-{
- struct swap_info_struct *sis;
- struct swap_extent *se;
- pgoff_t offset;
-
- sis = swp_swap_info(entry);
- *bdev = sis->bdev;
-
- offset = swp_offset(entry);
- se = offset_to_swap_extent(sis, offset);
- return se->start_block + (offset - se->start_page);
-}
-
-/*
- * Returns the page offset into bdev for the specified page's swap entry.
- */
-sector_t map_swap_page(struct page *page, struct block_device **bdev)
-{
- swp_entry_t entry;
- entry.val = page_private(page);
- return map_swap_entry(entry, bdev);
-}
-
-/*
* Free all of a swapdev's extent information
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
@@ -2385,8 +2217,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
/*
* A `swap extent' is a simple thing which maps a contiguous range of pages
- * onto a contiguous range of disk blocks. An ordered list of swap extents
- * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * onto a contiguous range of disk blocks. A rbtree of swap extents is
+ * built at swapon time and is then used at swap_writepage/swap_readpage
* time for locating where on disk a page belongs.
*
* If the swapfile is an S_ISBLK block device, a single extent is installed.
@@ -2394,12 +2226,12 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
* swap files identically.
*
* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
- * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
+ * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
* swapfiles are handled *identically* after swapon time.
*
* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
- * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
- * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray
+ * blocks are found which do not fall within the PAGE_SIZE alignment
* requirements, they are simply tossed out - we will never use those blocks
* for swapping.
*
@@ -2408,10 +2240,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
*
* The amount of disk space which a single swap extent represents varies.
* Typically it is in the 1-4 megabyte range. So we can have hundreds of
- * extents in the list. To avoid much list walking, we cache the previous
- * search location in `curr_swap_extent', and start new searches from there.
- * This is extremely effective. The average number of iterations in
- * map_swap_page() has been measured at about 0.3 per page. - akpm.
+ * extents in the rbtree. - akpm.
*/
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
@@ -2428,12 +2257,13 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
- if (ret >= 0)
- sis->flags |= SWP_ACTIVATED;
- if (!ret) {
- sis->flags |= SWP_FS_OPS;
- ret = add_swap_extent(sis, 0, sis->max, 0);
- *span = sis->pages;
+ if (ret < 0)
+ return ret;
+ sis->flags |= SWP_ACTIVATED;
+ if ((sis->flags & SWP_FS_OPS) &&
+ sio_pool_init() != 0) {
+ destroy_swap_extents(sis);
+ return -ENOMEM;
}
return ret;
}
@@ -2484,7 +2314,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
static void _enable_swap_info(struct swap_info_struct *p)
{
- p->flags |= SWP_WRITEOK | SWP_VALID;
+ p->flags |= SWP_WRITEOK;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2495,7 +2325,7 @@ static void _enable_swap_info(struct swap_info_struct *p)
* which on removal of any swap_info_struct with an auto-assigned
* (i.e. negative) priority increments the auto-assigned priority
* of any lower-priority swap_info_structs.
- * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
* which allocates swap pages from the highest available priority
* swap_info_struct.
*/
@@ -2508,17 +2338,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
struct swap_cluster_info *cluster_info,
unsigned long *frontswap_map)
{
- frontswap_init(p->type, frontswap_map);
+ if (IS_ENABLED(CONFIG_FRONTSWAP))
+ frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
setup_swap_info(p, prio, swap_map, cluster_info);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
/*
- * Guarantee swap_map, cluster_info, etc. fields are valid
- * between get/put_swap_device() if SWP_VALID bit is set
+ * Finished initializing swap device, now it's safe to reference it.
*/
- synchronize_rcu();
+ percpu_ref_resurrect(&p->users);
spin_lock(&swap_lock);
spin_lock(&p->lock);
_enable_swap_info(p);
@@ -2596,8 +2426,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- del_from_avail_list(p);
spin_lock(&p->lock);
+ del_from_avail_list(p);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
@@ -2622,7 +2452,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
disable_swap_slots_cache_lock();
set_current_oom_origin();
- err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ err = try_to_unuse(p->type);
clear_current_oom_origin();
if (err) {
@@ -2634,16 +2464,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
- spin_lock(&swap_lock);
- spin_lock(&p->lock);
- p->flags &= ~SWP_VALID; /* mark swap device as invalid */
- spin_unlock(&p->lock);
- spin_unlock(&swap_lock);
/*
- * wait for swap operations protected by get/put_swap_device()
- * to complete
+ * Wait for swap operations protected by get/put_swap_device()
+ * to complete.
+ *
+ * We need synchronize_rcu() here to protect the accessing to
+ * the swap cache data structure.
*/
+ percpu_ref_kill(&p->users);
synchronize_rcu();
+ wait_for_completion(&p->comp);
flush_work(&p->discard_work);
@@ -2651,7 +2481,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
- if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+ if (!p->bdev || !bdev_nonrot(p->bdev))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
@@ -2659,7 +2489,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
drain_mmlist();
- /* wait for anyone still in scan_swap_map */
+ /* wait for anyone still in scan_swap_map_slots */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
spin_unlock(&p->lock);
@@ -2700,7 +2530,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct block_device *bdev = I_BDEV(inode);
set_blocksize(bdev, old_block_size);
- blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(bdev, p);
}
inode_lock(inode);
@@ -2795,19 +2625,19 @@ static int swap_show(struct seq_file *swap, void *v)
struct swap_info_struct *si = v;
struct file *file;
int len;
- unsigned int bytes, inuse;
+ unsigned long bytes, inuse;
if (si == SEQ_START_TOKEN) {
- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
+ seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
return 0;
}
bytes = si->pages << (PAGE_SHIFT - 10);
- inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+ inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10);
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
- seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
+ seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file_inode(file)->i_mode) ?
"partition" : "file\t",
@@ -2867,6 +2697,7 @@ late_initcall(max_swapfiles_check);
static struct swap_info_struct *alloc_swap_info(void)
{
struct swap_info_struct *p;
+ struct swap_info_struct *defer = NULL;
unsigned int type;
int i;
@@ -2874,6 +2705,12 @@ static struct swap_info_struct *alloc_swap_info(void)
if (!p)
return ERR_PTR(-ENOMEM);
+ if (percpu_ref_init(&p->users, swap_users_ref_free,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
+ kvfree(p);
+ return ERR_PTR(-ENOMEM);
+ }
+
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
if (!(swap_info[type]->flags & SWP_USED))
@@ -2881,21 +2718,20 @@ static struct swap_info_struct *alloc_swap_info(void)
}
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
+ percpu_ref_exit(&p->users);
kvfree(p);
return ERR_PTR(-EPERM);
}
if (type >= nr_swapfiles) {
p->type = type;
- WRITE_ONCE(swap_info[type], p);
/*
- * Write swap_info[type] before nr_swapfiles, in case a
- * racing procfs swap_start() or swap_next() is reading them.
- * (We never shrink nr_swapfiles, we never free this entry.)
+ * Publish the swap_info_struct after initializing it.
+ * Note that kvzalloc() above zeroes all its fields.
*/
- smp_wmb();
- WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
+ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
+ nr_swapfiles++;
} else {
- kvfree(p);
+ defer = p;
p = swap_info[type];
/*
* Do not memset this entry: a racing procfs swap_next()
@@ -2908,8 +2744,13 @@ static struct swap_info_struct *alloc_swap_info(void)
plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
+ if (defer) {
+ percpu_ref_exit(&defer->users);
+ kvfree(defer);
+ }
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
+ init_completion(&p->comp);
return p;
}
@@ -2920,7 +2761,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
if (S_ISBLK(inode->i_mode)) {
p->bdev = blkdev_get_by_dev(inode->i_rdev,
- FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
+ BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
if (IS_ERR(p->bdev)) {
error = PTR_ERR(p->bdev);
p->bdev = NULL;
@@ -2935,7 +2776,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
- if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
+ if (bdev_is_zoned(p->bdev))
return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
@@ -2969,7 +2810,7 @@ unsigned long generic_max_swapfile_size(void)
}
/* Can be overridden by an architecture for additional checks. */
-__weak unsigned long max_swapfile_size(void)
+__weak unsigned long arch_max_swapfile_size(void)
{
return generic_max_swapfile_size();
}
@@ -2988,7 +2829,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
return 0;
}
- /* swap partition endianess hack... */
+ /* swap partition endianness hack... */
if (swab32(swap_header->info.version) == 1) {
swab32s(&swap_header->info.version);
swab32s(&swap_header->info.last_page);
@@ -3009,7 +2850,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
p->cluster_next = 1;
p->cluster_nr = 0;
- maxpages = max_swapfile_size();
+ maxpages = swapfile_maximum_size;
last_page = swap_header->info.last_page;
if (!last_page) {
pr_warn("Empty swap-file\n");
@@ -3131,26 +2972,13 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
return nr_extents;
}
-/*
- * Helper to sys_swapon determining if a given swap
- * backing device queue supports DISCARD operations.
- */
-static bool swap_discardable(struct swap_info_struct *si)
-{
- struct request_queue *q = bdev_get_queue(si->bdev);
-
- if (!q || !blk_queue_discard(q))
- return false;
-
- return true;
-}
-
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct *p;
struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
+ struct dentry *dentry;
int prio;
int error;
union swap_header *swap_header;
@@ -3194,6 +3022,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->swap_file = swap_file;
mapping = swap_file->f_mapping;
+ dentry = swap_file->f_path.dentry;
inode = mapping->host;
error = claim_swapfile(p, inode);
@@ -3201,6 +3030,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
inode_lock(inode);
+ if (d_unlinked(dentry) || cant_mount(dentry)) {
+ error = -ENOENT;
+ goto bad_swap_unlock_inode;
+ }
if (IS_SWAPFILE(inode)) {
error = -EBUSY;
goto bad_swap_unlock_inode;
@@ -3209,7 +3042,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
/*
* Read the swap header.
*/
- if (!mapping->a_ops->readpage) {
+ if (!mapping->a_ops->read_folio) {
error = -EINVAL;
goto bad_swap_unlock_inode;
}
@@ -3233,13 +3066,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
- if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
+ if (p->bdev && bdev_stable_writes(p->bdev))
p->flags |= SWP_STABLE_WRITES;
- if (p->bdev && p->bdev->bd_disk->fops->rw_page)
+ if (p->bdev && bdev_synchronous(p->bdev))
p->flags |= SWP_SYNCHRONOUS_IO;
- if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ if (p->bdev && bdev_nonrot(p->bdev)) {
int cpu;
unsigned long ci, nr_cluster;
@@ -3255,7 +3088,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
*/
for_each_possible_cpu(cpu) {
per_cpu(*p->cluster_next_cpu, cpu) =
- 1 + prandom_u32_max(p->highest_bit);
+ get_random_u32_inclusive(1, p->highest_bit);
}
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
@@ -3300,7 +3133,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sizeof(long),
GFP_KERNEL);
- if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+ if ((swap_flags & SWAP_FLAG_DISCARD) &&
+ p->bdev && bdev_max_discard_sectors(p->bdev)) {
/*
* When discard is enabled for swap with no particular
* policy flagged, we set all swap discard flags here in
@@ -3378,7 +3212,7 @@ bad_swap:
p->cluster_next_cpu = NULL;
if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
set_blocksize(p->bdev, p->old_block_size);
- blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(p->bdev, p);
}
inode = NULL;
destroy_swap_extents(p);
@@ -3418,7 +3252,7 @@ void si_swapinfo(struct sysinfo *val)
struct swap_info_struct *si = swap_info[type];
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
- nr_to_be_unused += si->inuse_pages;
+ nr_to_be_unused += READ_ONCE(si->inuse_pages);
}
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -3443,11 +3277,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unsigned long offset;
unsigned char count;
unsigned char has_cache;
- int err = -EINVAL;
+ int err;
- p = get_swap_device(entry);
- if (!p)
- goto out;
+ p = swp_swap_info(entry);
offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
@@ -3494,9 +3326,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
-out:
- if (p)
- put_swap_device(p);
return err;
}
@@ -3550,13 +3379,13 @@ struct swap_info_struct *page_swap_info(struct page *page)
}
/*
- * out-of-line __page_file_ methods to avoid include hell.
+ * out-of-line methods to avoid include hell.
*/
-struct address_space *__page_file_mapping(struct page *page)
+struct address_space *swapcache_mapping(struct folio *folio)
{
- return page_swap_info(page)->swap_file->f_mapping;
+ return page_swap_info(&folio->page)->swap_file->f_mapping;
}
-EXPORT_SYMBOL_GPL(__page_file_mapping);
+EXPORT_SYMBOL_GPL(swapcache_mapping);
pgoff_t __page_file_index(struct page *page)
{
@@ -3611,7 +3440,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
ci = lock_cluster(si, offset);
- count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+ count = swap_count(si->swap_map[offset]);
if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
/*
@@ -3627,11 +3456,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
goto out;
}
- /*
- * We are fortunate that although vmalloc_to_page uses pte_offset_map,
- * no architecture is using highmem pages for kernel page tables: so it
- * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
- */
head = vmalloc_to_page(si->swap_map + offset);
offset &= ~PAGE_MASK;
@@ -3795,12 +3619,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
struct swap_info_struct *si, *next;
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
- if (!(gfp_mask & __GFP_IO))
+ if (!(gfp & __GFP_IO))
return;
if (!blk_cgroup_congested())
@@ -3810,14 +3634,14 @@ void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
* We've already scheduled a throttle, avoid taking the global swap
* lock.
*/
- if (current->throttle_queue)
+ if (current->throttle_disk)
return;
spin_lock(&swap_avail_lock);
plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
avail_lists[nid]) {
if (si->bdev) {
- blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
+ blkcg_schedule_throttle(si->bdev->bd_disk, true);
break;
}
}
@@ -3839,6 +3663,13 @@ static int __init swapfile_init(void)
for_each_node(nid)
plist_head_init(&swap_avail_heads[nid]);
+ swapfile_maximum_size = arch_max_swapfile_size();
+
+#ifdef CONFIG_MIGRATION
+ if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
+ swap_migration_ad_supported = true;
+#endif /* CONFIG_MIGRATION */
+
return 0;
}
subsys_initcall(swapfile_init);
diff --git a/mm/truncate.c b/mm/truncate.c
index 6bbe0f0b3ce9..95d1291d269b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,10 +19,8 @@
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/buffer_head.h> /* grr. try_to_release_page,
- do_invalidatepage */
+#include <linux/buffer_head.h> /* grr. try_to_release_page */
#include <linux/shmem_fs.h>
-#include <linux/cleancache.h>
#include <linux/rmap.h>
#include "internal.h"
@@ -40,68 +38,72 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
if (xas_load(&xas) != entry)
return;
xas_store(&xas, NULL);
- mapping->nrexceptional--;
}
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
__clear_shadow_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
}
/*
* Unconditionally remove exceptional entries. Usually called from truncate
- * path. Note that the pagevec may be altered by this function by removing
- * exceptional entries similar to what pagevec_remove_exceptionals does.
+ * path. Note that the folio_batch may be altered by this function by removing
+ * exceptional entries similar to what folio_batch_remove_exceptionals() does.
*/
-static void truncate_exceptional_pvec_entries(struct address_space *mapping,
- struct pagevec *pvec, pgoff_t *indices,
- pgoff_t end)
+static void truncate_folio_batch_exceptionals(struct address_space *mapping,
+ struct folio_batch *fbatch, pgoff_t *indices)
{
int i, j;
- bool dax, lock;
+ bool dax;
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
- for (j = 0; j < pagevec_count(pvec); j++)
- if (xa_is_value(pvec->pages[j]))
+ for (j = 0; j < folio_batch_count(fbatch); j++)
+ if (xa_is_value(fbatch->folios[j]))
break;
- if (j == pagevec_count(pvec))
+ if (j == folio_batch_count(fbatch))
return;
dax = dax_mapping(mapping);
- lock = !dax && indices[j] < end;
- if (lock)
+ if (!dax) {
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
+ }
- for (i = j; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
+ for (i = j; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
pgoff_t index = indices[i];
- if (!xa_is_value(page)) {
- pvec->pages[j++] = page;
+ if (!xa_is_value(folio)) {
+ fbatch->folios[j++] = folio;
continue;
}
- if (index >= end)
- continue;
-
if (unlikely(dax)) {
dax_delete_mapping_entry(mapping, index);
continue;
}
- __clear_shadow_entry(mapping, index, page);
+ __clear_shadow_entry(mapping, index, folio);
}
- if (lock)
+ if (!dax) {
xa_unlock_irq(&mapping->i_pages);
- pvec->nr = j;
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+ }
+ fbatch->nr = j;
}
/*
@@ -135,98 +137,115 @@ static int invalidate_exceptional_entry2(struct address_space *mapping,
}
/**
- * do_invalidatepage - invalidate part or all of a page
- * @page: the page which is affected
+ * folio_invalidate - Invalidate part or all of a folio.
+ * @folio: The folio which is affected.
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
- * do_invalidatepage() is called when all or part of the page has become
+ * folio_invalidate() is called when all or part of the folio has become
* invalidated by a truncate operation.
*
- * do_invalidatepage() does not have to release all buffers, but it must
+ * folio_invalidate() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
* is underway against any of the blocks which are outside the truncation
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void do_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+void folio_invalidate(struct folio *folio, size_t offset, size_t length)
{
- void (*invalidatepage)(struct page *, unsigned int, unsigned int);
-
- invalidatepage = page->mapping->a_ops->invalidatepage;
-#ifdef CONFIG_BLOCK
- if (!invalidatepage)
- invalidatepage = block_invalidatepage;
-#endif
- if (invalidatepage)
- (*invalidatepage)(page, offset, length);
+ const struct address_space_operations *aops = folio->mapping->a_ops;
+
+ if (aops->invalidate_folio)
+ aops->invalidate_folio(folio, offset, length);
}
+EXPORT_SYMBOL_GPL(folio_invalidate);
/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
*
- * We need to bale out if page->mapping is no longer equal to the original
+ * We need to bail out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void
-truncate_cleanup_page(struct address_space *mapping, struct page *page)
+static void truncate_cleanup_folio(struct folio *folio)
{
- if (page_mapped(page)) {
- pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
- unmap_mapping_pages(mapping, page->index, nr, false);
- }
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
- if (page_has_private(page))
- do_invalidatepage(page, 0, PAGE_SIZE);
+ if (folio_has_private(folio))
+ folio_invalidate(folio, 0, folio_size(folio));
/*
* Some filesystems seem to re-dirty the page even after
* the VM has canceled the dirty bit (eg ext3 journaling).
* Hence dirty accounting check is placed after invalidation.
*/
- cancel_dirty_page(page);
- ClearPageMappedToDisk(page);
+ folio_cancel_dirty(folio);
+ folio_clear_mappedtodisk(folio);
}
-/*
- * This is for invalidate_mapping_pages(). That function can be called at
- * any time, and is not supposed to throw away dirty pages. But pages can
- * be marked dirty at any time too, so use remove_mapping which safely
- * discards clean, unused pages.
- *
- * Returns non-zero if the page was successfully invalidated.
- */
-static int
-invalidate_complete_page(struct address_space *mapping, struct page *page)
+int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
- int ret;
-
- if (page->mapping != mapping)
- return 0;
-
- if (page_has_private(page) && !try_to_release_page(page, 0))
- return 0;
-
- ret = remove_mapping(mapping, page);
+ if (folio->mapping != mapping)
+ return -EIO;
- return ret;
+ truncate_cleanup_folio(folio);
+ filemap_remove_folio(folio);
+ return 0;
}
-int truncate_inode_page(struct address_space *mapping, struct page *page)
+/*
+ * Handle partial folios. The folio may be entirely within the
+ * range if a split has raced with us. If not, we zero the part of the
+ * folio that's within the [start, end] range, and then split the folio if
+ * it's large. split_page_range() will discard pages which now lie beyond
+ * i_size, and we rely on the caller to discard pages which lie within a
+ * newly created hole.
+ *
+ * Returns false if splitting failed so the caller can avoid
+ * discarding the entire folio which is stubbornly unsplit.
+ */
+bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
- VM_BUG_ON_PAGE(PageTail(page), page);
+ loff_t pos = folio_pos(folio);
+ unsigned int offset, length;
- if (page->mapping != mapping)
- return -EIO;
+ if (pos < start)
+ offset = start - pos;
+ else
+ offset = 0;
+ length = folio_size(folio);
+ if (pos + length <= (u64)end)
+ length = length - offset;
+ else
+ length = end + 1 - pos - offset;
- truncate_cleanup_page(mapping, page);
- delete_from_page_cache(page);
- return 0;
+ folio_wait_writeback(folio);
+ if (length == folio_size(folio)) {
+ truncate_inode_folio(folio->mapping, folio);
+ return true;
+ }
+
+ /*
+ * We may be zeroing pages we're about to discard, but it avoids
+ * doing a complex calculation here, and then doing the zeroing
+ * anyway if the page split fails.
+ */
+ folio_zero_range(folio, offset, length);
+
+ if (folio_has_private(folio))
+ folio_invalidate(folio, offset, length);
+ if (!folio_test_large(folio))
+ return true;
+ if (split_folio(folio) == 0)
+ return true;
+ if (folio_test_dirty(folio))
+ return false;
+ truncate_inode_folio(folio->mapping, folio);
+ return true;
}
/*
@@ -234,6 +253,8 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
*/
int generic_error_remove_page(struct address_space *mapping, struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
if (!mapping)
return -EINVAL;
/*
@@ -242,26 +263,44 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page)
*/
if (!S_ISREG(mapping->host->i_mode))
return -EIO;
- return truncate_inode_page(mapping, page);
+ return truncate_inode_folio(mapping, page_folio(page));
}
EXPORT_SYMBOL(generic_error_remove_page);
-/*
+static long mapping_evict_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ if (folio_test_dirty(folio) || folio_test_writeback(folio))
+ return 0;
+ /* The refcount will be elevated if any page in the folio is mapped */
+ if (folio_ref_count(folio) >
+ folio_nr_pages(folio) + folio_has_private(folio) + 1)
+ return 0;
+ if (folio_has_private(folio) && !filemap_release_folio(folio, 0))
+ return 0;
+
+ return remove_mapping(mapping, folio);
+}
+
+/**
+ * invalidate_inode_page() - Remove an unused page from the pagecache.
+ * @page: The page to remove.
+ *
* Safely invalidate one page from its pagecache mapping.
- * It only drops clean, unused pages. The page must be locked.
+ * It only drops clean, unused pages.
*
- * Returns 1 if the page is successfully invalidated, otherwise 0.
+ * Context: Page must be locked.
+ * Return: The number of pages successfully removed.
*/
-int invalidate_inode_page(struct page *page)
+long invalidate_inode_page(struct page *page)
{
- struct address_space *mapping = page_mapping(page);
+ struct folio *folio = page_folio(page);
+ struct address_space *mapping = folio_mapping(folio);
+
+ /* The page may have been truncated before it was locked */
if (!mapping)
return 0;
- if (PageDirty(page) || PageWriteback(page))
- return 0;
- if (page_mapped(page))
- return 0;
- return invalidate_complete_page(mapping, page);
+ return mapping_evict_folio(mapping, folio);
}
/**
@@ -284,7 +323,7 @@ int invalidate_inode_page(struct page *page)
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
*
- * Note that since ->invalidatepage() accepts range to invalidate
+ * Note that since ->invalidate_folio() accepts range to invalidate
* truncate_inode_pages_range is able to handle cases where lend + 1 is not
* page aligned properly.
*/
@@ -293,19 +332,15 @@ void truncate_inode_pages_range(struct address_space *mapping,
{
pgoff_t start; /* inclusive */
pgoff_t end; /* exclusive */
- unsigned int partial_start; /* inclusive */
- unsigned int partial_end; /* exclusive */
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index;
int i;
+ struct folio *folio;
+ bool same_folio;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
- goto out;
-
- /* Offsets within partial pages */
- partial_start = lstart & (PAGE_SIZE - 1);
- partial_end = (lend + 1) & (PAGE_SIZE - 1);
+ if (mapping_empty(mapping))
+ return;
/*
* 'start' and 'end' always covers the range of pages to be fully
@@ -324,97 +359,50 @@ void truncate_inode_pages_range(struct address_space *mapping,
else
end = (lend + 1) >> PAGE_SHIFT;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
index = start;
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
- /*
- * Pagevec array has exceptional entries and we may also fail
- * to lock some pages. So we store pages that can be deleted
- * in a new pagevec.
- */
- struct pagevec locked_pvec;
-
- pagevec_init(&locked_pvec);
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
-
- /* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index >= end)
- break;
-
- if (xa_is_value(page))
- continue;
-
- if (!trylock_page(page))
- continue;
- WARN_ON(page_to_index(page) != index);
- if (PageWriteback(page)) {
- unlock_page(page);
- continue;
- }
- if (page->mapping != mapping) {
- unlock_page(page);
- continue;
- }
- pagevec_add(&locked_pvec, page);
- }
- for (i = 0; i < pagevec_count(&locked_pvec); i++)
- truncate_cleanup_page(mapping, locked_pvec.pages[i]);
- delete_from_page_cache_batch(mapping, &locked_pvec);
- for (i = 0; i < pagevec_count(&locked_pvec); i++)
- unlock_page(locked_pvec.pages[i]);
- truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
- pagevec_release(&pvec);
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
+ &fbatch, indices)) {
+ truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
+ for (i = 0; i < folio_batch_count(&fbatch); i++)
+ truncate_cleanup_folio(fbatch.folios[i]);
+ delete_from_page_cache_batch(mapping, &fbatch);
+ for (i = 0; i < folio_batch_count(&fbatch); i++)
+ folio_unlock(fbatch.folios[i]);
+ folio_batch_release(&fbatch);
cond_resched();
- index++;
}
- if (partial_start) {
- struct page *page = find_lock_page(mapping, start - 1);
- if (page) {
- unsigned int top = PAGE_SIZE;
- if (start > end) {
- /* Truncation within a single page */
- top = partial_end;
- partial_end = 0;
- }
- wait_on_page_writeback(page);
- zero_user_segment(page, partial_start, top);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, partial_start,
- top - partial_start);
- unlock_page(page);
- put_page(page);
+
+ same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
+ folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
+ if (!IS_ERR(folio)) {
+ same_folio = lend < folio_pos(folio) + folio_size(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend)) {
+ start = folio->index + folio_nr_pages(folio);
+ if (same_folio)
+ end = folio->index;
}
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
}
- if (partial_end) {
- struct page *page = find_lock_page(mapping, end);
- if (page) {
- wait_on_page_writeback(page);
- zero_user_segment(page, 0, partial_end);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, 0,
- partial_end);
- unlock_page(page);
- put_page(page);
+
+ if (!same_folio) {
+ folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
+ FGP_LOCK, 0);
+ if (!IS_ERR(folio)) {
+ if (!truncate_inode_partial_folio(folio, lstart, lend))
+ end = folio->index;
+ folio_unlock(folio);
+ folio_put(folio);
}
}
- /*
- * If the truncation happened within a single page no pages
- * will be released, just zeroed, so we can bail out now.
- */
- if (start >= end)
- goto out;
index = start;
- for ( ; ; ) {
+ while (index < end) {
cond_resched();
- if (!pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
+ indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
break;
@@ -422,40 +410,24 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = start;
continue;
}
- if (index == start && indices[0] >= end) {
- /* All gone out of hole to be punched, we're done */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- break;
- }
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index >= end) {
- /* Restart punch to make sure all gone */
- index = start - 1;
- break;
- }
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
- lock_page(page);
- WARN_ON(page_to_index(page) != index);
- wait_on_page_writeback(page);
- truncate_inode_page(mapping, page);
- unlock_page(page);
+ folio_lock(folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
+ folio_wait_writeback(folio);
+ truncate_inode_folio(mapping, folio);
+ folio_unlock(folio);
}
- truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
- pagevec_release(&pvec);
- index++;
+ truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
+ folio_batch_release(&fbatch);
}
-
-out:
- cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -464,10 +436,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
*
- * Called under (and serialised by) inode->i_mutex.
+ * Called under (and serialised by) inode->i_rwsem and
+ * mapping->invalidate_lock.
*
* Note: When this function returns, there can be a page in the process of
- * deletion (inside __delete_from_page_cache()) in the specified range. Thus
+ * deletion (inside __filemap_remove_folio()) in the specified range. Thus
* mapping->nrpages can be non-zero when this function returns even after
* truncation of the whole mapping.
*/
@@ -481,16 +454,13 @@ EXPORT_SYMBOL(truncate_inode_pages);
* truncate_inode_pages_final - truncate *all* pages before inode dies
* @mapping: mapping to truncate
*
- * Called under (and serialized by) inode->i_mutex.
+ * Called under (and serialized by) inode->i_rwsem.
*
* Filesystems have to use this in the .evict_inode path to inform the
* VM that this is the final truncate and the inode is going away.
*/
void truncate_inode_pages_final(struct address_space *mapping)
{
- unsigned long nrexceptional;
- unsigned long nrpages;
-
/*
* Page reclaim can not participate in regular inode lifetime
* management (can't call iput()) and thus can race with the
@@ -500,16 +470,7 @@ void truncate_inode_pages_final(struct address_space *mapping)
*/
mapping_set_exiting(mapping);
- /*
- * When reclaim installs eviction entries, it increases
- * nrexceptional first, then decreases nrpages. Make sure we see
- * this in the right order or we might miss an entry.
- */
- nrpages = mapping->nrpages;
- smp_rmb();
- nrexceptional = mapping->nrexceptional;
-
- if (nrpages || nrexceptional) {
+ if (!mapping_empty(mapping)) {
/*
* As truncation uses a lockless tree lookup, cycle
* the tree lock to make sure any ongoing tree
@@ -520,176 +481,129 @@ void truncate_inode_pages_final(struct address_space *mapping)
xa_unlock_irq(&mapping->i_pages);
}
- /*
- * Cleancache needs notification even if there are no pages or shadow
- * entries.
- */
truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);
-unsigned long __invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
+/**
+ * mapping_try_invalidate - Invalidate all the evictable folios of one inode
+ * @mapping: the address_space which holds the folios to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ * @nr_failed: How many folio invalidations failed
+ *
+ * This function is similar to invalidate_mapping_pages(), except that it
+ * returns the number of folios which could not be evicted in @nr_failed.
+ */
+unsigned long mapping_try_invalidate(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_failed)
{
pgoff_t indices[PAGEVEC_SIZE];
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index = start;
unsigned long ret;
unsigned long count = 0;
int i;
- pagevec_init(&pvec);
- while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
- indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
-
- /* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index > end)
- break;
-
- if (xa_is_value(page)) {
- invalidate_exceptional_entry(mapping, index,
- page);
- continue;
- }
-
- if (!trylock_page(page))
- continue;
+ folio_batch_init(&fbatch);
+ while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
- WARN_ON(page_to_index(page) != index);
+ /* We rely upon deletion not changing folio->index */
- /* Middle of THP: skip */
- if (PageTransTail(page)) {
- unlock_page(page);
+ if (xa_is_value(folio)) {
+ count += invalidate_exceptional_entry(mapping,
+ indices[i], folio);
continue;
- } else if (PageTransHuge(page)) {
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- /*
- * 'end' is in the middle of THP. Don't
- * invalidate the page as the part outside of
- * 'end' could be still useful.
- */
- if (index > end) {
- unlock_page(page);
- continue;
- }
-
- /* Take a pin outside pagevec */
- get_page(page);
-
- /*
- * Drop extra pins before trying to invalidate
- * the huge page.
- */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
}
- ret = invalidate_inode_page(page);
- unlock_page(page);
+ ret = mapping_evict_folio(mapping, folio);
+ folio_unlock(folio);
/*
- * Invalidation is a hint that the page is no longer
+ * Invalidation is a hint that the folio is no longer
* of interest and try to speed up its reclaim.
*/
if (!ret) {
- deactivate_file_page(page);
- /* It is likely on the pagevec of a remote CPU */
- if (nr_pagevec)
- (*nr_pagevec)++;
+ deactivate_file_folio(folio);
+ /* Likely in the lru cache of a remote CPU */
+ if (nr_failed)
+ (*nr_failed)++;
}
-
- if (PageTransHuge(page))
- put_page(page);
count += ret;
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
- index++;
}
return count;
}
/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
+ * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
+ * @mapping: the address_space which holds the cache to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
*
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
+ * This function removes pages that are clean, unmapped and unlocked,
+ * as well as shadow entries. It will not block on IO activity.
*
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
+ * If you want to remove all the pages of one inode, regardless of
+ * their use and writeback state, use truncate_inode_pages().
*
- * Return: the number of the pages that were invalidated
+ * Return: The number of indices that had their contents invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- return __invalidate_mapping_pages(mapping, start, end, NULL);
+ return mapping_try_invalidate(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);
-/**
- * This helper is similar with the above one, except that it accounts for pages
- * that are likely on a pagevec and count them in @nr_pagevec, which will used by
- * the caller.
- */
-void invalidate_mapping_pagevec(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
-{
- __invalidate_mapping_pages(mapping, start, end, nr_pagevec);
-}
-
/*
- * This is like invalidate_complete_page(), except it ignores the page's
+ * This is like invalidate_inode_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
* shrink_page_list() has a temp ref on them, or because they're transiently
- * sitting in the lru_cache_add() pagevecs.
+ * sitting in the folio_add_lru() caches.
*/
-static int
-invalidate_complete_page2(struct address_space *mapping, struct page *page)
+static int invalidate_complete_folio2(struct address_space *mapping,
+ struct folio *folio)
{
- unsigned long flags;
-
- if (page->mapping != mapping)
+ if (folio->mapping != mapping)
return 0;
- if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL))
return 0;
- xa_lock_irqsave(&mapping->i_pages, flags);
- if (PageDirty(page))
+ spin_lock(&mapping->host->i_lock);
+ xa_lock_irq(&mapping->i_pages);
+ if (folio_test_dirty(folio))
goto failed;
- BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
-
- if (mapping->a_ops->freepage)
- mapping->a_ops->freepage(page);
+ BUG_ON(folio_has_private(folio));
+ __filemap_remove_folio(folio, NULL);
+ xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
- put_page(page); /* pagecache ref */
+ filemap_free_folio(mapping, folio);
return 1;
failed:
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
+ spin_unlock(&mapping->host->i_lock);
return 0;
}
-static int do_launder_page(struct address_space *mapping, struct page *page)
+static int folio_launder(struct address_space *mapping, struct folio *folio)
{
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
return 0;
- if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
return 0;
- return mapping->a_ops->launder_page(page);
+ return mapping->a_ops->launder_folio(folio);
}
/**
@@ -707,73 +621,65 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
pgoff_t indices[PAGEVEC_SIZE];
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index;
int i;
int ret = 0;
int ret2 = 0;
int did_range_unmap = 0;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
- goto out;
+ if (mapping_empty(mapping))
+ return 0;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
index = start;
- while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
- indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index > end)
- break;
+ /* We rely upon deletion not changing folio->index */
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (!invalidate_exceptional_entry2(mapping,
- index, page))
+ indices[i], folio))
ret = -EBUSY;
continue;
}
- lock_page(page);
- WARN_ON(page_to_index(page) != index);
- if (page->mapping != mapping) {
- unlock_page(page);
- continue;
+ if (!did_range_unmap && folio_mapped(folio)) {
+ /*
+ * If folio is mapped, before taking its lock,
+ * zap the rest of the file in one hit.
+ */
+ unmap_mapping_pages(mapping, indices[i],
+ (1 + end - indices[i]), false);
+ did_range_unmap = 1;
}
- wait_on_page_writeback(page);
- if (page_mapped(page)) {
- if (!did_range_unmap) {
- /*
- * Zap the rest of the file in one hit.
- */
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
- did_range_unmap = 1;
- } else {
- /*
- * Just zap this page
- */
- unmap_mapping_pages(mapping, index,
- 1, false);
- }
+
+ folio_lock(folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ continue;
}
- BUG_ON(page_mapped(page));
- ret2 = do_launder_page(mapping, page);
+ folio_wait_writeback(folio);
+
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
+ BUG_ON(folio_mapped(folio));
+
+ ret2 = folio_launder(mapping, folio);
if (ret2 == 0) {
- if (!invalidate_complete_page2(mapping, page))
+ if (!invalidate_complete_folio2(mapping, folio))
ret2 = -EBUSY;
}
if (ret2 < 0)
ret = ret2;
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
- index++;
}
/*
* For DAX we invalidate page tables after invalidating page cache. We
@@ -785,8 +691,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
if (dax_mapping(mapping)) {
unmap_mapping_pages(mapping, start, end - start + 1, false);
}
-out:
- cleancache_invalidate_inode(mapping);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
@@ -851,7 +755,7 @@ EXPORT_SYMBOL(truncate_pagecache);
* setattr function when ATTR_SIZE is passed in.
*
* Must be called with a lock serializing truncates and writes (generally
- * i_mutex but e.g. xfs uses a different lock) and before all filesystem
+ * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
* specific block truncation has been performed.
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
@@ -880,7 +784,7 @@ EXPORT_SYMBOL(truncate_setsize);
*
* The function must be called after i_size is updated so that page fault
* coming after we unlock the page will already see the new i_size.
- * The function must be called while we still hold i_mutex - this not only
+ * The function must be called while we still hold i_rwsem - this not only
* makes sure i_size is stable but also that userspace cannot observe new
* i_size value before we are prepared to store mmap writes at new inode size.
*/
diff --git a/mm/usercopy.c b/mm/usercopy.c
index b3de3c4eefba..83c164aba6e0 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -12,14 +12,17 @@
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
+#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>
+#include "slab.h"
/*
* Checks if a given pointer and length is contained by the current
@@ -28,7 +31,7 @@
* Returns:
* NOT_STACK: not at all on the stack
* GOOD_FRAME: fully within a valid stack frame
- * GOOD_STACK: fully on the stack (when can't do frame-checking)
+ * GOOD_STACK: within the current stack (when can't frame-check exactly)
* BAD_STACK: error condition (invalid stack position or bad stack frame)
*/
static noinline int check_stack_object(const void *obj, unsigned long len)
@@ -54,6 +57,17 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
if (ret)
return ret;
+ /* Finally, check stack depth if possible. */
+#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
+ if (IS_ENABLED(CONFIG_STACK_GROWSUP)) {
+ if ((void *)current_stack_pointer < obj + len)
+ return BAD_STACK;
+ } else {
+ if (obj < (void *)current_stack_pointer)
+ return BAD_STACK;
+ }
+#endif
+
return GOOD_STACK;
}
@@ -69,17 +83,6 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
* kmem_cache_create_usercopy() function to create the cache (and
* carefully audit the whitelist range).
*/
-void usercopy_warn(const char *name, const char *detail, bool to_user,
- unsigned long offset, unsigned long len)
-{
- WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
- to_user ? "exposure" : "overwrite",
- to_user ? "from" : "to",
- name ? : "unknown?!",
- detail ? " '" : "", detail ? : "", detail ? "'" : "",
- offset, len);
-}
-
void __noreturn usercopy_abort(const char *name, const char *detail,
bool to_user, unsigned long offset,
unsigned long len)
@@ -156,91 +159,45 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
usercopy_abort("null address", NULL, to_user, ptr, n);
}
-/* Checks for allocs that are marked in some way as spanning multiple pages. */
-static inline void check_page_span(const void *ptr, unsigned long n,
- struct page *page, bool to_user)
+static inline void check_heap_object(const void *ptr, unsigned long n,
+ bool to_user)
{
-#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
- const void *end = ptr + n - 1;
- struct page *endpage;
- bool is_reserved, is_cma;
-
- /*
- * Sometimes the kernel data regions are not marked Reserved (see
- * check below). And sometimes [_sdata,_edata) does not cover
- * rodata and/or bss, so check each range explicitly.
- */
-
- /* Allow reads of kernel rodata region (if not marked as Reserved). */
- if (ptr >= (const void *)__start_rodata &&
- end <= (const void *)__end_rodata) {
- if (!to_user)
- usercopy_abort("rodata", NULL, to_user, 0, n);
+ unsigned long addr = (unsigned long)ptr;
+ unsigned long offset;
+ struct folio *folio;
+
+ if (is_kmap_addr(ptr)) {
+ offset = offset_in_page(ptr);
+ if (n > PAGE_SIZE - offset)
+ usercopy_abort("kmap", NULL, to_user, offset, n);
return;
}
- /* Allow kernel data region (if not marked as Reserved). */
- if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
- return;
+ if (is_vmalloc_addr(ptr) && !pagefault_disabled()) {
+ struct vmap_area *area = find_vmap_area(addr);
- /* Allow kernel bss region (if not marked as Reserved). */
- if (ptr >= (const void *)__bss_start &&
- end <= (const void *)__bss_stop)
- return;
+ if (!area)
+ usercopy_abort("vmalloc", "no area", to_user, 0, n);
- /* Is the object wholly within one base page? */
- if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
- ((unsigned long)end & (unsigned long)PAGE_MASK)))
+ if (n > area->va_end - addr) {
+ offset = addr - area->va_start;
+ usercopy_abort("vmalloc", NULL, to_user, offset, n);
+ }
return;
-
- /* Allow if fully inside the same compound (__GFP_COMP) page. */
- endpage = virt_to_head_page(end);
- if (likely(endpage == page))
- return;
-
- /*
- * Reject if range is entirely either Reserved (i.e. special or
- * device memory), or CMA. Otherwise, reject since the object spans
- * several independently allocated pages.
- */
- is_reserved = PageReserved(page);
- is_cma = is_migrate_cma_page(page);
- if (!is_reserved && !is_cma)
- usercopy_abort("spans multiple pages", NULL, to_user, 0, n);
-
- for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
- page = virt_to_head_page(ptr);
- if (is_reserved && !PageReserved(page))
- usercopy_abort("spans Reserved and non-Reserved pages",
- NULL, to_user, 0, n);
- if (is_cma && !is_migrate_cma_page(page))
- usercopy_abort("spans CMA and non-CMA pages", NULL,
- to_user, 0, n);
}
-#endif
-}
-
-static inline void check_heap_object(const void *ptr, unsigned long n,
- bool to_user)
-{
- struct page *page;
if (!virt_addr_valid(ptr))
return;
- /*
- * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
- * highmem page or fallback to virt_to_page(). The following
- * is effectively a highmem-aware virt_to_head_page().
- */
- page = compound_head(kmap_to_page((void *)ptr));
+ folio = virt_to_folio(ptr);
- if (PageSlab(page)) {
+ if (folio_test_slab(folio)) {
/* Check slab allocator for flags and size. */
- __check_heap_object(ptr, n, page, to_user);
- } else {
- /* Verify object does not incorrectly span multiple pages. */
- check_page_span(ptr, n, page, to_user);
+ __check_heap_object(ptr, n, folio_slab(folio), to_user);
+ } else if (folio_test_large(folio)) {
+ offset = ptr - folio_address(folio);
+ if (n > folio_size(folio) - offset)
+ usercopy_abort("page alloc", NULL, to_user, offset, n);
}
}
@@ -279,7 +236,15 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
*/
return;
default:
- usercopy_abort("process stack", NULL, to_user, 0, n);
+ usercopy_abort("process stack", NULL, to_user,
+#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
+ IS_ENABLED(CONFIG_STACK_GROWSUP) ?
+ ptr - (void *)current_stack_pointer :
+ (void *)current_stack_pointer - ptr,
+#else
+ 0,
+#endif
+ n);
}
/* Check for bad heap object. */
@@ -294,7 +259,10 @@ static bool enable_checks __initdata = true;
static int __init parse_hardened_usercopy(char *str)
{
- return strtobool(str, &enable_checks);
+ if (kstrtobool(str, &enable_checks))
+ pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
+ str);
+ return 1;
}
__setup("hardened_usercopy=", parse_hardened_usercopy);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451402d7..a2bf37ee276d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -16,6 +16,7 @@
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#include "internal.h"
static __always_inline
@@ -30,11 +31,7 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma;
dst_vma = find_vma(dst_mm, dst_start);
- if (!dst_vma)
- return NULL;
-
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
+ if (!range_in_vma(dst_vma, dst_start, dst_start + len))
return NULL;
/*
@@ -48,103 +45,168 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
return dst_vma;
}
-static int mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep,
- bool wp_copy)
+/*
+ * Install PTEs, to map dst_addr (within dst_vma) to page.
+ *
+ * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
+ * and anon, and for both shared and private VMAs.
+ */
+int mfill_atomic_install_pte(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, struct page *page,
+ bool newly_allocated, uffd_flags_t flags)
{
+ int ret;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
pte_t _dst_pte, *dst_pte;
+ bool writable = dst_vma->vm_flags & VM_WRITE;
+ bool vm_shared = dst_vma->vm_flags & VM_SHARED;
+ bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
- void *page_kaddr;
- int ret;
- struct page *page;
- pgoff_t offset, max_off;
+ struct folio *folio;
struct inode *inode;
+ pgoff_t offset, max_off;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ _dst_pte = pte_mkdirty(_dst_pte);
+ if (page_in_cache && !vm_shared)
+ writable = false;
+ if (writable)
+ _dst_pte = pte_mkwrite(_dst_pte);
+ if (flags & MFILL_ATOMIC_WP)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+
+ ret = -EAGAIN;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
+
+ if (vma_is_shmem(dst_vma)) {
+ /* serialize against truncate with the page table lock */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ /*
+ * We allow to overwrite a pte marker: consider when both MISSING|WP
+ * registered, we firstly wr-protect a none pte which has no page cache
+ * page backing it, then access the page.
+ */
+ if (!pte_none_mostly(ptep_get(dst_pte)))
+ goto out_unlock;
+
+ folio = page_folio(page);
+ if (page_in_cache) {
+ /* Usually, cache pages are already added to LRU */
+ if (newly_allocated)
+ folio_add_lru(folio);
+ page_add_file_rmap(page, dst_vma, false);
+ } else {
+ page_add_new_anon_rmap(page, dst_vma, dst_addr);
+ folio_add_lru_vma(folio, dst_vma);
+ }
+
+ /*
+ * Must happen after rmap, as mm_counter() checks mapping (via
+ * PageAnon()), which is set by __page_set_anon_rmap().
+ */
+ inc_mm_counter(dst_mm, mm_counter(page));
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+out:
+ return ret;
+}
- if (!*pagep) {
+static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct folio **foliop)
+{
+ void *kaddr;
+ int ret;
+ struct folio *folio;
+
+ if (!*foliop) {
ret = -ENOMEM;
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
- if (!page)
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
+ dst_addr, false);
+ if (!folio)
goto out;
- page_kaddr = kmap_atomic(page);
- ret = copy_from_user(page_kaddr,
- (const void __user *) src_addr,
+ kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
+ ret = copy_from_user(kaddr, (const void __user *) src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
- *pagep = page;
+ *foliop = folio;
/* don't free the page */
goto out;
}
+
+ flush_dcache_folio(folio);
} else {
- page = *pagep;
- *pagep = NULL;
+ folio = *foliop;
+ *foliop = NULL;
}
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
ret = -ENOMEM;
- if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
goto out_release;
- _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
- if (dst_vma->vm_flags & VM_WRITE) {
- if (wp_copy)
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- else
- _dst_pte = pte_mkwrite(_dst_pte);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
- if (dst_vma->vm_file) {
- /* the shmem MAP_PRIVATE case requires checking the i_size */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
- }
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
-
- inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
-
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
-
- pte_unmap_unlock(dst_pte, ptl);
- ret = 0;
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, flags);
+ if (ret)
+ goto out_release;
out:
return ret;
-out_release_uncharge_unlock:
- pte_unmap_unlock(dst_pte, ptl);
out_release:
- put_page(page);
+ folio_put(folio);
goto out;
}
-static int mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
{
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
@@ -154,7 +216,10 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ ret = -EAGAIN;
+ dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
if (dst_vma->vm_file) {
/* the shmem MAP_PRIVATE case requires checking the i_size */
inode = dst_vma->vm_file->f_inode;
@@ -165,15 +230,60 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
goto out_unlock;
}
ret = -EEXIST;
- if (!pte_none(*dst_pte))
+ if (!pte_none(ptep_get(dst_pte)))
goto out_unlock;
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
+out:
+ return ret;
+}
+
+/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
+static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ uffd_flags_t flags)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct folio *folio;
+ struct page *page;
+ int ret;
+
+ ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
+ /* Our caller expects us to return -EFAULT if we failed to find folio */
+ if (ret == -ENOENT)
+ ret = -EFAULT;
+ if (ret)
+ goto out;
+ if (!folio) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ page = folio_file_page(folio, pgoff);
+ if (PageHWPoison(page)) {
+ ret = -EIO;
+ goto out_release;
+ }
+
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ page, false, flags);
+ if (ret)
+ goto out_release;
+
+ folio_unlock(folio);
+ ret = 0;
+out:
return ret;
+out_release:
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
}
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
@@ -199,23 +309,23 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
- * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
+ * mfill_atomic processing for HUGETLB vmas. Note that this routine is
* called with mmap_lock held, it will release mmap_lock before returning.
*/
-static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic_hugetlb(
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage)
+ uffd_flags_t flags)
{
- int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
unsigned long src_addr, dst_addr;
long copied;
- struct page *page;
+ struct folio *folio;
unsigned long vma_hpagesize;
pgoff_t idx;
u32 hash;
@@ -227,7 +337,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
*/
- if (zeropage) {
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -235,7 +345,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
src_addr = src_start;
dst_addr = dst_start;
copied = 0;
- page = NULL;
+ folio = NULL;
vma_hpagesize = vma_kernel_pagesize(dst_vma);
/*
@@ -273,55 +383,50 @@ retry:
}
while (src_addr < src_start + len) {
- pte_t dst_pteval;
-
BUG_ON(dst_addr >= dst_start + len);
/*
- * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
- * i_mmap_rwsem ensures the dst_pte remains valid even
+ * Serialize via vma_lock and hugetlb_fault_mutex.
+ * vma_lock ensures the dst_pte remains valid even
* in the case of shared pmds. fault mutex prevents
* races with other faulting threads.
*/
- mapping = dst_vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(dst_vma);
err = -ENOMEM;
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
+ dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
if (!dst_pte) {
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
- err = -EEXIST;
- dst_pteval = huge_ptep_get(dst_pte);
- if (!huge_pte_none(dst_pteval)) {
+ if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
+ !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
+ err = -EEXIST;
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
- err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
- dst_addr, src_addr, &page);
+ err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
+ src_addr, flags, &folio);
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
- vm_alloc_shared = vm_shared;
cond_resched();
if (unlikely(err == -ENOENT)) {
mmap_read_unlock(dst_mm);
- BUG_ON(!page);
+ BUG_ON(!folio);
- err = copy_huge_page_from_user(page,
- (const void __user *)src_addr,
- vma_hpagesize / PAGE_SIZE,
- true);
+ err = copy_folio_from_user(folio,
+ (const void __user *)src_addr, true);
if (unlikely(err)) {
err = -EFAULT;
goto out;
@@ -331,7 +436,7 @@ retry:
dst_vma = NULL;
goto retry;
} else
- BUG_ON(page);
+ BUG_ON(folio);
if (!err) {
dst_addr += vma_hpagesize;
@@ -348,54 +453,8 @@ retry:
out_unlock:
mmap_read_unlock(dst_mm);
out:
- if (page) {
- /*
- * We encountered an error and are about to free a newly
- * allocated huge page.
- *
- * Reservation handling is very subtle, and is different for
- * private and shared mappings. See the routine
- * restore_reserve_on_error for details. Unfortunately, we
- * can not call restore_reserve_on_error now as it would
- * require holding mmap_lock.
- *
- * If a reservation for the page existed in the reservation
- * map of a private mapping, the map was modified to indicate
- * the reservation was consumed when the page was allocated.
- * We clear the PagePrivate flag now so that the global
- * reserve count will not be incremented in free_huge_page.
- * The reservation map will still indicate the reservation
- * was consumed and possibly prevent later page allocation.
- * This is better than leaking a global reservation. If no
- * reservation existed, it is still safe to clear PagePrivate
- * as no adjustments to reservation counts were made during
- * allocation.
- *
- * The reservation map for shared mappings indicates which
- * pages have reservations. When a huge page is allocated
- * for an address with a reservation, no change is made to
- * the reserve map. In this case PagePrivate will be set
- * to indicate that the global reservation count should be
- * incremented when the page is freed. This is the desired
- * behavior. However, when a huge page is allocated for an
- * address without a reservation a reservation entry is added
- * to the reservation map, and PagePrivate will not be set.
- * When the page is freed, the global reserve count will NOT
- * be incremented and it will appear as though we have leaked
- * reserved page. In this case, set PagePrivate so that the
- * global reserve count will be incremented to match the
- * reservation map entry which was created.
- *
- * Note that vm_alloc_shared is based on the flags of the vma
- * for which the page was originally allocated. dst_vma could
- * be different or NULL on error.
- */
- if (vm_alloc_shared)
- SetPagePrivate(page);
- else
- ClearPagePrivate(page);
- put_page(page);
- }
+ if (folio)
+ folio_put(folio);
BUG_ON(copied < 0);
BUG_ON(err > 0);
BUG_ON(!copied && !err);
@@ -403,25 +462,27 @@ out:
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
- struct vm_area_struct *dst_vma,
- unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
- bool zeropage);
+extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
-static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- struct page **page,
- bool zeropage,
- bool wp_copy)
+ uffd_flags_t flags,
+ struct folio **foliop)
{
ssize_t err;
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
+ return mfill_atomic_pte_continue(dst_pmd, dst_vma,
+ dst_addr, flags);
+ }
+
/*
* The normal page fault path for a shmem will invoke the
* fault, fill the hole in the file and COW it right away. The
@@ -433,42 +494,35 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (!zeropage)
- err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, page,
- wp_copy);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
+ err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ flags, foliop);
else
- err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ err = mfill_atomic_pte_zeropage(dst_pmd,
dst_vma, dst_addr);
} else {
- VM_WARN_ON_ONCE(wp_copy);
- if (!zeropage)
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr,
- src_addr, page);
- else
- err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr);
+ err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ flags, foliop);
}
return err;
}
-static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
- bool zeropage,
- bool *mmap_changing,
- __u64 mode)
+static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ atomic_t *mmap_changing,
+ uffd_flags_t flags)
{
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
unsigned long src_addr, dst_addr;
long copied;
- struct page *page;
- bool wp_copy;
+ struct folio *folio;
/*
* Sanitize the command parameters:
@@ -483,7 +537,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
src_addr = src_start;
dst_addr = dst_start;
copied = 0;
- page = NULL;
+ folio = NULL;
retry:
mmap_read_lock(dst_mm);
@@ -493,7 +547,7 @@ retry:
* request the user to retry later
*/
err = -EAGAIN;
- if (mmap_changing && READ_ONCE(*mmap_changing))
+ if (mmap_changing && atomic_read(mmap_changing))
goto out_unlock;
/*
@@ -518,19 +572,21 @@ retry:
* validate 'mode' now that we know the dst_vma: don't allow
* a wrprotect copy if the userfaultfd didn't register as WP.
*/
- wp_copy = mode & UFFDIO_COPY_MODE_WP;
- if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
goto out_unlock;
/*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, zeropage);
+ return mfill_atomic_hugetlb(dst_vma, dst_start,
+ src_start, len, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
+ if (!vma_is_shmem(dst_vma) &&
+ uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
+ goto out_unlock;
/*
* Ensure the dst_vma has a anon_vma or this page
@@ -553,7 +609,7 @@ retry:
break;
}
- dst_pmdval = pmd_read_atomic(dst_pmd);
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
/*
* If the dst_pmd is mapped as THP don't
* override it and just be strict.
@@ -576,28 +632,29 @@ retry:
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));
- err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage, wp_copy);
+ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
+ src_addr, flags, &folio);
cond_resched();
if (unlikely(err == -ENOENT)) {
- void *page_kaddr;
+ void *kaddr;
mmap_read_unlock(dst_mm);
- BUG_ON(!page);
+ BUG_ON(!folio);
- page_kaddr = kmap(page);
- err = copy_from_user(page_kaddr,
+ kaddr = kmap_local_folio(folio, 0);
+ err = copy_from_user(kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap(page);
+ kunmap_local(kaddr);
if (unlikely(err)) {
err = -EFAULT;
goto out;
}
+ flush_dcache_folio(folio);
goto retry;
} else
- BUG_ON(page);
+ BUG_ON(folio);
if (!err) {
dst_addr += PAGE_SIZE;
@@ -614,34 +671,76 @@ retry:
out_unlock:
mmap_read_unlock(dst_mm);
out:
- if (page)
- put_page(page);
+ if (folio)
+ folio_put(folio);
BUG_ON(copied < 0);
BUG_ON(err > 0);
BUG_ON(!copied && !err);
return copied ? copied : err;
}
-ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- bool *mmap_changing, __u64 mode)
+ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len,
+ atomic_t *mmap_changing, uffd_flags_t flags)
+{
+ return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
+}
+
+ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
- mmap_changing, mode);
+ return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}
-ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool *mmap_changing)
+ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+ return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
+}
+
+long uffd_wp_range(struct vm_area_struct *dst_vma,
+ unsigned long start, unsigned long len, bool enable_wp)
+{
+ unsigned int mm_cp_flags;
+ struct mmu_gather tlb;
+ long ret;
+
+ VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
+ "The address range exceeds VMA boundary.\n");
+ if (enable_wp)
+ mm_cp_flags = MM_CP_UFFD_WP;
+ else
+ mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
+
+ /*
+ * vma->vm_page_prot already reflects that uffd-wp is enabled for this
+ * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
+ * to be write-protected as default whenever protection changes.
+ * Try upgrading write permissions manually.
+ */
+ if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
+ mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
+ tlb_gather_mmu(&tlb, dst_vma->vm_mm);
+ ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
+ tlb_finish_mmu(&tlb);
+
+ return ret;
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool enable_wp, bool *mmap_changing)
+ unsigned long len, bool enable_wp,
+ atomic_t *mmap_changing)
{
+ unsigned long end = start + len;
+ unsigned long _start, _end;
struct vm_area_struct *dst_vma;
- pgprot_t newprot;
- int err;
+ unsigned long page_mask;
+ long err;
+ VMA_ITERATOR(vmi, dst_mm, start);
/*
* Sanitize the command parameters:
@@ -660,31 +759,34 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* request the user to retry later
*/
err = -EAGAIN;
- if (mmap_changing && READ_ONCE(*mmap_changing))
+ if (mmap_changing && atomic_read(mmap_changing))
goto out_unlock;
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, start, len);
- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
- goto out_unlock;
- if (!userfaultfd_wp(dst_vma))
- goto out_unlock;
- if (!vma_is_anonymous(dst_vma))
- goto out_unlock;
+ for_each_vma_range(vmi, dst_vma, end) {
- if (enable_wp)
- newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
- else
- newprot = vm_get_page_prot(dst_vma->vm_flags);
+ if (!userfaultfd_wp(dst_vma)) {
+ err = -ENOENT;
+ break;
+ }
- change_protection(dst_vma, start, start + len, newprot,
- enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+ if (is_vm_hugetlb_page(dst_vma)) {
+ err = -EINVAL;
+ page_mask = vma_kernel_pagesize(dst_vma) - 1;
+ if ((start & page_mask) || (len & page_mask))
+ break;
+ }
- err = 0;
+ _start = max(dst_vma->vm_start, start);
+ _end = min(dst_vma->vm_end, end);
+
+ err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
+
+ /* Return 0 on success, <0 on failures */
+ if (err < 0)
+ break;
+ err = 0;
+ }
out_unlock:
mmap_read_unlock(dst_mm);
return err;
diff --git a/mm/util.c b/mm/util.c
index 4e21fe7eae27..406634f26918 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -27,6 +27,7 @@
#include <linux/uaccess.h>
#include "internal.h"
+#include "swap.h"
/**
* kfree_const - conditionally free memory
@@ -48,6 +49,7 @@ EXPORT_SYMBOL(kfree_const);
*
* Return: newly allocated copy of @s or %NULL in case of error
*/
+noinline
char *kstrdup(const char *s, gfp_t gfp)
{
size_t len;
@@ -69,7 +71,8 @@ EXPORT_SYMBOL(kstrdup);
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
- * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
+ * must not be passed to krealloc().
*
* Return: source string if it is in .rodata section otherwise
* fallback to kstrdup.
@@ -118,7 +121,8 @@ EXPORT_SYMBOL(kstrndup);
* @len: memory region length
* @gfp: GFP mask to use
*
- * Return: newly allocated copy of @src or %NULL in case of error
+ * Return: newly allocated copy of @src or %NULL in case of error,
+ * result is physically contiguous. Use kfree() to free.
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
@@ -132,6 +136,27 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
+ * kvmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ *
+ * Return: newly allocated copy of @src or %NULL in case of error,
+ * result may be not physically contiguous. Use kvfree() to free.
+ */
+void *kvmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kvmalloc(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+ return p;
+}
+EXPORT_SYMBOL(kvmemdup);
+
+/**
* kmemdup_nul - Create a NUL-terminated string from unterminated data
* @s: The data to stringify
* @len: The size of the data
@@ -270,38 +295,6 @@ void *memdup_user_nul(const void __user *src, size_t len)
}
EXPORT_SYMBOL(memdup_user_nul);
-void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev)
-{
- struct vm_area_struct *next;
-
- vma->vm_prev = prev;
- if (prev) {
- next = prev->vm_next;
- prev->vm_next = vma;
- } else {
- next = mm->mmap;
- mm->mmap = vma;
- }
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
-}
-
-void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- struct vm_area_struct *prev, *next;
-
- next = vma->vm_next;
- prev = vma->vm_prev;
- if (prev)
- prev->vm_next = next;
- else
- mm->mmap = next;
- if (next)
- next->vm_prev = prev;
-}
-
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
@@ -310,6 +303,18 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
+/*
+ * Change backing file, only valid to use during initial VMA setup.
+ */
+void vma_set_file(struct vm_area_struct *vma, struct file *file)
+{
+ /* Changing an anonymous vma with this is illegal */
+ get_file(file);
+ swap(vma->vm_file, file);
+ fput(file);
+}
+EXPORT_SYMBOL(vma_set_file);
+
#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
#endif
@@ -330,8 +335,40 @@ unsigned long randomize_stack_top(unsigned long stack_top)
#endif
}
+/**
+ * randomize_page - Generate a random, page aligned address
+ * @start: The smallest acceptable address the caller will take.
+ * @range: The size of the area, starting at @start, within which the
+ * random address must fall.
+ *
+ * If @start + @range would overflow, @range is capped.
+ *
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ * @start was already page aligned. We now align it regardless.
+ *
+ * Return: A page aligned address within [start, start + range). On error,
+ * @start is returned.
+ */
+unsigned long randomize_page(unsigned long start, unsigned long range)
+{
+ if (!PAGE_ALIGNED(start)) {
+ range -= PAGE_ALIGN(start) - start;
+ start = PAGE_ALIGN(start);
+ }
+
+ if (start > ULONG_MAX - range)
+ range = ULONG_MAX - start;
+
+ range >>= PAGE_SHIFT;
+
+ if (range == 0)
+ return start;
+
+ return start + (get_random_long() % range << PAGE_SHIFT);
+}
+
#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
-unsigned long arch_randomize_brk(struct mm_struct *mm)
+unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
{
/* Is the current task 32bit ? */
if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
@@ -536,13 +573,10 @@ EXPORT_SYMBOL(vm_mmap);
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
- * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
+ * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
* preferable to the vmalloc fallback, due to visible performance drawbacks.
*
- * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
- * fall back to vmalloc.
- *
* Return: pointer to the allocated memory of %NULL in case of failure
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
@@ -551,13 +585,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
void *ret;
/*
- * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
- * so the given set of flags has to be compatible.
- */
- if ((flags & GFP_KERNEL) != GFP_KERNEL)
- return kmalloc_node(size, flags, node);
-
- /*
* We want to attempt a large physically contiguous block first because
* it is less likely to fragment multiple larger blocks and therefore
* contribute to a long term fragmentation less than vmalloc fallback.
@@ -569,6 +596,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
kmalloc_flags |= __GFP_NORETRY;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ kmalloc_flags &= ~__GFP_NOFAIL;
}
ret = kmalloc_node(size, kmalloc_flags, node);
@@ -580,8 +610,25 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
- return __vmalloc_node(size, 1, flags, node,
- __builtin_return_address(0));
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
+ /* Don't even allow crazy sizes */
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ /*
+ * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
+ * since the callers already cannot assume anything
+ * about the resulting pointer, and cannot play
+ * protection games.
+ */
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
@@ -622,111 +669,139 @@ void kvfree_sensitive(const void *addr, size_t len)
}
EXPORT_SYMBOL(kvfree_sensitive);
-static inline void *__page_rmapping(struct page *page)
+void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
{
- unsigned long mapping;
+ void *newp;
- mapping = (unsigned long)page->mapping;
- mapping &= ~PAGE_MAPPING_FLAGS;
+ if (oldsize >= newsize)
+ return (void *)p;
+ newp = kvmalloc(newsize, flags);
+ if (!newp)
+ return NULL;
+ memcpy(newp, p, oldsize);
+ kvfree(p);
+ return newp;
+}
+EXPORT_SYMBOL(kvrealloc);
- return (void *)mapping;
+/**
+ * __vmalloc_array - allocate memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
+void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+ size_t bytes;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+ return __vmalloc(bytes, flags);
}
+EXPORT_SYMBOL(__vmalloc_array);
-/* Neutral page->mapping pointer to address_space or anon_vma or other */
-void *page_rmapping(struct page *page)
+/**
+ * vmalloc_array - allocate memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ */
+void *vmalloc_array(size_t n, size_t size)
{
- page = compound_head(page);
- return __page_rmapping(page);
+ return __vmalloc_array(n, size, GFP_KERNEL);
}
+EXPORT_SYMBOL(vmalloc_array);
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
- */
-bool page_mapped(struct page *page)
-{
- int i;
-
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) >= 0;
- page = compound_head(page);
- if (atomic_read(compound_mapcount_ptr(page)) >= 0)
- return true;
- if (PageHuge(page))
- return false;
- for (i = 0; i < compound_nr(page); i++) {
- if (atomic_read(&page[i]._mapcount) >= 0)
- return true;
- }
- return false;
+/**
+ * __vcalloc - allocate and zero memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
+void *__vcalloc(size_t n, size_t size, gfp_t flags)
+{
+ return __vmalloc_array(n, size, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL(__vcalloc);
+
+/**
+ * vcalloc - allocate and zero memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ */
+void *vcalloc(size_t n, size_t size)
+{
+ return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vcalloc);
+
+/* Neutral page->mapping pointer to address_space or anon_vma or other */
+void *page_rmapping(struct page *page)
+{
+ return folio_raw_mapping(page_folio(page));
}
-EXPORT_SYMBOL(page_mapped);
-struct anon_vma *page_anon_vma(struct page *page)
+struct anon_vma *folio_anon_vma(struct folio *folio)
{
- unsigned long mapping;
+ unsigned long mapping = (unsigned long)folio->mapping;
- page = compound_head(page);
- mapping = (unsigned long)page->mapping;
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
return NULL;
- return __page_rmapping(page);
+ return (void *)(mapping - PAGE_MAPPING_ANON);
}
-struct address_space *page_mapping(struct page *page)
+/**
+ * folio_mapping - Find the mapping where this folio is stored.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to. Folios in the swap cache return the swap mapping
+ * this page is stored in (which is different from the mapping for the
+ * swap file or swap device where the data is stored).
+ *
+ * You can call this for folios which aren't in the swap cache or page
+ * cache and it will return NULL.
+ */
+struct address_space *folio_mapping(struct folio *folio)
{
struct address_space *mapping;
- page = compound_head(page);
-
/* This happens if someone calls flush_dcache_page on slab page */
- if (unlikely(PageSlab(page)))
+ if (unlikely(folio_test_slab(folio)))
return NULL;
- if (unlikely(PageSwapCache(page))) {
- swp_entry_t entry;
+ if (unlikely(folio_test_swapcache(folio)))
+ return swap_address_space(folio_swap_entry(folio));
- entry.val = page_private(page);
- return swap_address_space(entry);
- }
-
- mapping = page->mapping;
- if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+ mapping = folio->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
+ return mapping;
}
-EXPORT_SYMBOL(page_mapping);
+EXPORT_SYMBOL(folio_mapping);
-/*
- * For file cache pages, return the address_space, otherwise return NULL
+/**
+ * folio_copy - Copy the contents of one folio to another.
+ * @dst: Folio to copy to.
+ * @src: Folio to copy from.
+ *
+ * The bytes in the folio represented by @src are copied to @dst.
+ * Assumes the caller has validated that @dst is at least as large as @src.
+ * Can be called in atomic context for order-0 folios, but if the folio is
+ * larger, it may sleep.
*/
-struct address_space *page_mapping_file(struct page *page)
-{
- if (unlikely(PageSwapCache(page)))
- return NULL;
- return page_mapping(page);
-}
-
-/* Slow path of page_mapcount() for compound pages */
-int __page_mapcount(struct page *page)
+void folio_copy(struct folio *dst, struct folio *src)
{
- int ret;
-
- ret = atomic_read(&page->_mapcount) + 1;
- /*
- * For file THP page->_mapcount contains total number of mapping
- * of the page: no need to look into compound_mapcount.
- */
- if (!PageAnon(page) && !PageHuge(page))
- return ret;
- page = compound_head(page);
- ret += atomic_read(compound_mapcount_ptr(page)) + 1;
- if (PageDoubleMap(page))
- ret--;
- return ret;
+ long i = 0;
+ long nr = folio_nr_pages(src);
+
+ for (;;) {
+ copy_highpage(folio_page(dst, i), folio_page(src, i));
+ if (++i == nr)
+ break;
+ cond_resched();
+ }
}
-EXPORT_SYMBOL_GPL(__page_mapcount);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
@@ -755,14 +830,14 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
- int new_policy;
+ int new_policy = -1;
int ret;
/*
* The deviation of sync_overcommit_as could be big with loose policy
* like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
* strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
- * with the strict "NEVER", and to avoid possible race condtion (even
+ * with the strict "NEVER", and to avoid possible race condition (even
* though user usually won't too frequently do the switching to policy
* OVERCOMMIT_NEVER), the switch is done in the following order:
* 1. changing the batch
@@ -773,7 +848,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
t = *table;
t.data = &new_policy;
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
- if (ret)
+ if (ret || new_policy == -1)
return ret;
mm_compute_batch(new_policy);
@@ -846,7 +921,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst
+ * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
@@ -893,6 +968,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
+ pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n",
+ __func__, current->pid, current->comm);
vm_unacct_memory(pages);
return -ENOMEM;
@@ -969,3 +1046,94 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
kunmap_atomic(addr1);
return ret;
}
+
+#ifdef CONFIG_PRINTK
+/**
+ * mem_dump_obj - Print available provenance information
+ * @object: object for which to find provenance information.
+ *
+ * This function uses pr_cont(), so that the caller is expected to have
+ * printed out whatever preamble is appropriate. The provenance information
+ * depends on the type of object and on how much debugging is enabled.
+ * For example, for a slab-cache object, the slab name is printed, and,
+ * if available, the return address and stack trace from the allocation
+ * and last free path of that object.
+ */
+void mem_dump_obj(void *object)
+{
+ const char *type;
+
+ if (kmem_valid_obj(object)) {
+ kmem_dump_obj(object);
+ return;
+ }
+
+ if (vmalloc_dump_obj(object))
+ return;
+
+ if (is_vmalloc_addr(object))
+ type = "vmalloc memory";
+ else if (virt_addr_valid(object))
+ type = "non-slab/vmalloc memory";
+ else if (object == NULL)
+ type = "NULL pointer";
+ else if (object == ZERO_SIZE_PTR)
+ type = "zero-size pointer";
+ else
+ type = "non-paged memory";
+
+ pr_cont(" %s\n", type);
+}
+EXPORT_SYMBOL_GPL(mem_dump_obj);
+#endif
+
+/*
+ * A driver might set a page logically offline -- PageOffline() -- and
+ * turn the page inaccessible in the hypervisor; after that, access to page
+ * content can be fatal.
+ *
+ * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
+ * pages after checking PageOffline(); however, these PFN walkers can race
+ * with drivers that set PageOffline().
+ *
+ * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
+ * synchronize with such drivers, achieving that a page cannot be set
+ * PageOffline() while frozen.
+ *
+ * page_offline_begin()/page_offline_end() is used by drivers that care about
+ * such races when setting a page PageOffline().
+ */
+static DECLARE_RWSEM(page_offline_rwsem);
+
+void page_offline_freeze(void)
+{
+ down_read(&page_offline_rwsem);
+}
+
+void page_offline_thaw(void)
+{
+ up_read(&page_offline_rwsem);
+}
+
+void page_offline_begin(void)
+{
+ down_write(&page_offline_rwsem);
+}
+EXPORT_SYMBOL(page_offline_begin);
+
+void page_offline_end(void)
+{
+ up_write(&page_offline_rwsem);
+}
+EXPORT_SYMBOL(page_offline_end);
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+void flush_dcache_folio(struct folio *folio)
+{
+ long i, nr = folio_nr_pages(folio);
+
+ for (i = 0; i < nr; i++)
+ flush_dcache_page(folio_page(folio, i));
+}
+EXPORT_SYMBOL(flush_dcache_folio);
+#endif
diff --git a/mm/vmacache.c b/mm/vmacache.c
deleted file mode 100644
index 01a6e6688ec1..000000000000
--- a/mm/vmacache.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2014 Davidlohr Bueso.
- */
-#include <linux/sched/signal.h>
-#include <linux/sched/task.h>
-#include <linux/mm.h>
-#include <linux/vmacache.h>
-
-/*
- * Hash based on the pmd of addr if configured with MMU, which provides a good
- * hit rate for workloads with spatial locality. Otherwise, use pages.
- */
-#ifdef CONFIG_MMU
-#define VMACACHE_SHIFT PMD_SHIFT
-#else
-#define VMACACHE_SHIFT PAGE_SHIFT
-#endif
-#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
-
-/*
- * This task may be accessing a foreign mm via (for example)
- * get_user_pages()->find_vma(). The vmacache is task-local and this
- * task's vmacache pertains to a different mm (ie, its own). There is
- * nothing we can do here.
- *
- * Also handle the case where a kernel thread has adopted this mm via
- * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
- */
-static inline bool vmacache_valid_mm(struct mm_struct *mm)
-{
- return current->mm == mm && !(current->flags & PF_KTHREAD);
-}
-
-void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
-{
- if (vmacache_valid_mm(newvma->vm_mm))
- current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
-}
-
-static bool vmacache_valid(struct mm_struct *mm)
-{
- struct task_struct *curr;
-
- if (!vmacache_valid_mm(mm))
- return false;
-
- curr = current;
- if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
- /*
- * First attempt will always be invalid, initialize
- * the new cache for this task here.
- */
- curr->vmacache.seqnum = mm->vmacache_seqnum;
- vmacache_flush(curr);
- return false;
- }
- return true;
-}
-
-struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
-{
- int idx = VMACACHE_HASH(addr);
- int i;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- if (!vmacache_valid(mm))
- return NULL;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[idx];
-
- if (vma) {
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- if (WARN_ON_ONCE(vma->vm_mm != mm))
- break;
-#endif
- if (vma->vm_start <= addr && vma->vm_end > addr) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- }
- if (++idx == VMACACHE_SIZE)
- idx = 0;
- }
-
- return NULL;
-}
-
-#ifndef CONFIG_MMU
-struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
- int idx = VMACACHE_HASH(start);
- int i;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- if (!vmacache_valid(mm))
- return NULL;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[idx];
-
- if (vma && vma->vm_start == start && vma->vm_end == end) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- if (++idx == VMACACHE_SIZE)
- idx = 0;
- }
-
- return NULL;
-}
-#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 04ac98bf5045..ef8599d394fd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/mm/vmalloc.c
- *
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
@@ -27,26 +25,59 @@
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
+#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
+#include <linux/memcontrol.h>
#include <linux/llist.h>
+#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
-
-#include <linux/uaccess.h>
+#include <linux/pgtable.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmalloc.h>
+
#include "internal.h"
#include "pgalloc-track.h"
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
+
+static int __init set_nohugeiomap(char *str)
+{
+ ioremap_max_page_shift = PAGE_SHIFT;
+ return 0;
+}
+early_param("nohugeiomap", set_nohugeiomap);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+ vmap_allow_huge = false;
+ return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
bool is_vmalloc_addr(const void *x)
{
- unsigned long addr = (unsigned long)x;
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
return addr >= VMALLOC_START && addr < VMALLOC_END;
}
@@ -58,18 +89,234 @@ struct vfree_deferred {
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
-static void __vunmap(const void *, int);
+/*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+ unsigned long size = PAGE_SIZE;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(ptep_get(pte)));
+
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
+ if (size != PAGE_SIZE) {
+ pte_t entry = pfn_pte(pfn, prot);
+
+ entry = arch_make_huge_pte(entry, ilog2(size), 0);
+ set_huge_pte_at(&init_mm, addr, pte, entry);
+ pfn += PFN_DOWN(size);
+ continue;
+ }
+#endif
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte += PFN_DOWN(size), addr += size, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
-static void free_work(struct work_struct *w)
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
{
- struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
- struct llist_node *t, *llnode;
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
- llist_for_each_safe(llnode, t, llist_del_all(&p->list))
- __vunmap((void *)llnode, 1);
+ if (!arch_vmap_pmd_supported(prot))
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
}
-/*** Page table manipulation functions ***/
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pud_supported(prot))
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if (!arch_vmap_p4d_supported(prot))
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_range_noflush(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+ max_page_shift, &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+int ioremap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
+{
+ int err;
+
+ err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
+ ioremap_max_page_shift);
+ flush_cache_vmap(addr, end);
+ if (!err)
+ err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
+ ioremap_max_page_shift);
+ return err;
+}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
@@ -137,40 +384,35 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
{
p4d_t *p4d;
unsigned long next;
- int cleared;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- cleared = p4d_clear_huge(p4d);
- if (cleared || p4d_bad(*p4d))
+ p4d_clear_huge(p4d);
+ if (p4d_bad(*p4d))
*mask |= PGTBL_P4D_MODIFIED;
- if (cleared)
- continue;
if (p4d_none_or_clear_bad(p4d))
continue;
vunmap_pud_range(p4d, addr, next, mask);
} while (p4d++, addr = next, addr != end);
}
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @start: start of the VM area to unmap
- * @size: size of the VM area to unmap
+/*
+ * vunmap_range_noflush is similar to vunmap_range, but does not
+ * flush caches or TLBs.
*
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
- * should have been allocated using get_vm_area() and its friends.
+ * The caller is responsible for calling flush_cache_vmap() before calling
+ * this function, and flush_tlb_kernel_range after it has returned
+ * successfully (and before the addresses are expected to cause a page fault
+ * or be re-mapped for something else, if TLB flushes are being delayed or
+ * coalesced).
*
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible
- * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
- * function and flush_tlb_kernel_range() after.
+ * This is an internal function only. Do not use outside mm/.
*/
-void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
+void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
- unsigned long end = start + size;
unsigned long next;
pgd_t *pgd;
unsigned long addr = start;
@@ -191,7 +433,29 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
arch_sync_kernel_mappings(start, end);
}
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+void vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+ kmsan_vunmap_range_noflush(start, end);
+ __vunmap_range_noflush(start, end);
+}
+
+/**
+ * vunmap_range - unmap kernel virtual addresses
+ * @addr: start of the VM area to unmap
+ * @end: end of the VM area to unmap (non-inclusive)
+ *
+ * Clears any present PTEs in the virtual address range, flushes TLBs and
+ * caches. Any subsequent access to the address before it has been re-mapped
+ * is a kernel bug.
+ */
+void vunmap_range(unsigned long addr, unsigned long end)
+{
+ flush_cache_vunmap(addr, end);
+ vunmap_range_noflush(addr, end);
+ flush_tlb_kernel_range(addr, end);
+}
+
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -208,10 +472,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(*pte)))
+ if (WARN_ON(!pte_none(ptep_get(pte))))
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
+ if (WARN_ON(!pfn_valid(page_to_pfn(page))))
+ return -EINVAL;
+
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -219,7 +486,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -231,13 +498,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -249,13 +516,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -267,37 +534,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
- unsigned long end = addr + size;
- unsigned long next;
pgd_t *pgd;
+ unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
@@ -308,7 +556,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
+ err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
@@ -319,14 +567,72 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
return 0;
}
-int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
- struct page **pages)
+/*
+ * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
+ * flush caches.
+ *
+ * The caller is responsible for calling flush_cache_vmap() after this
+ * function returns successfully and before the addresses are accessed.
+ *
+ * This is an internal function only. Do not use outside mm/.
+ */
+int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
{
- int ret;
+ unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+ WARN_ON(page_shift < PAGE_SHIFT);
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
+ page_shift == PAGE_SHIFT)
+ return vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+ for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+ int err;
+
+ err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+ page_to_phys(pages[i]), prot,
+ page_shift);
+ if (err)
+ return err;
+
+ addr += 1UL << page_shift;
+ }
+
+ return 0;
+}
+
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
+ page_shift);
+
+ if (ret)
+ return ret;
+ return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+}
+
+/**
+ * vmap_pages_range - map pages to a kernel virtual address
+ * @addr: start of the VM area to map
+ * @end: end of the VM area to map (non-inclusive)
+ * @prot: page protection flags to use
+ * @pages: pages to map (always PAGE_SIZE pages)
+ * @page_shift: maximum shift that the pages may be mapped with, @pages must
+ * be aligned and contiguous up to at least this shift.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ int err;
- ret = map_kernel_range_noflush(start, size, prot, pages);
- flush_cache_vmap(start, start + size);
- return ret;
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+ flush_cache_vmap(addr, end);
+ return err;
}
int is_vmalloc_or_module_addr(const void *x)
@@ -337,15 +643,18 @@ int is_vmalloc_or_module_addr(const void *x)
* just put it in the vmalloc space.
*/
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
- unsigned long addr = (unsigned long)x;
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
if (addr >= MODULES_VADDR && addr < MODULES_END)
return 1;
#endif
return is_vmalloc_addr(x);
}
+EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
*/
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
@@ -365,32 +674,40 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pgd_none(*pgd))
return NULL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return NULL; /* XXX: no allowance for huge pgd */
+ if (WARN_ON_ONCE(pgd_bad(*pgd)))
+ return NULL;
+
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
- pud = pud_offset(p4d, addr);
+ if (p4d_leaf(*p4d))
+ return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(p4d_bad(*p4d)))
+ return NULL;
- /*
- * Don't dereference bad PUD or PMD (below) entries. This will also
- * identify huge mappings, which we may encounter on architectures
- * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
- * identified as vmalloc addresses by is_vmalloc_addr(), but are
- * not [unambiguously] associated with a struct page, so there is
- * no correct value to return for them.
- */
- WARN_ON_ONCE(pud_bad(*pud));
- if (pud_none(*pud) || pud_bad(*pud))
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return NULL;
+ if (pud_leaf(*pud))
+ return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pud_bad(*pud)))
return NULL;
+
pmd = pmd_offset(pud, addr);
- WARN_ON_ONCE(pmd_bad(*pmd));
- if (pmd_none(*pmd) || pmd_bad(*pmd))
+ if (pmd_none(*pmd))
+ return NULL;
+ if (pmd_leaf(*pmd))
+ return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
- ptep = pte_offset_map(pmd, addr);
- pte = *ptep;
+ ptep = pte_offset_kernel(pmd, addr);
+ pte = ptep_get(ptep);
if (pte_present(pte))
page = pte_page(pte);
- pte_unmap(ptep);
+
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
@@ -415,10 +732,13 @@ static DEFINE_SPINLOCK(vmap_area_lock);
static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
-static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
static bool vmap_initialized __read_mostly;
+static struct rb_root purge_vmap_area_root = RB_ROOT;
+static LIST_HEAD(purge_vmap_area_list);
+static DEFINE_SPINLOCK(purge_vmap_area_lock);
+
/*
* This kmem_cache is used for vmap_area objects. Instead of
* allocating from slab we reuse an object from this cache to
@@ -467,23 +787,13 @@ get_subtree_max_size(struct rb_node *node)
return va ? va->subtree_max_size : 0;
}
-/*
- * Gets called when remove the node and rotate.
- */
-static __always_inline unsigned long
-compute_subtree_max_size(struct vmap_area *va)
-{
- return max3(va_size(va),
- get_subtree_max_size(va->rb_node.rb_left),
- get_subtree_max_size(va->rb_node.rb_right));
-}
-
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
-static void purge_vmap_area_lazy(void);
+static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
-static unsigned long lazy_max_pages(void);
+static void drain_vmap_area_work(struct work_struct *work);
+static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
static atomic_long_t nr_vmalloc_pages;
@@ -492,10 +802,37 @@ unsigned long vmalloc_nr_pages(void)
return atomic_long_read(&nr_vmalloc_pages);
}
-static struct vmap_area *__find_vmap_area(unsigned long addr)
+/* Look up the first VA which satisfies addr < va_end, NULL if none. */
+static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
{
+ struct vmap_area *va = NULL;
struct rb_node *n = vmap_area_root.rb_node;
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
+ while (n) {
+ struct vmap_area *tmp;
+
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_end > addr) {
+ va = tmp;
+ if (tmp->va_start <= addr)
+ break;
+
+ n = n->rb_left;
+ } else
+ n = n->rb_right;
+ }
+
+ return va;
+}
+
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
while (n) {
struct vmap_area *va;
@@ -550,11 +887,9 @@ find_va_links(struct vmap_area *va,
* Trigger the BUG() if there are sides(left/right)
* or full overlaps.
*/
- if (va->va_start < tmp_va->va_end &&
- va->va_end <= tmp_va->va_start)
+ if (va->va_end <= tmp_va->va_start)
link = &(*link)->rb_left;
- else if (va->va_end > tmp_va->va_start &&
- va->va_start >= tmp_va->va_end)
+ else if (va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
else {
WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
@@ -587,8 +922,9 @@ get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
}
static __always_inline void
-link_va(struct vmap_area *va, struct rb_root *root,
- struct rb_node *parent, struct rb_node **link, struct list_head *head)
+__link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head, bool augment)
{
/*
* VA is still not in the list, but we can
@@ -602,12 +938,12 @@ link_va(struct vmap_area *va, struct rb_root *root,
/* Insert to the rb-tree */
rb_link_node(&va->rb_node, parent, link);
- if (root == &free_vmap_area_root) {
+ if (augment) {
/*
* Some explanation here. Just perform simple insertion
* to the tree. We do not set va->subtree_max_size to
* its current size before calling rb_insert_augmented().
- * It is because of we populate the tree from the bottom
+ * It is because we populate the tree from the bottom
* to parent levels when the node _is_ in the tree.
*
* Therefore we set subtree_max_size to zero after insertion,
@@ -626,22 +962,61 @@ link_va(struct vmap_area *va, struct rb_root *root,
}
static __always_inline void
-unlink_va(struct vmap_area *va, struct rb_root *root)
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head)
+{
+ __link_va(va, root, parent, link, head, false);
+}
+
+static __always_inline void
+link_va_augment(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head)
+{
+ __link_va(va, root, parent, link, head, true);
+}
+
+static __always_inline void
+__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
return;
- if (root == &free_vmap_area_root)
+ if (augment)
rb_erase_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
else
rb_erase(&va->rb_node, root);
- list_del(&va->list);
+ list_del_init(&va->list);
RB_CLEAR_NODE(&va->rb_node);
}
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ __unlink_va(va, root, false);
+}
+
+static __always_inline void
+unlink_va_augment(struct vmap_area *va, struct rb_root *root)
+{
+ __unlink_va(va, root, true);
+}
+
#if DEBUG_AUGMENT_PROPAGATE_CHECK
+/*
+ * Gets called when remove the node and rotate.
+ */
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
+
static void
augment_tree_propagate_check(void)
{
@@ -725,7 +1100,7 @@ insert_vmap_area_augment(struct vmap_area *va,
link = find_va_links(va, root, NULL, &parent);
if (link) {
- link_va(va, root, parent, link, head);
+ link_va_augment(va, root, parent, link, head);
augment_tree_propagate_from(va);
}
}
@@ -742,8 +1117,8 @@ insert_vmap_area_augment(struct vmap_area *va,
* ongoing.
*/
static __always_inline struct vmap_area *
-merge_or_add_vmap_area(struct vmap_area *va,
- struct rb_root *root, struct list_head *head)
+__merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head, bool augment)
{
struct vmap_area *sibling;
struct list_head *next;
@@ -805,7 +1180,7 @@ merge_or_add_vmap_area(struct vmap_area *va,
* "normalized" because of rotation operations.
*/
if (merged)
- unlink_va(va, root);
+ __unlink_va(va, root, augment);
sibling->va_end = va->va_end;
@@ -820,12 +1195,26 @@ merge_or_add_vmap_area(struct vmap_area *va,
insert:
if (!merged)
- link_va(va, root, parent, link, head);
+ __link_va(va, root, parent, link, head, augment);
+
+ return va;
+}
+
+static __always_inline struct vmap_area *
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ return __merge_or_add_vmap_area(va, root, head, false);
+}
+
+static __always_inline struct vmap_area *
+merge_or_add_vmap_area_augment(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ va = __merge_or_add_vmap_area(va, root, head, true);
+ if (va)
+ augment_tree_propagate_from(va);
- /*
- * Last step is to check and update the tree.
- */
- augment_tree_propagate_from(va);
return va;
}
@@ -851,21 +1240,23 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
/*
* Find the first free block(lowest start address) in the tree,
* that will accomplish the request corresponding to passing
- * parameters.
+ * parameters. Please note, with an alignment bigger than PAGE_SIZE,
+ * a search length is adjusted to account for worst case alignment
+ * overhead.
*/
static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size,
- unsigned long align, unsigned long vstart)
+find_vmap_lowest_match(struct rb_root *root, unsigned long size,
+ unsigned long align, unsigned long vstart, bool adjust_search_size)
{
struct vmap_area *va;
struct rb_node *node;
unsigned long length;
/* Start from the root. */
- node = free_vmap_area_root.rb_node;
+ node = root->rb_node;
/* Adjust the search size for alignment overhead. */
- length = size + align - 1;
+ length = adjust_search_size ? size + align - 1 : size;
while (node) {
va = rb_entry(node, struct vmap_area, rb_node);
@@ -890,7 +1281,8 @@ find_vmap_lowest_match(unsigned long size,
/*
* OK. We roll back and find the first right sub-tree,
* that will satisfy the search criteria. It can happen
- * only once due to "vstart" restriction.
+ * due to "vstart" restriction or an alignment overhead
+ * that is bigger then PAGE_SIZE.
*/
while ((node = rb_parent(node))) {
va = rb_entry(node, struct vmap_area, rb_node);
@@ -899,6 +1291,13 @@ find_vmap_lowest_match(unsigned long size,
if (get_subtree_max_size(node->rb_right) >= length &&
vstart <= va->va_start) {
+ /*
+ * Shift the vstart forward. Please note, we update it with
+ * parent's start address adding "1" because we do not want
+ * to enter same sub-tree after it has already been checked
+ * and no suitable free block found there.
+ */
+ vstart = va->va_start + 1;
node = node->rb_right;
break;
}
@@ -913,12 +1312,12 @@ find_vmap_lowest_match(unsigned long size,
#include <linux/random.h>
static struct vmap_area *
-find_vmap_lowest_linear_match(unsigned long size,
+find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
unsigned long align, unsigned long vstart)
{
struct vmap_area *va;
- list_for_each_entry(va, &free_vmap_area_list, list) {
+ list_for_each_entry(va, head, list) {
if (!is_within_this_va(va, size, align, vstart))
continue;
@@ -929,7 +1328,8 @@ find_vmap_lowest_linear_match(unsigned long size,
}
static void
-find_vmap_lowest_match_check(unsigned long size)
+find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align)
{
struct vmap_area *va_1, *va_2;
unsigned long vstart;
@@ -938,8 +1338,8 @@ find_vmap_lowest_match_check(unsigned long size)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
- va_1 = find_vmap_lowest_match(size, 1, vstart);
- va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
+ va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
+ va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -982,11 +1382,12 @@ classify_va_fit_type(struct vmap_area *va,
}
static __always_inline int
-adjust_va_to_fit_type(struct vmap_area *va,
- unsigned long nva_start_addr, unsigned long size,
- enum fit_type type)
+adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
+ struct vmap_area *va, unsigned long nva_start_addr,
+ unsigned long size)
{
struct vmap_area *lva = NULL;
+ enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
if (type == FL_FIT_TYPE) {
/*
@@ -996,7 +1397,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
* V NVA V
* |---------------|
*/
- unlink_va(va, &free_vmap_area_root);
+ unlink_va_augment(va, root);
kmem_cache_free(vmap_area_cachep, va);
} else if (type == LE_FIT_TYPE) {
/*
@@ -1074,8 +1475,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
augment_tree_propagate_from(va);
if (lva) /* type == NE_FIT_TYPE */
- insert_vmap_area_augment(lva, &va->rb_node,
- &free_vmap_area_root, &free_vmap_area_list);
+ insert_vmap_area_augment(lva, &va->rb_node, root, head);
}
return 0;
@@ -1086,15 +1486,28 @@ adjust_va_to_fit_type(struct vmap_area *va,
* Otherwise a vend is returned that indicates failure.
*/
static __always_inline unsigned long
-__alloc_vmap_area(unsigned long size, unsigned long align,
+__alloc_vmap_area(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
+ bool adjust_search_size = true;
unsigned long nva_start_addr;
struct vmap_area *va;
- enum fit_type type;
int ret;
- va = find_vmap_lowest_match(size, align, vstart);
+ /*
+ * Do not adjust when:
+ * a) align <= PAGE_SIZE, because it does not make any sense.
+ * All blocks(their start addresses) are at least PAGE_SIZE
+ * aligned anyway;
+ * b) a short range where a requested size corresponds to exactly
+ * specified [vstart:vend] interval and an alignment > PAGE_SIZE.
+ * With adjusted search length an allocation would not succeed.
+ */
+ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
+ adjust_search_size = false;
+
+ va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
return vend;
@@ -1107,18 +1520,13 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
if (nva_start_addr + size > vend)
return vend;
- /* Classify what we have found. */
- type = classify_va_fit_type(va, nva_start_addr, size);
- if (WARN_ON_ONCE(type == NOTHING_FIT))
- return vend;
-
/* Update the free vmap_area. */
- ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
- if (ret)
+ ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
+ if (WARN_ON_ONCE(ret))
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(size);
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
@@ -1140,10 +1548,33 @@ static void free_vmap_area(struct vmap_area *va)
* Insert/Merge it back to the free tree/list.
*/
spin_lock(&free_vmap_area_lock);
- merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
spin_unlock(&free_vmap_area_lock);
}
+static inline void
+preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
+{
+ struct vmap_area *va = NULL;
+
+ /*
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. It guarantees that
+ * a CPU that does an allocation is preloaded.
+ *
+ * We do it in non-atomic context, thus it allows us to use more
+ * permissive allocation masks to be more stable under low memory
+ * condition and high memory pressure.
+ */
+ if (!this_cpu_read(ne_fit_preload_node))
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(lock);
+
+ if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
+ kmem_cache_free(vmap_area_cachep, va);
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
@@ -1151,16 +1582,17 @@ static void free_vmap_area(struct vmap_area *va)
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
- int node, gfp_t gfp_mask)
+ int node, gfp_t gfp_mask,
+ unsigned long va_flags)
{
- struct vmap_area *va, *pva;
+ struct vmap_area *va;
+ unsigned long freed;
unsigned long addr;
int purged = 0;
int ret;
- BUG_ON(!size);
- BUG_ON(offset_in_page(size));
- BUG_ON(!is_power_of_2(align));
+ if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
+ return ERR_PTR(-EINVAL);
if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
@@ -1179,50 +1611,24 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
- /*
- * Preload this CPU with one extra vmap_area object. It is used
- * when fit type of free area is NE_FIT_TYPE. Please note, it
- * does not guarantee that an allocation occurs on a CPU that
- * is preloaded, instead we minimize the case when it is not.
- * It can happen because of cpu migration, because there is a
- * race until the below spinlock is taken.
- *
- * The preload is done in non-atomic context, thus it allows us
- * to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure. In rare case,
- * if not preloaded, GFP_NOWAIT is used.
- *
- * Set "pva" to NULL here, because of "retry" path.
- */
- pva = NULL;
-
- if (!this_cpu_read(ne_fit_preload_node))
- /*
- * Even if it fails we do not really care about that.
- * Just proceed as it is. If needed "overflow" path
- * will refill the cache we allocate from.
- */
- pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
-
- spin_lock(&free_vmap_area_lock);
+ preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+ addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
+ size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
- if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
- kmem_cache_free(vmap_area_cachep, pva);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
- addr = __alloc_vmap_area(size, align, vstart, vend);
- spin_unlock(&free_vmap_area_lock);
-
if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
-
+ va->flags = va_flags;
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
@@ -1242,18 +1648,17 @@ retry:
overflow:
if (!purged) {
- purge_vmap_area_lazy();
+ reclaim_and_purge_vmap_areas();
purged = 1;
goto retry;
}
- if (gfpflags_allow_blocking(gfp_mask)) {
- unsigned long freed = 0;
- blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
- if (freed > 0) {
- purged = 0;
- goto retry;
- }
+ freed = 0;
+ blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
+
+ if (freed > 0) {
+ purged = 0;
+ goto retry;
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
@@ -1304,8 +1709,8 @@ static unsigned long lazy_max_pages(void)
static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
- * Serialize vmap purging. There is no actual criticial section protected
- * by this look, but we want to avoid concurrent calls for performance
+ * Serialize vmap purging. There is no actual critical section protected
+ * by this lock, but we want to avoid concurrent calls for performance
* reasons and to make the pcpu_get_vm_areas more deterministic.
*/
static DEFINE_MUTEX(vmap_purge_lock);
@@ -1314,46 +1719,38 @@ static DEFINE_MUTEX(vmap_purge_lock);
static void purge_fragmented_blocks_allcpus(void);
/*
- * called before a call to iounmap() if the caller wants vm_area_struct's
- * immediately freed.
- */
-void set_iounmap_nonlazy(void)
-{
- atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
-}
-
-/*
* Purges all lazily-freed vmap areas.
*/
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
- struct llist_node *valist;
- struct vmap_area *va;
- struct vmap_area *n_va;
+ unsigned int num_purged_areas = 0;
+ struct list_head local_purge_list;
+ struct vmap_area *va, *n_va;
lockdep_assert_held(&vmap_purge_lock);
- valist = llist_del_all(&vmap_purge_list);
- if (unlikely(valist == NULL))
- return false;
+ spin_lock(&purge_vmap_area_lock);
+ purge_vmap_area_root = RB_ROOT;
+ list_replace_init(&purge_vmap_area_list, &local_purge_list);
+ spin_unlock(&purge_vmap_area_lock);
- /*
- * TODO: to calculate a flush range without looping.
- * The list can be up to lazy_max_pages() elements.
- */
- llist_for_each_entry(va, valist, purge_list) {
- if (va->va_start < start)
- start = va->va_start;
- if (va->va_end > end)
- end = va->va_end;
- }
+ if (unlikely(list_empty(&local_purge_list)))
+ goto out;
+
+ start = min(start,
+ list_first_entry(&local_purge_list,
+ struct vmap_area, list)->va_start);
+
+ end = max(end,
+ list_last_entry(&local_purge_list,
+ struct vmap_area, list)->va_end);
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
spin_lock(&free_vmap_area_lock);
- llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
unsigned long orig_start = va->va_start;
unsigned long orig_end = va->va_end;
@@ -1363,8 +1760,8 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
* detached and there is no need to "unlink" it from
* anything.
*/
- va = merge_or_add_vmap_area(va, &free_vmap_area_root,
- &free_vmap_area_list);
+ va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
+ &free_vmap_area_list);
if (!va)
continue;
@@ -1374,30 +1771,23 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
+ num_purged_areas++;
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
cond_resched_lock(&free_vmap_area_lock);
}
spin_unlock(&free_vmap_area_lock);
- return true;
-}
-/*
- * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
- * is already purging.
- */
-static void try_purge_vmap_area_lazy(void)
-{
- if (mutex_trylock(&vmap_purge_lock)) {
- __purge_vmap_area_lazy(ULONG_MAX, 0);
- mutex_unlock(&vmap_purge_lock);
- }
+out:
+ trace_purge_vmap_area_lazy(start, end, num_purged_areas);
+ return num_purged_areas > 0;
}
/*
- * Kick off a purge of the outstanding lazy areas.
+ * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
*/
-static void purge_vmap_area_lazy(void)
+static void reclaim_and_purge_vmap_areas(void)
+
{
mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
@@ -1405,27 +1795,50 @@ static void purge_vmap_area_lazy(void)
mutex_unlock(&vmap_purge_lock);
}
+static void drain_vmap_area_work(struct work_struct *work)
+{
+ unsigned long nr_lazy;
+
+ do {
+ mutex_lock(&vmap_purge_lock);
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+
+ /* Recheck if further work is required. */
+ nr_lazy = atomic_long_read(&vmap_lazy_nr);
+ } while (nr_lazy > lazy_max_pages());
+}
+
/*
- * Free a vmap area, caller ensuring that the area has been unmapped
- * and flush_cache_vunmap had been called for the correct range
- * previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped,
+ * unlinked and flush_cache_vunmap had been called for the correct
+ * range previously.
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
+ unsigned long nr_lazy_max = lazy_max_pages();
+ unsigned long va_start = va->va_start;
unsigned long nr_lazy;
- spin_lock(&vmap_area_lock);
- unlink_va(va, &vmap_area_root);
- spin_unlock(&vmap_area_lock);
+ if (WARN_ON_ONCE(!list_empty(&va->list)))
+ return;
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
- /* After this point, we may free va at any time */
- llist_add(&va->purge_list, &vmap_purge_list);
+ /*
+ * Merge or place it to the purge tree/list.
+ */
+ spin_lock(&purge_vmap_area_lock);
+ merge_or_add_vmap_area(va,
+ &purge_vmap_area_root, &purge_vmap_area_list);
+ spin_unlock(&purge_vmap_area_lock);
- if (unlikely(nr_lazy > lazy_max_pages()))
- try_purge_vmap_area_lazy();
+ trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
+
+ /* After this point, we may free va at any time */
+ if (unlikely(nr_lazy > nr_lazy_max))
+ schedule_work(&drain_vmap_work);
}
/*
@@ -1434,19 +1847,32 @@ static void free_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
+ vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
free_vmap_area_noflush(va);
}
-static struct vmap_area *find_vmap_area(unsigned long addr)
+struct vmap_area *find_vmap_area(unsigned long addr)
{
struct vmap_area *va;
spin_lock(&vmap_area_lock);
- va = __find_vmap_area(addr);
+ va = __find_vmap_area(addr, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ return va;
+}
+
+static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area(addr, &vmap_area_root);
+ if (va)
+ unlink_va(va, &vmap_area_root);
spin_unlock(&vmap_area_lock);
return va;
@@ -1482,15 +1908,33 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
+/*
+ * Purge threshold to prevent overeager purging of fragmented blocks for
+ * regular operations: Purge if vb->free is less than 1/4 of the capacity.
+ */
+#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
+
+#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
+#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
+#define VMAP_FLAGS_MASK 0x3
+
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
+
+ /*
+ * An xarray requires an extra memory dynamically to
+ * be allocated. If it is an issue, we can use rb-tree
+ * instead.
+ */
+ struct xarray vmap_blocks;
};
struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
unsigned long free, dirty;
+ DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
unsigned long dirty_min, dirty_max; /*< dirty range */
struct list_head free_list;
struct rcu_head rcu_head;
@@ -1501,11 +1945,48 @@ struct vmap_block {
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
- * XArray of vmap blocks, indexed by address, to quickly find a vmap block
- * in the free path. Could get rid of this if we change the API to return a
- * "cookie" from alloc, to be passed to free. But no big deal yet.
+ * In order to fast access to any "vmap_block" associated with a
+ * specific address, we use a hash.
+ *
+ * A per-cpu vmap_block_queue is used in both ways, to serialize
+ * an access to free block chains among CPUs(alloc path) and it
+ * also acts as a vmap_block hash(alloc/free paths). It means we
+ * overload it, since we already have the per-cpu array which is
+ * used as a hash table. When used as a hash a 'cpu' passed to
+ * per_cpu() is not actually a CPU but rather a hash index.
+ *
+ * A hash function is addr_to_vb_xa() which hashes any address
+ * to a specific index(in a hash) it belongs to. This then uses a
+ * per_cpu() macro to access an array with generated index.
+ *
+ * An example:
+ *
+ * CPU_1 CPU_2 CPU_0
+ * | | |
+ * V V V
+ * 0 10 20 30 40 50 60
+ * |------|------|------|------|------|------|...<vmap address space>
+ * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2
+ *
+ * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
+ * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
+ *
+ * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
+ * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
+ *
+ * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
+ * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
+ *
+ * This technique almost always avoids lock contention on insert/remove,
+ * however xarray spinlocks protect against any contention that remains.
*/
-static DEFINE_XARRAY(vmap_blocks);
+static struct xarray *
+addr_to_vb_xa(unsigned long addr)
+{
+ int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();
+
+ return &per_cpu(vmap_block_queue, index).vmap_blocks;
+}
/*
* We should probably have a fallback mechanism to allocate virtual memory
@@ -1543,6 +2024,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
struct vmap_block_queue *vbq;
struct vmap_block *vb;
struct vmap_area *va;
+ struct xarray *xa;
unsigned long vb_idx;
int node, err;
void *vaddr;
@@ -1556,7 +2038,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
- node, gfp_mask);
+ node, gfp_mask,
+ VMAP_RAM|VMAP_BLOCK);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
@@ -1567,25 +2050,27 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
vb->va = va;
/* At least something should be left free */
BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+ bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
vb->free = VMAP_BBMAP_BITS - (1UL << order);
vb->dirty = 0;
vb->dirty_min = VMAP_BBMAP_BITS;
vb->dirty_max = 0;
+ bitmap_set(vb->used_map, 0, (1UL << order));
INIT_LIST_HEAD(&vb->free_list);
+ xa = addr_to_vb_xa(va->va_start);
vb_idx = addr_to_vb_idx(va->va_start);
- err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+ err = xa_insert(xa, vb_idx, vb, gfp_mask);
if (err) {
kfree(vb);
free_vmap_area(va);
return ERR_PTR(err);
}
- vbq = &get_cpu_var(vmap_block_queue);
+ vbq = raw_cpu_ptr(&vmap_block_queue);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
- put_cpu_var(vmap_block_queue);
return vaddr;
}
@@ -1593,47 +2078,76 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
+ struct xarray *xa;
- tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
+ xa = addr_to_vb_xa(vb->va->va_start);
+ tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
+ spin_lock(&vmap_area_lock);
+ unlink_va(vb->va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
}
+static bool purge_fragmented_block(struct vmap_block *vb,
+ struct vmap_block_queue *vbq, struct list_head *purge_list,
+ bool force_purge)
+{
+ if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
+ vb->dirty == VMAP_BBMAP_BITS)
+ return false;
+
+ /* Don't overeagerly purge usable blocks unless requested */
+ if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
+ return false;
+
+ /* prevent further allocs after releasing lock */
+ WRITE_ONCE(vb->free, 0);
+ /* prevent purging it again */
+ WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
+ vb->dirty_min = 0;
+ vb->dirty_max = VMAP_BBMAP_BITS;
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ list_add_tail(&vb->purge, purge_list);
+ return true;
+}
+
+static void free_purged_blocks(struct list_head *purge_list)
+{
+ struct vmap_block *vb, *n_vb;
+
+ list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
static void purge_fragmented_blocks(int cpu)
{
LIST_HEAD(purge);
struct vmap_block *vb;
- struct vmap_block *n_vb;
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ unsigned long free = READ_ONCE(vb->free);
+ unsigned long dirty = READ_ONCE(vb->dirty);
- if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ if (free + dirty != VMAP_BBMAP_BITS ||
+ dirty == VMAP_BBMAP_BITS)
continue;
spin_lock(&vb->lock);
- if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
- vb->free = 0; /* prevent further allocs after releasing lock */
- vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
- vb->dirty_min = 0;
- vb->dirty_max = VMAP_BBMAP_BITS;
- spin_lock(&vbq->lock);
- list_del_rcu(&vb->free_list);
- spin_unlock(&vbq->lock);
- spin_unlock(&vb->lock);
- list_add_tail(&vb->purge, &purge);
- } else
- spin_unlock(&vb->lock);
+ purge_fragmented_block(vb, vbq, &purge, true);
+ spin_unlock(&vb->lock);
}
rcu_read_unlock();
-
- list_for_each_entry_safe(vb, n_vb, &purge, purge) {
- list_del(&vb->purge);
- free_vmap_block(vb);
- }
+ free_purged_blocks(&purge);
}
static void purge_fragmented_blocks_allcpus(void)
@@ -1664,10 +2178,13 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
order = get_order(size);
rcu_read_lock();
- vbq = &get_cpu_var(vmap_block_queue);
+ vbq = raw_cpu_ptr(&vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
+ if (READ_ONCE(vb->free) < (1UL << order))
+ continue;
+
spin_lock(&vb->lock);
if (vb->free < (1UL << order)) {
spin_unlock(&vb->lock);
@@ -1676,7 +2193,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
pages_off = VMAP_BBMAP_BITS - vb->free;
vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
- vb->free -= 1UL << order;
+ WRITE_ONCE(vb->free, vb->free - (1UL << order));
+ bitmap_set(vb->used_map, pages_off, (1UL << order));
if (vb->free == 0) {
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
@@ -1687,7 +2205,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
break;
}
- put_cpu_var(vmap_block_queue);
rcu_read_unlock();
/* Allocate new block if nothing was found */
@@ -1702,6 +2219,7 @@ static void vb_free(unsigned long addr, unsigned long size)
unsigned long offset;
unsigned int order;
struct vmap_block *vb;
+ struct xarray *xa;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -1710,20 +2228,26 @@ static void vb_free(unsigned long addr, unsigned long size)
order = get_order(size);
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
- vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
- unmap_kernel_range_noflush(addr, size);
+ xa = addr_to_vb_xa(addr);
+ vb = xa_load(xa, addr_to_vb_idx(addr));
+
+ spin_lock(&vb->lock);
+ bitmap_clear(vb->used_map, offset, (1UL << order));
+ spin_unlock(&vb->lock);
+
+ vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
- /* Expand dirty range */
+ /* Expand the not yet TLB flushed dirty range */
vb->dirty_min = min(vb->dirty_min, offset);
vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
- vb->dirty += 1UL << order;
+ WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
if (vb->dirty == VMAP_BBMAP_BITS) {
BUG_ON(vb->free);
spin_unlock(&vb->lock);
@@ -1734,21 +2258,30 @@ static void vb_free(unsigned long addr, unsigned long size)
static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
+ LIST_HEAD(purge_list);
int cpu;
if (unlikely(!vmap_initialized))
return;
- might_sleep();
+ mutex_lock(&vmap_purge_lock);
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
+ unsigned long idx;
rcu_read_lock();
- list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ xa_for_each(&vbq->vmap_blocks, idx, vb) {
spin_lock(&vb->lock);
- if (vb->dirty) {
+
+ /*
+ * Try to purge a fragmented block first. If it's
+ * not purgeable, check whether there is dirty
+ * space to be flushed.
+ */
+ if (!purge_fragmented_block(vb, vbq, &purge_list, false) &&
+ vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
@@ -1758,15 +2291,18 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
start = min(s, start);
end = max(e, end);
+ /* Prevent that this is flushed again */
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
+
flush = 1;
}
spin_unlock(&vb->lock);
}
rcu_read_unlock();
}
+ free_purged_blocks(&purge_list);
- mutex_lock(&vmap_purge_lock);
- purge_fragmented_blocks_allcpus();
if (!__purge_vmap_area_lazy(start, end) && flush)
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
@@ -1802,7 +2338,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
- unsigned long addr = (unsigned long)mem;
+ unsigned long addr = (unsigned long)kasan_reset_tag(mem);
struct vmap_area *va;
might_sleep();
@@ -1819,8 +2355,10 @@ void vm_unmap_ram(const void *mem, unsigned int count)
return;
}
- va = find_vmap_area(addr);
- BUG_ON(!va);
+ va = find_unlink_vmap_area(addr);
+ if (WARN_ON_ONCE(!va))
+ return;
+
debug_check_no_locks_freed((void *)va->va_start,
(va->va_end - va->va_start));
free_unmap_vmap_area(va);
@@ -1855,7 +2393,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
} else {
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
- VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+ VMALLOC_START, VMALLOC_END,
+ node, GFP_KERNEL, VMAP_RAM);
if (IS_ERR(va))
return NULL;
@@ -1863,18 +2402,43 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
mem = (void *)addr;
}
- kasan_unpoison_vmalloc(mem, size);
-
- if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
+ if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
+ pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+
+ /*
+ * Mark the pages as accessible, now that they are mapped.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+static inline unsigned int vm_area_page_order(struct vm_struct *vm)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ return vm->page_order;
+#else
+ return 0;
+#endif
+}
+
+static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ vm->page_order = order;
+#else
+ BUG_ON(order != 0);
+#endif
+}
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1915,15 +2479,22 @@ void __init vm_area_add_early(struct vm_struct *vm)
*/
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
- static size_t vm_init_off __initdata;
- unsigned long addr;
+ unsigned long addr = ALIGN(VMALLOC_START, align);
+ struct vm_struct *cur, **p;
- addr = ALIGN(VMALLOC_START + vm_init_off, align);
- vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
+ BUG_ON(vmap_initialized);
- vm->addr = (void *)addr;
+ for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
+ if ((unsigned long)cur->addr - addr >= vm->size)
+ break;
+ addr = ALIGN((unsigned long)cur->addr + cur->size, align);
+ }
- vm_area_add_early(vm);
+ BUG_ON(addr > VMALLOC_END - vm->size);
+ vm->addr = (void *)addr;
+ vm->next = *p;
+ *p = vm;
+ kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
static void vmap_init_free_space(void)
@@ -1967,65 +2538,6 @@ static void vmap_init_free_space(void)
}
}
-void __init vmalloc_init(void)
-{
- struct vmap_area *va;
- struct vm_struct *tmp;
- int i;
-
- /*
- * Create the cache for vmap_area objects.
- */
- vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
-
- for_each_possible_cpu(i) {
- struct vmap_block_queue *vbq;
- struct vfree_deferred *p;
-
- vbq = &per_cpu(vmap_block_queue, i);
- spin_lock_init(&vbq->lock);
- INIT_LIST_HEAD(&vbq->free);
- p = &per_cpu(vfree_deferred, i);
- init_llist_head(&p->list);
- INIT_WORK(&p->wq, free_work);
- }
-
- /* Import existing vmlist entries. */
- for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
- if (WARN_ON_ONCE(!va))
- continue;
-
- va->va_start = (unsigned long)tmp->addr;
- va->va_end = va->va_start + tmp->size;
- va->vm = tmp;
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
- }
-
- /*
- * Now we can initialize a free vmap space.
- */
- vmap_init_free_space();
- vmap_initialized = true;
-}
-
-/**
- * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Similar to unmap_kernel_range_noflush() but flushes vcache before
- * the unmapping and tlb after.
- */
-void unmap_kernel_range(unsigned long addr, unsigned long size)
-{
- unsigned long end = addr + size;
-
- flush_cache_vunmap(addr, end);
- unmap_kernel_range_noflush(addr, size);
- flush_tlb_kernel_range(addr, end);
-}
-
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
@@ -2056,15 +2568,16 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
- unsigned long align, unsigned long flags, unsigned long start,
- unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+ unsigned long align, unsigned long shift, unsigned long flags,
+ unsigned long start, unsigned long end, int node,
+ gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
unsigned long requested_size = size;
BUG_ON(in_interrupt());
- size = PAGE_ALIGN(size);
+ size = ALIGN(size, 1ul << shift);
if (unlikely(!size))
return NULL;
@@ -2079,16 +2592,26 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
- va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
- kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
-
setup_vmalloc_vm(area, va, flags, caller);
+ /*
+ * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
+ * best-effort approach, as they can be mapped outside of vmalloc code.
+ * For VM_ALLOC mappings, the pages are marked as accessible after
+ * getting mapped in __vmalloc_node_range().
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ if (!(flags & VM_ALLOC))
+ area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+
return area;
}
@@ -2096,8 +2619,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, caller);
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
+ NUMA_NO_NODE, GFP_KERNEL, caller);
}
/**
@@ -2113,7 +2636,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
}
@@ -2121,7 +2645,8 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
@@ -2159,25 +2684,26 @@ struct vm_struct *find_vm_area(const void *addr)
struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
+ struct vm_struct *vm;
might_sleep();
- spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr);
- if (va && va->vm) {
- struct vm_struct *vm = va->vm;
-
- va->vm = NULL;
- spin_unlock(&vmap_area_lock);
+ if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
+ addr))
+ return NULL;
- kasan_free_shadow(vm);
- free_unmap_vmap_area(va);
+ va = find_unlink_vmap_area((unsigned long)addr);
+ if (!va || !va->vm)
+ return NULL;
+ vm = va->vm;
- return vm;
- }
+ debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
+ debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
+ kasan_free_module_shadow(vm);
+ kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
- spin_unlock(&vmap_area_lock);
- return NULL;
+ free_unmap_vmap_area(va);
+ return vm;
}
static inline void set_area_direct_map(const struct vm_struct *area,
@@ -2185,44 +2711,35 @@ static inline void set_area_direct_map(const struct vm_struct *area,
{
int i;
+ /* HUGE_VMALLOC passes small pages to set_direct_map */
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
}
-/* Handle removing and resetting vm mappings related to the vm_struct. */
-static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
+/*
+ * Flush the vm mapping and reset the direct map.
+ */
+static void vm_reset_perms(struct vm_struct *area)
{
unsigned long start = ULONG_MAX, end = 0;
- int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
+ unsigned int page_order = vm_area_page_order(area);
int flush_dmap = 0;
int i;
- remove_vm_area(area->addr);
-
- /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
- if (!flush_reset)
- return;
-
- /*
- * If not deallocating pages, just do the flush of the VM area and
- * return.
- */
- if (!deallocate_pages) {
- vm_unmap_aliases();
- return;
- }
-
/*
- * If execution gets here, flush the vm mapping and reset the direct
- * map. Find the start and end range of the direct mappings to make sure
+ * Find the start and end range of the direct mappings to make sure that
* the vm_unmap_aliases() flush includes the direct map.
*/
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
+
if (addr) {
+ unsigned long page_size;
+
+ page_size = PAGE_SIZE << page_order;
start = min(addr, start);
- end = max(addr + PAGE_SIZE, end);
+ end = max(addr + page_size, end);
flush_dmap = 1;
}
}
@@ -2237,61 +2754,13 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
set_area_direct_map(area, set_direct_map_default_noflush);
}
-static void __vunmap(const void *addr, int deallocate_pages)
-{
- struct vm_struct *area;
-
- if (!addr)
- return;
-
- if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
- addr))
- return;
-
- area = find_vm_area(addr);
- if (unlikely(!area)) {
- WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
- addr);
- return;
- }
-
- debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
- debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
-
- kasan_poison_vmalloc(area->addr, area->size);
-
- vm_remove_mappings(area, deallocate_pages);
-
- if (deallocate_pages) {
- int i;
-
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page = area->pages[i];
-
- BUG_ON(!page);
- __free_pages(page, 0);
- }
- atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
-
- kvfree(area->pages);
- }
-
- kfree(area);
- return;
-}
-
-static inline void __vfree_deferred(const void *addr)
+static void delayed_vfree_work(struct work_struct *w)
{
- /*
- * Use raw_cpu_ptr() because this can be called from preemptible
- * context. Preemption is absolutely fine here, because the llist_add()
- * implementation is lockless, so it works even if we are adding to
- * another cpu's list. schedule_work() should be fine with this too.
- */
- struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+ struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
+ struct llist_node *t, *llnode;
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
+ llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+ vfree(llnode);
}
/**
@@ -2303,51 +2772,79 @@ static inline void __vfree_deferred(const void *addr)
*/
void vfree_atomic(const void *addr)
{
- BUG_ON(in_nmi());
+ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+ BUG_ON(in_nmi());
kmemleak_free(addr);
- if (!addr)
- return;
- __vfree_deferred(addr);
-}
-
-static void __vfree(const void *addr)
-{
- if (unlikely(in_interrupt()))
- __vfree_deferred(addr);
- else
- __vunmap(addr, 1);
+ /*
+ * Use raw_cpu_ptr() because this can be called from preemptible
+ * context. Preemption is absolutely fine here, because the llist_add()
+ * implementation is lockless, so it works even if we are adding to
+ * another cpu's list. schedule_work() should be fine with this too.
+ */
+ if (addr && llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
}
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - Release memory allocated by vmalloc()
+ * @addr: Memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as obtained
+ * from one of the vmalloc() family of APIs. This will usually also free the
+ * physical memory underlying the virtual allocation, but that memory is
+ * reference counted, so it will not be freed until the last user goes away.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * If @addr is NULL, no operation is performed.
*
+ * Context:
* May sleep if called *not* from interrupt context.
- *
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
+ * Must not be called in NMI context (strictly speaking, it could be
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-dependent would be a really bad idea).
*/
void vfree(const void *addr)
{
- BUG_ON(in_nmi());
+ struct vm_struct *vm;
+ int i;
- kmemleak_free(addr);
+ if (unlikely(in_interrupt())) {
+ vfree_atomic(addr);
+ return;
+ }
- might_sleep_if(!in_interrupt());
+ BUG_ON(in_nmi());
+ kmemleak_free(addr);
+ might_sleep();
if (!addr)
return;
- __vfree(addr);
+ vm = remove_vm_area(addr);
+ if (unlikely(!vm)) {
+ WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+ addr);
+ return;
+ }
+
+ if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
+ vm_reset_perms(vm);
+ for (i = 0; i < vm->nr_pages; i++) {
+ struct page *page = vm->pages[i];
+
+ BUG_ON(!page);
+ mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
+ /*
+ * High-order allocs for huge vmallocs are split, so
+ * can be freed as an array of order-0 allocations
+ */
+ __free_page(page);
+ cond_resched();
+ }
+ atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
+ kvfree(vm->pages);
+ kfree(vm);
}
EXPORT_SYMBOL(vfree);
@@ -2362,10 +2859,20 @@ EXPORT_SYMBOL(vfree);
*/
void vunmap(const void *addr)
{
+ struct vm_struct *vm;
+
BUG_ON(in_interrupt());
might_sleep();
- if (addr)
- __vunmap(addr, 0);
+
+ if (!addr)
+ return;
+ vm = remove_vm_area(addr);
+ if (unlikely(!vm)) {
+ WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
+ addr);
+ return;
+ }
+ kfree(vm);
}
EXPORT_SYMBOL(vunmap);
@@ -2376,8 +2883,11 @@ EXPORT_SYMBOL(vunmap);
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * Maps @count pages from @pages into contiguous kernel virtual space.
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
+ * are transferred from the caller to vmap(), and will be freed / dropped when
+ * vfree() is called on the return value.
*
* Return: the address of the area or %NULL on failure
*/
@@ -2385,10 +2895,21 @@ void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
+ unsigned long addr;
unsigned long size; /* In bytes */
might_sleep();
+ if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
+ return NULL;
+
+ /*
+ * Your top guard is someone else's bottom guard. Not having a top
+ * guard compromises someone else's mappings too.
+ */
+ if (WARN_ON_ONCE(flags & VM_NO_GUARD))
+ flags &= ~VM_NO_GUARD;
+
if (count > totalram_pages())
return NULL;
@@ -2397,78 +2918,289 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
- if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
- pages) < 0) {
+ addr = (unsigned long)area->addr;
+ if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
+ pages, PAGE_SHIFT) < 0) {
vunmap(area->addr);
return NULL;
}
+ if (flags & VM_MAP_PUT_PAGES) {
+ area->pages = pages;
+ area->nr_pages = count;
+ }
return area->addr;
}
EXPORT_SYMBOL(vmap);
+#ifdef CONFIG_VMAP_PFN
+struct vmap_pfn_data {
+ unsigned long *pfns;
+ pgprot_t prot;
+ unsigned int idx;
+};
+
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
+{
+ struct vmap_pfn_data *data = private;
+ unsigned long pfn = data->pfns[data->idx];
+ pte_t ptent;
+
+ if (WARN_ON_ONCE(pfn_valid(pfn)))
+ return -EINVAL;
+
+ ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
+ set_pte_at(&init_mm, addr, pte, ptent);
+
+ data->idx++;
+ return 0;
+}
+
+/**
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
+ * @pfns: array of PFNs
+ * @count: number of pages to map
+ * @prot: page protection for the mapping
+ *
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
+ * the start address of the mapping.
+ */
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
+{
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
+ free_vm_area(area);
+ return NULL;
+ }
+
+ flush_cache_vmap((unsigned long)area->addr,
+ (unsigned long)area->addr + count * PAGE_SIZE);
+
+ return area->addr;
+}
+EXPORT_SYMBOL_GPL(vmap_pfn);
+#endif /* CONFIG_VMAP_PFN */
+
+static inline unsigned int
+vm_area_alloc_pages(gfp_t gfp, int nid,
+ unsigned int order, unsigned int nr_pages, struct page **pages)
+{
+ unsigned int nr_allocated = 0;
+ gfp_t alloc_gfp = gfp;
+ bool nofail = false;
+ struct page *page;
+ int i;
+
+ /*
+ * For order-0 pages we make use of bulk allocator, if
+ * the page array is partly or not at all populated due
+ * to fails, fallback to a single page allocator that is
+ * more permissive.
+ */
+ if (!order) {
+ /* bulk allocator doesn't support nofail req. officially */
+ gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
+
+ while (nr_allocated < nr_pages) {
+ unsigned int nr, nr_pages_request;
+
+ /*
+ * A maximum allowed request is hard-coded and is 100
+ * pages per call. That is done in order to prevent a
+ * long preemption off scenario in the bulk-allocator
+ * so the range is [1:100].
+ */
+ nr_pages_request = min(100U, nr_pages - nr_allocated);
+
+ /* memory allocation should consider mempolicy, we can't
+ * wrongly use nearest node when nid == NUMA_NO_NODE,
+ * otherwise memory may be allocated in only one node,
+ * but mempolicy wants to alloc memory by interleaving.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
+ nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
+ nr_pages_request,
+ pages + nr_allocated);
+
+ else
+ nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
+ nr_pages_request,
+ pages + nr_allocated);
+
+ nr_allocated += nr;
+ cond_resched();
+
+ /*
+ * If zero or pages were obtained partly,
+ * fallback to a single page allocator.
+ */
+ if (nr != nr_pages_request)
+ break;
+ }
+ } else if (gfp & __GFP_NOFAIL) {
+ /*
+ * Higher order nofail allocations are really expensive and
+ * potentially dangerous (pre-mature OOM, disruptive reclaim
+ * and compaction etc.
+ */
+ alloc_gfp &= ~__GFP_NOFAIL;
+ nofail = true;
+ }
+
+ /* High-order pages or fallback path if "bulk" fails. */
+ while (nr_allocated < nr_pages) {
+ if (fatal_signal_pending(current))
+ break;
+
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(alloc_gfp, order);
+ else
+ page = alloc_pages_node(nid, alloc_gfp, order);
+ if (unlikely(!page)) {
+ if (!nofail)
+ break;
+
+ /* fall back to the zero order allocations */
+ alloc_gfp |= __GFP_NOFAIL;
+ order = 0;
+ continue;
+ }
+
+ /*
+ * Higher order allocations must be able to be treated as
+ * indepdenent small pages by callers (as they can with
+ * small-page vmallocs). Some drivers do their own refcounting
+ * on vmalloc_to_page() pages, some use page->mapping,
+ * page->lru, etc.
+ */
+ if (order)
+ split_page(page, order);
+
+ /*
+ * Careful, we allocate and map page-order pages, but
+ * tracking is done per PAGE_SIZE page so as to keep the
+ * vm_struct APIs independent of the physical/mapped size.
+ */
+ for (i = 0; i < (1U << order); i++)
+ pages[nr_allocated + i] = page + i;
+
+ cond_resched();
+ nr_allocated += 1U << order;
+ }
+
+ return nr_allocated;
+}
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, unsigned int page_shift,
+ int node)
{
- struct page **pages;
- unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
- const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
- 0 :
- __GFP_HIGHMEM;
+ bool nofail = gfp_mask & __GFP_NOFAIL;
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long size = get_vm_area_size(area);
+ unsigned long array_size;
+ unsigned int nr_small_pages = size >> PAGE_SHIFT;
+ unsigned int page_order;
+ unsigned int flags;
+ int ret;
+
+ array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
- array_size = (nr_pages * sizeof(struct page *));
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
+ gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- node, area->caller);
+ area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+ area->caller);
} else {
- pages = kmalloc_node(array_size, nested_gfp, node);
+ area->pages = kmalloc_node(array_size, nested_gfp, node);
}
- if (!pages) {
- remove_vm_area(area->addr);
- kfree(area);
+ if (!area->pages) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, failed to allocated page array size %lu",
+ nr_small_pages * PAGE_SIZE, array_size);
+ free_vm_area(area);
return NULL;
}
- area->pages = pages;
- area->nr_pages = nr_pages;
+ set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
+ page_order = vm_area_page_order(area);
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page;
+ area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
+ node, page_order, nr_small_pages, area->pages);
- if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask|highmem_mask);
- else
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+ if (gfp_mask & __GFP_ACCOUNT) {
+ int i;
- if (unlikely(!page)) {
- /* Successfully allocated i pages, free them in __vfree() */
- area->nr_pages = i;
- atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- goto fail;
- }
- area->pages[i] = page;
- if (gfpflags_allow_blocking(gfp_mask))
- cond_resched();
+ for (i = 0; i < area->nr_pages; i++)
+ mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
}
- atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
- prot, pages) < 0)
+ /*
+ * If not enough pages were obtained to accomplish an
+ * allocation request, free them via vfree() if any.
+ */
+ if (area->nr_pages != nr_small_pages) {
+ /*
+ * vm_area_alloc_pages() can fail due to insufficient memory but
+ * also:-
+ *
+ * - a pending fatal signal
+ * - insufficient huge page-order pages
+ *
+ * Since we always retry allocations at order-0 in the huge page
+ * case a warning for either is spurious.
+ */
+ if (!fatal_signal_pending(current) && page_order == 0)
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, failed to allocate pages",
+ area->nr_pages * PAGE_SIZE);
+ goto fail;
+ }
+
+ /*
+ * page tables allocations ignore external gfp mask, enforce it
+ * by the scope API
+ */
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ flags = memalloc_nofs_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ flags = memalloc_noio_save();
+
+ do {
+ ret = vmap_pages_range(addr, addr + size, prot, area->pages,
+ page_shift);
+ if (nofail && (ret < 0))
+ schedule_timeout_uninterruptible(1);
+ } while (nofail && (ret < 0));
+
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ memalloc_nofs_restore(flags);
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ memalloc_noio_restore(flags);
+
+ if (ret < 0) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, failed to map pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
+ }
return area->addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure, allocated %ld of %ld bytes",
- (area->nr_pages*PAGE_SIZE), area->size);
- __vfree(area->addr);
+ vfree(area->addr);
return NULL;
}
@@ -2485,8 +3217,18 @@ fail:
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * allocator with @gfp_mask flags. Please note that the full set of gfp
+ * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
+ * supported.
+ * Zone modifiers are not supported. From the reclaim modifiers
+ * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
+ * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
+ * __GFP_RETRY_MAYFAIL are not supported).
+ *
+ * __GFP_NOWARN can be used to suppress failures messages.
+ *
+ * Map them into contiguous kernel virtual space, using a pagetable
+ * protection of @prot.
*
* Return: the address of the area or %NULL on failure
*/
@@ -2496,21 +3238,103 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
const void *caller)
{
struct vm_struct *area;
- void *addr;
+ void *ret;
+ kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long real_size = size;
+ unsigned long real_align = align;
+ unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages())
+ if (WARN_ON_ONCE(!size))
+ return NULL;
+
+ if ((size >> PAGE_SHIFT) > totalram_pages()) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, exceeds total pages",
+ real_size);
+ return NULL;
+ }
+
+ if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
+ unsigned long size_per_node;
+
+ /*
+ * Try huge pages. Only try for PAGE_KERNEL allocations,
+ * others like modules don't yet expect huge pages in
+ * their allocations due to apply_to_page_range not
+ * supporting them.
+ */
+
+ size_per_node = size;
+ if (node == NUMA_NO_NODE)
+ size_per_node /= num_online_nodes();
+ if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
+ shift = PMD_SHIFT;
+ else
+ shift = arch_vmap_pte_supported_shift(size_per_node);
+
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, 1UL << shift);
+ }
+
+again:
+ area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+ VM_UNINITIALIZED | vm_flags, start, end, node,
+ gfp_mask, caller);
+ if (!area) {
+ bool nofail = gfp_mask & __GFP_NOFAIL;
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, vm_struct allocation failed%s",
+ real_size, (nofail) ? ". Retrying." : "");
+ if (nofail) {
+ schedule_timeout_uninterruptible(1);
+ goto again;
+ }
goto fail;
+ }
- area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
- vm_flags, start, end, node, gfp_mask, caller);
- if (!area)
+ /*
+ * Prepare arguments for __vmalloc_area_node() and
+ * kasan_unpoison_vmalloc().
+ */
+ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
+ if (kasan_hw_tags_enabled()) {
+ /*
+ * Modify protection bits to allow tagging.
+ * This must be done before mapping.
+ */
+ prot = arch_vmap_pgprot_tagged(prot);
+
+ /*
+ * Skip page_alloc poisoning and zeroing for physical
+ * pages backing VM_ALLOC mapping. Memory is instead
+ * poisoned and zeroed by kasan_unpoison_vmalloc().
+ */
+ gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
+ }
+
+ /* Take note that the mapping is PAGE_KERNEL. */
+ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
+ }
+
+ /* Allocate physical pages and map them into vmalloc space. */
+ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
+ if (!ret)
goto fail;
- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
- if (!addr)
- return NULL;
+ /*
+ * Mark the pages as accessible, now that they are mapped.
+ * The condition for setting KASAN_VMALLOC_INIT should complement the
+ * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
+ * to make sure that memory is initialized under the same conditions.
+ * Tag-based KASAN modes only assign tags to normal non-executable
+ * allocations, see __kasan_unpoison_vmalloc().
+ */
+ kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
+ if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
+ (gfp_mask & __GFP_SKIP_ZERO))
+ kasan_flags |= KASAN_VMALLOC_INIT;
+ /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
+ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2519,13 +3343,20 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
*/
clear_vm_uninitialized_flag(area);
- kmemleak_vmalloc(area, size, gfp_mask);
+ size = PAGE_ALIGN(size);
+ if (!(vm_flags & VM_DEFER_KMEMLEAK))
+ kmemleak_vmalloc(area, size, gfp_mask);
- return addr;
+ return area->addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure: %lu bytes", real_size);
+ if (shift > PAGE_SHIFT) {
+ shift = PAGE_SHIFT;
+ align = real_align;
+ size = real_size;
+ goto again;
+ }
+
return NULL;
}
@@ -2590,6 +3421,26 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
+ * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * If @size is greater than or equal to PMD_SIZE, allow using
+ * huge pages for the memory
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
+{
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ NUMA_NO_NODE, __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(vmalloc_huge);
+
+/**
* vzalloc - allocate virtually contiguous memory with zero fill
* @size: allocation size
*
@@ -2674,7 +3525,7 @@ EXPORT_SYMBOL(vzalloc_node);
* 64b systems should always have either DMA or DMA32 zones. For others
* GFP_DMA32 should do the right thing and use the normal zone.
*/
-#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif
/**
@@ -2712,91 +3563,163 @@ void *vmalloc_32_user(unsigned long size)
EXPORT_SYMBOL(vmalloc_32_user);
/*
- * small helper routine , copy contents to buf from addr.
- * If the page is not present, fill zero.
+ * Atomically zero bytes in the iterator.
+ *
+ * Returns the number of zeroed bytes.
*/
+static size_t zero_iter(struct iov_iter *iter, size_t count)
+{
+ size_t remains = count;
+
+ while (remains > 0) {
+ size_t num, copied;
+
+ num = min_t(size_t, remains, PAGE_SIZE);
+ copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
+ remains -= copied;
+
+ if (copied < num)
+ break;
+ }
+
+ return count - remains;
+}
-static int aligned_vread(char *buf, char *addr, unsigned long count)
+/*
+ * small helper routine, copy contents to iter from addr.
+ * If the page is not present, fill zero.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t aligned_vread_iter(struct iov_iter *iter,
+ const char *addr, size_t count)
{
- struct page *p;
- int copied = 0;
+ size_t remains = count;
+ struct page *page;
- while (count) {
+ while (remains > 0) {
unsigned long offset, length;
+ size_t copied = 0;
offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
+ if (length > remains)
+ length = remains;
+ page = vmalloc_to_page(addr);
/*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calles for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
+ * To do safe access to this _mapped_ area, we need lock. But
+ * adding lock here means that we need to add overhead of
+ * vmalloc()/vfree() calls for this _debug_ interface, rarely
+ * used. Instead of that, we'll use an local mapping via
+ * copy_page_to_iter_nofault() and accept a small overhead in
+ * this access function.
*/
- if (p) {
- /*
- * we can expect USER0 is not used (see vread/vwrite's
- * function description)
- */
- void *map = kmap_atomic(p);
- memcpy(buf, map + offset, length);
- kunmap_atomic(map);
- } else
- memset(buf, 0, length);
+ if (page)
+ copied = copy_page_to_iter_nofault(page, offset,
+ length, iter);
+ else
+ copied = zero_iter(iter, length);
- addr += length;
- buf += length;
- copied += length;
- count -= length;
+ addr += copied;
+ remains -= copied;
+
+ if (copied != length)
+ break;
}
- return copied;
+
+ return count - remains;
}
-static int aligned_vwrite(char *buf, char *addr, unsigned long count)
+/*
+ * Read from a vm_map_ram region of memory.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
+ size_t count, unsigned long flags)
{
- struct page *p;
- int copied = 0;
+ char *start;
+ struct vmap_block *vb;
+ struct xarray *xa;
+ unsigned long offset;
+ unsigned int rs, re;
+ size_t remains, n;
- while (count) {
- unsigned long offset, length;
+ /*
+ * If it's area created by vm_map_ram() interface directly, but
+ * not further subdividing and delegating management to vmap_block,
+ * handle it here.
+ */
+ if (!(flags & VMAP_BLOCK))
+ return aligned_vread_iter(iter, addr, count);
- offset = offset_in_page(addr);
- length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
- /*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calles for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
- */
- if (p) {
- /*
- * we can expect USER0 is not used (see vread/vwrite's
- * function description)
- */
- void *map = kmap_atomic(p);
- memcpy(map + offset, buf, length);
- kunmap_atomic(map);
+ remains = count;
+
+ /*
+ * Area is split into regions and tracked with vmap_block, read out
+ * each region and zero fill the hole between regions.
+ */
+ xa = addr_to_vb_xa((unsigned long) addr);
+ vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
+ if (!vb)
+ goto finished_zero;
+
+ spin_lock(&vb->lock);
+ if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
+ spin_unlock(&vb->lock);
+ goto finished_zero;
+ }
+
+ for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
+
+ start = vmap_block_vaddr(vb->va->va_start, rs);
+
+ if (addr < start) {
+ size_t to_zero = min_t(size_t, start - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
+ goto finished;
}
- addr += length;
- buf += length;
- copied += length;
- count -= length;
+
+ /*it could start reading from the middle of used region*/
+ offset = offset_in_page(addr);
+ n = ((re - rs + 1) << PAGE_SHIFT) - offset;
+ if (n > remains)
+ n = remains;
+
+ copied = aligned_vread_iter(iter, start + offset, n);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
- return copied;
+
+ spin_unlock(&vb->lock);
+
+finished_zero:
+ /* zero-fill the left dirty or free regions */
+ return count - remains + zero_iter(iter, remains);
+finished:
+ /* We couldn't copy/zero everything */
+ spin_unlock(&vb->lock);
+ return count - remains;
}
/**
- * vread() - read vmalloc area in a safe way.
- * @buf: buffer for reading data
- * @addr: vm address.
- * @count: number of bytes to be read.
+ * vread_iter() - read vmalloc area in a safe way to an iterator.
+ * @iter: the iterator to which data should be written.
+ * @addr: vm address.
+ * @count: number of bytes to be read.
*
* This function checks that addr is a valid vmalloc'ed area, and
* copy data from that area to a given buffer. If the given memory range
@@ -2810,139 +3733,103 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
* Note: In usual ops, vread() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
+ * any information, as /proc/kcore.
*
* Return: number of bytes for which addr and buf should be increased
* (same number as @count) or %0 if [addr...addr+count) doesn't
* include any intersection with valid vmalloc area
*/
-long vread(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
struct vmap_area *va;
struct vm_struct *vm;
- char *vaddr, *buf_start = buf;
- unsigned long buflen = count;
- unsigned long n;
+ char *vaddr;
+ size_t n, size, flags, remains;
+
+ addr = kasan_reset_tag(addr);
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
+ remains = count;
+
spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
- if (!count)
- break;
+ va = find_vmap_area_exceed_addr((unsigned long)addr);
+ if (!va)
+ goto finished_zero;
- if (!va->vm)
- continue;
+ /* no intersects with alive vmap_area */
+ if ((unsigned long)addr + remains <= va->va_start)
+ goto finished_zero;
+
+ list_for_each_entry_from(va, &vmap_area_list, list) {
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + get_vm_area_size(vm))
+ flags = va->flags & VMAP_FLAGS_MASK;
+ /*
+ * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
+ * be set together with VMAP_RAM.
+ */
+ WARN_ON(flags == VMAP_BLOCK);
+
+ if (!vm && !flags)
continue;
- while (addr < vaddr) {
- if (count == 0)
- goto finished;
- *buf = '\0';
- buf++;
- addr++;
- count--;
- }
- n = vaddr + get_vm_area_size(vm) - addr;
- if (n > count)
- n = count;
- if (!(vm->flags & VM_IOREMAP))
- aligned_vread(buf, addr, n);
- else /* IOREMAP area is treated as memory hole */
- memset(buf, 0, n);
- buf += n;
- addr += n;
- count -= n;
- }
-finished:
- spin_unlock(&vmap_area_lock);
- if (buf == buf_start)
- return 0;
- /* zero-fill memory holes */
- if (buf != buf_start + buflen)
- memset(buf, 0, buflen - (buf - buf_start));
+ if (vm && (vm->flags & VM_UNINITIALIZED))
+ continue;
- return buflen;
-}
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
-/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf: buffer for source data
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
- *
- * Return: number of bytes for which addr and buf should be
- * increased (same number as @count) or %0 if [addr...addr+count)
- * doesn't include any intersection with valid vmalloc area
- */
-long vwrite(char *buf, char *addr, unsigned long count)
-{
- struct vmap_area *va;
- struct vm_struct *vm;
- char *vaddr;
- unsigned long n, buflen;
- int copied = 0;
+ vaddr = (char *) va->va_start;
+ size = vm ? get_vm_area_size(vm) : va_size(va);
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
- buflen = count;
+ if (addr >= vaddr + size)
+ continue;
- spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
- if (!count)
- break;
+ if (addr < vaddr) {
+ size_t to_zero = min_t(size_t, vaddr - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
- if (!va->vm)
- continue;
+ addr += zeroed;
+ remains -= zeroed;
- vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + get_vm_area_size(vm))
- continue;
- while (addr < vaddr) {
- if (count == 0)
+ if (remains == 0 || zeroed != to_zero)
goto finished;
- buf++;
- addr++;
- count--;
- }
- n = vaddr + get_vm_area_size(vm) - addr;
- if (n > count)
- n = count;
- if (!(vm->flags & VM_IOREMAP)) {
- aligned_vwrite(buf, addr, n);
- copied++;
}
- buf += n;
- addr += n;
- count -= n;
+
+ n = vaddr + size - addr;
+ if (n > remains)
+ n = remains;
+
+ if (flags & VMAP_RAM)
+ copied = vmap_ram_vread_iter(iter, addr, n, flags);
+ else if (!(vm->flags & VM_IOREMAP))
+ copied = aligned_vread_iter(iter, addr, n);
+ else /* IOREMAP area is treated as memory hole */
+ copied = zero_iter(iter, n);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
+
+finished_zero:
+ spin_unlock(&vmap_area_lock);
+ /* zero-fill memory holes */
+ return count - remains + zero_iter(iter, remains);
finished:
+ /* Nothing remains, or We couldn't copy/zero everything. */
spin_unlock(&vmap_area_lock);
- if (!copied)
- return 0;
- return buflen;
+
+ return count - remains;
}
/**
@@ -3003,11 +3890,10 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
size -= PAGE_SIZE;
} while (size > 0);
- vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
return 0;
}
-EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
* remap_vmalloc_range - map vmalloc pages to userspace
@@ -3032,54 +3918,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-static int f(pte_t *pte, unsigned long addr, void *data)
-{
- pte_t ***p = data;
-
- if (p) {
- *(*p) = pte;
- (*p)++;
- }
- return 0;
-}
-
-/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
- *
- * Returns: NULL on failure, vm_struct on success
- *
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
- *
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
- */
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- struct vm_struct *area;
-
- area = get_vm_area_caller(size, VM_IOREMAP,
- __builtin_return_address(0));
- if (area == NULL)
- return NULL;
-
- /*
- * This ensures that page tables are constructed for this region
- * of kernel virtual address space and mapped into init_mm.
- */
- if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
- size, f, ptes ? &ptes : NULL)) {
- free_vm_area(area);
- return NULL;
- }
-
- return area;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;
@@ -3135,6 +3973,7 @@ pvm_find_va_enclose_addr(unsigned long addr)
* @va:
* in - the VA we start the search(reverse order);
* out - the VA with the highest aligned end address.
+ * @align: alignment for required highest address
*
* Returns: determined end address within vmap_area
*/
@@ -3191,7 +4030,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
int area, area2, last_area, term_area;
unsigned long base, start, size, end, last_end, orig_start, orig_end;
bool purged = false;
- enum fit_type type;
/* verify parameters and allocate data structures */
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -3302,15 +4140,13 @@ retry:
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- type = classify_va_fit_type(va, start, size);
- if (WARN_ON_ONCE(type == NOTHING_FIT))
+ ret = adjust_va_to_fit_type(&free_vmap_area_root,
+ &free_vmap_area_list,
+ va, start, size);
+ if (WARN_ON_ONCE(unlikely(ret)))
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- ret = adjust_va_to_fit_type(va, start, size, type);
- if (unlikely(ret))
- goto recovery;
-
/* Allocated area. */
va = vas[area];
va->va_start = start;
@@ -3323,9 +4159,6 @@ retry:
for (area = 0; area < nr_vms; area++) {
if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
goto err_free_shadow;
-
- kasan_unpoison_vmalloc((void *)vas[area]->va_start,
- sizes[area]);
}
/* insert all vm's */
@@ -3338,6 +4171,16 @@ retry:
}
spin_unlock(&vmap_area_lock);
+ /*
+ * Mark allocated areas as accessible. Do it now as a best-effort
+ * approach, as they can be mapped outside of vmalloc code.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ for (area = 0; area < nr_vms; area++)
+ vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
+ vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
+
kfree(vas);
return vms;
@@ -3351,8 +4194,8 @@ recovery:
while (area--) {
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
- va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
- &free_vmap_area_list);
+ va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -3362,7 +4205,7 @@ recovery:
overflow:
spin_unlock(&free_vmap_area_lock);
if (!purged) {
- purge_vmap_area_lazy();
+ reclaim_and_purge_vmap_areas();
purged = true;
/* Before "retry", check if we recover. */
@@ -3401,8 +4244,8 @@ err_free_shadow:
for (area = 0; area < nr_vms; area++) {
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
- va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
- &free_vmap_area_list);
+ va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -3432,6 +4275,39 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_PRINTK
+bool vmalloc_dump_obj(void *object)
+{
+ void *objp = (void *)PAGE_ALIGN((unsigned long)object);
+ const void *caller;
+ struct vm_struct *vm;
+ struct vmap_area *va;
+ unsigned long addr;
+ unsigned int nr_pages;
+
+ if (!spin_trylock(&vmap_area_lock))
+ return false;
+ va = __find_vmap_area((unsigned long)objp, &vmap_area_root);
+ if (!va) {
+ spin_unlock(&vmap_area_lock);
+ return false;
+ }
+
+ vm = va->vm;
+ if (!vm) {
+ spin_unlock(&vmap_area_lock);
+ return false;
+ }
+ addr = (unsigned long)vm->addr;
+ caller = vm->caller;
+ nr_pages = vm->nr_pages;
+ spin_unlock(&vmap_area_lock);
+ pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
+ nr_pages, addr, caller);
+ return true;
+}
+#endif
+
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
__acquires(&vmap_purge_lock)
@@ -3449,17 +4325,18 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
}
static void s_stop(struct seq_file *m, void *p)
- __releases(&vmap_purge_lock)
__releases(&vmap_area_lock)
+ __releases(&vmap_purge_lock)
{
- mutex_unlock(&vmap_purge_lock);
spin_unlock(&vmap_area_lock);
+ mutex_unlock(&vmap_purge_lock);
}
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
if (IS_ENABLED(CONFIG_NUMA)) {
unsigned int nr, *counters = m->private;
+ unsigned int step = 1U << vm_area_page_order(v);
if (!counters)
return;
@@ -3471,9 +4348,8 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr++)
- counters[page_to_nid(v->pages[nr])]++;
-
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
for_each_node_state(nr, N_HIGH_MEMORY)
if (counters[nr])
seq_printf(m, " N%u=%u", nr, counters[nr]);
@@ -3482,18 +4358,15 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
static void show_purge_info(struct seq_file *m)
{
- struct llist_node *head;
struct vmap_area *va;
- head = READ_ONCE(vmap_purge_list.first);
- if (head == NULL)
- return;
-
- llist_for_each_entry(va, head, purge_list) {
+ spin_lock(&purge_vmap_area_lock);
+ list_for_each_entry(va, &purge_vmap_area_list, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
(void *)va->va_start, (void *)va->va_end,
va->va_end - va->va_start);
}
+ spin_unlock(&purge_vmap_area_lock);
}
static int s_show(struct seq_file *m, void *p)
@@ -3503,16 +4376,13 @@ static int s_show(struct seq_file *m, void *p)
va = list_entry(p, struct vmap_area, list);
- /*
- * s_show can encounter race with remove_vm_area, !vm on behalf
- * of vmap area is being tear down or vm_map_ram allocation.
- */
if (!va->vm) {
- seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
- (void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
+ if (va->flags & VMAP_RAM)
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
- return 0;
+ goto final;
}
v = va->vm;
@@ -3551,11 +4421,9 @@ static int s_show(struct seq_file *m, void *p)
seq_putc(m, '\n');
/*
- * As a final step, dump "unpurged" areas. Note,
- * that entire "/proc/vmallocinfo" output will not
- * be address sorted, because the purge list is not
- * sorted.
+ * As a final step, dump "unpurged" areas.
*/
+final:
if (list_is_last(&va->list, &vmap_area_list))
show_purge_info(m);
@@ -3582,3 +4450,46 @@ static int __init proc_vmalloc_init(void)
module_init(proc_vmalloc_init);
#endif
+
+void __init vmalloc_init(void)
+{
+ struct vmap_area *va;
+ struct vm_struct *tmp;
+ int i;
+
+ /*
+ * Create the cache for vmap_area objects.
+ */
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+
+ for_each_possible_cpu(i) {
+ struct vmap_block_queue *vbq;
+ struct vfree_deferred *p;
+
+ vbq = &per_cpu(vmap_block_queue, i);
+ spin_lock_init(&vbq->lock);
+ INIT_LIST_HEAD(&vbq->free);
+ p = &per_cpu(vfree_deferred, i);
+ init_llist_head(&p->list);
+ INIT_WORK(&p->wq, delayed_vfree_work);
+ xa_init(&vbq->vmap_blocks);
+ }
+
+ /* Import existing vmlist entries. */
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (WARN_ON_ONCE(!va))
+ continue;
+
+ va->va_start = (unsigned long)tmp->addr;
+ va->va_end = va->va_start + tmp->size;
+ va->vm = tmp;
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ }
+
+ /*
+ * Now we can initialize a free vmap space.
+ */
+ vmap_init_free_space();
+ vmap_initialized = true;
+}
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index d69019fc3789..22c6689d9302 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -74,8 +74,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
{
- struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
memcg = parent_mem_cgroup(memcg);
if (!memcg)
@@ -240,7 +239,20 @@ static void vmpressure_work_fn(struct work_struct *work)
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
- struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ struct vmpressure *vmpr;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ /*
+ * The in-kernel users only care about the reclaim efficiency
+ * for this @memcg rather than the whole subtree, and there
+ * isn't and won't be any in-kernel user in a legacy cgroup.
+ */
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree)
+ return;
+
+ vmpr = memcg_to_vmpressure(memcg);
/*
* Here we only want to account pressure that userland is able to
@@ -304,7 +316,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
* asserted for a second in which subsequent
* pressure events can occur.
*/
- memcg->socket_pressure = jiffies + HZ;
+ WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
}
}
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 879fb57c5045..da152407bc2b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/mm/vmscan.c
- *
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
@@ -28,8 +26,7 @@
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
-#include <linux/buffer_head.h> /* for try_to_release_page(),
- buffer_heads_over_limit */
+#include <linux/buffer_head.h> /* for buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
@@ -43,22 +40,33 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/memory-tiers.h>
#include <linux/oom.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
+#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/khugepaged.h>
+#include <linux/rculist_nulls.h>
+#include <linux/random.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>
+#include <linux/sched/sysctl.h>
#include "internal.h"
+#include "swap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
@@ -85,7 +93,7 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
- /* Can active pages be deactivated as part of reclaim? */
+ /* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
unsigned int may_deactivate:2;
@@ -95,16 +103,22 @@ struct scan_control {
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
- /* Can mapped pages be reclaimed? */
+ /* Can mapped folios be reclaimed? */
unsigned int may_unmap:1;
- /* Can pages be swapped as part of reclaim? */
+ /* Can folios be swapped as part of reclaim? */
unsigned int may_swap:1;
+ /* Proactive reclaim invoked by userspace through memory.reclaim */
+ unsigned int proactive:1;
+
/*
- * Cgroups are not reclaimed below their configured memory.low,
- * unless we threaten to OOM. If any cgroups are skipped due to
- * memory.low and nothing was reclaimed, go back for memory.low.
+ * Cgroup memory below memory.low is protected as long as we
+ * don't threaten to OOM. If any cgroup is reclaimed at
+ * reduced force or passed over entirely due to its memory.low
+ * setting (memcg_low_skipped), and nothing is reclaimed as a
+ * result, then go back for one more cycle that reclaims the protected
+ * memory (memcg_low_reclaim) to avert OOM.
*/
unsigned int memcg_low_reclaim:1;
unsigned int memcg_low_skipped:1;
@@ -117,16 +131,19 @@ struct scan_control {
/* There is easily reclaimable cold cache in the current node */
unsigned int cache_trim_mode:1;
- /* The file pages on the current node are dangerously low */
+ /* The file folios on the current node are dangerously low */
unsigned int file_is_tiny:1;
+ /* Always discard instead of demoting to lower tier memory */
+ unsigned int no_demotion:1;
+
/* Allocation order */
s8 order;
/* Scan (total_size >> priority) pages at once */
s8 priority;
- /* The highest zone to isolate pages for reclaim from */
+ /* The highest zone to isolate folios for reclaim from */
s8 reclaim_idx;
/* This context's GFP mask */
@@ -153,17 +170,17 @@ struct scan_control {
};
#ifdef ARCH_HAS_PREFETCHW
-#define prefetchw_prev_lru_page(_page, _base, _field) \
+#define prefetchw_prev_lru_folio(_folio, _base, _field) \
do { \
- if ((_page)->lru.prev != _base) { \
- struct page *prev; \
+ if ((_folio)->lru.prev != _base) { \
+ struct folio *prev; \
\
- prev = lru_to_page(&(_page->lru)); \
+ prev = lru_to_folio(&(_folio->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#else
-#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
#endif
/*
@@ -171,55 +188,186 @@ struct scan_control {
*/
int vm_swappiness = 60;
-static void set_task_reclaim_state(struct task_struct *task,
- struct reclaim_state *rs)
+LIST_HEAD(shrinker_list);
+DECLARE_RWSEM(shrinker_rwsem);
+
+#ifdef CONFIG_MEMCG
+static int shrinker_nr_max;
+
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
+static inline int shrinker_map_size(int nr_items)
{
- /* Check for an overwrite */
- WARN_ON_ONCE(rs && task->reclaim_state);
+ return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
+}
- /* Check for the nulling of an already-nulled member */
- WARN_ON_ONCE(!rs && !task->reclaim_state);
+static inline int shrinker_defer_size(int nr_items)
+{
+ return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+}
- task->reclaim_state = rs;
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
+ int nid)
+{
+ return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+ lockdep_is_held(&shrinker_rwsem));
}
-static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+static int expand_one_shrinker_info(struct mem_cgroup *memcg,
+ int map_size, int defer_size,
+ int old_map_size, int old_defer_size,
+ int new_nr_max)
+{
+ struct shrinker_info *new, *old;
+ struct mem_cgroup_per_node *pn;
+ int nid;
+ int size = map_size + defer_size;
-#ifdef CONFIG_MEMCG
-/*
- * We allow subsystems to populate their shrinker-related
- * LRU lists before register_shrinker_prepared() is called
- * for the shrinker, since we don't want to impose
- * restrictions on their internal registration order.
- * In this case shrink_slab_memcg() may find corresponding
- * bit is set in the shrinkers map.
- *
- * This value is used by the function to detect registering
- * shrinkers and to skip do_shrink_slab() calls for them.
- */
-#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+ for_each_node(nid) {
+ pn = memcg->nodeinfo[nid];
+ old = shrinker_info_protected(memcg, nid);
+ /* Not yet online memcg */
+ if (!old)
+ return 0;
+
+ /* Already expanded this shrinker_info */
+ if (new_nr_max <= old->map_nr_max)
+ continue;
+
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+ if (!new)
+ return -ENOMEM;
+
+ new->nr_deferred = (atomic_long_t *)(new + 1);
+ new->map = (void *)new->nr_deferred + defer_size;
+ new->map_nr_max = new_nr_max;
+
+ /* map: set all old bits, clear all new bits */
+ memset(new->map, (int)0xff, old_map_size);
+ memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
+ /* nr_deferred: copy old values, clear all new values */
+ memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
+ memset((void *)new->nr_deferred + old_defer_size, 0,
+ defer_size - old_defer_size);
+
+ rcu_assign_pointer(pn->shrinker_info, new);
+ kvfree_rcu(old, rcu);
+ }
+
+ return 0;
+}
+
+void free_shrinker_info(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *pn;
+ struct shrinker_info *info;
+ int nid;
+
+ for_each_node(nid) {
+ pn = memcg->nodeinfo[nid];
+ info = rcu_dereference_protected(pn->shrinker_info, true);
+ kvfree(info);
+ rcu_assign_pointer(pn->shrinker_info, NULL);
+ }
+}
+
+int alloc_shrinker_info(struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+ int nid, size, ret = 0;
+ int map_size, defer_size = 0;
+
+ down_write(&shrinker_rwsem);
+ map_size = shrinker_map_size(shrinker_nr_max);
+ defer_size = shrinker_defer_size(shrinker_nr_max);
+ size = map_size + defer_size;
+ for_each_node(nid) {
+ info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
+ if (!info) {
+ free_shrinker_info(memcg);
+ ret = -ENOMEM;
+ break;
+ }
+ info->nr_deferred = (atomic_long_t *)(info + 1);
+ info->map = (void *)info->nr_deferred + defer_size;
+ info->map_nr_max = shrinker_nr_max;
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
+ }
+ up_write(&shrinker_rwsem);
+
+ return ret;
+}
+
+static int expand_shrinker_info(int new_id)
+{
+ int ret = 0;
+ int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
+ int map_size, defer_size = 0;
+ int old_map_size, old_defer_size = 0;
+ struct mem_cgroup *memcg;
+
+ if (!root_mem_cgroup)
+ goto out;
+
+ lockdep_assert_held(&shrinker_rwsem);
+
+ map_size = shrinker_map_size(new_nr_max);
+ defer_size = shrinker_defer_size(new_nr_max);
+ old_map_size = shrinker_map_size(shrinker_nr_max);
+ old_defer_size = shrinker_defer_size(shrinker_nr_max);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ ret = expand_one_shrinker_info(memcg, map_size, defer_size,
+ old_map_size, old_defer_size,
+ new_nr_max);
+ if (ret) {
+ mem_cgroup_iter_break(NULL, memcg);
+ goto out;
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+out:
+ if (!ret)
+ shrinker_nr_max = new_nr_max;
+
+ return ret;
+}
+
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+ if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+ struct shrinker_info *info;
+
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id, info->map);
+ }
+ rcu_read_unlock();
+ }
+}
static DEFINE_IDR(shrinker_idr);
-static int shrinker_nr_max;
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
int id, ret = -ENOMEM;
+ if (mem_cgroup_disabled())
+ return -ENOSYS;
+
down_write(&shrinker_rwsem);
/* This may call shrinker, so it must use down_read_trylock() */
- id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
if (id >= shrinker_nr_max) {
- if (memcg_expand_shrinker_maps(id)) {
+ if (expand_shrinker_info(id)) {
idr_remove(&shrinker_idr, id);
goto unlock;
}
-
- shrinker_nr_max = id + 1;
}
shrinker->id = id;
ret = 0;
@@ -234,23 +382,75 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- down_write(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_rwsem);
+
idr_remove(&shrinker_idr, id);
- up_write(&shrinker_rwsem);
}
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+
+ info = shrinker_info_protected(memcg, nid);
+ return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+
+ info = shrinker_info_protected(memcg, nid);
+ return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+}
+
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+ int i, nid;
+ long nr;
+ struct mem_cgroup *parent;
+ struct shrinker_info *child_info, *parent_info;
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ /* Prevent from concurrent shrinker_info expand */
+ down_read(&shrinker_rwsem);
+ for_each_node(nid) {
+ child_info = shrinker_info_protected(memcg, nid);
+ parent_info = shrinker_info_protected(parent, nid);
+ for (i = 0; i < child_info->map_nr_max; i++) {
+ nr = atomic_long_read(&child_info->nr_deferred[i]);
+ atomic_long_add(nr, &parent_info->nr_deferred[i]);
+ }
+ }
+ up_read(&shrinker_rwsem);
+}
+
+/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
static bool cgroup_reclaim(struct scan_control *sc)
{
return sc->target_mem_cgroup;
}
+/*
+ * Returns true for reclaim on the root cgroup. This is true for direct
+ * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
+ */
+static bool root_reclaim(struct scan_control *sc)
+{
+ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
+}
+
/**
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
* @sc: scan_control in question
*
* The normal page dirty throttling mechanism in balance_dirty_pages() is
* completely broken with the legacy memcg and direct stalling in
- * shrink_page_list() is used for throttling instead, which lacks all the
+ * shrink_folio_list() is used for throttling instead, which lacks all the
* niceties such as fairness, adaptive pausing, bandwidth proportional
* allocation and configurability.
*
@@ -270,28 +470,167 @@ static bool writeback_throttling_sane(struct scan_control *sc)
#else
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
- return 0;
+ return -ENOSYS;
}
static void unregister_memcg_shrinker(struct shrinker *shrinker)
{
}
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
static bool cgroup_reclaim(struct scan_control *sc)
{
return false;
}
+static bool root_reclaim(struct scan_control *sc)
+{
+ return true;
+}
+
static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
}
#endif
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
+/*
+ * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
+ * scan_control->nr_reclaimed.
+ */
+static void flush_reclaim_state(struct scan_control *sc)
+{
+ /*
+ * Currently, reclaim_state->reclaimed includes three types of pages
+ * freed outside of vmscan:
+ * (1) Slab pages.
+ * (2) Clean file pages from pruned inodes (on highmem systems).
+ * (3) XFS freed buffer pages.
+ *
+ * For all of these cases, we cannot universally link the pages to a
+ * single memcg. For example, a memcg-aware shrinker can free one object
+ * charged to the target memcg, causing an entire page to be freed.
+ * If we count the entire page as reclaimed from the memcg, we end up
+ * overestimating the reclaimed amount (potentially under-reclaiming).
+ *
+ * Only count such pages for global reclaim to prevent under-reclaiming
+ * from the target memcg; preventing unnecessary retries during memcg
+ * charging and false positives from proactive reclaim.
+ *
+ * For uncommon cases where the freed pages were actually mostly
+ * charged to the target memcg, we end up underestimating the reclaimed
+ * amount. This should be fine. The freed pages will be uncharged
+ * anyway, even if they are not counted here properly, and we will be
+ * able to make forward progress in charging (which is usually in a
+ * retry loop).
+ *
+ * We can go one step further, and report the uncharged objcg pages in
+ * memcg reclaim, to make reporting more accurate and reduce
+ * underestimation, but it's probably not worth the complexity for now.
+ */
+ if (current->reclaim_state && root_reclaim(sc)) {
+ sc->nr_reclaimed += current->reclaim_state->reclaimed;
+ current->reclaim_state->reclaimed = 0;
+ }
+}
+
+static long xchg_nr_deferred(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int nid = sc->nid;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ if (sc->memcg &&
+ (shrinker->flags & SHRINKER_MEMCG_AWARE))
+ return xchg_nr_deferred_memcg(nid, shrinker,
+ sc->memcg);
+
+ return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+
+static long add_nr_deferred(long nr, struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int nid = sc->nid;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ if (sc->memcg &&
+ (shrinker->flags & SHRINKER_MEMCG_AWARE))
+ return add_nr_deferred_memcg(nr, nid, shrinker,
+ sc->memcg);
+
+ return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+}
+
+static bool can_demote(int nid, struct scan_control *sc)
+{
+ if (!numa_demotion_enabled)
+ return false;
+ if (sc && sc->no_demotion)
+ return false;
+ if (next_demotion_node(nid) == NUMA_NO_NODE)
+ return false;
+
+ return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+ int nid,
+ struct scan_control *sc)
+{
+ if (memcg == NULL) {
+ /*
+ * For non-memcg reclaim, is there
+ * space in any swap device?
+ */
+ if (get_nr_swap_pages() > 0)
+ return true;
+ } else {
+ /* Is the memcg below its swap limit? */
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+ return true;
+ }
+
+ /*
+ * The page can not be swapped.
+ *
+ * Can it be reclaimed from this node via demotion?
+ */
+ return can_demote(nid, sc);
+}
+
/*
- * This misses isolated pages which are not accounted for to save counters.
+ * This misses isolated folios which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
- * not expected that isolated pages will be a dominating factor.
+ * not expected that isolated folios will be a dominating factor.
*/
unsigned long zone_reclaimable_pages(struct zone *zone)
{
@@ -299,7 +638,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
@@ -310,14 +649,15 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
* @lru: lru to use
- * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
*/
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
+static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int zone_idx)
{
unsigned long size = 0;
int zid;
- for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
+ for (zid = 0; zid <= zone_idx; zid++) {
struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
if (!managed_zone(zone))
@@ -334,10 +674,20 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
/*
* Add a shrinker callback to be called from the vm.
*/
-int prealloc_shrinker(struct shrinker *shrinker)
+static int __prealloc_shrinker(struct shrinker *shrinker)
{
- unsigned int size = sizeof(*shrinker->nr_deferred);
+ unsigned int size;
+ int err;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ err = prealloc_memcg_shrinker(shrinker);
+ if (err != -ENOSYS)
+ return err;
+
+ shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+ }
+ size = sizeof(*shrinker->nr_deferred);
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -345,26 +695,48 @@ int prealloc_shrinker(struct shrinker *shrinker)
if (!shrinker->nr_deferred)
return -ENOMEM;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- if (prealloc_memcg_shrinker(shrinker))
- goto free_deferred;
- }
-
return 0;
+}
-free_deferred:
- kfree(shrinker->nr_deferred);
- shrinker->nr_deferred = NULL;
- return -ENOMEM;
+#ifdef CONFIG_SHRINKER_DEBUG
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ va_list ap;
+ int err;
+
+ va_start(ap, fmt);
+ shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!shrinker->name)
+ return -ENOMEM;
+
+ err = __prealloc_shrinker(shrinker);
+ if (err) {
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+ }
+
+ return err;
}
+#else
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ return __prealloc_shrinker(shrinker);
+}
+#endif
void free_prealloced_shrinker(struct shrinker *shrinker)
{
- if (!shrinker->nr_deferred)
- return;
-
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+#ifdef CONFIG_SHRINKER_DEBUG
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+#endif
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
+ up_write(&shrinker_rwsem);
+ return;
+ }
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
@@ -374,22 +746,46 @@ void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
- idr_replace(&shrinker_idr, shrinker, shrinker->id);
-#endif
+ shrinker->flags |= SHRINKER_REGISTERED;
+ shrinker_debugfs_add(shrinker);
up_write(&shrinker_rwsem);
}
-int register_shrinker(struct shrinker *shrinker)
+static int __register_shrinker(struct shrinker *shrinker)
{
- int err = prealloc_shrinker(shrinker);
+ int err = __prealloc_shrinker(shrinker);
if (err)
return err;
register_shrinker_prepared(shrinker);
return 0;
}
+
+#ifdef CONFIG_SHRINKER_DEBUG
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ va_list ap;
+ int err;
+
+ va_start(ap, fmt);
+ shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!shrinker->name)
+ return -ENOMEM;
+
+ err = __register_shrinker(shrinker);
+ if (err) {
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+ }
+ return err;
+}
+#else
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ return __register_shrinker(shrinker);
+}
+#endif
EXPORT_SYMBOL(register_shrinker);
/*
@@ -397,18 +793,42 @@ EXPORT_SYMBOL(register_shrinker);
*/
void unregister_shrinker(struct shrinker *shrinker)
{
- if (!shrinker->nr_deferred)
+ struct dentry *debugfs_entry;
+ int debugfs_id;
+
+ if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
- unregister_memcg_shrinker(shrinker);
+
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
+ shrinker->flags &= ~SHRINKER_REGISTERED;
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
+ debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
up_write(&shrinker_rwsem);
+
+ shrinker_debugfs_remove(debugfs_entry, debugfs_id);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
EXPORT_SYMBOL(unregister_shrinker);
+/**
+ * synchronize_shrinkers - Wait for all running shrinkers to complete.
+ *
+ * This is equivalent to calling unregister_shrink() and register_shrinker(),
+ * but atomically and with less overhead. This is useful to guarantee that all
+ * shrinker invocations have seen an update, before freeing memory, similar to
+ * rcu.
+ */
+void synchronize_shrinkers(void)
+{
+ down_write(&shrinker_rwsem);
+ up_write(&shrinker_rwsem);
+}
+EXPORT_SYMBOL(synchronize_shrinkers);
+
#define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
@@ -420,14 +840,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
long freeable;
long nr;
long new_nr;
- int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
long scanned = 0, next_deferred;
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- nid = 0;
-
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0 || freeable == SHRINK_EMPTY)
return freeable;
@@ -437,9 +853,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
- nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+ nr = xchg_nr_deferred(shrinker, shrinkctl);
- total_scan = nr;
if (shrinker->seeks) {
delta = freeable >> priority;
delta *= 4;
@@ -453,37 +868,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
delta = freeable / 2;
}
+ total_scan = nr >> priority;
total_scan += delta;
- if (total_scan < 0) {
- pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
- shrinker->scan_objects, total_scan);
- total_scan = freeable;
- next_deferred = nr;
- } else
- next_deferred = total_scan;
-
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * freeable. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (delta < freeable / 4)
- total_scan = min(total_scan, freeable / 2);
-
- /*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
- */
- if (total_scan > freeable * 2)
- total_scan = freeable * 2;
+ total_scan = min(total_scan, (2 * freeable));
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
freeable, delta, total_scan, priority);
@@ -522,22 +909,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
cond_resched();
}
- if (next_deferred >= scanned)
- next_deferred -= scanned;
- else
- next_deferred = 0;
+ /*
+ * The deferred work is increased by any new work (delta) that wasn't
+ * done, decreased by old deferred work that was done now.
+ *
+ * And it is capped to two times of the freeable items.
+ */
+ next_deferred = max_t(long, (nr + delta - scanned), 0);
+ next_deferred = min(next_deferred, (2 * freeable));
+
/*
* move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates. If we exhausted the
- * scan, there is no need to do an update.
+ * manner that handles concurrent updates.
*/
- if (next_deferred > 0)
- new_nr = atomic_long_add_return(next_deferred,
- &shrinker->nr_deferred[nid]);
- else
- new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
- trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+ trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
return freed;
}
@@ -545,7 +932,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
- struct memcg_shrinker_map *map;
+ struct shrinker_info *info;
unsigned long ret, freed = 0;
int i;
@@ -555,12 +942,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
if (!down_read_trylock(&shrinker_rwsem))
return 0;
- map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
- true);
- if (unlikely(!map))
+ info = shrinker_info_protected(memcg, nid);
+ if (unlikely(!info))
goto unlock;
- for_each_set_bit(i, map->map, shrinker_nr_max) {
+ for_each_set_bit(i, info->map, info->map_nr_max) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -569,20 +955,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
shrinker = idr_find(&shrinker_idr, i);
- if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+ if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
if (!shrinker)
- clear_bit(i, map->map);
+ clear_bit(i, info->map);
continue;
}
/* Call non-slab shrinkers even though kmem is disabled */
- if (!memcg_kmem_enabled() &&
+ if (!memcg_kmem_online() &&
!(shrinker->flags & SHRINKER_NONSLAB))
continue;
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
- clear_bit(i, map->map);
+ clear_bit(i, info->map);
/*
* After the shrinker reported that it had no objects to
* free, but before we cleared the corresponding bit in
@@ -591,7 +977,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
* case, we invoke the shrinker one more time and reset
* the bit if it reports that it is not empty anymore.
* The memory barrier here pairs with the barrier in
- * memcg_set_shrinker_bit():
+ * set_shrinker_bit():
*
* list_lru_add() shrink_slab_memcg()
* list_add_tail() clear_bit()
@@ -603,7 +989,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY)
ret = 0;
else
- memcg_set_shrinker_bit(memcg, nid, i);
+ set_shrinker_bit(memcg, nid, i);
}
freed += ret;
@@ -692,121 +1078,260 @@ out:
return freed;
}
-void drop_slab_node(int nid)
+static unsigned long drop_slab_node(int nid)
{
- unsigned long freed;
+ unsigned long freed = 0;
+ struct mem_cgroup *memcg = NULL;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- struct mem_cgroup *memcg = NULL;
-
- if (fatal_signal_pending(current))
- return;
+ freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- freed = 0;
- memcg = mem_cgroup_iter(NULL, NULL, NULL);
- do {
- freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
- } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- } while (freed > 10);
+ return freed;
}
void drop_slab(void)
{
int nid;
+ int shift = 0;
+ unsigned long freed;
- for_each_online_node(nid)
- drop_slab_node(nid);
+ do {
+ freed = 0;
+ for_each_online_node(nid) {
+ if (fatal_signal_pending(current))
+ return;
+
+ freed += drop_slab_node(nid);
+ }
+ } while ((freed >> shift++) > 1);
}
-static inline int is_page_cache_freeable(struct page *page)
+static int reclaimer_offset(void)
{
- /*
- * A freeable page cache page is referenced only by the caller
- * that isolated the page, the page cache and optional buffer
- * heads at page->private.
- */
- int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
- HPAGE_PMD_NR : 1;
- return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGSCAN_DIRECT - PGSCAN_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+ PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+ PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+
+ if (current_is_kswapd())
+ return 0;
+ if (current_is_khugepaged())
+ return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
-static int may_write_to_inode(struct inode *inode)
+static inline int is_page_cache_freeable(struct folio *folio)
{
- if (current->flags & PF_SWAPWRITE)
- return 1;
- if (!inode_write_congested(inode))
- return 1;
- if (inode_to_bdi(inode) == current->backing_dev_info)
- return 1;
- return 0;
+ /*
+ * A freeable page cache folio is referenced only by the caller
+ * that isolated the folio, the page cache and optional filesystem
+ * private data at folio->private.
+ */
+ return folio_ref_count(folio) - folio_test_private(folio) ==
+ 1 + folio_nr_pages(folio);
}
/*
- * We detected a synchronous write error writing a page out. Probably
+ * We detected a synchronous write error writing a folio out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
* fsync(), msync() or close().
*
* The tricky part is that after writepage we cannot touch the mapping: nothing
- * prevents it from being freed up. But we have a ref on the page and once
- * that page is locked, the mapping is pinned.
+ * prevents it from being freed up. But we have a ref on the folio and once
+ * that folio is locked, the mapping is pinned.
*
- * We're allowed to run sleeping lock_page() here because we know the caller has
+ * We're allowed to run sleeping folio_lock() here because we know the caller has
* __GFP_FS.
*/
static void handle_write_error(struct address_space *mapping,
- struct page *page, int error)
+ struct folio *folio, int error)
{
- lock_page(page);
- if (page_mapping(page) == mapping)
+ folio_lock(folio);
+ if (folio_mapping(folio) == mapping)
mapping_set_error(mapping, error);
- unlock_page(page);
+ folio_unlock(folio);
+}
+
+static bool skip_throttle_noprogress(pg_data_t *pgdat)
+{
+ int reclaimable = 0, write_pending = 0;
+ int i;
+
+ /*
+ * If kswapd is disabled, reschedule if necessary but do not
+ * throttle as the system is likely near OOM.
+ */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return true;
+
+ /*
+ * If there are a lot of dirty/writeback folios then do not
+ * throttle as throttling will occur when the folios cycle
+ * towards the end of the LRU if still under writeback.
+ */
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ reclaimable += zone_reclaimable_pages(zone);
+ write_pending += zone_page_state_snapshot(zone,
+ NR_ZONE_WRITE_PENDING);
+ }
+ if (2 * write_pending <= reclaimable)
+ return true;
+
+ return false;
+}
+
+void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
+{
+ wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
+ long timeout, ret;
+ DEFINE_WAIT(wait);
+
+ /*
+ * Do not throttle user workers, kthreads other than kswapd or
+ * workqueues. They may be required for reclaim to make
+ * forward progress (e.g. journalling workqueues or kthreads).
+ */
+ if (!current_is_kswapd() &&
+ current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
+ cond_resched();
+ return;
+ }
+
+ /*
+ * These figures are pulled out of thin air.
+ * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
+ * parallel reclaimers which is a short-lived event so the timeout is
+ * short. Failing to make progress or waiting on writeback are
+ * potentially long-lived events so use a longer timeout. This is shaky
+ * logic as a failure to make progress could be due to anything from
+ * writeback to a slow device to excessive referenced folios at the tail
+ * of the inactive LRU.
+ */
+ switch(reason) {
+ case VMSCAN_THROTTLE_WRITEBACK:
+ timeout = HZ/10;
+
+ if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
+ WRITE_ONCE(pgdat->nr_reclaim_start,
+ node_page_state(pgdat, NR_THROTTLED_WRITTEN));
+ }
+
+ break;
+ case VMSCAN_THROTTLE_CONGESTED:
+ fallthrough;
+ case VMSCAN_THROTTLE_NOPROGRESS:
+ if (skip_throttle_noprogress(pgdat)) {
+ cond_resched();
+ return;
+ }
+
+ timeout = 1;
+
+ break;
+ case VMSCAN_THROTTLE_ISOLATED:
+ timeout = HZ/50;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ timeout = HZ;
+ break;
+ }
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+ if (reason == VMSCAN_THROTTLE_WRITEBACK)
+ atomic_dec(&pgdat->nr_writeback_throttled);
+
+ trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
+ jiffies_to_usecs(timeout - ret),
+ reason);
+}
+
+/*
+ * Account for folios written if tasks are throttled waiting on dirty
+ * folios to clean. If enough folios have been cleaned since throttling
+ * started then wakeup the throttled tasks.
+ */
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+ int nr_throttled)
+{
+ unsigned long nr_written;
+
+ node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
+
+ /*
+ * This is an inaccurate read as the per-cpu deltas may not
+ * be synchronised. However, given that the system is
+ * writeback throttled, it is not worth taking the penalty
+ * of getting an accurate count. At worst, the throttle
+ * timeout guarantees forward progress.
+ */
+ nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
+ READ_ONCE(pgdat->nr_reclaim_start);
+
+ if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
+ wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
}
/* possible outcome of pageout() */
typedef enum {
- /* failed to write page out, page is locked */
+ /* failed to write folio out, folio is locked */
PAGE_KEEP,
- /* move page to the active list, page is locked */
+ /* move folio to the active list, folio is locked */
PAGE_ACTIVATE,
- /* page has been sent to the disk successfully, page is unlocked */
+ /* folio has been sent to the disk successfully, folio is unlocked */
PAGE_SUCCESS,
- /* page is clean and locked */
+ /* folio is clean and locked */
PAGE_CLEAN,
} pageout_t;
/*
- * pageout is called by shrink_page_list() for each dirty page.
+ * pageout is called by shrink_folio_list() for each dirty folio.
* Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct folio *folio, struct address_space *mapping,
+ struct swap_iocb **plug)
{
/*
- * If the page is dirty, only perform writeback if that write
+ * If the folio is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in __generic_file_write_iter() against
- * this page's queue, we can perform writeback even if that
+ * this folio's queue, we can perform writeback even if that
* will block.
*
- * If the page is swapcache, write it back even if that would
+ * If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
*/
- if (!is_page_cache_freeable(page))
+ if (!is_page_cache_freeable(folio))
return PAGE_KEEP;
if (!mapping) {
/*
- * Some data journaling orphaned pages can have
- * page->mapping == NULL while being dirty with clean buffers.
+ * Some data journaling orphaned folios can have
+ * folio->mapping == NULL while being dirty with clean buffers.
*/
- if (page_has_private(page)) {
- if (try_to_free_buffers(page)) {
- ClearPageDirty(page);
- pr_info("%s: orphaned page\n", __func__);
+ if (folio_test_private(folio)) {
+ if (try_to_free_buffers(folio)) {
+ folio_clear_dirty(folio);
+ pr_info("%s: orphaned folio\n", __func__);
return PAGE_CLEAN;
}
}
@@ -814,10 +1339,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_inode(mapping->host))
- return PAGE_KEEP;
- if (clear_page_dirty_for_io(page)) {
+ if (folio_clear_dirty_for_io(folio)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
@@ -825,23 +1348,24 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
.range_start = 0,
.range_end = LLONG_MAX,
.for_reclaim = 1,
+ .swap_plug = plug,
};
- SetPageReclaim(page);
- res = mapping->a_ops->writepage(page, &wbc);
+ folio_set_reclaim(folio);
+ res = mapping->a_ops->writepage(&folio->page, &wbc);
if (res < 0)
- handle_write_error(mapping, page, res);
+ handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
- ClearPageReclaim(page);
+ folio_clear_reclaim(folio);
return PAGE_ACTIVATE;
}
- if (!PageWriteback(page)) {
+ if (!folio_test_writeback(folio)) {
/* synchronous write or broken a_ops? */
- ClearPageReclaim(page);
+ folio_clear_reclaim(folio);
}
- trace_mm_vmscan_writepage(page);
- inc_node_page_state(page, NR_VMSCAN_WRITE);
+ trace_mm_vmscan_write_folio(folio);
+ node_stat_add_folio(folio, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -849,66 +1373,68 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
}
/*
- * Same as remove_mapping, but if the page is removed from the mapping, it
+ * Same as remove_mapping, but if the folio is removed from the mapping, it
* gets returned with a refcount of 0.
*/
-static int __remove_mapping(struct address_space *mapping, struct page *page,
+static int __remove_mapping(struct address_space *mapping, struct folio *folio,
bool reclaimed, struct mem_cgroup *target_memcg)
{
- unsigned long flags;
int refcount;
void *shadow = NULL;
- BUG_ON(!PageLocked(page));
- BUG_ON(mapping != page_mapping(page));
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(mapping != folio_mapping(folio));
- xa_lock_irqsave(&mapping->i_pages, flags);
+ if (!folio_test_swapcache(folio))
+ spin_lock(&mapping->host->i_lock);
+ xa_lock_irq(&mapping->i_pages);
/*
- * The non racy check for a busy page.
+ * The non racy check for a busy folio.
*
* Must be careful with the order of the tests. When someone has
- * a ref to the page, it may be possible that they dirty it then
- * drop the reference. So if PageDirty is tested before page_count
- * here, then the following race may occur:
+ * a ref to the folio, it may be possible that they dirty it then
+ * drop the reference. So if the dirty flag is tested before the
+ * refcount here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
- * !PageDirty(page) [good]
- * SetPageDirty(page);
- * put_page(page);
- * !page_count(page) [good, discard it]
+ * !folio_test_dirty(folio) [good]
+ * folio_set_dirty(folio);
+ * folio_put(folio);
+ * !refcount(folio) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
- * escape unnoticed. The smp_rmb is needed to ensure the page->flags
- * load is not satisfied before that of page->_refcount.
+ * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
+ * load is not satisfied before that of folio->_refcount.
*
- * Note that if SetPageDirty is always performed via set_page_dirty,
+ * Note that if the dirty flag is always set via folio_mark_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
- refcount = 1 + compound_nr(page);
- if (!page_ref_freeze(page, refcount))
+ refcount = 1 + folio_nr_pages(folio);
+ if (!folio_ref_freeze(folio, refcount))
goto cannot_free;
- /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
- if (unlikely(PageDirty(page))) {
- page_ref_unfreeze(page, refcount);
+ /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
+ if (unlikely(folio_test_dirty(folio))) {
+ folio_ref_unfreeze(folio, refcount);
goto cannot_free;
}
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page_private(page) };
- mem_cgroup_swapout(page, swap);
+ if (folio_test_swapcache(folio)) {
+ swp_entry_t swap = folio_swap_entry(folio);
+
if (reclaimed && !mapping_exiting(mapping))
- shadow = workingset_eviction(page, target_memcg);
- __delete_from_swap_cache(page, swap, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
- put_swap_page(page, swap);
+ shadow = workingset_eviction(folio, target_memcg);
+ __delete_from_swap_cache(folio, swap, shadow);
+ mem_cgroup_swapout(folio, swap);
+ xa_unlock_irq(&mapping->i_pages);
+ put_swap_folio(folio, swap);
} else {
- void (*freepage)(struct page *);
+ void (*free_folio)(struct folio *);
- freepage = mapping->a_ops->freepage;
+ free_folio = mapping->a_ops->free_folio;
/*
* Remember a shadow entry for reclaimed file cache in
* order to detect refaults, thus thrashing, later on.
@@ -920,262 +1446,370 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* back.
*
* We also don't store shadows for DAX mappings because the
- * only page cache pages found in these are zero pages
+ * only page cache folios found in these are zero pages
* covering holes, and because we don't want to mix DAX
* exceptional entries and shadow exceptional entries in the
* same address_space.
*/
- if (reclaimed && page_is_file_lru(page) &&
+ if (reclaimed && folio_is_file_lru(folio) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
- shadow = workingset_eviction(page, target_memcg);
- __delete_from_page_cache(page, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
-
- if (freepage != NULL)
- freepage(page);
+ shadow = workingset_eviction(folio, target_memcg);
+ __filemap_remove_folio(folio, shadow);
+ xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+
+ if (free_folio)
+ free_folio(folio);
}
return 1;
cannot_free:
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
+ if (!folio_test_swapcache(folio))
+ spin_unlock(&mapping->host->i_lock);
return 0;
}
-/*
- * Attempt to detach a locked page from its ->mapping. If it is dirty or if
- * someone else has a ref on the page, abort and return 0. If it was
- * successfully detached, return 1. Assumes the caller has a single ref on
- * this page.
+/**
+ * remove_mapping() - Attempt to remove a folio from its mapping.
+ * @mapping: The address space.
+ * @folio: The folio to remove.
+ *
+ * If the folio is dirty, under writeback or if someone else has a ref
+ * on it, removal will fail.
+ * Return: The number of pages removed from the mapping. 0 if the folio
+ * could not be removed.
+ * Context: The caller should have a single refcount on the folio and
+ * hold its lock.
*/
-int remove_mapping(struct address_space *mapping, struct page *page)
+long remove_mapping(struct address_space *mapping, struct folio *folio)
{
- if (__remove_mapping(mapping, page, false, NULL)) {
+ if (__remove_mapping(mapping, folio, false, NULL)) {
/*
- * Unfreezing the refcount with 1 rather than 2 effectively
+ * Unfreezing the refcount with 1 effectively
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
- page_ref_unfreeze(page, 1);
- return 1;
+ folio_ref_unfreeze(folio, 1);
+ return folio_nr_pages(folio);
}
return 0;
}
/**
- * putback_lru_page - put previously isolated page onto appropriate LRU list
- * @page: page to be put back to appropriate lru list
+ * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
+ * @folio: Folio to be returned to an LRU list.
*
- * Add previously isolated @page to appropriate LRU list.
- * Page may still be unevictable for other reasons.
+ * Add previously isolated @folio to appropriate LRU list.
+ * The folio may still be unevictable for other reasons.
*
- * lru_lock must not be held, interrupts must be enabled.
+ * Context: lru_lock must not be held, interrupts must be enabled.
*/
-void putback_lru_page(struct page *page)
+void folio_putback_lru(struct folio *folio)
{
- lru_cache_add(page);
- put_page(page); /* drop ref from isolate */
+ folio_add_lru(folio);
+ folio_put(folio); /* drop ref from isolate */
}
-enum page_references {
- PAGEREF_RECLAIM,
- PAGEREF_RECLAIM_CLEAN,
- PAGEREF_KEEP,
- PAGEREF_ACTIVATE,
+enum folio_references {
+ FOLIOREF_RECLAIM,
+ FOLIOREF_RECLAIM_CLEAN,
+ FOLIOREF_KEEP,
+ FOLIOREF_ACTIVATE,
};
-static enum page_references page_check_references(struct page *page,
+static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
- int referenced_ptes, referenced_page;
+ int referenced_ptes, referenced_folio;
unsigned long vm_flags;
- referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
- &vm_flags);
- referenced_page = TestClearPageReferenced(page);
+ referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
+ &vm_flags);
+ referenced_folio = folio_test_clear_referenced(folio);
/*
- * Mlock lost the isolation race with us. Let try_to_unmap()
- * move the page to the unevictable list.
+ * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
+ * Let the folio, now marked Mlocked, be moved to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
- return PAGEREF_RECLAIM;
+ return FOLIOREF_ACTIVATE;
+
+ /* rmap lock contention: rotate */
+ if (referenced_ptes == -1)
+ return FOLIOREF_KEEP;
if (referenced_ptes) {
/*
- * All mapped pages start out with page table
+ * All mapped folios start out with page table
* references from the instantiating fault, so we need
- * to look twice if a mapped file page is used more
+ * to look twice if a mapped file/anon folio is used more
* than once.
*
* Mark it and spare it for another trip around the
* inactive list. Another page table reference will
* lead to its activation.
*
- * Note: the mark is set for activated pages as well
- * so that recently deactivated but used pages are
+ * Note: the mark is set for activated folios as well
+ * so that recently deactivated but used folios are
* quickly recovered.
*/
- SetPageReferenced(page);
+ folio_set_referenced(folio);
- if (referenced_page || referenced_ptes > 1)
- return PAGEREF_ACTIVATE;
+ if (referenced_folio || referenced_ptes > 1)
+ return FOLIOREF_ACTIVATE;
/*
- * Activate file-backed executable pages after first usage.
+ * Activate file-backed executable folios after first usage.
*/
- if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
- return PAGEREF_ACTIVATE;
+ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
+ return FOLIOREF_ACTIVATE;
- return PAGEREF_KEEP;
+ return FOLIOREF_KEEP;
}
- /* Reclaim if clean, defer dirty pages to writeback */
- if (referenced_page && !PageSwapBacked(page))
- return PAGEREF_RECLAIM_CLEAN;
+ /* Reclaim if clean, defer dirty folios to writeback */
+ if (referenced_folio && folio_is_file_lru(folio))
+ return FOLIOREF_RECLAIM_CLEAN;
- return PAGEREF_RECLAIM;
+ return FOLIOREF_RECLAIM;
}
-/* Check if a page is dirty or under writeback */
-static void page_check_dirty_writeback(struct page *page,
+/* Check if a folio is dirty or under writeback */
+static void folio_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct address_space *mapping;
/*
- * Anonymous pages are not handled by flushers and must be written
- * from reclaim context. Do not stall reclaim based on them
+ * Anonymous folios are not handled by flushers and must be written
+ * from reclaim context. Do not stall reclaim based on them.
+ * MADV_FREE anonymous folios are put into inactive file list too.
+ * They could be mistakenly treated as file lru. So further anon
+ * test is needed.
*/
- if (!page_is_file_lru(page) ||
- (PageAnon(page) && !PageSwapBacked(page))) {
+ if (!folio_is_file_lru(folio) ||
+ (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
*dirty = false;
*writeback = false;
return;
}
- /* By default assume that the page flags are accurate */
- *dirty = PageDirty(page);
- *writeback = PageWriteback(page);
+ /* By default assume that the folio flags are accurate */
+ *dirty = folio_test_dirty(folio);
+ *writeback = folio_test_writeback(folio);
/* Verify dirty/writeback state if the filesystem supports it */
- if (!page_has_private(page))
+ if (!folio_test_private(folio))
return;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (mapping && mapping->a_ops->is_dirty_writeback)
- mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+ mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
+}
+
+static struct folio *alloc_demote_folio(struct folio *src,
+ unsigned long private)
+{
+ struct folio *dst;
+ nodemask_t *allowed_mask;
+ struct migration_target_control *mtc;
+
+ mtc = (struct migration_target_control *)private;
+
+ allowed_mask = mtc->nmask;
+ /*
+ * make sure we allocate from the target node first also trying to
+ * demote or reclaim pages from the target node via kswapd if we are
+ * low on free memory on target node. If we don't do this and if
+ * we have free memory on the slower(lower) memtier, we would start
+ * allocating pages from slower(lower) memory tiers without even forcing
+ * a demotion of cold pages from the target memtier. This can result
+ * in the kernel placing hot pages in slower(lower) memory tiers.
+ */
+ mtc->nmask = NULL;
+ mtc->gfp_mask |= __GFP_THISNODE;
+ dst = alloc_migration_target(src, (unsigned long)mtc);
+ if (dst)
+ return dst;
+
+ mtc->gfp_mask &= ~__GFP_THISNODE;
+ mtc->nmask = allowed_mask;
+
+ return alloc_migration_target(src, (unsigned long)mtc);
}
/*
- * shrink_page_list() returns the number of reclaimed pages
+ * Take folios on @demote_folios and attempt to demote them to another node.
+ * Folios which are not demoted are left on @demote_folios.
*/
-static unsigned int shrink_page_list(struct list_head *page_list,
- struct pglist_data *pgdat,
- struct scan_control *sc,
- enum ttu_flags ttu_flags,
- struct reclaim_stat *stat,
- bool ignore_references)
-{
- LIST_HEAD(ret_pages);
- LIST_HEAD(free_pages);
+static unsigned int demote_folio_list(struct list_head *demote_folios,
+ struct pglist_data *pgdat)
+{
+ int target_nid = next_demotion_node(pgdat->node_id);
+ unsigned int nr_succeeded;
+ nodemask_t allowed_mask;
+
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = target_nid,
+ .nmask = &allowed_mask
+ };
+
+ if (list_empty(demote_folios))
+ return 0;
+
+ if (target_nid == NUMA_NO_NODE)
+ return 0;
+
+ node_get_allowed_targets(pgdat, &allowed_mask);
+
+ /* Demotion ignores all cpuset and mempolicy settings */
+ migrate_pages(demote_folios, alloc_demote_folio, NULL,
+ (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
+
+ __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
+
+ return nr_succeeded;
+}
+
+static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
+{
+ if (gfp_mask & __GFP_FS)
+ return true;
+ if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
+ return false;
+ /*
+ * We can "enter_fs" for swap-cache with only __GFP_IO
+ * providing this isn't SWP_FS_OPS.
+ * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * but that will never affect SWP_FS_OPS, so the data_race
+ * is safe.
+ */
+ return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
+}
+
+/*
+ * shrink_folio_list() returns the number of reclaimed pages
+ */
+static unsigned int shrink_folio_list(struct list_head *folio_list,
+ struct pglist_data *pgdat, struct scan_control *sc,
+ struct reclaim_stat *stat, bool ignore_references)
+{
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(free_folios);
+ LIST_HEAD(demote_folios);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
+ bool do_demote_pass;
+ struct swap_iocb *plug = NULL;
memset(stat, 0, sizeof(*stat));
cond_resched();
+ do_demote_pass = can_demote(pgdat->node_id, sc);
- while (!list_empty(page_list)) {
+retry:
+ while (!list_empty(folio_list)) {
struct address_space *mapping;
- struct page *page;
- enum page_references references = PAGEREF_RECLAIM;
- bool dirty, writeback, may_enter_fs;
+ struct folio *folio;
+ enum folio_references references = FOLIOREF_RECLAIM;
+ bool dirty, writeback;
unsigned int nr_pages;
cond_resched();
- page = lru_to_page(page_list);
- list_del(&page->lru);
+ folio = lru_to_folio(folio_list);
+ list_del(&folio->lru);
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto keep;
- VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
- /* Account the number of base pages even though THP */
+ /* Account the number of base pages */
sc->nr_scanned += nr_pages;
- if (unlikely(!page_evictable(page)))
+ if (unlikely(!folio_evictable(folio)))
goto activate_locked;
- if (!sc->may_unmap && page_mapped(page))
+ if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+ /* folio_update_gen() tried to promote this page? */
+ if (lru_gen_enabled() && !ignore_references &&
+ folio_mapped(folio) && folio_test_referenced(folio))
+ goto keep_locked;
/*
* The number of dirty pages determines if a node is marked
- * reclaim_congested which affects wait_iff_congested. kswapd
- * will stall and start writing pages if the tail of the LRU
- * is all dirty unqueued pages.
+ * reclaim_congested. kswapd will stall and start writing
+ * folios if the tail of the LRU is all dirty unqueued folios.
*/
- page_check_dirty_writeback(page, &dirty, &writeback);
+ folio_check_dirty_writeback(folio, &dirty, &writeback);
if (dirty || writeback)
- stat->nr_dirty++;
+ stat->nr_dirty += nr_pages;
if (dirty && !writeback)
- stat->nr_unqueued_dirty++;
+ stat->nr_unqueued_dirty += nr_pages;
/*
- * Treat this page as congested if the underlying BDI is or if
- * pages are cycling through the LRU so quickly that the
- * pages marked for immediate reclaim are making it to the
- * end of the LRU a second time.
+ * Treat this folio as congested if folios are cycling
+ * through the LRU so quickly that the folios marked
+ * for immediate reclaim are making it to the end of
+ * the LRU a second time.
*/
- mapping = page_mapping(page);
- if (((dirty || writeback) && mapping &&
- inode_write_congested(mapping->host)) ||
- (writeback && PageReclaim(page)))
- stat->nr_congested++;
+ if (writeback && folio_test_reclaim(folio))
+ stat->nr_congested += nr_pages;
/*
- * If a page at the tail of the LRU is under writeback, there
+ * If a folio at the tail of the LRU is under writeback, there
* are three cases to consider.
*
- * 1) If reclaim is encountering an excessive number of pages
- * under writeback and this page is both under writeback and
- * PageReclaim then it indicates that pages are being queued
- * for IO but are being recycled through the LRU before the
- * IO can complete. Waiting on the page itself risks an
- * indefinite stall if it is impossible to writeback the
- * page due to IO error or disconnected storage so instead
- * note that the LRU is being scanned too quickly and the
- * caller can stall after page list has been processed.
+ * 1) If reclaim is encountering an excessive number
+ * of folios under writeback and this folio has both
+ * the writeback and reclaim flags set, then it
+ * indicates that folios are being queued for I/O but
+ * are being recycled through the LRU before the I/O
+ * can complete. Waiting on the folio itself risks an
+ * indefinite stall if it is impossible to writeback
+ * the folio due to I/O error or disconnected storage
+ * so instead note that the LRU is being scanned too
+ * quickly and the caller can stall after the folio
+ * list has been processed.
*
- * 2) Global or new memcg reclaim encounters a page that is
+ * 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the page for immediate
+ * not to fs). In this case mark the folio for immediate
* reclaim and continue scanning.
*
- * Require may_enter_fs because we would wait on fs, which
- * may not have submitted IO yet. And the loop driver might
- * enter reclaim, and deadlock if it waits on a page for
+ * Require may_enter_fs() because we would wait on fs, which
+ * may not have submitted I/O yet. And the loop driver might
+ * enter reclaim, and deadlock if it waits on a folio for
* which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * 3) Legacy memcg encounters a page that is already marked
- * PageReclaim. memcg does not have any dirty pages
+ * 3) Legacy memcg encounters a folio that already has the
+ * reclaim flag set. memcg does not have any dirty folio
* throttling so we could easily OOM just because too many
- * pages are in writeback and there is nothing else to
+ * folios are in writeback and there is nothing else to
* reclaim. Wait for the writeback to complete.
*
- * In cases 1) and 2) we activate the pages to get them out of
- * the way while we continue scanning for clean pages on the
+ * In cases 1) and 2) we activate the folios to get them out of
+ * the way while we continue scanning for clean folios on the
* inactive list and refilling from the active list. The
* observation here is that waiting for disk writes is more
* expensive than potentially causing reloads down the line.
@@ -1183,274 +1817,308 @@ static unsigned int shrink_page_list(struct list_head *page_list,
* memory pressure on the cache working set any longer than it
* takes to write them to disk.
*/
- if (PageWriteback(page)) {
+ if (folio_test_writeback(folio)) {
/* Case 1 above */
if (current_is_kswapd() &&
- PageReclaim(page) &&
+ folio_test_reclaim(folio) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
- stat->nr_immediate++;
+ stat->nr_immediate += nr_pages;
goto activate_locked;
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
- !PageReclaim(page) || !may_enter_fs) {
+ !folio_test_reclaim(folio) ||
+ !may_enter_fs(folio, sc->gfp_mask)) {
/*
- * This is slightly racy - end_page_writeback()
- * might have just cleared PageReclaim, then
- * setting PageReclaim here end up interpreted
- * as PageReadahead - but that does not matter
- * enough to care. What we do want is for this
- * page to have PageReclaim set next time memcg
- * reclaim reaches the tests above, so it will
- * then wait_on_page_writeback() to avoid OOM;
- * and it's also appropriate in global reclaim.
+ * This is slightly racy -
+ * folio_end_writeback() might have
+ * just cleared the reclaim flag, then
+ * setting the reclaim flag here ends up
+ * interpreted as the readahead flag - but
+ * that does not matter enough to care.
+ * What we do want is for this folio to
+ * have the reclaim flag set next time
+ * memcg reclaim reaches the tests above,
+ * so it will then wait for writeback to
+ * avoid OOM; and it's also appropriate
+ * in global reclaim.
*/
- SetPageReclaim(page);
- stat->nr_writeback++;
+ folio_set_reclaim(folio);
+ stat->nr_writeback += nr_pages;
goto activate_locked;
/* Case 3 above */
} else {
- unlock_page(page);
- wait_on_page_writeback(page);
- /* then go back and try same page again */
- list_add_tail(&page->lru, page_list);
+ folio_unlock(folio);
+ folio_wait_writeback(folio);
+ /* then go back and try same folio again */
+ list_add_tail(&folio->lru, folio_list);
continue;
}
}
if (!ignore_references)
- references = page_check_references(page, sc);
+ references = folio_check_references(folio, sc);
switch (references) {
- case PAGEREF_ACTIVATE:
+ case FOLIOREF_ACTIVATE:
goto activate_locked;
- case PAGEREF_KEEP:
+ case FOLIOREF_KEEP:
stat->nr_ref_keep += nr_pages;
goto keep_locked;
- case PAGEREF_RECLAIM:
- case PAGEREF_RECLAIM_CLEAN:
- ; /* try to reclaim the page below */
+ case FOLIOREF_RECLAIM:
+ case FOLIOREF_RECLAIM_CLEAN:
+ ; /* try to reclaim the folio below */
+ }
+
+ /*
+ * Before reclaiming the folio, try to relocate
+ * its contents to another node.
+ */
+ if (do_demote_pass &&
+ (thp_migration_supported() || !folio_test_large(folio))) {
+ list_add(&folio->lru, &demote_folios);
+ folio_unlock(folio);
+ continue;
}
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
- * Lazyfree page could be freed directly
+ * Lazyfree folio could be freed directly
*/
- if (PageAnon(page) && PageSwapBacked(page)) {
- if (!PageSwapCache(page)) {
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
+ if (!folio_test_swapcache(folio)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
- if (PageTransHuge(page)) {
- /* cannot split THP, skip it */
- if (!can_split_huge_page(page, NULL))
+ if (folio_maybe_dma_pinned(folio))
+ goto keep_locked;
+ if (folio_test_large(folio)) {
+ /* cannot split folio, skip it */
+ if (!can_split_folio(folio, NULL))
goto activate_locked;
/*
- * Split pages without a PMD map right
+ * Split folios without a PMD map right
* away. Chances are some or all of the
* tail pages can be freed without IO.
*/
- if (!compound_mapcount(page) &&
- split_huge_page_to_list(page,
- page_list))
+ if (!folio_entire_mapcount(folio) &&
+ split_folio_to_list(folio,
+ folio_list))
goto activate_locked;
}
- if (!add_to_swap(page)) {
- if (!PageTransHuge(page))
+ if (!add_to_swap(folio)) {
+ if (!folio_test_large(folio))
goto activate_locked_split;
/* Fallback to swap normal pages */
- if (split_huge_page_to_list(page,
- page_list))
+ if (split_folio_to_list(folio,
+ folio_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
- if (!add_to_swap(page))
+ if (!add_to_swap(folio))
goto activate_locked_split;
}
-
- may_enter_fs = true;
-
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
}
- } else if (unlikely(PageTransHuge(page))) {
- /* Split file THP */
- if (split_huge_page_to_list(page, page_list))
+ } else if (folio_test_swapbacked(folio) &&
+ folio_test_large(folio)) {
+ /* Split shmem folio */
+ if (split_folio_to_list(folio, folio_list))
goto keep_locked;
}
/*
- * THP may get split above, need minus tail pages and update
- * nr_pages to avoid accounting tail pages twice.
- *
- * The tail pages that are added into swap cache successfully
- * reach here.
+ * If the folio was split above, the tail pages will make
+ * their own pass through this function and be accounted
+ * then.
*/
- if ((nr_pages > 1) && !PageTransHuge(page)) {
+ if ((nr_pages > 1) && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
/*
- * The page is mapped into the page tables of one or more
+ * The folio is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page)) {
- enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
- bool was_swapbacked = PageSwapBacked(page);
+ if (folio_mapped(folio)) {
+ enum ttu_flags flags = TTU_BATCH_FLUSH;
+ bool was_swapbacked = folio_test_swapbacked(folio);
- if (unlikely(PageTransHuge(page)))
+ if (folio_test_pmd_mappable(folio))
flags |= TTU_SPLIT_HUGE_PMD;
- if (!try_to_unmap(page, flags)) {
+ try_to_unmap(folio, flags);
+ if (folio_mapped(folio)) {
stat->nr_unmap_fail += nr_pages;
- if (!was_swapbacked && PageSwapBacked(page))
+ if (!was_swapbacked &&
+ folio_test_swapbacked(folio))
stat->nr_lazyfree_fail += nr_pages;
goto activate_locked;
}
}
- if (PageDirty(page)) {
+ /*
+ * Folio is unmapped now so it cannot be newly pinned anymore.
+ * No point in trying to reclaim folio if it is pinned.
+ * Furthermore we don't want to reclaim underlying fs metadata
+ * if the folio is pinned and thus potentially modified by the
+ * pinning process as that may upset the filesystem.
+ */
+ if (folio_maybe_dma_pinned(folio))
+ goto activate_locked;
+
+ mapping = folio_mapping(folio);
+ if (folio_test_dirty(folio)) {
/*
- * Only kswapd can writeback filesystem pages
+ * Only kswapd can writeback filesystem folios
* to avoid risk of stack overflow. But avoid
- * injecting inefficient single-page IO into
+ * injecting inefficient single-folio I/O into
* flusher writeback as much as possible: only
- * write pages when we've encountered many
- * dirty pages, and when we've already scanned
- * the rest of the LRU for clean pages and see
- * the same dirty pages again (PageReclaim).
+ * write folios when we've encountered many
+ * dirty folios, and when we've already scanned
+ * the rest of the LRU for clean folios and see
+ * the same dirty folios again (with the reclaim
+ * flag set).
*/
- if (page_is_file_lru(page) &&
- (!current_is_kswapd() || !PageReclaim(page) ||
+ if (folio_is_file_lru(folio) &&
+ (!current_is_kswapd() ||
+ !folio_test_reclaim(folio) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
- * Similar in principal to deactivate_page()
- * except we already have the page isolated
+ * Similar in principle to folio_deactivate()
+ * except we already have the folio isolated
* and know it's dirty
*/
- inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
- SetPageReclaim(page);
+ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
+ nr_pages);
+ folio_set_reclaim(folio);
goto activate_locked;
}
- if (references == PAGEREF_RECLAIM_CLEAN)
+ if (references == FOLIOREF_RECLAIM_CLEAN)
goto keep_locked;
- if (!may_enter_fs)
+ if (!may_enter_fs(folio, sc->gfp_mask))
goto keep_locked;
if (!sc->may_writepage)
goto keep_locked;
/*
- * Page is dirty. Flush the TLB if a writable entry
- * potentially exists to avoid CPU writes after IO
+ * Folio is dirty. Flush the TLB if a writable entry
+ * potentially exists to avoid CPU writes after I/O
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(page, mapping)) {
+ switch (pageout(folio, mapping, &plug)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- stat->nr_pageout += thp_nr_pages(page);
+ stat->nr_pageout += nr_pages;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
goto keep;
- if (PageDirty(page))
+ if (folio_test_dirty(folio))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
- * ahead and try to reclaim the page.
+ * ahead and try to reclaim the folio.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto keep;
- if (PageDirty(page) || PageWriteback(page))
+ if (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))
goto keep_locked;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
+ fallthrough;
case PAGE_CLEAN:
- ; /* try to free the page below */
+ ; /* try to free the folio below */
}
}
/*
- * If the page has buffers, try to free the buffer mappings
- * associated with this page. If we succeed we try to free
- * the page as well.
+ * If the folio has buffers, try to free the buffer
+ * mappings associated with this folio. If we succeed
+ * we try to free the folio as well.
*
- * We do this even if the page is PageDirty().
- * try_to_release_page() does not perform I/O, but it is
- * possible for a page to have PageDirty set, but it is actually
- * clean (all its buffers are clean). This happens if the
- * buffers were written out directly, with submit_bh(). ext3
- * will do this, as well as the blockdev mapping.
- * try_to_release_page() will discover that cleanness and will
- * drop the buffers and mark the page clean - it can be freed.
+ * We do this even if the folio is dirty.
+ * filemap_release_folio() does not perform I/O, but it
+ * is possible for a folio to have the dirty flag set,
+ * but it is actually clean (all its buffers are clean).
+ * This happens if the buffers were written out directly,
+ * with submit_bh(). ext3 will do this, as well as
+ * the blockdev mapping. filemap_release_folio() will
+ * discover that cleanness and will drop the buffers
+ * and mark the folio clean - it can be freed.
*
- * Rarely, pages can have buffers and no ->mapping. These are
- * the pages which were not successfully invalidated in
- * truncate_complete_page(). We try to drop those buffers here
- * and if that worked, and the page is no longer mapped into
- * process address space (page_count == 1) it can be freed.
- * Otherwise, leave the page on the LRU so it is swappable.
+ * Rarely, folios can have buffers and no ->mapping.
+ * These are the folios which were not successfully
+ * invalidated in truncate_cleanup_folio(). We try to
+ * drop those buffers here and if that worked, and the
+ * folio is no longer mapped into process address space
+ * (refcount == 1) it can be freed. Otherwise, leave
+ * the folio on the LRU so it is swappable.
*/
- if (page_has_private(page)) {
- if (!try_to_release_page(page, sc->gfp_mask))
+ if (folio_has_private(folio)) {
+ if (!filemap_release_folio(folio, sc->gfp_mask))
goto activate_locked;
- if (!mapping && page_count(page) == 1) {
- unlock_page(page);
- if (put_page_testzero(page))
+ if (!mapping && folio_ref_count(folio) == 1) {
+ folio_unlock(folio);
+ if (folio_put_testzero(folio))
goto free_it;
else {
/*
* rare race with speculative reference.
* the speculative reference will free
- * this page shortly, so we may
+ * this folio shortly, so we may
* increment nr_reclaimed here (and
* leave it off the LRU).
*/
- nr_reclaimed++;
+ nr_reclaimed += nr_pages;
continue;
}
}
}
- if (PageAnon(page) && !PageSwapBacked(page)) {
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
/* follow __remove_mapping for reference */
- if (!page_ref_freeze(page, 1))
- goto keep_locked;
- if (PageDirty(page)) {
- page_ref_unfreeze(page, 1);
+ if (!folio_ref_freeze(folio, 1))
goto keep_locked;
- }
-
- count_vm_event(PGLAZYFREED);
- count_memcg_page_event(page, PGLAZYFREED);
- } else if (!mapping || !__remove_mapping(mapping, page, true,
+ /*
+ * The folio has only one reference left, which is
+ * from the isolation. After the caller puts the
+ * folio back on the lru and drops the reference, the
+ * folio will be freed anyway. It doesn't matter
+ * which lru it goes on. So we don't bother checking
+ * the dirty flag here.
+ */
+ count_vm_events(PGLAZYFREED, nr_pages);
+ count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
+ } else if (!mapping || !__remove_mapping(mapping, folio, true,
sc->target_mem_cgroup))
goto keep_locked;
- unlock_page(page);
+ folio_unlock(folio);
free_it:
/*
- * THP may get swapped out in a whole, need account
- * all base pages.
+ * Folio may get swapped out as a whole, need to account
+ * all pages in it.
*/
nr_reclaimed += nr_pages;
/*
- * Is there need to periodically free_page_list? It would
+ * Is there need to periodically free_folio_list? It would
* appear not as the counts should be low
*/
- if (unlikely(PageTransHuge(page)))
- destroy_compound_page(page);
+ if (unlikely(folio_test_large(folio)))
+ destroy_large_folio(folio);
else
- list_add(&page->lru, &free_pages);
+ list_add(&folio->lru, &free_folios);
continue;
activate_locked_split:
@@ -1464,60 +2132,104 @@ activate_locked_split:
}
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
- PageMlocked(page)))
- try_to_free_swap(page);
- VM_BUG_ON_PAGE(PageActive(page), page);
- if (!PageMlocked(page)) {
- int type = page_is_file_lru(page);
- SetPageActive(page);
+ if (folio_test_swapcache(folio) &&
+ (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
+ folio_free_swap(folio);
+ VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
+ if (!folio_test_mlocked(folio)) {
+ int type = folio_is_file_lru(folio);
+ folio_set_active(folio);
stat->nr_activate[type] += nr_pages;
- count_memcg_page_event(page, PGACTIVATE);
+ count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
}
keep_locked:
- unlock_page(page);
+ folio_unlock(folio);
keep:
- list_add(&page->lru, &ret_pages);
- VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
+ list_add(&folio->lru, &ret_folios);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
+ folio_test_unevictable(folio), folio);
+ }
+ /* 'folio_list' is always empty here */
+
+ /* Migrate folios selected for demotion */
+ nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
+ /* Folios that could not be demoted are still in @demote_folios */
+ if (!list_empty(&demote_folios)) {
+ /* Folios which weren't demoted go back on @folio_list */
+ list_splice_init(&demote_folios, folio_list);
+
+ /*
+ * goto retry to reclaim the undemoted folios in folio_list if
+ * desired.
+ *
+ * Reclaiming directly from top tier nodes is not often desired
+ * due to it breaking the LRU ordering: in general memory
+ * should be reclaimed from lower tier nodes and demoted from
+ * top tier nodes.
+ *
+ * However, disabling reclaim from top tier nodes entirely
+ * would cause ooms in edge scenarios where lower tier memory
+ * is unreclaimable for whatever reason, eg memory being
+ * mlocked or too hot to reclaim. We can disable reclaim
+ * from top tier nodes in proactive reclaim though as that is
+ * not real memory pressure.
+ */
+ if (!sc->proactive) {
+ do_demote_pass = false;
+ goto retry;
+ }
}
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
- mem_cgroup_uncharge_list(&free_pages);
+ mem_cgroup_uncharge_list(&free_folios);
try_to_unmap_flush();
- free_unref_page_list(&free_pages);
+ free_unref_page_list(&free_folios);
- list_splice(&ret_pages, page_list);
+ list_splice(&ret_folios, folio_list);
count_vm_events(PGACTIVATE, pgactivate);
+ if (plug)
+ swap_write_unplug(plug);
return nr_reclaimed;
}
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
- struct list_head *page_list)
+ struct list_head *folio_list)
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_unmap = 1,
};
struct reclaim_stat stat;
unsigned int nr_reclaimed;
- struct page *page, *next;
- LIST_HEAD(clean_pages);
-
- list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_lru(page) && !PageDirty(page) &&
- !__PageMovable(page) && !PageUnevictable(page)) {
- ClearPageActive(page);
- list_move(&page->lru, &clean_pages);
+ struct folio *folio, *next;
+ LIST_HEAD(clean_folios);
+ unsigned int noreclaim_flag;
+
+ list_for_each_entry_safe(folio, next, folio_list, lru) {
+ if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
+ !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
+ !folio_test_unevictable(folio)) {
+ folio_clear_active(folio);
+ list_move(&folio->lru, &clean_folios);
}
}
- nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, &stat, true);
- list_splice(&clean_pages, page_list);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed);
+ /*
+ * We should be safe here since we are only dealing with file pages and
+ * we are not kswapd and therefore cannot write dirty file pages. But
+ * call memalloc_noreclaim_save() anyway, just in case these conditions
+ * change in the future.
+ */
+ noreclaim_flag = memalloc_noreclaim_save();
+ nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
+ &stat, true);
+ memalloc_noreclaim_restore(noreclaim_flag);
+
+ list_splice(&clean_folios, folio_list);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
+ -(long)nr_reclaimed);
/*
* Since lazyfree pages are isolated from file LRU from the beginning,
* they will rotate back to anonymous LRU in the end if it failed to
@@ -1527,89 +2239,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
stat.nr_lazyfree_fail);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
- -stat.nr_lazyfree_fail);
+ -(long)stat.nr_lazyfree_fail);
return nr_reclaimed;
}
/*
- * Attempt to remove the specified page from its LRU. Only take this page
- * if it is of the appropriate PageActive status. Pages which are being
- * freed elsewhere are also ignored.
- *
- * page: page to consider
- * mode: one of the LRU isolation modes defined above
- *
- * returns 0 on success, -ve errno on failure.
- */
-int __isolate_lru_page(struct page *page, isolate_mode_t mode)
-{
- int ret = -EINVAL;
-
- /* Only take pages on the LRU. */
- if (!PageLRU(page))
- return ret;
-
- /* Compaction should not handle unevictable pages but CMA can do so */
- if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
- return ret;
-
- ret = -EBUSY;
-
- /*
- * To minimise LRU disruption, the caller can indicate that it only
- * wants to isolate pages it will be able to operate on without
- * blocking - clean pages for the most part.
- *
- * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
- * that it is possible to migrate without blocking
- */
- if (mode & ISOLATE_ASYNC_MIGRATE) {
- /* All the caller can do on PageWriteback is block */
- if (PageWriteback(page))
- return ret;
-
- if (PageDirty(page)) {
- struct address_space *mapping;
- bool migrate_dirty;
-
- /*
- * Only pages without mappings or that have a
- * ->migratepage callback are possible to migrate
- * without blocking. However, we can be racing with
- * truncation so it's necessary to lock the page
- * to stabilise the mapping as truncation holds
- * the page lock until after the page is removed
- * from the page cache.
- */
- if (!trylock_page(page))
- return ret;
-
- mapping = page_mapping(page);
- migrate_dirty = !mapping || mapping->a_ops->migratepage;
- unlock_page(page);
- if (!migrate_dirty)
- return ret;
- }
- }
-
- if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
- return ret;
-
- if (likely(get_page_unless_zero(page))) {
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- ClearPageLRU(page);
- ret = 0;
- }
-
- return ret;
-}
-
-
-/*
* Update LRU sizes after isolating pages. The LRU size updates must
* be complete before mem_cgroup_update_lru_size due to a sanity check.
*/
@@ -1627,15 +2261,36 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
-/**
- * pgdat->lru_lock is heavily contended. Some of the functions that
+#ifdef CONFIG_CMA
+/*
+ * It is waste of effort to scan and reclaim CMA pages if it is not available
+ * for current allocation context. Kswapd can not be enrolled as it can not
+ * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
+ */
+static bool skip_cma(struct folio *folio, struct scan_control *sc)
+{
+ return !current_is_kswapd() &&
+ gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
+ get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
+}
+#else
+static bool skip_cma(struct folio *folio, struct scan_control *sc)
+{
+ return false;
+}
+#endif
+
+/*
+ * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
+ *
+ * lruvec->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
* For pagecache intensive workloads, this function is the hottest
* spot in the kernel (apart from copy_*_user functions).
*
- * Appropriate locks must be held before calling this function.
+ * Lru_lock must be held before calling this function.
*
* @nr_to_scan: The number of eligible pages to look through on the list.
* @lruvec: The LRU vector to pull pages from.
@@ -1646,7 +2301,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
*
* returns how many pages were moved onto *@dst.
*/
-static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
enum lru_list lru)
@@ -1657,67 +2312,73 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
- LIST_HEAD(pages_skipped);
- isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+ LIST_HEAD(folios_skipped);
total_scan = 0;
scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
- struct page *page;
-
- page = lru_to_page(src);
- prefetchw_prev_lru_page(page, src, flags);
+ struct list_head *move_to = src;
+ struct folio *folio;
- VM_BUG_ON_PAGE(!PageLRU(page), page);
+ folio = lru_to_folio(src);
+ prefetchw_prev_lru_folio(folio, src, flags);
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (page_zonenum(page) > sc->reclaim_idx) {
- list_move(&page->lru, &pages_skipped);
- nr_skipped[page_zonenum(page)] += nr_pages;
- continue;
+ if (folio_zonenum(folio) > sc->reclaim_idx ||
+ skip_cma(folio, sc)) {
+ nr_skipped[folio_zonenum(folio)] += nr_pages;
+ move_to = &folios_skipped;
+ goto move;
}
/*
- * Do not count skipped pages because that makes the function
- * return with no isolated pages if the LRU mostly contains
- * ineligible pages. This causes the VM to not reclaim any
- * pages, triggering a premature OOM.
- *
- * Account all tail pages of THP. This would not cause
- * premature OOM since __isolate_lru_page() returns -EBUSY
- * only when the page is being freed somewhere else.
+ * Do not count skipped folios because that makes the function
+ * return with no isolated folios if the LRU mostly contains
+ * ineligible folios. This causes the VM to not reclaim any
+ * folios, triggering a premature OOM.
+ * Account all pages in a folio.
*/
scan += nr_pages;
- switch (__isolate_lru_page(page, mode)) {
- case 0:
- nr_taken += nr_pages;
- nr_zone_taken[page_zonenum(page)] += nr_pages;
- list_move(&page->lru, dst);
- break;
- case -EBUSY:
- /* else it is being freed elsewhere */
- list_move(&page->lru, src);
- continue;
+ if (!folio_test_lru(folio))
+ goto move;
+ if (!sc->may_unmap && folio_mapped(folio))
+ goto move;
- default:
- BUG();
+ /*
+ * Be careful not to clear the lru flag until after we're
+ * sure the folio is not being freed elsewhere -- the
+ * folio release code relies on it.
+ */
+ if (unlikely(!folio_try_get(folio)))
+ goto move;
+
+ if (!folio_test_clear_lru(folio)) {
+ /* Another thread is already isolating this folio */
+ folio_put(folio);
+ goto move;
}
+
+ nr_taken += nr_pages;
+ nr_zone_taken[folio_zonenum(folio)] += nr_pages;
+ move_to = dst;
+move:
+ list_move(&folio->lru, move_to);
}
/*
- * Splice any skipped pages to the start of the LRU list. Note that
+ * Splice any skipped folios to the start of the LRU list. Note that
* this disrupts the LRU order when reclaiming for lower zones but
* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
- * scanning would soon rescan the same pages to skip and put the
- * system at risk of premature OOM.
+ * scanning would soon rescan the same folios to skip and waste lots
+ * of cpu cycles.
*/
- if (!list_empty(&pages_skipped)) {
+ if (!list_empty(&folios_skipped)) {
int zid;
- list_splice(&pages_skipped, src);
+ list_splice(&folios_skipped, src);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
@@ -1728,59 +2389,51 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
}
*nr_scanned = total_scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
- total_scan, skipped, nr_taken, mode, lru);
+ total_scan, skipped, nr_taken,
+ sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
/**
- * isolate_lru_page - tries to isolate a page from its LRU list
- * @page: page to isolate from its LRU list
- *
- * Isolates a @page from an LRU list, clears PageLRU and adjusts the
- * vmstat statistic corresponding to whatever LRU list the page was on.
+ * folio_isolate_lru() - Try to isolate a folio from its LRU list.
+ * @folio: Folio to isolate from its LRU list.
*
- * Returns 0 if the page was removed from an LRU list.
- * Returns -EBUSY if the page was not on an LRU list.
+ * Isolate a @folio from an LRU list and adjust the vmstat statistic
+ * corresponding to whatever LRU list the folio was on.
*
- * The returned page will have PageLRU() cleared. If it was found on
- * the active list, it will have PageActive set. If it was found on
- * the unevictable list, it will have the PageUnevictable bit set. That flag
+ * The folio will have its LRU flag cleared. If it was found on the
+ * active list, it will have the Active flag set. If it was found on the
+ * unevictable list, it will have the Unevictable flag set. These flags
* may need to be cleared by the caller before letting the page go.
*
- * The vmstat statistic corresponding to the list on which the page was
- * found will be decremented.
+ * Context:
*
- * Restrictions:
- *
- * (1) Must be called with an elevated refcount on the page. This is a
- * fundamental difference from isolate_lru_pages (which is called
+ * (1) Must be called with an elevated refcount on the folio. This is a
+ * fundamental difference from isolate_lru_folios() (which is called
* without a stable reference).
- * (2) the lru_lock must not be held.
- * (3) interrupts must be enabled.
+ * (2) The lru_lock must not be held.
+ * (3) Interrupts must be enabled.
+ *
+ * Return: true if the folio was removed from an LRU list.
+ * false if the folio was not on an LRU list.
*/
-int isolate_lru_page(struct page *page)
+bool folio_isolate_lru(struct folio *folio)
{
- int ret = -EBUSY;
+ bool ret = false;
- VM_BUG_ON_PAGE(!page_count(page), page);
- WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
+ VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
- if (PageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
+ if (folio_test_clear_lru(folio)) {
struct lruvec *lruvec;
- spin_lock_irq(&pgdat->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- if (PageLRU(page)) {
- int lru = page_lru(page);
- get_page(page);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, lru);
- ret = 0;
- }
- spin_unlock_irq(&pgdat->lru_lock);
+ folio_get(folio);
+ lruvec = folio_lruvec_lock_irq(folio);
+ lruvec_del_folio(lruvec, folio);
+ unlock_page_lruvec_irq(lruvec);
+ ret = true;
}
+
return ret;
}
@@ -1795,6 +2448,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
+ bool too_many;
if (current_is_kswapd())
return 0;
@@ -1815,108 +2469,107 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
- if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
+ if (gfp_has_io_fs(sc->gfp_mask))
inactive >>= 3;
- return isolated > inactive;
+ too_many = isolated > inactive;
+
+ /* Wake up tasks throttled due to too_many_isolated. */
+ if (!too_many)
+ wake_throttle_isolated(pgdat);
+
+ return too_many;
}
/*
- * This moves pages from @list to corresponding LRU list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone_lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone_lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
+ * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
+ * On return, @list is reused as a list of folios to be freed by the caller.
*
* Returns the number of pages moved to the given lruvec.
*/
-
-static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_folios_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
int nr_pages, nr_moved = 0;
- LIST_HEAD(pages_to_free);
- struct page *page;
- enum lru_list lru;
+ LIST_HEAD(folios_to_free);
while (!list_empty(list)) {
- page = lru_to_page(list);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- if (unlikely(!page_evictable(page))) {
- list_del(&page->lru);
- spin_unlock_irq(&pgdat->lru_lock);
- putback_lru_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ struct folio *folio = lru_to_folio(list);
+
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ list_del(&folio->lru);
+ if (unlikely(!folio_evictable(folio))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ folio_putback_lru(folio);
+ spin_lock_irq(&lruvec->lru_lock);
continue;
}
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- SetPageLRU(page);
- lru = page_lru(page);
- nr_pages = thp_nr_pages(page);
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_move(&page->lru, &lruvec->lists[lru]);
+ /*
+ * The folio_set_lru needs to be kept here for list integrity.
+ * Otherwise:
+ * #0 move_folios_to_lru #1 release_pages
+ * if (!folio_put_testzero())
+ * if (folio_put_testzero())
+ * !lru //skip lru_lock
+ * folio_set_lru()
+ * list_add(&folio->lru,)
+ * list_add(&folio->lru,)
+ */
+ folio_set_lru(folio);
- if (put_page_testzero(page)) {
- __ClearPageLRU(page);
- __ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
+ if (unlikely(folio_put_testzero(folio))) {
+ __folio_clear_lru_flags(folio);
- if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
- destroy_compound_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ if (unlikely(folio_test_large(folio))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ destroy_large_folio(folio);
+ spin_lock_irq(&lruvec->lru_lock);
} else
- list_add(&page->lru, &pages_to_free);
- } else {
- nr_moved += nr_pages;
- if (PageActive(page))
- workingset_age_nonresident(lruvec, nr_pages);
+ list_add(&folio->lru, &folios_to_free);
+
+ continue;
}
+
+ /*
+ * All pages were isolated from the same lruvec (and isolation
+ * inhibits memcg migration).
+ */
+ VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+ lruvec_add_folio(lruvec, folio);
+ nr_pages = folio_nr_pages(folio);
+ nr_moved += nr_pages;
+ if (folio_test_active(folio))
+ workingset_age_nonresident(lruvec, nr_pages);
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
- list_splice(&pages_to_free, list);
+ list_splice(&folios_to_free, list);
return nr_moved;
}
/*
- * If a kernel thread (such as nfsd for loop-back mounts) services
- * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
- * In that case we should only throttle if the backing device it is
- * writing to is congested. In other cases it is safe to throttle.
+ * If a kernel thread (such as nfsd for loop-back mounts) services a backing
+ * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
+ * we should not throttle. Otherwise it is safe to do so.
*/
static int current_may_throttle(void)
{
- return !(current->flags & PF_LOCAL_THROTTLE) ||
- current->backing_dev_info == NULL ||
- bdi_write_congested(current->backing_dev_info);
+ return !(current->flags & PF_LOCAL_THROTTLE);
}
/*
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
-static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
- struct scan_control *sc, enum lru_list lru)
+static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
+ struct lruvec *lruvec, struct scan_control *sc,
+ enum lru_list lru)
{
- LIST_HEAD(page_list);
+ LIST_HEAD(folio_list);
unsigned long nr_scanned;
unsigned int nr_reclaimed = 0;
unsigned long nr_taken;
@@ -1931,8 +2584,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return 0;
/* wait a bit for the reclaimer. */
- msleep(100);
stalled = true;
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
@@ -1941,56 +2594,65 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
lru_add_drain();
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
+ nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ item = PGSCAN_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
- &stat, false);
-
- spin_lock_irq(&pgdat->lru_lock);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
- move_pages_to_lru(lruvec, &page_list);
+ spin_lock_irq(&lruvec->lru_lock);
+ move_folios_to_lru(lruvec, &folio_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- lru_note_cost(lruvec, file, stat.nr_pageout);
- item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
+ spin_unlock_irq(&lruvec->lru_lock);
- spin_unlock_irq(&pgdat->lru_lock);
-
- mem_cgroup_uncharge_list(&page_list);
- free_unref_page_list(&page_list);
+ lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ mem_cgroup_uncharge_list(&folio_list);
+ free_unref_page_list(&folio_list);
/*
- * If dirty pages are scanned that are not queued for IO, it
+ * If dirty folios are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
- * happen when memory pressure pushes dirty pages to the end of
+ * happen when memory pressure pushes dirty folios to the end of
* the LRU before the dirty limits are breached and the dirty
* data has expired. It can also happen when the proportion of
- * dirty pages grows not through writes but through memory
+ * dirty folios grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
- if (stat.nr_unqueued_dirty == nr_taken)
+ if (stat.nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /*
+ * For cgroupv1 dirty throttling is achieved by waking up
+ * the kernel flusher here and later waiting on folios
+ * which are in writeback to finish (see shrink_folio_list()).
+ *
+ * Flusher may not be able to issue writeback quickly
+ * enough for cgroupv1 writeback throttling to work
+ * on a large system.
+ */
+ if (!writeback_throttling_sane(sc))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ }
sc->nr.dirty += stat.nr_dirty;
sc->nr.congested += stat.nr_congested;
@@ -2006,6 +2668,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return nr_reclaimed;
}
+/*
+ * shrink_active_list() moves folios from the active LRU to the inactive LRU.
+ *
+ * We move them the other way if the folio is referenced by one or more
+ * processes.
+ *
+ * If the folios are mostly unmapped, the processing is fast and it is
+ * appropriate to hold lru_lock across the whole operation. But if
+ * the folios are mapped, the processing is slow (folio_referenced()), so
+ * we should drop lru_lock around each folio. It's impossible to balance
+ * this, so instead we remove the folios from the LRU while processing them.
+ * It is safe to rely on the active flag against the non-LRU folios in here
+ * because nobody will play with that bit on a non-LRU folio.
+ *
+ * The downside is that we have to touch folio->_refcount against each folio.
+ * But we had to alter folio->flags anyway.
+ */
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
@@ -2014,10 +2693,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
unsigned long nr_taken;
unsigned long nr_scanned;
unsigned long vm_flags;
- LIST_HEAD(l_hold); /* The pages which were snipped off */
+ LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
- struct page *page;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
int file = is_file_lru(lru);
@@ -2025,9 +2703,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
lru_add_drain();
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
@@ -2036,123 +2714,129 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
while (!list_empty(&l_hold)) {
+ struct folio *folio;
+
cond_resched();
- page = lru_to_page(&l_hold);
- list_del(&page->lru);
+ folio = lru_to_folio(&l_hold);
+ list_del(&folio->lru);
- if (unlikely(!page_evictable(page))) {
- putback_lru_page(page);
+ if (unlikely(!folio_evictable(folio))) {
+ folio_putback_lru(folio);
continue;
}
if (unlikely(buffer_heads_over_limit)) {
- if (page_has_private(page) && trylock_page(page)) {
- if (page_has_private(page))
- try_to_release_page(page, 0);
- unlock_page(page);
+ if (folio_test_private(folio) && folio_trylock(folio)) {
+ if (folio_test_private(folio))
+ filemap_release_folio(folio, 0);
+ folio_unlock(folio);
}
}
- if (page_referenced(page, 0, sc->target_mem_cgroup,
- &vm_flags)) {
+ /* Referenced or rmap lock contention: rotate */
+ if (folio_referenced(folio, 0, sc->target_mem_cgroup,
+ &vm_flags) != 0) {
/*
- * Identify referenced, file-backed active pages and
+ * Identify referenced, file-backed active folios and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
- * memory under moderate memory pressure. Anon pages
+ * memory under moderate memory pressure. Anon folios
* are not likely to be evicted by use-once streaming
- * IO, plus JVM can create lots of anon VM_EXEC pages,
+ * IO, plus JVM can create lots of anon VM_EXEC folios,
* so we ignore them here.
*/
- if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
- nr_rotated += thp_nr_pages(page);
- list_add(&page->lru, &l_active);
+ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
+ nr_rotated += folio_nr_pages(folio);
+ list_add(&folio->lru, &l_active);
continue;
}
}
- ClearPageActive(page); /* we are de-activating */
- SetPageWorkingset(page);
- list_add(&page->lru, &l_inactive);
+ folio_clear_active(folio); /* we are de-activating */
+ folio_set_workingset(folio);
+ list_add(&folio->lru, &l_inactive);
}
/*
- * Move pages back to the lru list.
+ * Move folios back to the lru list.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
- nr_activate = move_pages_to_lru(lruvec, &l_active);
- nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
- /* Keep all free pages in l_active list */
+ nr_activate = move_folios_to_lru(lruvec, &l_active);
+ nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
+ /* Keep all free folios in l_active list */
list_splice(&l_inactive, &l_active);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
+ if (nr_rotated)
+ lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
-unsigned long reclaim_pages(struct list_head *page_list)
+static unsigned int reclaim_folio_list(struct list_head *folio_list,
+ struct pglist_data *pgdat)
{
- int nid = NUMA_NO_NODE;
- unsigned int nr_reclaimed = 0;
- LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
- struct page *page;
+ unsigned int nr_reclaimed;
+ struct folio *folio;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
+ .no_demotion = 1,
};
- while (!list_empty(page_list)) {
- page = lru_to_page(page_list);
- if (nid == NUMA_NO_NODE) {
- nid = page_to_nid(page);
- INIT_LIST_HEAD(&node_page_list);
- }
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+ while (!list_empty(folio_list)) {
+ folio = lru_to_folio(folio_list);
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ }
+
+ return nr_reclaimed;
+}
+
+unsigned long reclaim_pages(struct list_head *folio_list)
+{
+ int nid;
+ unsigned int nr_reclaimed = 0;
+ LIST_HEAD(node_folio_list);
+ unsigned int noreclaim_flag;
+
+ if (list_empty(folio_list))
+ return nr_reclaimed;
+
+ noreclaim_flag = memalloc_noreclaim_save();
+
+ nid = folio_nid(lru_to_folio(folio_list));
+ do {
+ struct folio *folio = lru_to_folio(folio_list);
- if (nid == page_to_nid(page)) {
- ClearPageActive(page);
- list_move(&page->lru, &node_page_list);
+ if (nid == folio_nid(folio)) {
+ folio_clear_active(folio);
+ list_move(&folio->lru, &node_folio_list);
continue;
}
- nr_reclaimed += shrink_page_list(&node_page_list,
- NODE_DATA(nid),
- &sc, 0,
- &dummy_stat, false);
- while (!list_empty(&node_page_list)) {
- page = lru_to_page(&node_page_list);
- list_del(&page->lru);
- putback_lru_page(page);
- }
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
+ nid = folio_nid(lru_to_folio(folio_list));
+ } while (!list_empty(folio_list));
- nid = NUMA_NO_NODE;
- }
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
- if (!list_empty(&node_page_list)) {
- nr_reclaimed += shrink_page_list(&node_page_list,
- NODE_DATA(nid),
- &sc, 0,
- &dummy_stat, false);
- while (!list_empty(&node_page_list)) {
- page = lru_to_page(&node_page_list);
- list_del(&page->lru);
- putback_lru_page(page);
- }
- }
+ memalloc_noreclaim_restore(noreclaim_flag);
return nr_reclaimed;
}
@@ -2180,13 +2864,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
* but large enough to avoid thrashing the aggregate readahead window.
*
* Both inactive lists should also be large enough that each inactive
- * page has a chance to be referenced again before it is reclaimed.
+ * folio has a chance to be referenced again before it is reclaimed.
*
* If that fails and refaulting is observed, the inactive list grows.
*
- * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
* on this LRU, maintained by the pageout code. An inactive_ratio
- * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
+ * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
*
* total target max
* memory ratio inactive
@@ -2225,29 +2909,134 @@ enum scan_balance {
SCAN_FILE,
};
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
+ if (lru_gen_enabled())
+ return;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&target_lruvec->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&target_lruvec->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
- * scanned. The relative value of each set of LRU lists is determined
- * by looking at the fraction of the pages scanned we did rotate back
- * onto the active list instead of evict.
+ * scanned.
*
- * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
- * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
+ * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
+ * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
- u64 fraction[2];
+ u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
unsigned long ap, fp;
enum lru_list lru;
- /* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+ /* If we have no swap space, do not bother scanning anon folios. */
+ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2325,15 +3114,14 @@ out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long lruvec_size;
+ unsigned long low, min;
unsigned long scan;
- unsigned long protection;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- protection = mem_cgroup_protection(sc->target_mem_cgroup,
- memcg,
- sc->memcg_low_reclaim);
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg,
+ &min, &low);
- if (protection) {
+ if (min || low) {
/*
* Scale a cgroup's reclaim pressure by proportioning
* its current usage to its memory.low or memory.min
@@ -2364,12 +3152,21 @@ out:
* hard protection.
*/
unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
/* Avoid TOCTOU with earlier protection check */
cgroup_size = max(cgroup_size, protection);
scan = lruvec_size - lruvec_size * protection /
- cgroup_size;
+ (cgroup_size + 1);
/*
* Minimally target SWAP_CLUSTER_MAX pages to keep
@@ -2422,6 +3219,3056 @@ out:
}
}
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+ struct scan_control *sc)
+{
+ /* Aging the anon LRU is valuable if swap is present: */
+ if (total_swap_pages > 0)
+ return true;
+
+ /* Also valuable if anon pages can be demoted: */
+ return can_demote(pgdat->node_id, sc);
+}
+
+#ifdef CONFIG_LRU_GEN
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
+#else
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
+#endif
+
+static bool should_walk_mmu(void)
+{
+ return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
+}
+
+static bool should_clear_pmd_young(void)
+{
+ return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
+}
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
+
+#define DEFINE_MAX_SEQ(lruvec) \
+ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
+
+#define DEFINE_MIN_SEQ(lruvec) \
+ unsigned long min_seq[ANON_AND_FILE] = { \
+ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
+ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
+ }
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
+#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
+
+static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+#ifdef CONFIG_MEMCG
+ if (memcg) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ /* see the comment in mem_cgroup_lruvec() */
+ if (!lruvec->pgdat)
+ lruvec->pgdat = pgdat;
+
+ return lruvec;
+ }
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return &pgdat->__lruvec;
+}
+
+static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ if (!sc->may_swap)
+ return 0;
+
+ if (!can_demote(pgdat->node_id, sc) &&
+ mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
+ return 0;
+
+ return mem_cgroup_swappiness(memcg);
+}
+
+static int get_nr_gens(struct lruvec *lruvec, int type)
+{
+ return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
+}
+
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+{
+ /* see the comment on lru_gen_folio */
+ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
+ get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
+ get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+}
+
+/******************************************************************************
+ * Bloom filters
+ ******************************************************************************/
+
+/*
+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
+ * bits in a bitmap, k is the number of hash functions and n is the number of
+ * inserted items.
+ *
+ * Page table walkers use one of the two filters to reduce their search space.
+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
+ * aging uses the double-buffering technique to flip to the other filter each
+ * time it produces a new generation. For non-leaf entries that have enough
+ * leaf entries, the aging carries them over to the next generation in
+ * walk_pmd_range(); the eviction also report them when walking the rmap
+ * in lru_gen_look_around().
+ *
+ * For future optimizations:
+ * 1. It's not necessary to keep both filters all the time. The spare one can be
+ * freed after the RCU grace period and reallocated if needed again.
+ * 2. And when reallocating, it's worth scaling its size according to the number
+ * of inserted entries in the other filter, to reduce the memory overhead on
+ * small systems and false positives on large systems.
+ * 3. Jenkins' hash function is an alternative to Knuth's.
+ */
+#define BLOOM_FILTER_SHIFT 15
+
+static inline int filter_gen_from_seq(unsigned long seq)
+{
+ return seq % NR_BLOOM_FILTERS;
+}
+
+static void get_item_key(void *item, int *key)
+{
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
+}
+
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ if (!filter)
+ return true;
+
+ get_item_key(item, key);
+
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ if (!filter)
+ return;
+
+ get_item_key(item, key);
+
+ if (!test_bit(key[0], filter))
+ set_bit(key[0], filter);
+ if (!test_bit(key[1], filter))
+ set_bit(key[1], filter);
+}
+
+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+{
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = lruvec->mm_state.filters[gen];
+ if (filter) {
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+ return;
+ }
+
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+}
+
+/******************************************************************************
+ * mm_struct list
+ ******************************************************************************/
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+ static struct lru_gen_mm_list mm_list = {
+ .fifo = LIST_HEAD_INIT(mm_list.fifo),
+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
+ };
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ return &memcg->mm_list;
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return &mm_list;
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+ int nid;
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+ VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
+#ifdef CONFIG_MEMCG
+ VM_WARN_ON_ONCE(mm->lru_gen.memcg);
+ mm->lru_gen.memcg = memcg;
+#endif
+ spin_lock(&mm_list->lock);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ /* the first addition since the last iteration */
+ if (lruvec->mm_state.tail == &mm_list->fifo)
+ lruvec->mm_state.tail = &mm->lru_gen.list;
+ }
+
+ list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
+
+ spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+ int nid;
+ struct lru_gen_mm_list *mm_list;
+ struct mem_cgroup *memcg = NULL;
+
+ if (list_empty(&mm->lru_gen.list))
+ return;
+
+#ifdef CONFIG_MEMCG
+ memcg = mm->lru_gen.memcg;
+#endif
+ mm_list = get_mm_list(memcg);
+
+ spin_lock(&mm_list->lock);
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ /* where the current iteration continues after */
+ if (lruvec->mm_state.head == &mm->lru_gen.list)
+ lruvec->mm_state.head = lruvec->mm_state.head->prev;
+
+ /* where the last iteration ended before */
+ if (lruvec->mm_state.tail == &mm->lru_gen.list)
+ lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+ }
+
+ list_del_init(&mm->lru_gen.list);
+
+ spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+ mem_cgroup_put(mm->lru_gen.memcg);
+ mm->lru_gen.memcg = NULL;
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+ struct task_struct *task = rcu_dereference_protected(mm->owner, true);
+
+ VM_WARN_ON_ONCE(task->mm != mm);
+ lockdep_assert_held(&task->alloc_lock);
+
+ /* for mm_update_next_owner() */
+ if (mem_cgroup_disabled())
+ return;
+
+ /* migration can happen before addition */
+ if (!mm->lru_gen.memcg)
+ return;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(task);
+ rcu_read_unlock();
+ if (memcg == mm->lru_gen.memcg)
+ return;
+
+ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
+
+ lru_gen_del_mm(mm);
+ lru_gen_add_mm(mm);
+}
+#endif
+
+static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+{
+ int i;
+ int hist;
+
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+ if (walk) {
+ hist = lru_hist_from_seq(walk->max_seq);
+
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(lruvec->mm_state.stats[hist][i],
+ lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+ walk->mm_stats[i] = 0;
+ }
+ }
+
+ if (NR_HIST_GENS > 1 && last) {
+ hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+
+ for (i = 0; i < NR_MM_STATS; i++)
+ WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
+ }
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+ int type;
+ unsigned long size = 0;
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
+
+ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
+ return true;
+
+ clear_bit(key, &mm->lru_gen.bitmap);
+
+ for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+ get_mm_counter(mm, MM_ANONPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
+ }
+
+ if (size < MIN_LRU_BATCH)
+ return true;
+
+ return !mmget_not_zero(mm);
+}
+
+static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
+ struct mm_struct **iter)
+{
+ bool first = false;
+ bool last = false;
+ struct mm_struct *mm = NULL;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+ /*
+ * mm_state->seq is incremented after each iteration of mm_list. There
+ * are three interesting cases for this page table walker:
+ * 1. It tries to start a new iteration with a stale max_seq: there is
+ * nothing left to do.
+ * 2. It started the next iteration: it needs to reset the Bloom filter
+ * so that a fresh set of PTE tables can be recorded.
+ * 3. It ended the current iteration: it needs to reset the mm stats
+ * counters and tell its caller to increment max_seq.
+ */
+ spin_lock(&mm_list->lock);
+
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+
+ if (walk->max_seq <= mm_state->seq)
+ goto done;
+
+ if (!mm_state->head)
+ mm_state->head = &mm_list->fifo;
+
+ if (mm_state->head == &mm_list->fifo)
+ first = true;
+
+ do {
+ mm_state->head = mm_state->head->next;
+ if (mm_state->head == &mm_list->fifo) {
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ last = true;
+ break;
+ }
+
+ /* force scan for those added after the last iteration */
+ if (!mm_state->tail || mm_state->tail == mm_state->head) {
+ mm_state->tail = mm_state->head->next;
+ walk->force_scan = true;
+ }
+
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+ if (should_skip_mm(mm, walk))
+ mm = NULL;
+ } while (!mm);
+done:
+ if (*iter || last)
+ reset_mm_stats(lruvec, walk, last);
+
+ spin_unlock(&mm_list->lock);
+
+ if (mm && first)
+ reset_bloom_filter(lruvec, walk->max_seq + 1);
+
+ if (*iter)
+ mmput_async(*iter);
+
+ *iter = mm;
+
+ return last;
+}
+
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+{
+ bool success = false;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+ spin_lock(&mm_list->lock);
+
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+
+ if (max_seq > mm_state->seq) {
+ mm_state->head = NULL;
+ mm_state->tail = NULL;
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ reset_mm_stats(lruvec, NULL, true);
+ success = true;
+ }
+
+ spin_unlock(&mm_list->lock);
+
+ return success;
+}
+
+/******************************************************************************
+ * PID controller
+ ******************************************************************************/
+
+/*
+ * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
+ *
+ * The P term is refaulted/(evicted+protected) from a tier in the generation
+ * currently being evicted; the I term is the exponential moving average of the
+ * P term over the generations previously evicted, using the smoothing factor
+ * 1/2; the D term isn't supported.
+ *
+ * The setpoint (SP) is always the first tier of one type; the process variable
+ * (PV) is either any tier of the other type or any other tier of the same
+ * type.
+ *
+ * The error is the difference between the SP and the PV; the correction is to
+ * turn off protection when SP>PV or turn on protection when SP<PV.
+ *
+ * For future optimizations:
+ * 1. The D term may discount the other two terms over time so that long-lived
+ * generations can resist stale information.
+ */
+struct ctrl_pos {
+ unsigned long refaulted;
+ unsigned long total;
+ int gain;
+};
+
+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
+ struct ctrl_pos *pos)
+{
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ pos->total = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ pos->total += lrugen->protected[hist][type][tier - 1];
+ pos->gain = gain;
+}
+
+static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
+{
+ int hist, tier;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
+ unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
+
+ lockdep_assert_held(&lruvec->lru_lock);
+
+ if (!carryover && !clear)
+ return;
+
+ hist = lru_hist_from_seq(seq);
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ if (carryover) {
+ unsigned long sum;
+
+ sum = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+ sum = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ sum += lrugen->protected[hist][type][tier - 1];
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+ }
+
+ if (clear) {
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+ if (tier)
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ }
+ }
+}
+
+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
+{
+ /*
+ * Return true if the PV has a limited number of refaults or a lower
+ * refaulted/total than the SP.
+ */
+ return pv->refaulted < MIN_LRU_BATCH ||
+ pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
+ (sp->refaulted + 1) * pv->total * pv->gain;
+}
+
+/******************************************************************************
+ * the aging
+ ******************************************************************************/
+
+/* promote pages accessed through page tables */
+static int folio_update_gen(struct folio *folio, int gen)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ /* lru_gen_del_folio() has isolated this page? */
+ if (!(old_flags & LRU_GEN_MASK)) {
+ /* for shrink_folio_list() */
+ new_flags = old_flags | BIT(PG_referenced);
+ continue;
+ }
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+/* protect pages accessed multiple times through file descriptors */
+static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ int type = folio_is_file_lru(folio);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
+
+ do {
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ /* folio_update_gen() has promoted this page? */
+ if (new_gen >= 0 && new_gen != old_gen)
+ return new_gen;
+
+ new_gen = (old_gen + 1) % MAX_NR_GENS;
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
+ /* for folio_end_writeback() */
+ if (reclaiming)
+ new_flags |= BIT(PG_reclaim);
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+
+ return new_gen;
+}
+
+static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int old_gen, int new_gen)
+{
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
+
+ walk->batched++;
+
+ walk->nr_pages[old_gen][type][zone] -= delta;
+ walk->nr_pages[new_gen][type][zone] += delta;
+}
+
+static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+{
+ int gen, type, zone;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ walk->batched = 0;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ int delta = walk->nr_pages[gen][type][zone];
+
+ if (!delta)
+ continue;
+
+ walk->nr_pages[gen][type][zone] = 0;
+ WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
+ lrugen->nr_pages[gen][type][zone] + delta);
+
+ if (lru_gen_is_active(lruvec, gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, delta);
+ }
+}
+
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
+{
+ struct address_space *mapping;
+ struct vm_area_struct *vma = args->vma;
+ struct lru_gen_mm_walk *walk = args->private;
+
+ if (!vma_is_accessible(vma))
+ return true;
+
+ if (is_vm_hugetlb_page(vma))
+ return true;
+
+ if (!vma_has_recency(vma))
+ return true;
+
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
+ return true;
+
+ if (vma == get_gate_vma(vma->vm_mm))
+ return true;
+
+ if (vma_is_anonymous(vma))
+ return !walk->can_swap;
+
+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
+ return true;
+
+ mapping = vma->vm_file->f_mapping;
+ if (mapping_unevictable(mapping))
+ return true;
+
+ if (shmem_mapping(mapping))
+ return !walk->can_swap;
+
+ /* to exclude special mappings like dax, etc. */
+ return !mapping->a_ops->read_folio;
+}
+
+/*
+ * Some userspace memory allocators map many single-page VMAs. Instead of
+ * returning back to the PGD table for each of such VMAs, finish an entire PMD
+ * table to reduce zigzags and improve cache performance.
+ */
+static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
+ unsigned long *vm_start, unsigned long *vm_end)
+{
+ unsigned long start = round_up(*vm_end, size);
+ unsigned long end = (start | ~mask) + 1;
+ VMA_ITERATOR(vmi, args->mm, start);
+
+ VM_WARN_ON_ONCE(mask & size);
+ VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
+
+ for_each_vma(vmi, args->vma) {
+ if (end && end <= args->vma->vm_start)
+ return false;
+
+ if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
+ continue;
+
+ *vm_start = max(start, args->vma->vm_start);
+ *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
+
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pte_present(pte) || is_zero_pfn(pfn))
+ return -1;
+
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pmd_pfn(pmd);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
+ return -1;
+
+ if (WARN_ON_ONCE(pmd_devmap(pmd)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+#endif
+
+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
+ struct pglist_data *pgdat, bool can_swap)
+{
+ struct folio *folio;
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return NULL;
+
+ folio = pfn_folio(pfn);
+ if (folio_nid(folio) != pgdat->node_id)
+ return NULL;
+
+ if (folio_memcg_rcu(folio) != memcg)
+ return NULL;
+
+ /* file VMAs can contain anon pages from COW */
+ if (!folio_is_file_lru(folio) && !can_swap)
+ return NULL;
+
+ return folio;
+}
+
+static bool suitable_to_scan(int total, int young)
+{
+ int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
+
+ /* suitable if the average number of young PTEs per cacheline is >=1 */
+ return young * n >= total;
+}
+
+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pte_t *pte;
+ spinlock_t *ptl;
+ unsigned long addr;
+ int total = 0;
+ int young = 0;
+ struct lru_gen_mm_walk *walk = args->private;
+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+
+ pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+ if (!pte)
+ return false;
+ if (!spin_trylock(ptl)) {
+ pte_unmap(pte);
+ return false;
+ }
+
+ arch_enter_lazy_mmu_mode();
+restart:
+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+ struct folio *folio;
+ pte_t ptent = ptep_get(pte + i);
+
+ total++;
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+
+ pfn = get_pte_pfn(ptent, args->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(ptent)) {
+ walk->mm_stats[MM_LEAF_OLD]++;
+ continue;
+ }
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
+
+ if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ }
+
+ if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
+ goto restart;
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte, ptl);
+
+ return suitable_to_scan(total, young);
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
+{
+ int i;
+ pmd_t *pmd;
+ spinlock_t *ptl;
+ struct lru_gen_mm_walk *walk = args->private;
+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+
+ VM_WARN_ON_ONCE(pud_leaf(*pud));
+
+ /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
+ if (*first == -1) {
+ *first = addr;
+ bitmap_zero(bitmap, MIN_LRU_BATCH);
+ return;
+ }
+
+ i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
+ if (i && i <= MIN_LRU_BATCH) {
+ __set_bit(i - 1, bitmap);
+ return;
+ }
+
+ pmd = pmd_offset(pud, *first);
+
+ ptl = pmd_lockptr(args->mm, pmd);
+ if (!spin_trylock(ptl))
+ goto done;
+
+ arch_enter_lazy_mmu_mode();
+
+ do {
+ unsigned long pfn;
+ struct folio *folio;
+
+ /* don't round down the first address */
+ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
+
+ pfn = get_pmd_pfn(pmd[i], vma, addr);
+ if (pfn == -1)
+ goto next;
+
+ if (!pmd_trans_huge(pmd[i])) {
+ if (should_clear_pmd_young())
+ pmdp_test_and_clear_young(vma, addr, pmd + i);
+ goto next;
+ }
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ if (!folio)
+ goto next;
+
+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+ goto next;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
+
+ if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+next:
+ i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
+ } while (i <= MIN_LRU_BATCH);
+
+ arch_leave_lazy_mmu_mode();
+ spin_unlock(ptl);
+done:
+ *first = -1;
+}
+#else
+static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
+{
+}
+#endif
+
+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pmd_t *pmd;
+ unsigned long next;
+ unsigned long addr;
+ struct vm_area_struct *vma;
+ DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
+ unsigned long first = -1;
+ struct lru_gen_mm_walk *walk = args->private;
+
+ VM_WARN_ON_ONCE(pud_leaf(*pud));
+
+ /*
+ * Finish an entire PMD in two passes: the first only reaches to PTE
+ * tables to avoid taking the PMD lock; the second, if necessary, takes
+ * the PMD lock to clear the accessed bit in PMD entries.
+ */
+ pmd = pmd_offset(pud, start & PUD_MASK);
+restart:
+ /* walk_pte_range() may call get_next_vma() */
+ vma = args->vma;
+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
+ pmd_t val = pmdp_get_lockless(pmd + i);
+
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(val) || is_huge_zero_pmd(val)) {
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+ continue;
+ }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(val)) {
+ unsigned long pfn = pmd_pfn(val);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+
+ if (!pmd_young(val)) {
+ walk->mm_stats[MM_LEAF_OLD]++;
+ continue;
+ }
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ continue;
+
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ continue;
+ }
+#endif
+ walk->mm_stats[MM_NONLEAF_TOTAL]++;
+
+ if (should_clear_pmd_young()) {
+ if (!pmd_young(val))
+ continue;
+
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ }
+
+ if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+ continue;
+
+ walk->mm_stats[MM_NONLEAF_FOUND]++;
+
+ if (!walk_pte_range(&val, addr, next, args))
+ continue;
+
+ walk->mm_stats[MM_NONLEAF_ADDED]++;
+
+ /* carry over to the next generation */
+ update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+ }
+
+ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
+
+ if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
+ goto restart;
+}
+
+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pud_t *pud;
+ unsigned long addr;
+ unsigned long next;
+ struct lru_gen_mm_walk *walk = args->private;
+
+ VM_WARN_ON_ONCE(p4d_leaf(*p4d));
+
+ pud = pud_offset(p4d, start & P4D_MASK);
+restart:
+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
+ pud_t val = READ_ONCE(pud[i]);
+
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
+ continue;
+
+ walk_pmd_range(&val, addr, next, args);
+
+ if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
+ end = (addr | ~PUD_MASK) + 1;
+ goto done;
+ }
+ }
+
+ if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
+ goto restart;
+
+ end = round_up(end, P4D_SIZE);
+done:
+ if (!end || !args->vma)
+ return 1;
+
+ walk->next_addr = max(end, args->vma->vm_start);
+
+ return -EAGAIN;
+}
+
+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+ static const struct mm_walk_ops mm_walk_ops = {
+ .test_walk = should_skip_vma,
+ .p4d_entry = walk_pud_range,
+ .walk_lock = PGWALK_RDLOCK,
+ };
+
+ int err;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ walk->next_addr = FIRST_USER_ADDRESS;
+
+ do {
+ DEFINE_MAX_SEQ(lruvec);
+
+ err = -EBUSY;
+
+ /* another thread might have called inc_max_seq() */
+ if (walk->max_seq != max_seq)
+ break;
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ break;
+
+ /* the caller might be holding the lock for write */
+ if (mmap_read_trylock(mm)) {
+ err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
+
+ mmap_read_unlock(mm);
+ }
+
+ mem_cgroup_unlock_pages();
+
+ if (walk->batched) {
+ spin_lock_irq(&lruvec->lru_lock);
+ reset_batch_size(lruvec, walk);
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while (err == -EAGAIN);
+}
+
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
+{
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
+ if (pgdat && current_is_kswapd()) {
+ VM_WARN_ON_ONCE(walk);
+
+ walk = &pgdat->mm_walk;
+ } else if (!walk && force_alloc) {
+ VM_WARN_ON_ONCE(current_is_kswapd());
+
+ walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ }
+
+ current->reclaim_state->mm_walk = walk;
+
+ return walk;
+}
+
+static void clear_mm_walk(void)
+{
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
+
+ current->reclaim_state->mm_walk = NULL;
+
+ if (!current_is_kswapd())
+ kfree(walk);
+}
+
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+{
+ int zone;
+ int remaining = MAX_LRU_BATCH;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (type == LRU_GEN_ANON && !can_swap)
+ goto done;
+
+ /* prevent cold/hot inversion if force_scan is true */
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct list_head *head = &lrugen->folios[old_gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ new_gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+done:
+ reset_ctrl_pos(lruvec, type, true);
+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+
+ return true;
+}
+
+static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+{
+ int gen, type, zone;
+ bool success = false;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ DEFINE_MIN_SEQ(lruvec);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+
+ /* find the oldest populated generation */
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
+ gen = lru_gen_from_seq(min_seq[type]);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ if (!list_empty(&lrugen->folios[gen][type][zone]))
+ goto next;
+ }
+
+ min_seq[type]++;
+ }
+next:
+ ;
+ }
+
+ /* see the comment on lru_gen_folio */
+ if (can_swap) {
+ min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
+ min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ }
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ if (min_seq[type] == lrugen->min_seq[type])
+ continue;
+
+ reset_ctrl_pos(lruvec, type, true);
+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
+ success = true;
+ }
+
+ return success;
+}
+
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+{
+ int prev, next;
+ int type, zone;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+restart:
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+
+ for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+ continue;
+
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
+
+ if (inc_min_seq(lruvec, type, can_swap))
+ continue;
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ goto restart;
+ }
+
+ /*
+ * Update the active/inactive LRU sizes for compatibility. Both sides of
+ * the current max_seq need to be covered, since max_seq+1 can overlap
+ * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
+ * overlap, cold/hot inversion happens.
+ */
+ prev = lru_gen_from_seq(lrugen->max_seq - 1);
+ next = lru_gen_from_seq(lrugen->max_seq + 1);
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ long delta = lrugen->nr_pages[prev][type][zone] -
+ lrugen->nr_pages[next][type][zone];
+
+ if (!delta)
+ continue;
+
+ __update_lru_size(lruvec, lru, zone, delta);
+ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
+ }
+ }
+
+ for (type = 0; type < ANON_AND_FILE; type++)
+ reset_ctrl_pos(lruvec, type, false);
+
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
+ /* make sure preceding modifications appear */
+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+
+ spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ struct scan_control *sc, bool can_swap, bool force_scan)
+{
+ bool success;
+ struct lru_gen_mm_walk *walk;
+ struct mm_struct *mm = NULL;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+
+ /* see the comment in iterate_mm_list() */
+ if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
+ success = false;
+ goto done;
+ }
+
+ /*
+ * If the hardware doesn't automatically set the accessed bit, fallback
+ * to lru_gen_look_around(), which only clears the accessed bit in a
+ * handful of PTEs. Spreading the work out over a period of time usually
+ * is less efficient, but it avoids bursty page faults.
+ */
+ if (!should_walk_mmu()) {
+ success = iterate_mm_list_nowalk(lruvec, max_seq);
+ goto done;
+ }
+
+ walk = set_mm_walk(NULL, true);
+ if (!walk) {
+ success = iterate_mm_list_nowalk(lruvec, max_seq);
+ goto done;
+ }
+
+ walk->lruvec = lruvec;
+ walk->max_seq = max_seq;
+ walk->can_swap = can_swap;
+ walk->force_scan = force_scan;
+
+ do {
+ success = iterate_mm_list(lruvec, walk, &mm);
+ if (mm)
+ walk_mm(lruvec, mm, walk);
+ } while (mm);
+done:
+ if (success)
+ inc_max_seq(lruvec, can_swap, force_scan);
+
+ return success;
+}
+
+/******************************************************************************
+ * working set protection
+ ******************************************************************************/
+
+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
+{
+ int gen, type, zone;
+ unsigned long total = 0;
+ bool can_swap = get_swappiness(lruvec, sc);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+ }
+ }
+
+ /* whether the size is big enough to be helpful */
+ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+}
+
+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long min_ttl)
+{
+ int gen;
+ unsigned long birth;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ /* see the comment on lru_gen_folio */
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ if (time_is_after_jiffies(birth + min_ttl))
+ return false;
+
+ if (!lruvec_is_sizable(lruvec, sc))
+ return false;
+
+ mem_cgroup_calculate_protection(NULL, memcg);
+
+ return !mem_cgroup_below_min(NULL, memcg);
+}
+
+/* to protect the working set of the last N jiffies */
+static unsigned long lru_gen_min_ttl __read_mostly;
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg;
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+
+ VM_WARN_ON_ONCE(!current_is_kswapd());
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
+ return;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
+ mem_cgroup_iter_break(NULL, memcg);
+ return;
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ /*
+ * The main goal is to OOM kill if every generation from all memcgs is
+ * younger than min_ttl. However, another possibility is all memcgs are
+ * either too small or below min.
+ */
+ if (mutex_trylock(&oom_lock)) {
+ struct oom_control oc = {
+ .gfp_mask = sc->gfp_mask,
+ };
+
+ out_of_memory(&oc);
+
+ mutex_unlock(&oom_lock);
+ }
+}
+
+/******************************************************************************
+ * rmap/PT walk feedback
+ ******************************************************************************/
+
+/*
+ * This function exploits spatial locality when shrink_folio_list() walks the
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
+ * the scan was done cacheline efficiently, it adds the PMD entry pointing to
+ * the PTE table to the Bloom filter. This forms a feedback loop between the
+ * eviction and the aging.
+ */
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+ int i;
+ unsigned long start;
+ unsigned long end;
+ struct lru_gen_mm_walk *walk;
+ int young = 0;
+ pte_t *pte = pvmw->pte;
+ unsigned long addr = pvmw->address;
+ struct folio *folio = pfn_folio(pvmw->pfn);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ DEFINE_MAX_SEQ(lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+
+ if (spin_is_contended(pvmw->ptl))
+ return;
+
+ /* avoid taking the LRU lock under the PTL when possible */
+ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+
+ start = max(addr & PMD_MASK, pvmw->vma->vm_start);
+ end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
+ else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ else {
+ start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ }
+ }
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
+
+ arch_enter_lazy_mmu_mode();
+
+ pte -= (addr - start) / PAGE_SIZE;
+
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+ pte_t ptent = ptep_get(pte + i);
+
+ pfn = get_pte_pfn(ptent, pvmw->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(ptent))
+ continue;
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ young++;
+
+ if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+
+ continue;
+ }
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ folio_activate(folio);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ mem_cgroup_unlock_pages();
+
+ /* feedback from rmap walkers to page table walkers */
+ if (suitable_to_scan(i, young))
+ update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+}
+
+/******************************************************************************
+ * memcg LRU
+ ******************************************************************************/
+
+/* see the comment on MEMCG_NR_GENS */
+enum {
+ MEMCG_LRU_NOP,
+ MEMCG_LRU_HEAD,
+ MEMCG_LRU_TAIL,
+ MEMCG_LRU_OLD,
+ MEMCG_LRU_YOUNG,
+};
+
+#ifdef CONFIG_MEMCG
+
+static int lru_gen_memcg_seg(struct lruvec *lruvec)
+{
+ return READ_ONCE(lruvec->lrugen.seg);
+}
+
+static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
+{
+ int seg;
+ int old, new;
+ unsigned long flags;
+ int bin = get_random_u32_below(MEMCG_NR_BINS);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
+
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ seg = 0;
+ new = old = lruvec->lrugen.gen;
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (op == MEMCG_LRU_HEAD)
+ seg = MEMCG_LRU_HEAD;
+ else if (op == MEMCG_LRU_TAIL)
+ seg = MEMCG_LRU_TAIL;
+ else if (op == MEMCG_LRU_OLD)
+ new = get_memcg_gen(pgdat->memcg_lru.seq);
+ else if (op == MEMCG_LRU_YOUNG)
+ new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
+ else
+ VM_WARN_ON_ONCE(true);
+
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
+
+ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
+ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+ else
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+
+ pgdat->memcg_lru.nr_memcgs[old]--;
+ pgdat->memcg_lru.nr_memcgs[new]++;
+
+ lruvec->lrugen.gen = new;
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
+
+ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
+
+ spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
+}
+
+void lru_gen_online_memcg(struct mem_cgroup *memcg)
+{
+ int gen;
+ int nid;
+ int bin = get_random_u32_below(MEMCG_NR_BINS);
+
+ for_each_node(nid) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock_irq(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ gen = get_memcg_gen(pgdat->memcg_lru.seq);
+
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
+ pgdat->memcg_lru.nr_memcgs[gen]++;
+
+ lruvec->lrugen.gen = gen;
+
+ spin_unlock_irq(&pgdat->memcg_lru.lock);
+ }
+}
+
+void lru_gen_offline_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
+ }
+}
+
+void lru_gen_release_memcg(struct mem_cgroup *memcg)
+{
+ int gen;
+ int nid;
+
+ for_each_node(nid) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock_irq(&pgdat->memcg_lru.lock);
+
+ if (hlist_nulls_unhashed(&lruvec->lrugen.list))
+ goto unlock;
+
+ gen = lruvec->lrugen.gen;
+
+ hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
+ pgdat->memcg_lru.nr_memcgs[gen]--;
+
+ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
+unlock:
+ spin_unlock_irq(&pgdat->memcg_lru.lock);
+ }
+}
+
+void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
+{
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
+}
+
+#else /* !CONFIG_MEMCG */
+
+static int lru_gen_memcg_seg(struct lruvec *lruvec)
+{
+ return 0;
+}
+
+#endif
+
+/******************************************************************************
+ * the eviction
+ ******************************************************************************/
+
+static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
+ int tier_idx)
+{
+ bool success;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+ int refs = folio_lru_refs(folio);
+ int tier = lru_tier_from_refs(refs);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
+
+ /* unevictable */
+ if (!folio_evictable(folio)) {
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+ folio_set_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
+ __count_vm_events(UNEVICTABLE_PGCULLED, delta);
+ return true;
+ }
+
+ /* dirty lazyfree */
+ if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+ folio_set_swapbacked(folio);
+ lruvec_add_folio_tail(lruvec, folio);
+ return true;
+ }
+
+ /* promoted */
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+ return true;
+ }
+
+ /* protected */
+ if (tier > tier_idx) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
+ lrugen->protected[hist][type][tier - 1] + delta);
+ return true;
+ }
+
+ /* ineligible */
+ if (zone > sc->reclaim_idx) {
+ gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ return true;
+ }
+
+ /* waiting for writeback */
+ if (folio_test_locked(folio) || folio_test_writeback(folio) ||
+ (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
+ gen = folio_inc_gen(lruvec, folio, true);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+ return true;
+ }
+
+ return false;
+}
+
+static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
+{
+ bool success;
+
+ /* swapping inhibited */
+ if (!(sc->gfp_mask & __GFP_IO) &&
+ (folio_test_dirty(folio) ||
+ (folio_test_anon(folio) && !folio_test_swapcache(folio))))
+ return false;
+
+ /* raced with release_pages() */
+ if (!folio_try_get(folio))
+ return false;
+
+ /* raced with another isolation */
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return false;
+ }
+
+ /* see the comment on MAX_NR_TIERS */
+ if (!folio_test_referenced(folio))
+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+
+ /* for shrink_folio_list() */
+ folio_clear_reclaim(folio);
+ folio_clear_referenced(folio);
+
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+
+ return true;
+}
+
+static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
+ int type, int tier, struct list_head *list)
+{
+ int i;
+ int gen;
+ enum vm_event_item item;
+ int sorted = 0;
+ int scanned = 0;
+ int isolated = 0;
+ int remaining = MAX_LRU_BATCH;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ VM_WARN_ON_ONCE(!list_empty(list));
+
+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
+ return 0;
+
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ for (i = MAX_NR_ZONES; i > 0; i--) {
+ LIST_HEAD(moved);
+ int skipped = 0;
+ int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
+ struct list_head *head = &lrugen->folios[gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+ int delta = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ scanned += delta;
+
+ if (sort_folio(lruvec, folio, sc, tier))
+ sorted += delta;
+ else if (isolate_folio(lruvec, folio, sc)) {
+ list_add(&folio->lru, list);
+ isolated += delta;
+ } else {
+ list_move(&folio->lru, &moved);
+ skipped += delta;
+ }
+
+ if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
+ break;
+ }
+
+ if (skipped) {
+ list_splice(&moved, head);
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
+ }
+
+ if (!remaining || isolated >= MIN_LRU_BATCH)
+ break;
+ }
+
+ item = PGSCAN_KSWAPD + reclaimer_offset();
+ if (!cgroup_reclaim(sc)) {
+ __count_vm_events(item, isolated);
+ __count_vm_events(PGREFILL, sorted);
+ }
+ __count_memcg_events(memcg, item, isolated);
+ __count_memcg_events(memcg, PGREFILL, sorted);
+ __count_vm_events(PGSCAN_ANON + type, isolated);
+
+ /*
+ * There might not be eligible folios due to reclaim_idx. Check the
+ * remaining to prevent livelock if it's not making progress.
+ */
+ return isolated || !remaining ? scanned : 0;
+}
+
+static int get_tier_idx(struct lruvec *lruvec, int type)
+{
+ int tier;
+ struct ctrl_pos sp, pv;
+
+ /*
+ * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * This value is chosen because any other tier would have at least twice
+ * as many refaults as the first tier.
+ */
+ read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ return tier - 1;
+}
+
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+{
+ int type, tier;
+ struct ctrl_pos sp, pv;
+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+
+ /*
+ * Compare the first tier of anon with that of file to determine which
+ * type to scan. Also need to compare other tiers of the selected type
+ * with the first tier of the other type to determine the last tier (of
+ * the selected type) to evict.
+ */
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
+ type = positive_ctrl_err(&sp, &pv);
+
+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ *tier_idx = tier - 1;
+
+ return type;
+}
+
+static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ int *type_scanned, struct list_head *list)
+{
+ int i;
+ int type;
+ int scanned;
+ int tier = -1;
+ DEFINE_MIN_SEQ(lruvec);
+
+ /*
+ * Try to make the obvious choice first. When anon and file are both
+ * available from the same generation, interpret swappiness 1 as file
+ * first and 200 as anon first.
+ */
+ if (!swappiness)
+ type = LRU_GEN_FILE;
+ else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
+ type = LRU_GEN_ANON;
+ else if (swappiness == 1)
+ type = LRU_GEN_FILE;
+ else if (swappiness == 200)
+ type = LRU_GEN_ANON;
+ else
+ type = get_type_to_scan(lruvec, swappiness, &tier);
+
+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
+ if (tier < 0)
+ tier = get_tier_idx(lruvec, type);
+
+ scanned = scan_folios(lruvec, sc, type, tier, list);
+ if (scanned)
+ break;
+
+ type = !type;
+ tier = -1;
+ }
+
+ *type_scanned = type;
+
+ return scanned;
+}
+
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+{
+ int type;
+ int scanned;
+ int reclaimed;
+ LIST_HEAD(list);
+ LIST_HEAD(clean);
+ struct folio *folio;
+ struct folio *next;
+ enum vm_event_item item;
+ struct reclaim_stat stat;
+ struct lru_gen_mm_walk *walk;
+ bool skip_retry = false;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+
+ scanned += try_to_inc_min_seq(lruvec, swappiness);
+
+ if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ scanned = 0;
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ if (list_empty(&list))
+ return scanned;
+retry:
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ sc->nr_reclaimed += reclaimed;
+
+ list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ if (!folio_evictable(folio)) {
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ continue;
+ }
+
+ if (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))) {
+ /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
+ if (folio_test_workingset(folio))
+ folio_set_referenced(folio);
+ continue;
+ }
+
+ if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
+ folio_mapped(folio) || folio_test_locked(folio) ||
+ folio_test_dirty(folio) || folio_test_writeback(folio)) {
+ /* don't add rejected folios to the oldest generation */
+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
+ BIT(PG_active));
+ continue;
+ }
+
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ list_move(&folio->lru, &clean);
+ sc->nr_scanned -= folio_nr_pages(folio);
+ }
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ move_folios_to_lru(lruvec, &list);
+
+ walk = current->reclaim_state->mm_walk;
+ if (walk && walk->batched)
+ reset_batch_size(lruvec, walk);
+
+ item = PGSTEAL_KSWAPD + reclaimer_offset();
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, reclaimed);
+ __count_memcg_events(memcg, item, reclaimed);
+ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_uncharge_list(&list);
+ free_unref_page_list(&list);
+
+ INIT_LIST_HEAD(&list);
+ list_splice_init(&clean, &list);
+
+ if (!list_empty(&list)) {
+ skip_retry = true;
+ goto retry;
+ }
+
+ return scanned;
+}
+
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+{
+ int gen, type, zone;
+ unsigned long old = 0;
+ unsigned long young = 0;
+ unsigned long total = 0;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ /* whether this lruvec is completely out of cold folios */
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
+ *nr_to_scan = 0;
+ return true;
+ }
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ unsigned long size = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ total += size;
+ if (seq == max_seq)
+ young += size;
+ else if (seq + MIN_NR_GENS == max_seq)
+ old += size;
+ }
+ }
+
+ /* try to scrape all its memory if this memcg was deleted */
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+
+ /*
+ * The aging tries to be lazy to reduce the overhead, while the eviction
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+ * ideal number of generations is MIN_NR_GENS+1.
+ */
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+ return false;
+
+ /*
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+ * of the total number of pages for each generation. A reasonable range
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+ * aging cares about the upper bound of hot pages, while the eviction
+ * cares about the lower bound of cold pages.
+ */
+ if (young * MIN_NR_GENS > total)
+ return true;
+ if (old * (MIN_NR_GENS + 2) < total)
+ return true;
+
+ return false;
+}
+
+/*
+ * For future optimizations:
+ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
+ * reclaim.
+ */
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+{
+ unsigned long nr_to_scan;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
+ return 0;
+
+ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
+ return nr_to_scan;
+
+ /* skip the aging path at the default priority */
+ if (sc->priority == DEF_PRIORITY)
+ return nr_to_scan;
+
+ /* skip this lruvec as it's low on cold folios */
+ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+}
+
+static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+{
+ /* don't abort memcg reclaim to ensure fairness */
+ if (!root_reclaim(sc))
+ return -1;
+
+ return max(sc->nr_to_reclaim, compact_gap(sc->order));
+}
+
+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ long nr_to_scan;
+ unsigned long scanned = 0;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ int swappiness = get_swappiness(lruvec, sc);
+
+ /* clean file folios are more likely to exist */
+ if (swappiness && !(sc->gfp_mask & __GFP_IO))
+ swappiness = 1;
+
+ while (true) {
+ int delta;
+
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ if (nr_to_scan <= 0)
+ break;
+
+ delta = evict_folios(lruvec, sc, swappiness);
+ if (!delta)
+ break;
+
+ scanned += delta;
+ if (scanned >= nr_to_scan)
+ break;
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ break;
+
+ cond_resched();
+ }
+
+ /* whether try_to_inc_max_seq() was successful */
+ return nr_to_scan < 0;
+}
+
+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+{
+ bool success;
+ unsigned long scanned = sc->nr_scanned;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ int seg = lru_gen_memcg_seg(lruvec);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (!lruvec_is_sizable(lruvec, sc))
+ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
+
+ mem_cgroup_calculate_protection(NULL, memcg);
+
+ if (mem_cgroup_below_min(NULL, memcg))
+ return MEMCG_LRU_YOUNG;
+
+ if (mem_cgroup_below_low(NULL, memcg)) {
+ /* see the comment on MEMCG_NR_GENS */
+ if (seg != MEMCG_LRU_TAIL)
+ return MEMCG_LRU_TAIL;
+
+ memcg_memory_event(memcg, MEMCG_LOW);
+ }
+
+ success = try_to_shrink_lruvec(lruvec, sc);
+
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
+
+ if (!sc->proactive)
+ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
+ flush_reclaim_state(sc);
+
+ return success ? MEMCG_LRU_YOUNG : 0;
+}
+
+#ifdef CONFIG_MEMCG
+
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int op;
+ int gen;
+ int bin;
+ int first_bin;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ struct mem_cgroup *memcg;
+ const struct hlist_nulls_node *pos;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+
+ bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
+restart:
+ op = 0;
+ memcg = NULL;
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
+
+ rcu_read_lock();
+
+ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
+ if (op) {
+ lru_gen_rotate_memcg(lruvec, op);
+ op = 0;
+ }
+
+ mem_cgroup_put(memcg);
+
+ lruvec = container_of(lrugen, struct lruvec, lrugen);
+ memcg = lruvec_memcg(lruvec);
+
+ if (!mem_cgroup_tryget(memcg)) {
+ lru_gen_release_memcg(memcg);
+ memcg = NULL;
+ continue;
+ }
+
+ rcu_read_unlock();
+
+ op = shrink_one(lruvec, sc);
+
+ rcu_read_lock();
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ if (op)
+ lru_gen_rotate_memcg(lruvec, op);
+
+ mem_cgroup_put(memcg);
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ return;
+
+ /* restart if raced with lru_gen_rotate_memcg() */
+ if (gen != get_nulls_value(pos))
+ goto restart;
+
+ /* try the rest of the bins of the current generation */
+ bin = get_memcg_bin(bin + 1);
+ if (bin != first_bin)
+ goto restart;
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct blk_plug plug;
+
+ VM_WARN_ON_ONCE(root_reclaim(sc));
+ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
+
+ lru_add_drain();
+
+ blk_start_plug(&plug);
+
+ set_mm_walk(NULL, sc->proactive);
+
+ if (try_to_shrink_lruvec(lruvec, sc))
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
+
+ clear_mm_walk();
+
+ blk_finish_plug(&plug);
+}
+
+#else /* !CONFIG_MEMCG */
+
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ BUILD_BUG();
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ BUILD_BUG();
+}
+
+#endif
+
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
+ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
+ * estimated reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_swappiness(lruvec, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ reclaimable /= MEMCG_NR_GENS;
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ sc->priority = clamp(priority, 0, DEF_PRIORITY);
+}
+
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct blk_plug plug;
+ unsigned long reclaimed = sc->nr_reclaimed;
+
+ VM_WARN_ON_ONCE(!root_reclaim(sc));
+
+ /*
+ * Unmapped clean folios are already prioritized. Scanning for more of
+ * them is likely futile and can cause high reclaim latency when there
+ * is a large number of memcgs.
+ */
+ if (!sc->may_writepage || !sc->may_unmap)
+ goto done;
+
+ lru_add_drain();
+
+ blk_start_plug(&plug);
+
+ set_mm_walk(pgdat, sc->proactive);
+
+ set_initial_priority(pgdat, sc);
+
+ if (current_is_kswapd())
+ sc->nr_reclaimed = 0;
+
+ if (mem_cgroup_disabled())
+ shrink_one(&pgdat->__lruvec, sc);
+ else
+ shrink_many(pgdat, sc);
+
+ if (current_is_kswapd())
+ sc->nr_reclaimed += reclaimed;
+
+ clear_mm_walk();
+
+ blk_finish_plug(&plug);
+done:
+ /* kswapd should never fail */
+ pgdat->kswapd_failures = 0;
+}
+
+/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ if (lrugen->enabled) {
+ enum lru_list lru;
+
+ for_each_evictable_lru(lru) {
+ if (!list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+ } else {
+ int gen, type, zone;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!list_empty(&lrugen->folios[gen][type][zone]))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool fill_evictable(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
+
+ lruvec_del_folio(lruvec, folio);
+ success = lru_gen_add_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_evictable(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ success = lru_gen_del_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+ lruvec_add_folio(lruvec, folio);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void lru_gen_change_state(bool enabled)
+{
+ static DEFINE_MUTEX(state_mutex);
+
+ struct mem_cgroup *memcg;
+
+ cgroup_lock();
+ cpus_read_lock();
+ get_online_mems();
+ mutex_lock(&state_mutex);
+
+ if (enabled == lru_gen_enabled())
+ goto unlock;
+
+ if (enabled)
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+ else
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
+
+ lruvec->lrugen.enabled = enabled;
+
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ mutex_unlock(&state_mutex);
+ put_online_mems();
+ cpus_read_unlock();
+ cgroup_unlock();
+}
+
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+
+ if (kstrtouint(buf, 0, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
+
+static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned int caps = 0;
+
+ if (get_cap(LRU_GEN_CORE))
+ caps |= BIT(LRU_GEN_CORE);
+
+ if (should_walk_mmu())
+ caps |= BIT(LRU_GEN_MM_WALK);
+
+ if (should_clear_pmd_young())
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+
+ return sysfs_emit(buf, "0x%04x\n", caps);
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int i;
+ unsigned int caps;
+
+ if (tolower(*buf) == 'n')
+ caps = 0;
+ else if (tolower(*buf) == 'y')
+ caps = -1;
+ else if (kstrtouint(buf, 0, &caps))
+ return -EINVAL;
+
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
+ bool enabled = caps & BIT(i);
+
+ if (i == LRU_GEN_CORE)
+ lru_gen_change_state(enabled);
+ else if (enabled)
+ static_branch_enable(&lru_gen_caps[i]);
+ else
+ static_branch_disable(&lru_gen_caps[i]);
+ }
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_min_ttl_attr.attr,
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static const struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
+/******************************************************************************
+ * debugfs interface
+ ******************************************************************************/
+
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg;
+ loff_t nr_to_skip = *pos;
+
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
+ if (!m->private)
+ return ERR_PTR(-ENOMEM);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (!nr_to_skip--)
+ return get_lruvec(memcg, nid);
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ return NULL;
+}
+
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
+{
+ if (!IS_ERR_OR_NULL(v))
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
+
+ kvfree(m->private);
+ m->private = NULL;
+}
+
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int nid = lruvec_pgdat(v)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(v);
+
+ ++*pos;
+
+ nid = next_memory_node(nid);
+ if (nid == MAX_NUMNODES) {
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ if (!memcg)
+ return NULL;
+
+ nid = first_memory_node;
+ }
+
+ return get_lruvec(memcg, nid);
+}
+
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ unsigned long max_seq, unsigned long *min_seq,
+ unsigned long seq)
+{
+ int i;
+ int type, tier;
+ int hist = lru_hist_from_seq(seq);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ seq_printf(m, " %10d", tier);
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ const char *s = " ";
+ unsigned long n[3] = {};
+
+ if (seq == max_seq) {
+ s = "RT ";
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
+ s = "rep";
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ }
+
+ for (i = 0; i < 3; i++)
+ seq_printf(m, " %10lu%c", n[i], s[i]);
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, " ");
+ for (i = 0; i < NR_MM_STATS; i++) {
+ const char *s = " ";
+ unsigned long n = 0;
+
+ if (seq == max_seq && NR_HIST_GENS == 1) {
+ s = "LOYNFA";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
+ s = "loynfa";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ }
+
+ seq_printf(m, " %10lu%c", n, s[i]);
+ }
+ seq_putc(m, '\n');
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static int lru_gen_seq_show(struct seq_file *m, void *v)
+{
+ unsigned long seq;
+ bool full = !debugfs_real_fops(m->file)->write;
+ struct lruvec *lruvec = v;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (nid == first_memory_node) {
+ const char *path = memcg ? m->private : "";
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
+#endif
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
+ }
+
+ seq_printf(m, " node %5d\n", nid);
+
+ if (!full)
+ seq = min_seq[LRU_GEN_ANON];
+ else if (max_seq >= MAX_NR_GENS)
+ seq = max_seq - MAX_NR_GENS + 1;
+ else
+ seq = 0;
+
+ for (; seq <= max_seq; seq++) {
+ int type, zone;
+ int gen = lru_gen_from_seq(seq);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long size = 0;
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ seq_printf(m, " %10lu%c", size, mark);
+ }
+
+ seq_putc(m, '\n');
+
+ if (full)
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lru_gen_seq_ops = {
+ .start = lru_gen_seq_start,
+ .stop = lru_gen_seq_stop,
+ .next = lru_gen_seq_next,
+ .show = lru_gen_seq_show,
+};
+
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ bool can_swap, bool force_scan)
+{
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < max_seq)
+ return 0;
+
+ if (seq > max_seq)
+ return -EINVAL;
+
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
+ return -ERANGE;
+
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+
+ return 0;
+}
+
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ int swappiness, unsigned long nr_to_reclaim)
+{
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (seq + MIN_NR_GENS > max_seq)
+ return -EINVAL;
+
+ sc->nr_reclaimed = 0;
+
+ while (!signal_pending(current)) {
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < min_seq[!swappiness])
+ return 0;
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ return 0;
+
+ if (!evict_folios(lruvec, sc, swappiness))
+ return 0;
+
+ cond_resched();
+ }
+
+ return -EINTR;
+}
+
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+ struct scan_control *sc, int swappiness, unsigned long opt)
+{
+ struct lruvec *lruvec;
+ int err = -EINVAL;
+ struct mem_cgroup *memcg = NULL;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
+
+ memcg = mem_cgroup_from_id(memcg_id);
+ if (!mem_cgroup_tryget(memcg))
+ memcg = NULL;
+
+ rcu_read_unlock();
+
+ if (!memcg)
+ return -EINVAL;
+ }
+
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto done;
+
+ lruvec = get_lruvec(memcg, nid);
+
+ if (swappiness < 0)
+ swappiness = get_swappiness(lruvec, sc);
+ else if (swappiness > 200)
+ goto done;
+
+ switch (cmd) {
+ case '+':
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
+ break;
+ case '-':
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
+ break;
+ }
+done:
+ mem_cgroup_put(memcg);
+
+ return err;
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ size_t len, loff_t *pos)
+{
+ void *buf;
+ char *cur, *next;
+ unsigned int flags;
+ struct blk_plug plug;
+ int err = -EINVAL;
+ struct scan_control sc = {
+ .may_writepage = true,
+ .may_unmap = true,
+ .may_swap = true,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ buf = kvmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, src, len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ flags = memalloc_noreclaim_save();
+ blk_start_plug(&plug);
+ if (!set_mm_walk(NULL, true)) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ next = buf;
+ next[len] = '\0';
+
+ while ((cur = strsep(&next, ",;\n"))) {
+ int n;
+ int end;
+ char cmd;
+ unsigned int memcg_id;
+ unsigned int nid;
+ unsigned long seq;
+ unsigned int swappiness = -1;
+ unsigned long opt = -1;
+
+ cur = skip_spaces(cur);
+ if (!*cur)
+ continue;
+
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, &swappiness, &end, &opt, &end);
+ if (n < 4 || cur[end]) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
+ if (err)
+ break;
+ }
+done:
+ clear_mm_walk();
+ blk_finish_plug(&plug);
+ memalloc_noreclaim_restore(flags);
+ set_task_reclaim_state(current, NULL);
+
+ kvfree(buf);
+
+ return err ? : len;
+}
+
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lru_gen_seq_ops);
+}
+
+static const struct file_operations lru_gen_rw_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .write = lru_gen_seq_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations lru_gen_ro_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+ int i;
+ int gen, type, zone;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled = lru_gen_enabled();
+
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
+
+ lruvec->mm_state.seq = MIN_NR_GENS;
+}
+
+#ifdef CONFIG_MEMCG
+
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+ int i, j;
+
+ spin_lock_init(&pgdat->memcg_lru.lock);
+
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
+ for (j = 0; j < MEMCG_NR_BINS; j++)
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+ }
+}
+
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
+ spin_lock_init(&memcg->mm_list.lock);
+}
+
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+ int i;
+ int nid;
+
+ VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ sizeof(lruvec->lrugen.nr_pages)));
+
+ lruvec->lrugen.list.next = LIST_POISON1;
+
+ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+ bitmap_free(lruvec->mm_state.filters[i]);
+ lruvec->mm_state.filters[i] = NULL;
+ }
+ }
+}
+
+#endif /* CONFIG_MEMCG */
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
+ return 0;
+};
+late_initcall(init_lru_gen);
+
+#else /* !CONFIG_LRU_GEN */
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+}
+
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
@@ -2430,8 +6277,13 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ bool proportional_reclaim;
struct blk_plug plug;
- bool scan_adjusted;
+
+ if (lru_gen_enabled() && !root_reclaim(sc)) {
+ lru_gen_shrink_lruvec(lruvec, sc);
+ return;
+ }
get_scan_count(lruvec, sc, nr);
@@ -2449,8 +6301,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
- sc->priority == DEF_PRIORITY);
+ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2470,7 +6322,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
continue;
/*
@@ -2521,8 +6373,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
-
- scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
@@ -2531,7 +6381,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+ if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2585,14 +6436,13 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
if (!managed_zone(zone))
continue;
- switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
- case COMPACT_SUCCESS:
- case COMPACT_CONTINUE:
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ sc->reclaim_idx, 0))
+ return false;
+
+ if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
return false;
- default:
- /* check next zone */
- ;
- }
}
/*
@@ -2601,7 +6451,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
*/
pages_for_compaction = compact_gap(sc->order);
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
return inactive_lru_pages > pages_for_compaction;
@@ -2628,13 +6478,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
mem_cgroup_calculate_protection(target_memcg, memcg);
- if (mem_cgroup_below_min(memcg)) {
+ if (mem_cgroup_below_min(target_memcg, memcg)) {
/*
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
continue;
- } else if (mem_cgroup_below_low(memcg)) {
+ } else if (mem_cgroup_below_low(target_memcg, memcg)) {
/*
* Soft protection.
* Respect the protection only as long as
@@ -2657,20 +6507,24 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
sc->priority);
/* Record the group's reclaim efficiency */
- vmpressure(sc->gfp_mask, memcg, false,
- sc->nr_scanned - scanned,
- sc->nr_reclaimed - reclaimed);
+ if (!sc->proactive)
+ vmpressure(sc->gfp_mask, memcg, false,
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
- struct reclaim_state *reclaim_state = current->reclaim_state;
- unsigned long nr_reclaimed, nr_scanned;
+ unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed;
struct lruvec *target_lruvec;
bool reclaimable = false;
- unsigned long file;
+
+ if (lru_gen_enabled() && root_reclaim(sc)) {
+ lru_gen_shrink_node(pgdat, sc);
+ return;
+ }
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
@@ -2680,107 +6534,20 @@ again:
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- /*
- * Determine the scan balance between anon and file LRUs.
- */
- spin_lock_irq(&pgdat->lru_lock);
- sc->anon_cost = target_lruvec->anon_cost;
- sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&pgdat->lru_lock);
-
- /*
- * Target desirable inactive:active list ratios for the anon
- * and file LRU lists.
- */
- if (!sc->force_deactivate) {
- unsigned long refaults;
-
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
- sc->may_deactivate |= DEACTIVATE_ANON;
- else
- sc->may_deactivate &= ~DEACTIVATE_ANON;
-
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
- sc->may_deactivate |= DEACTIVATE_FILE;
- else
- sc->may_deactivate &= ~DEACTIVATE_FILE;
- } else
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-
- /*
- * If we have plenty of inactive file pages that aren't
- * thrashing, try to reclaim those first before touching
- * anonymous pages.
- */
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
- sc->cache_trim_mode = 1;
- else
- sc->cache_trim_mode = 0;
-
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
- if (!cgroup_reclaim(sc)) {
- unsigned long total_high_wmark = 0;
- unsigned long free, anon;
- int z;
-
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- /*
- * Consider anon: if that's low too, this isn't a
- * runaway file reclaim problem, but rather just
- * extreme pressure. Reclaim as per usual then.
- */
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-
- sc->file_is_tiny =
- file + free <= total_high_wmark &&
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
- anon >> sc->priority;
- }
+ prepare_scan_count(pgdat, sc);
shrink_node_memcgs(pgdat, sc);
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
+ flush_reclaim_state(sc);
+
+ nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed;
/* Record the subtree's reclaim efficiency */
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
- sc->nr_scanned - nr_scanned,
- sc->nr_reclaimed - nr_reclaimed);
+ if (!sc->proactive)
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ sc->nr_scanned - nr_scanned, nr_node_reclaimed);
- if (sc->nr_reclaimed - nr_reclaimed)
+ if (nr_node_reclaimed)
reclaimable = true;
if (current_is_kswapd()) {
@@ -2812,38 +6579,41 @@ again:
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
- * faster than they are written so also forcibly stall.
+ * faster than they are written so forcibly stall
+ * until some pages complete writeback.
*/
if (sc->nr.immediate)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}
/*
- * Tag a node/memcg as congested if all the dirty pages
- * scanned were backed by a congested BDI and
- * wait_iff_congested will stall.
+ * Tag a node/memcg as congested if all the dirty pages were marked
+ * for writeback and immediate reclaim (counted in nr.congested).
*
* Legacy memcg will stall in page writeback so avoid forcibly
- * stalling in wait_iff_congested().
+ * stalling in reclaim_throttle().
*/
- if ((current_is_kswapd() ||
- (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
- sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
- set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
+ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
+ if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
+ set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
+
+ if (current_is_kswapd())
+ set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
+ }
/*
- * Stall direct reclaim for IO completions if underlying BDIs
- * and node is congested. Allow kswapd to continue until it
+ * Stall direct reclaim for IO completions if the lruvec is
+ * node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
- test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
- wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+ (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
+ test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
- if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
- sc))
+ if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc))
goto again;
/*
@@ -2864,14 +6634,14 @@ again:
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long watermark;
- enum compact_result suitable;
- suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
- if (suitable == COMPACT_SUCCESS)
- /* Allocation should succeed already. Don't reclaim. */
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ sc->reclaim_idx, 0))
return true;
- if (suitable == COMPACT_SKIPPED)
- /* Compaction cannot yet proceed. Do reclaim. */
+
+ /* Compaction cannot yet proceed. Do reclaim. */
+ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
return false;
/*
@@ -2888,6 +6658,36 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
}
+static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
+{
+ /*
+ * If reclaim is making progress greater than 12% efficiency then
+ * wake all the NOPROGRESS throttled tasks.
+ */
+ if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
+ wait_queue_head_t *wqh;
+
+ wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+
+ return;
+ }
+
+ /*
+ * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
+ * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
+ * under writeback and marked for immediate reclaim at the tail of the
+ * LRU.
+ */
+ if (current_is_kswapd() || cgroup_reclaim(sc))
+ return;
+
+ /* Throttle if making no progress at high prioities. */
+ if (sc->priority == 1 && !sc->nr_reclaimed)
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2904,6 +6704,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_scanned;
gfp_t orig_mask;
pg_data_t *last_pgdat = NULL;
+ pg_data_t *first_pgdat = NULL;
/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2967,6 +6768,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
/* need some check for avoid more shrink_zone() */
}
+ if (!first_pgdat)
+ first_pgdat = zone->zone_pgdat;
+
/* See comment about same check for global reclaim above */
if (zone->zone_pgdat == last_pgdat)
continue;
@@ -2974,6 +6778,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
shrink_node(zone->zone_pgdat, sc);
}
+ if (first_pgdat)
+ consider_reclaim_throttle(first_pgdat, sc);
+
/*
* Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM.
@@ -2986,11 +6793,14 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
struct lruvec *target_lruvec;
unsigned long refaults;
+ if (lru_gen_enabled())
+ return;
+
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
- target_lruvec->refaults[0] = refaults;
+ target_lruvec->refaults[WORKINGSET_ANON] = refaults;
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
- target_lruvec->refaults[1] = refaults;
+ target_lruvec->refaults[WORKINGSET_FILE] = refaults;
}
/*
@@ -3023,8 +6833,9 @@ retry:
__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do {
- vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
- sc->priority);
+ if (!sc->proactive)
+ vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+ sc->priority);
sc->nr_scanned = 0;
shrink_zones(zonelist, sc);
@@ -3056,7 +6867,7 @@ retry:
lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
zone->zone_pgdat);
- clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
}
}
@@ -3117,7 +6928,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
- free_pages += zone_page_state(zone, NR_FREE_PAGES);
+ free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES);
}
/* If there are no reserves (unexpected config) then do not throttle */
@@ -3211,18 +7022,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
- if (!(gfp_mask & __GFP_FS)) {
+ if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat), HZ);
+ else
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ allow_direct_reclaim(pgdat));
- goto check_pending;
- }
-
- /* Throttle until kswapd wakes the process */
- wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
- allow_direct_reclaim(pgdat));
-
-check_pending:
if (fatal_signal_pending(current))
return true;
@@ -3250,7 +7057,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
* scan_control uses s8 fields for order, priority, and reclaim_idx.
* Confirm they are large enough for max values.
*/
- BUILD_BUG_ON(MAX_ORDER > S8_MAX);
+ BUILD_BUG_ON(MAX_ORDER >= S8_MAX);
BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
@@ -3318,7 +7125,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- bool may_swap)
+ unsigned int reclaim_options)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -3331,7 +7138,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
- .may_swap = may_swap,
+ .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
+ .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -3354,13 +7162,17 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
}
#endif
-static void age_active_anon(struct pglist_data *pgdat,
- struct scan_control *sc)
+static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
- if (!total_swap_pages)
+ if (lru_gen_enabled()) {
+ lru_gen_age_node(pgdat, sc);
+ return;
+ }
+
+ if (!can_age_anon_pages(pgdat, sc))
return;
lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -3420,13 +7232,16 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
if (!managed_zone(zone))
continue;
- mark = high_wmark_pages(zone);
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
+ mark = wmark_pages(zone, WMARK_PROMO);
+ else
+ mark = high_wmark_pages(zone);
if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
return true;
}
/*
- * If a node has no populated zone within highest_zoneidx, it does not
+ * If a node has no managed zone within highest_zoneidx, it does not
* need balancing by definition. This can happen if a zone-restricted
* allocation tries to wake a remote kswapd.
*/
@@ -3441,7 +7256,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
{
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
- clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
clear_bit(PGDAT_DIRTY, &pgdat->flags);
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
@@ -3526,6 +7342,38 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
return sc->nr_scanned >= sc->nr_to_reclaim;
}
+/* Page allocator PCP high watermark is lowered if reclaim is active. */
+static inline void
+update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
+{
+ int i;
+ struct zone *zone;
+
+ for (i = 0; i <= highest_zoneidx; i++) {
+ zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ if (active)
+ set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
+ else
+ clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
+ }
+}
+
+static inline void
+set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
+{
+ update_reclaim_active(pgdat, highest_zoneidx, true);
+}
+
+static inline void
+clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
+{
+ update_reclaim_active(pgdat, highest_zoneidx, false);
+}
+
/*
* For kswapd, balance_pgdat() will reclaim pages across a node from zones
* that are eligible for use by the caller until at least one zone is
@@ -3557,7 +7405,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_THIS_IP_);
count_vm_event(PAGEOUTRUN);
@@ -3578,6 +7426,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
boosted = nr_boost_reclaim;
restart:
+ set_reclaim_active(pgdat, highest_zoneidx);
sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
@@ -3643,12 +7492,11 @@ restart:
sc.may_swap = !nr_boost_reclaim;
/*
- * Do some background aging of the anon list, to give
- * pages a chance to be referenced before reclaiming. All
- * pages are rotated regardless of classzone as this is
- * about consistent aging.
+ * Do some background aging, to give pages a chance to be
+ * referenced before reclaiming. All pages are rotated
+ * regardless of classzone as this is about consistent aging.
*/
- age_active_anon(pgdat, &sc);
+ kswapd_age_node(pgdat, &sc);
/*
* If we're getting trouble reclaiming, start doing writepage
@@ -3682,9 +7530,9 @@ restart:
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
- __fs_reclaim_release();
+ __fs_reclaim_release(_THIS_IP_);
ret = try_to_freeze();
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_THIS_IP_);
if (ret || kthread_should_stop())
break;
@@ -3711,6 +7559,8 @@ restart:
pgdat->kswapd_failures++;
out:
+ clear_reclaim_active(pgdat, highest_zoneidx);
+
/* If reclaim was boosted, account for the reclaim done in this pass */
if (boosted) {
unsigned long flags;
@@ -3734,7 +7584,7 @@ out:
}
snapshot_refaults(NULL, pgdat);
- __fs_reclaim_release();
+ __fs_reclaim_release(_THIS_IP_);
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
@@ -3863,7 +7713,7 @@ static int kswapd(void *p)
{
unsigned int alloc_order, reclaim_order;
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
- pg_data_t *pgdat = (pg_data_t*)p;
+ pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -3882,11 +7732,12 @@ static int kswapd(void *p)
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
- tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
set_freezable();
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+ atomic_set(&pgdat->nr_writeback_throttled, 0);
for ( ; ; ) {
bool ret;
@@ -3899,7 +7750,7 @@ kswapd_try_sleep:
highest_zoneidx);
/* Read the new order and highest_zoneidx */
- alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
+ alloc_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
highest_zoneidx);
WRITE_ONCE(pgdat->kswapd_order, 0);
@@ -3932,7 +7783,7 @@ kswapd_try_sleep:
goto kswapd_try_sleep;
}
- tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
+ tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
return 0;
}
@@ -4030,39 +7881,40 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
/*
* This kswapd start function will be called by init and node-hot-add.
- * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
-int kswapd_run(int nid)
+void __meminit kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- int ret = 0;
-
- if (pgdat->kswapd)
- return 0;
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
- if (IS_ERR(pgdat->kswapd)) {
- /* failure at boot is fatal */
- BUG_ON(system_state < SYSTEM_RUNNING);
- pr_err("Failed to start kswapd on node %d\n", nid);
- ret = PTR_ERR(pgdat->kswapd);
- pgdat->kswapd = NULL;
+ pgdat_kswapd_lock(pgdat);
+ if (!pgdat->kswapd) {
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+ BUG_ON(system_state < SYSTEM_RUNNING);
+ pr_err("Failed to start kswapd on node %d\n", nid);
+ pgdat->kswapd = NULL;
+ }
}
- return ret;
+ pgdat_kswapd_unlock(pgdat);
}
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
*/
-void kswapd_stop(int nid)
+void __meminit kswapd_stop(int nid)
{
- struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct task_struct *kswapd;
+ pgdat_kswapd_lock(pgdat);
+ kswapd = pgdat->kswapd;
if (kswapd) {
kthread_stop(kswapd);
- NODE_DATA(nid)->kswapd = NULL;
+ pgdat->kswapd = NULL;
}
+ pgdat_kswapd_unlock(pgdat);
}
static int __init kswapd_init(void)
@@ -4086,9 +7938,6 @@ module_init(kswapd_init)
*/
int node_reclaim_mode __read_mostly;
-#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */
-
/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
@@ -4169,22 +8018,22 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
};
+ unsigned long pflags;
trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
sc.gfp_mask);
cond_resched();
+ psi_memstall_enter(&pflags);
fs_reclaim_acquire(sc.gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
- * and we also need to be able to write out pages for RECLAIM_WRITE
- * and RECLAIM_UNMAP.
*/
noreclaim_flag = memalloc_noreclaim_save();
- p->flags |= PF_SWAPWRITE;
set_task_reclaim_state(p, &sc.reclaim_state);
- if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
+ if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
+ node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
/*
* Free memory by calling shrink node with increasing
* priorities until we have enough memory freed.
@@ -4195,9 +8044,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
}
set_task_reclaim_state(p, NULL);
- current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
+ psi_memstall_leave(&pflags);
trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
@@ -4252,59 +8101,47 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
#endif
/**
- * check_move_unevictable_pages - check pages for evictability and move to
- * appropriate zone lru list
- * @pvec: pagevec with lru pages to check
+ * check_move_unevictable_folios - Move evictable folios to appropriate zone
+ * lru list
+ * @fbatch: Batch of lru folios to check.
*
- * Checks pages for evictability, if an evictable page is in the unevictable
+ * Checks folios for evictability, if an evictable folio is in the unevictable
* lru list, moves it to the appropriate evictable lru list. This function
- * should be only used for lru pages.
+ * should be only used for lru folios.
*/
-void check_move_unevictable_pages(struct pagevec *pvec)
+void check_move_unevictable_folios(struct folio_batch *fbatch)
{
- struct lruvec *lruvec;
- struct pglist_data *pgdat = NULL;
+ struct lruvec *lruvec = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;
- for (i = 0; i < pvec->nr; i++) {
- struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
- int nr_pages;
-
- if (PageTransTail(page))
- continue;
+ for (i = 0; i < fbatch->nr; i++) {
+ struct folio *folio = fbatch->folios[i];
+ int nr_pages = folio_nr_pages(folio);
- nr_pages = thp_nr_pages(page);
pgscanned += nr_pages;
- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irq(&pgdat->lru_lock);
- pgdat = pagepgdat;
- spin_lock_irq(&pgdat->lru_lock);
- }
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- if (!PageLRU(page) || !PageUnevictable(page))
+ /* block memcg migration while the folio moves between lrus */
+ if (!folio_test_clear_lru(folio))
continue;
- if (page_evictable(page)) {
- enum lru_list lru = page_lru_base_type(page);
-
- VM_BUG_ON_PAGE(PageActive(page), page);
- ClearPageUnevictable(page);
- del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
- add_page_to_lru_list(page, lruvec, lru);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
+ if (folio_evictable(folio) && folio_test_unevictable(folio)) {
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
pgrescued += nr_pages;
}
+ folio_set_lru(folio);
}
- if (pgdat) {
+ if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);
+ } else if (pgscanned) {
+ count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
}
-EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
+EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f7b4ee6aa12..b731d57996c5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,11 +28,10 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/sched/isolation.h>
#include "internal.h"
-#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
-
#ifdef CONFIG_NUMA
int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
@@ -41,11 +40,12 @@ static void zero_zone_numa_counters(struct zone *zone)
{
int item, cpu;
- for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
- atomic_long_set(&zone->vm_numa_stat[item], 0);
- for_each_online_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
+ atomic_long_set(&zone->vm_numa_event[item], 0);
+ for_each_online_cpu(cpu) {
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
= 0;
+ }
}
}
@@ -63,8 +63,8 @@ static void zero_global_numa_counters(void)
{
int item;
- for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
- atomic_long_set(&vm_numa_stat[item], 0);
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ atomic_long_set(&vm_numa_event[item], 0);
}
static void invalid_numa_statistics(void)
@@ -130,9 +130,9 @@ static void sum_vm_events(unsigned long *ret)
*/
void all_vm_events(unsigned long *ret)
{
- get_online_cpus();
+ cpus_read_lock();
sum_vm_events(ret);
- put_online_cpus();
+ cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -161,12 +161,39 @@ void vm_events_fold_cpu(int cpu)
* vm_stat contains the global counters
*/
atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
-atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
-EXPORT_SYMBOL(vm_numa_stat);
EXPORT_SYMBOL(vm_node_stat);
+#ifdef CONFIG_NUMA
+static void fold_vm_zone_numa_events(struct zone *zone)
+{
+ unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
+ int cpu;
+ enum numa_stat_item item;
+
+ for_each_online_cpu(cpu) {
+ struct per_cpu_zonestat *pzstats;
+
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
+ }
+
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_event_add(zone_numa_events[item], zone, item);
+}
+
+void fold_vm_numa_events(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ fold_vm_zone_numa_events(zone);
+}
+#endif
+
#ifdef CONFIG_SMP
int calculate_pressure_threshold(struct zone *zone)
@@ -206,7 +233,7 @@ int calculate_normal_threshold(struct zone *zone)
*
* Some sample thresholds:
*
- * Threshold Processors (fls) Zonesize fls(mem+1)
+ * Threshold Processors (fls) Zonesize fls(mem)+1
* ------------------------------------------------------------------
* 8 1 1 0.9-1 GB 4
* 16 2 2 0.9-1 GB 4
@@ -266,7 +293,7 @@ void refresh_zone_stat_thresholds(void)
for_each_online_cpu(cpu) {
int pgdat_threshold;
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
= threshold;
/* Base nodestat threshold on the largest populated zone. */
@@ -303,7 +330,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
threshold = (*calculate_pressure)(zone);
for_each_online_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
= threshold;
}
}
@@ -316,20 +343,31 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long delta)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
long t;
+ /*
+ * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
+ * atomicity is provided by IRQs being disabled -- either explicitly
+ * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
+ * CPU migrations and preemption potentially corrupts a counter so
+ * disable preemption.
+ */
+ preempt_disable_nested();
+
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
__this_cpu_write(*p, x);
+
+ preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -342,19 +380,30 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long t;
if (vmstat_item_in_bytes(item)) {
+ /*
+ * Only cgroups use subpage accounting right now; at
+ * the global level, these items still change in
+ * multiples of whole pages. Store them as pages
+ * internally to keep the per-cpu counters compact.
+ */
VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
delta >>= PAGE_SHIFT;
}
+ /* See __mod_node_page_state */
+ preempt_disable_nested();
+
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
node_page_state_add(x, pgdat, item);
x = 0;
}
__this_cpu_write(*p, x);
+
+ preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -383,10 +432,13 @@ EXPORT_SYMBOL(__mod_node_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ /* See __mod_node_page_state */
+ preempt_disable_nested();
+
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -395,6 +447,8 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v + overstep, zone, item);
__this_cpu_write(*p, -overstep);
}
+
+ preempt_enable_nested();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -405,6 +459,9 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+ /* See __mod_node_page_state */
+ preempt_disable_nested();
+
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -413,6 +470,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
}
+
+ preempt_enable_nested();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -429,10 +488,13 @@ EXPORT_SYMBOL(__inc_node_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ /* See __mod_node_page_state */
+ preempt_disable_nested();
+
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -441,6 +503,8 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v - overstep, zone, item);
__this_cpu_write(*p, overstep);
}
+
+ preempt_enable_nested();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -451,6 +515,9 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+ /* See __mod_node_page_state */
+ preempt_disable_nested();
+
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -459,6 +526,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
}
+
+ preempt_enable_nested();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -489,7 +558,7 @@ EXPORT_SYMBOL(__dec_node_page_state);
static inline void mod_zone_state(struct zone *zone,
enum zone_stat_item item, long delta, int overstep_mode)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
long o, n, t, z;
@@ -511,7 +580,7 @@ static inline void mod_zone_state(struct zone *zone,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to zone counters */
@@ -551,6 +620,12 @@ static inline void mod_node_state(struct pglist_data *pgdat,
long o, n, t, z;
if (vmstat_item_in_bytes(item)) {
+ /*
+ * Only cgroups use subpage accounting right now; at
+ * the global level, these items still change in
+ * multiples of whole pages. Store them as pages
+ * internally to keep the per-cpu counters compact.
+ */
VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
delta >>= PAGE_SHIFT;
}
@@ -573,7 +648,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to node counters */
@@ -694,32 +769,6 @@ EXPORT_SYMBOL(dec_node_page_state);
* Fold a differential into the global counters.
* Returns the number of counters updated.
*/
-#ifdef CONFIG_NUMA
-static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
-{
- int i;
- int changes = 0;
-
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (zone_diff[i]) {
- atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
- changes++;
- }
-
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (numa_diff[i]) {
- atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
- changes++;
- }
-
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- if (node_diff[i]) {
- atomic_long_add(node_diff[i], &vm_node_stat[i]);
- changes++;
- }
- return changes;
-}
-#else
static int fold_diff(int *zone_diff, int *node_diff)
{
int i;
@@ -738,7 +787,6 @@ static int fold_diff(int *zone_diff, int *node_diff)
}
return changes;
}
-#endif /* CONFIG_NUMA */
/*
* Update the zone counters for the current cpu.
@@ -762,41 +810,30 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-#ifdef CONFIG_NUMA
- int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
-#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
int changes = 0;
for_each_populated_zone(zone) {
- struct per_cpu_pageset __percpu *p = zone->pageset;
+ struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+#ifdef CONFIG_NUMA
+ struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
+#endif
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
int v;
- v = this_cpu_xchg(p->vm_stat_diff[i], 0);
+ v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
if (v) {
atomic_long_add(v, &zone->vm_stat[i]);
global_zone_diff[i] += v;
#ifdef CONFIG_NUMA
/* 3 seconds idle till flush */
- __this_cpu_write(p->expire, 3);
+ __this_cpu_write(pcp->expire, 3);
#endif
}
}
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
- int v;
-
- v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
- if (v) {
-
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- global_numa_diff[i] += v;
- __this_cpu_write(p->expire, 3);
- }
- }
if (do_pagesets) {
cond_resched();
@@ -807,23 +844,23 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
* Check if there are pages remaining in this pageset
* if not then there is nothing to expire.
*/
- if (!__this_cpu_read(p->expire) ||
- !__this_cpu_read(p->pcp.count))
+ if (!__this_cpu_read(pcp->expire) ||
+ !__this_cpu_read(pcp->count))
continue;
/*
* We never drain zones local to this processor.
*/
if (zone_to_nid(zone) == numa_node_id()) {
- __this_cpu_write(p->expire, 0);
+ __this_cpu_write(pcp->expire, 0);
continue;
}
- if (__this_cpu_dec_return(p->expire))
+ if (__this_cpu_dec_return(pcp->expire))
continue;
- if (__this_cpu_read(p->pcp.count)) {
- drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ if (__this_cpu_read(pcp->count)) {
+ drain_zone_pages(zone, this_cpu_ptr(pcp));
changes++;
}
}
@@ -844,12 +881,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
}
}
-#ifdef CONFIG_NUMA
- changes += fold_diff(global_zone_diff, global_numa_diff,
- global_node_diff);
-#else
changes += fold_diff(global_zone_diff, global_node_diff);
-#endif
return changes;
}
@@ -864,36 +896,33 @@ void cpu_vm_stats_fold(int cpu)
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-#ifdef CONFIG_NUMA
- int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
-#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
for_each_populated_zone(zone) {
- struct per_cpu_pageset *p;
+ struct per_cpu_zonestat *pzstats;
- p = per_cpu_ptr(zone->pageset, cpu);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (p->vm_stat_diff[i]) {
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ if (pzstats->vm_stat_diff[i]) {
int v;
- v = p->vm_stat_diff[i];
- p->vm_stat_diff[i] = 0;
+ v = pzstats->vm_stat_diff[i];
+ pzstats->vm_stat_diff[i] = 0;
atomic_long_add(v, &zone->vm_stat[i]);
global_zone_diff[i] += v;
}
-
+ }
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (p->vm_numa_stat_diff[i]) {
- int v;
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
+ if (pzstats->vm_numa_event[i]) {
+ unsigned long v;
- v = p->vm_numa_stat_diff[i];
- p->vm_numa_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- global_numa_diff[i] += v;
+ v = pzstats->vm_numa_event[i];
+ pzstats->vm_numa_event[i] = 0;
+ zone_numa_event_add(v, zone, i);
}
+ }
#endif
}
@@ -913,58 +942,39 @@ void cpu_vm_stats_fold(int cpu)
}
}
-#ifdef CONFIG_NUMA
- fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
-#else
fold_diff(global_zone_diff, global_node_diff);
-#endif
}
/*
* this is only called if !populated_zone(zone), which implies no other users of
- * pset->vm_stat_diff[] exsist.
+ * pset->vm_stat_diff[] exist.
*/
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
{
+ unsigned long v;
int i;
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (pset->vm_stat_diff[i]) {
- int v = pset->vm_stat_diff[i];
- pset->vm_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_stat[i]);
- atomic_long_add(v, &vm_zone_stat[i]);
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ if (pzstats->vm_stat_diff[i]) {
+ v = pzstats->vm_stat_diff[i];
+ pzstats->vm_stat_diff[i] = 0;
+ zone_page_state_add(v, zone, i);
}
+ }
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (pset->vm_numa_stat_diff[i]) {
- int v = pset->vm_numa_stat_diff[i];
-
- pset->vm_numa_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- atomic_long_add(v, &vm_numa_stat[i]);
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
+ if (pzstats->vm_numa_event[i]) {
+ v = pzstats->vm_numa_event[i];
+ pzstats->vm_numa_event[i] = 0;
+ zone_numa_event_add(v, zone, i);
}
+ }
#endif
}
#endif
#ifdef CONFIG_NUMA
-void __inc_numa_state(struct zone *zone,
- enum numa_stat_item item)
-{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
- u16 __percpu *p = pcp->vm_numa_stat_diff + item;
- u16 v;
-
- v = __this_cpu_inc_return(*p);
-
- if (unlikely(v > NUMA_STATS_THRESHOLD)) {
- zone_numa_state_add(v, zone, item);
- __this_cpu_write(*p, 0);
- }
-}
-
/*
* Determine the per node value of a stat item. This function
* is called frequently in a NUMA machine, so try to be as
@@ -983,19 +993,16 @@ unsigned long sum_zone_node_page_state(int node,
return count;
}
-/*
- * Determine the per node value of a numa stat item. To avoid deviation,
- * the per cpu stat number in vm_numa_stat_diff[] is also included.
- */
-unsigned long sum_zone_numa_state(int node,
+/* Determine the per node value of a numa stat item. */
+unsigned long sum_zone_numa_event_state(int node,
enum numa_stat_item item)
{
struct zone *zones = NODE_DATA(node)->node_zones;
- int i;
unsigned long count = 0;
+ int i;
for (i = 0; i < MAX_NR_ZONES; i++)
- count += zone_numa_state_snapshot(zones + i, item);
+ count += zone_numa_event_state(zones + i, item);
return count;
}
@@ -1049,11 +1056,16 @@ static void fill_contig_page_info(struct zone *zone,
info->free_blocks_total = 0;
info->free_blocks_suitable = 0;
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
unsigned long blocks;
- /* Count number of free blocks */
- blocks = zone->free_area[order].nr_free;
+ /*
+ * Count number of free blocks.
+ *
+ * Access to nr_free is lockless as nr_free is used only for
+ * diagnostic purposes. Use data_race to avoid KCSAN warning.
+ */
+ blocks = data_race(zone->free_area[order].nr_free);
info->free_blocks_total += blocks;
/* Count free base pages */
@@ -1077,7 +1089,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
{
unsigned long requested = 1UL << order;
- if (WARN_ON_ONCE(order >= MAX_ORDER))
+ if (WARN_ON_ONCE(order > MAX_ORDER))
return 0;
if (!info->free_blocks_total)
@@ -1144,8 +1156,15 @@ int fragmentation_index(struct zone *zone, unsigned int order)
#define TEXT_FOR_HIGHMEM(xx)
#endif
+#ifdef CONFIG_ZONE_DEVICE
+#define TEXT_FOR_DEVICE(xx) xx "_device",
+#else
+#define TEXT_FOR_DEVICE(xx)
+#endif
+
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable",
+ TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+ TEXT_FOR_DEVICE(xx)
const char * const vmstat_text[] = {
/* enum zone_stat_item counters */
@@ -1157,12 +1176,14 @@ const char * const vmstat_text[] = {
"nr_zone_unevictable",
"nr_zone_write_pending",
"nr_mlock",
- "nr_page_table_pages",
"nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
#endif
"nr_free_cma",
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ "nr_unaccepted",
+#endif
/* enum numa_stat_item counters */
#ifdef CONFIG_NUMA
@@ -1208,6 +1229,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
+ "nr_throttled_written",
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",
@@ -1215,6 +1237,15 @@ const char * const vmstat_text[] = {
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
"nr_shadow_call_stack",
#endif
+ "nr_page_table_pages",
+ "nr_sec_page_table_pages",
+#ifdef CONFIG_SWAP
+ "nr_swapcached",
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ "pgpromote_success",
+ "pgpromote_candidate",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1244,8 +1275,13 @@ const char * const vmstat_text[] = {
"pgreuse",
"pgsteal_kswapd",
"pgsteal_direct",
+ "pgsteal_khugepaged",
+ "pgdemote_kswapd",
+ "pgdemote_direct",
+ "pgdemote_khugepaged",
"pgscan_kswapd",
"pgscan_direct",
+ "pgscan_khugepaged",
"pgscan_direct_throttle",
"pgscan_anon",
"pgscan_file",
@@ -1298,6 +1334,10 @@ const char * const vmstat_text[] = {
"htlb_buddy_alloc_success",
"htlb_buddy_alloc_fail",
#endif
+#ifdef CONFIG_CMA
+ "cma_alloc_success",
+ "cma_alloc_fail",
+#endif
"unevictable_pgs_culled",
"unevictable_pgs_scanned",
"unevictable_pgs_rescued",
@@ -1320,6 +1360,9 @@ const char * const vmstat_text[] = {
"thp_split_page_failed",
"thp_deferred_split_page",
"thp_split_pmd",
+ "thp_scan_exceed_none_pte",
+ "thp_scan_exceed_swap_pte",
+ "thp_scan_exceed_share_pte",
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
"thp_split_pud",
#endif
@@ -1342,13 +1385,29 @@ const char * const vmstat_text[] = {
"nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- "vmacache_find_calls",
- "vmacache_find_hits",
-#endif
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
+#ifdef CONFIG_KSM
+ "ksm_swpin_copy",
+#endif
+#endif
+#ifdef CONFIG_KSM
+ "cow_ksm",
+#endif
+#ifdef CONFIG_ZSWAP
+ "zswpin",
+ "zswpout",
+#endif
+#ifdef CONFIG_X86
+ "direct_map_level2_splits",
+ "direct_map_level3_splits",
+#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+ "vma_lock_success",
+ "vma_lock_abort",
+ "vma_lock_retry",
+ "vma_lock_miss",
#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
@@ -1413,8 +1472,12 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
int order;
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
- for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ for (order = 0; order <= MAX_ORDER; ++order)
+ /*
+ * Access to nr_free is lockless as nr_free is used only for
+ * printing purposes. Use data_race to avoid KCSAN warning.
+ */
+ seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
seq_putc(m, '\n');
}
@@ -1438,7 +1501,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
pgdat->node_id,
zone->name,
migratetype_names[mtype]);
- for (order = 0; order < MAX_ORDER; ++order) {
+ for (order = 0; order <= MAX_ORDER; ++order) {
unsigned long freecount = 0;
struct free_area *area;
struct list_head *curr;
@@ -1471,20 +1534,18 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
}
/* Print out the free pages at each order for each migatetype */
-static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
{
int order;
pg_data_t *pgdat = (pg_data_t *)arg;
/* Print header */
seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
- for (order = 0; order < MAX_ORDER; ++order)
+ for (order = 0; order <= MAX_ORDER; ++order)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
-
- return 0;
}
static void pagetypeinfo_showblockcount_print(struct seq_file *m,
@@ -1503,10 +1564,6 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
if (!page)
continue;
- /* Watch for unexpected holes punched in the memmap */
- if (!memmap_valid_within(pfn, page, zone))
- continue;
-
if (page_zone(page) != zone)
continue;
@@ -1524,7 +1581,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
}
/* Print out the number of pageblocks for each migratetype */
-static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
{
int mtype;
pg_data_t *pgdat = (pg_data_t *)arg;
@@ -1535,8 +1592,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
seq_putc(m, '\n');
walk_zones_in_node(m, pgdat, true, false,
pagetypeinfo_showblockcount_print);
-
- return 0;
}
/*
@@ -1623,25 +1678,33 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
if (is_zone_first_populated(pgdat, zone)) {
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ unsigned long pages = node_page_state_pages(pgdat, i);
+
+ if (vmstat_item_print_in_thp(i))
+ pages /= HPAGE_PMD_NR;
seq_printf(m, "\n %-12s %lu", node_stat_name(i),
- node_page_state_pages(pgdat, i));
+ pages);
}
}
seq_printf(m,
"\n pages free %lu"
+ "\n boost %lu"
"\n min %lu"
"\n low %lu"
"\n high %lu"
"\n spanned %lu"
"\n present %lu"
- "\n managed %lu",
+ "\n managed %lu"
+ "\n cma %lu",
zone_page_state(zone, NR_FREE_PAGES),
+ zone->watermark_boost,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
zone->spanned_pages,
zone->present_pages,
- zone_managed_pages(zone));
+ zone_managed_pages(zone),
+ zone_cma_pages(zone));
seq_printf(m,
"\n protection: (%ld",
@@ -1661,28 +1724,30 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone_page_state(zone, i));
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
- zone_numa_state_snapshot(zone, i));
+ zone_numa_event_state(zone, i));
#endif
seq_printf(m, "\n pagesets");
for_each_online_cpu(i) {
- struct per_cpu_pageset *pageset;
+ struct per_cpu_pages *pcp;
+ struct per_cpu_zonestat __maybe_unused *pzstats;
- pageset = per_cpu_ptr(zone->pageset, i);
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"
"\n high: %i"
"\n batch: %i",
i,
- pageset->pcp.count,
- pageset->pcp.high,
- pageset->pcp.batch);
+ pcp->count,
+ pcp->high,
+ pcp->batch);
#ifdef CONFIG_SMP
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
seq_printf(m, "\n vm stats threshold: %d",
- pageset->stat_threshold);
+ pzstats->stat_threshold);
#endif
}
seq_printf(m,
@@ -1715,7 +1780,7 @@ static const struct seq_operations zoneinfo_op = {
};
#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
- NR_VM_NUMA_STAT_ITEMS + \
+ NR_VM_NUMA_EVENT_ITEMS + \
NR_VM_NODE_STAT_ITEMS + \
NR_VM_WRITEBACK_STAT_ITEMS + \
(IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
@@ -1730,6 +1795,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
return NULL;
BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+ fold_vm_numa_events();
v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
m->private = v;
if (!v)
@@ -1739,13 +1805,16 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
v += NR_VM_ZONE_STAT_ITEMS;
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- v[i] = global_numa_state(i);
- v += NR_VM_NUMA_STAT_ITEMS;
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
+ v[i] = global_numa_event_state(i);
+ v += NR_VM_NUMA_EVENT_ITEMS;
#endif
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
v[i] = global_node_page_state_pages(i);
+ if (vmstat_item_print_in_thp(i))
+ v[i] /= HPAGE_PMD_NR;
+ }
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
@@ -1834,25 +1903,34 @@ int vmstat_refresh(struct ctl_table *table, int write,
if (err)
return err;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ /*
+ * Skip checking stats known to go negative occasionally.
+ */
+ switch (i) {
+ case NR_ZONE_WRITE_PENDING:
+ case NR_FREE_CMA_PAGES:
+ continue;
+ }
val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
pr_warn("%s: %s %ld\n",
__func__, zone_stat_name(i), val);
- err = -EINVAL;
}
}
-#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
- val = atomic_long_read(&vm_numa_stat[i]);
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ /*
+ * Skip checking stats known to go negative occasionally.
+ */
+ switch (i) {
+ case NR_WRITEBACK:
+ continue;
+ }
+ val = atomic_long_read(&vm_node_stat[i]);
if (val < 0) {
pr_warn("%s: %s %ld\n",
- __func__, numa_stat_name(i), val);
- err = -EINVAL;
+ __func__, node_stat_name(i), val);
}
}
-#endif
- if (err)
- return err;
if (write)
*ppos += *lenp;
else
@@ -1876,37 +1954,30 @@ static void vmstat_update(struct work_struct *w)
}
/*
- * Switch off vmstat processing and then fold all the remaining differentials
- * until the diffs stay at zero. The function is used by NOHZ and can only be
- * invoked when tick processing is not active.
- */
-/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
*/
static bool need_update(int cpu)
{
+ pg_data_t *last_pgdat = NULL;
struct zone *zone;
for_each_populated_zone(zone) {
- struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
-
- BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
-#ifdef CONFIG_NUMA
- BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
-#endif
+ struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ struct per_cpu_nodestat *n;
/*
* The fast way of checking if there are any vmstat diffs.
*/
- if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
- sizeof(p->vm_stat_diff[0])))
+ if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
return true;
-#ifdef CONFIG_NUMA
- if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
- sizeof(p->vm_numa_stat_diff[0])))
+
+ if (last_pgdat == zone->zone_pgdat)
+ continue;
+ last_pgdat = zone->zone_pgdat;
+ n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
+ if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
return true;
-#endif
}
return false;
}
@@ -1950,15 +2021,31 @@ static void vmstat_shepherd(struct work_struct *w)
{
int cpu;
- get_online_cpus();
+ cpus_read_lock();
/* Check processors whose vmstat worker threads have been disabled */
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ /*
+ * In kernel users of vmstat counters either require the precise value and
+ * they are using zone_page_state_snapshot interface or they can live with
+ * an imprecision as the regular flushing can happen at arbitrary time and
+ * cumulative error can grow (see calculate_normal_threshold).
+ *
+ * From that POV the regular flushing can be postponed for CPUs that have
+ * been isolated from the kernel interference without critical
+ * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
+ * for all isolated CPUs to avoid interference with the isolated workload.
+ */
+ if (cpu_is_isolated(cpu))
+ continue;
+
if (!delayed_work_pending(dw) && need_update(cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
+
+ cond_resched();
}
- put_online_cpus();
+ cpus_read_unlock();
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
@@ -1981,7 +2068,7 @@ static void __init init_cpu_node_state(void)
int node;
for_each_online_node(node) {
- if (cpumask_weight(cpumask_of_node(node)) > 0)
+ if (!cpumask_empty(cpumask_of_node(node)))
node_set_state(node, N_CPU);
}
}
@@ -1989,7 +2076,11 @@ static void __init init_cpu_node_state(void)
static int vmstat_cpu_online(unsigned int cpu)
{
refresh_zone_stat_thresholds();
- node_set_state(cpu_to_node(cpu), N_CPU);
+
+ if (!node_state(cpu_to_node(cpu), N_CPU)) {
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ }
+
return 0;
}
@@ -2008,10 +2099,11 @@ static int vmstat_cpu_dead(unsigned int cpu)
refresh_zone_stat_thresholds();
node_cpus = cpumask_of_node(node);
- if (cpumask_weight(node_cpus) > 0)
+ if (!cpumask_empty(node_cpus))
return 0;
node_clear_state(node, N_CPU);
+
return 0;
}
@@ -2037,9 +2129,9 @@ void __init init_mm_internals(void)
if (ret < 0)
pr_err("vmstat: failed to register 'online' hotplug state\n");
- get_online_cpus();
+ cpus_read_lock();
init_cpu_node_state();
- put_online_cpus();
+ cpus_read_unlock();
start_shepherd_timer();
#endif
@@ -2085,7 +2177,7 @@ static void unusable_show_print(struct seq_file *m,
seq_printf(m, "Node %d, zone %8s ",
pgdat->node_id,
zone->name);
- for (order = 0; order < MAX_ORDER; ++order) {
+ for (order = 0; order <= MAX_ORDER; ++order) {
fill_contig_page_info(zone, order, &info);
index = unusable_free_index(order, &info);
seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
@@ -2137,10 +2229,10 @@ static void extfrag_show_print(struct seq_file *m,
seq_printf(m, "Node %d, zone %8s ",
pgdat->node_id,
zone->name);
- for (order = 0; order < MAX_ORDER; ++order) {
+ for (order = 0; order <= MAX_ORDER; ++order) {
fill_contig_page_info(zone, order, &info);
index = __fragmentation_index(order, &info);
- seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
}
seq_putc(m, '\n');
diff --git a/mm/workingset.c b/mm/workingset.c
index 92e66113a577..4686ae363000 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -111,9 +111,20 @@
*
* NR_inactive + (R - E) <= NR_inactive + NR_active
*
- * which can be further simplified to
+ * If we have swap we should consider about NR_inactive_anon and
+ * NR_active_anon, so for page cache and anonymous respectively:
*
- * (R - E) <= NR_active
+ * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
+ * + NR_inactive_anon + NR_active_anon
+ *
+ * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
+ * + NR_inactive_file + NR_active_file
+ *
+ * Which can be further simplified to:
+ *
+ * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
+ *
+ * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
*
* Put into words, the refault distance (out-of-cache) can be seen as
* a deficit in inactive list space (in-cache). If the inactive list
@@ -130,14 +141,14 @@
* are no longer in active use.
*
* So when a refault distance of (R - E) is observed and there are at
- * least (R - E) active pages, the refaulting page is activated
- * optimistically in the hope that (R - E) active pages are actually
+ * least (R - E) pages in the userspace workingset, the refaulting page
+ * is activated optimistically in the hope that (R - E) pages are actually
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
* That means if inactive cache is refaulting with a suitable refault
* distance, we assume the cache workingset is transitioning and put
- * pressure on the current active list.
+ * pressure on the current workingset.
*
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
@@ -168,8 +179,10 @@
* refault distance will immediately activate the refaulting page.
*/
+#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
+ WORKINGSET_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
@@ -185,11 +198,10 @@ static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
- eviction >>= bucket_order;
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
- eviction = (eviction << 1) | workingset;
+ eviction = (eviction << WORKINGSET_SHIFT) | workingset;
return xa_mk_value(eviction);
}
@@ -201,8 +213,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
int memcgid, nid;
bool workingset;
- workingset = entry & 1;
- entry >>= 1;
+ workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
+ entry >>= WORKINGSET_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -210,13 +222,129 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
- *evictionp = entry << bucket_order;
+ *evictionp = entry;
*workingsetp = workingset;
}
+#ifdef CONFIG_LRU_GEN
+
+static void *lru_gen_eviction(struct folio *folio)
+{
+ int hist;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ int type = folio_is_file_lru(folio);
+ int delta = folio_nr_pages(folio);
+ int refs = folio_lru_refs(folio);
+ int tier = lru_tier_from_refs(refs);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+
+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->lrugen;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
+
+ hist = lru_hist_from_seq(min_seq);
+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
+
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+}
+
+/*
+ * Tests if the shadow entry is for a folio that was recently evicted.
+ * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
+ */
+static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ int memcg_id;
+ unsigned long min_seq;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+
+ unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
+
+ memcg = mem_cgroup_from_id(memcg_id);
+ *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
+ return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
+}
+
+static void lru_gen_refault(struct folio *folio, void *shadow)
+{
+ bool recent;
+ int hist, tier, refs;
+ bool workingset;
+ unsigned long token;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ int type = folio_is_file_lru(folio);
+ int delta = folio_nr_pages(folio);
+
+ rcu_read_lock();
+
+ recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
+ if (lruvec != folio_lruvec(folio))
+ goto unlock;
+
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
+
+ if (!recent)
+ goto unlock;
+
+ lrugen = &lruvec->lrugen;
+
+ hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
+ /* see the comment in folio_lru_refs() */
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
+ tier = lru_tier_from_refs(refs);
+
+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+
+ /*
+ * Count the following two cases as stalls:
+ * 1. For pages accessed through page tables, hotter pages pushed out
+ * hot pages which refaulted immediately.
+ * 2. For pages accessed multiple times through file descriptors,
+ * numbers of accesses might have been out of the range.
+ */
+ if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
+ folio_set_workingset(folio);
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
+ }
+unlock:
+ rcu_read_unlock();
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct folio *folio)
+{
+ return NULL;
+}
+
+static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ return false;
+}
+
+static void lru_gen_refault(struct folio *folio, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
- * @memcg: the lruvec that was aged
+ * @lruvec: the lruvec that was aged
* @nr_pages: the number of pages to count
*
* As in-memory pages are aged, non-resident pages need to be aged as
@@ -243,66 +371,72 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
}
/**
- * workingset_eviction - note the eviction of a page from memory
+ * workingset_eviction - note the eviction of a folio from memory
* @target_memcg: the cgroup that is causing the reclaim
- * @page: the page being evicted
+ * @folio: the folio being evicted
*
- * Returns a shadow entry to be stored in @page->mapping->i_pages in place
- * of the evicted @page so that a later refault can be detected.
+ * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
+ * of the evicted @folio so that a later refault can be detected.
*/
-void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
+void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
{
- struct pglist_data *pgdat = page_pgdat(page);
+ struct pglist_data *pgdat = folio_pgdat(folio);
unsigned long eviction;
struct lruvec *lruvec;
int memcgid;
- /* Page is fully exclusive and pins page->mem_cgroup */
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ /* Folio is fully exclusive and pins folio's memory cgroup pointer */
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (lru_gen_enabled())
+ return lru_gen_eviction(folio);
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ eviction >>= bucket_order;
+ workingset_age_nonresident(lruvec, folio_nr_pages(folio));
+ return pack_shadow(memcgid, pgdat, eviction,
+ folio_test_workingset(folio));
}
/**
- * workingset_refault - evaluate the refault of a previously evicted page
- * @page: the freshly allocated replacement page
- * @shadow: shadow entry of the evicted page
- *
- * Calculates and evaluates the refault distance of the previously
- * evicted page in the context of the node and the memcg whose memory
- * pressure caused the eviction.
+ * workingset_test_recent - tests if the shadow entry is for a folio that was
+ * recently evicted. Also fills in @workingset with the value unpacked from
+ * shadow.
+ * @shadow: the shadow entry to be tested.
+ * @file: whether the corresponding folio is from the file lru.
+ * @workingset: where the workingset value unpacked from shadow should
+ * be stored.
+ *
+ * Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
-void workingset_refault(struct page *page, void *shadow)
+bool workingset_test_recent(void *shadow, bool file, bool *workingset)
{
- bool file = page_is_file_lru(page);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
unsigned long workingset_size;
- struct pglist_data *pgdat;
- struct mem_cgroup *memcg;
- unsigned long eviction;
- struct lruvec *lruvec;
unsigned long refault;
- bool workingset;
int memcgid;
+ struct pglist_data *pgdat;
+ unsigned long eviction;
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ if (lru_gen_enabled())
+ return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
+
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
+ eviction <<= bucket_order;
- rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
- * have been deleted since the page's eviction.
+ * have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
- * for a new cgroup that refaults a shared page. This is
+ * for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
@@ -315,7 +449,8 @@ void workingset_refault(struct page *page, void *shadow)
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !eviction_memcg)
- goto out;
+ return false;
+
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -338,31 +473,18 @@ void workingset_refault(struct page *page, void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
/*
- * The activation decision for this page is made at the level
- * where the eviction occurred, as that is where the LRU order
- * during page reclaim is being determined.
- *
- * However, the cgroup that will own the page is the one that
- * is actually experiencing the refault event.
- */
- memcg = page_memcg(page);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
-
- inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
-
- /*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if
* all the memory was available to the workingset. Whether
* workingset competition needs to consider anon or not depends
- * on having swap.
+ * on having free swap space.
*/
workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
if (!file) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_FILE);
}
- if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
if (file) {
@@ -370,21 +492,69 @@ void workingset_refault(struct page *page, void *shadow)
NR_INACTIVE_ANON);
}
}
- if (refault_distance > workingset_size)
+
+ return refault_distance <= workingset_size;
+}
+
+/**
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted folio in the context of the node and the memcg whose memory
+ * pressure caused the eviction.
+ */
+void workingset_refault(struct folio *folio, void *shadow)
+{
+ bool file = folio_is_file_lru(folio);
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ bool workingset;
+ long nr;
+
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ return;
+ }
+
+ /* Flush stats (and potentially sleep) before holding RCU read lock */
+ mem_cgroup_flush_stats_ratelimited();
+
+ rcu_read_lock();
+
+ /*
+ * The activation decision for this folio is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during folio reclaim is being determined.
+ *
+ * However, the cgroup that will own the folio is the one that
+ * is actually experiencing the refault event.
+ */
+ nr = folio_nr_pages(folio);
+ memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+
+ if (!workingset_test_recent(shadow, file, &workingset))
goto out;
- SetPageActive(page);
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
+ folio_set_active(folio);
+ workingset_age_nonresident(lruvec, nr);
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
- /* Page was active prior to eviction */
+ /* Folio was active prior to eviction */
if (workingset) {
- SetPageWorkingset(page);
- /* XXX: Move to lru_cache_add() when it supports new vs putback */
- spin_lock_irq(&page_pgdat(page)->lru_lock);
- lru_note_cost_page(page);
- spin_unlock_irq(&page_pgdat(page)->lru_lock);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
+ folio_set_workingset(folio);
+ /*
+ * XXX: Move to folio_add_lru() when it supports new vs
+ * putback
+ */
+ lru_note_cost_refault(folio);
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out:
rcu_read_unlock();
@@ -392,12 +562,11 @@ out:
/**
* workingset_activation - note a page activation
- * @page: page that is being activated
+ * @folio: Folio that is being activated.
*/
-void workingset_activation(struct page *page)
+void workingset_activation(struct folio *folio)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
/*
@@ -407,11 +576,10 @@ void workingset_activation(struct page *page)
* XXX: See workingset_refault() - this should return
* root_mem_cgroup even for !CONFIG_MEMCG.
*/
- memcg = page_memcg_rcu(page);
+ memcg = folio_memcg_rcu(folio);
if (!mem_cgroup_disabled() && !memcg)
goto out;
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
out:
rcu_read_unlock();
}
@@ -428,10 +596,12 @@ out:
* point where they would still be useful.
*/
-static struct list_lru shadow_nodes;
+struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
+ struct address_space *mapping;
+
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -440,17 +610,18 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+ mapping = container_of(node->array, struct address_space, i_pages);
+ lockdep_assert_held(&mapping->i_pages.xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
- __inc_lruvec_slab_state(node, WORKINGSET_NODES);
+ __inc_lruvec_kmem_state(node, WORKINGSET_NODES);
}
} else {
if (!list_empty(&node->private_list)) {
list_lru_del(&shadow_nodes, &node->private_list);
- __dec_lruvec_slab_state(node, WORKINGSET_NODES);
+ __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
}
}
}
@@ -463,6 +634,8 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
+ if (!nodes)
+ return SHRINK_EMPTY;
/*
* Approximate a reasonable limit for the nodes
@@ -505,9 +678,6 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
- if (!nodes)
- return SHRINK_EMPTY;
-
if (nodes <= max_nodes)
return 0;
return nodes - max_nodes;
@@ -519,12 +689,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
void *arg) __must_hold(lru_lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
- XA_STATE(xas, node->array, 0);
struct address_space *mapping;
int ret;
/*
- * Page cache insertions and deletions synchroneously maintain
+ * Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
@@ -544,8 +713,18 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
+ /* For page cache we need to hold i_lock */
+ if (mapping->host != NULL) {
+ if (!spin_trylock(&mapping->host->i_lock)) {
+ xa_unlock(&mapping->i_pages);
+ spin_unlock_irq(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+ }
+
list_lru_isolate(lru, item);
- __dec_lruvec_slab_state(node, WORKINGSET_NODES);
+ __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
spin_unlock(lru_lock);
@@ -558,20 +737,16 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out_invalid;
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
- mapping->nrexceptional -= node->nr_values;
- xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
- xas.xa_offset = node->offset;
- xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
- xas_set_update(&xas, workingset_update_node);
- /*
- * We could store a shadow entry here which was the minimum of the
- * shadow entries we were tracking ...
- */
- xas_store(&xas, NULL);
- __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
+ xa_delete_node(node, workingset_update_node);
+ __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
+ if (mapping->host != NULL) {
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+ }
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
@@ -621,7 +796,7 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = prealloc_shrinker(&workingset_shadow_shrinker);
+ ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
if (ret)
goto err;
ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
diff --git a/mm/z3fold.c b/mm/z3fold.c
index c203623dece1..6f1f77ec49ac 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -34,15 +34,11 @@
#include <linux/node.h>
#include <linux/compaction.h>
#include <linux/percpu.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
-#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
-#include <linux/magic.h>
#include <linux/kmemleak.h>
/*
@@ -62,7 +58,7 @@
#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS)
#define BUDDY_MASK (0x3)
#define BUDDY_SHIFT 2
@@ -72,9 +68,6 @@
* Structures
*****************/
struct z3fold_pool;
-struct z3fold_ops {
- int (*evict)(struct z3fold_pool *pool, unsigned long handle);
-};
enum buddy {
HEADLESS = 0,
@@ -90,7 +83,7 @@ struct z3fold_buddy_slots {
* be enough slots to hold all possible variants
*/
unsigned long slot[BUDDY_MASK + 1];
- unsigned long pool; /* back link + flags */
+ unsigned long pool; /* back link */
rwlock_t lock;
};
#define HANDLE_FLAG_MASK (0x03)
@@ -132,24 +125,19 @@ struct z3fold_header {
/**
* struct z3fold_pool - stores metadata for each z3fold pool
* @name: pool name
- * @lock: protects pool unbuddied/lru lists
+ * @lock: protects pool unbuddied lists
* @stale_lock: protects pool stale page list
* @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
* buddies; the list each z3fold page is added to depends on
* the size of its free region.
- * @lru: list tracking the z3fold pages in LRU order by most recently
- * added buddy.
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @c_handle: cache for z3fold_buddy_slots allocation
- * @ops: pointer to a structure of user defined operations specified at
- * pool creation time.
+ * @zpool: zpool driver
+ * @zpool_ops: zpool operations structure with an evict callback
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
- * @inode: inode for z3fold pseudo filesystem
- * @destroying: bool to stop migration once we start destruction
- * @isolated: int to count the number of pages currently in isolation
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
@@ -159,20 +147,13 @@ struct z3fold_pool {
spinlock_t lock;
spinlock_t stale_lock;
struct list_head *unbuddied;
- struct list_head lru;
struct list_head stale;
atomic64_t pages_nr;
struct kmem_cache *c_handle;
- const struct z3fold_ops *ops;
- struct zpool *zpool;
- const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct wait_queue_head isolate_wait;
struct work_struct work;
- struct inode *inode;
- bool destroying;
- int isolated;
};
/*
@@ -184,13 +165,14 @@ enum z3fold_page_flags {
NEEDS_COMPACTING,
PAGE_STALE,
PAGE_CLAIMED, /* by either reclaim or free */
+ PAGE_MIGRATED, /* page is migrated and soon to be released */
};
/*
* handle flags, go under HANDLE_FLAG_MASK
*/
enum z3fold_handle_flags {
- HANDLES_ORPHANED = 0,
+ HANDLES_NOFREE = 0,
};
/*
@@ -215,10 +197,8 @@ static int size_to_chunks(size_t size)
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
gfp_t gfp)
{
- struct z3fold_buddy_slots *slots;
-
- slots = kmem_cache_zalloc(pool->c_handle,
- (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
+ struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
+ gfp);
if (slots) {
/* It will be freed separately in free_handle(). */
@@ -258,9 +238,8 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
spin_unlock(&zhdr->page_lock);
}
-
-static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
- bool lock)
+/* return locked z3fold page if it's not headless */
+static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
{
struct z3fold_buddy_slots *slots;
struct z3fold_header *zhdr;
@@ -274,13 +253,17 @@ static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
read_lock(&slots->lock);
addr = *(unsigned long *)handle;
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- if (lock)
- locked = z3fold_page_trylock(zhdr);
+ locked = z3fold_page_trylock(zhdr);
read_unlock(&slots->lock);
- if (locked)
- break;
+ if (locked) {
+ struct page *page = virt_to_page(zhdr);
+
+ if (!test_bit(PAGE_MIGRATED, &page->private))
+ break;
+ z3fold_page_unlock(zhdr);
+ }
cpu_relax();
- } while (lock);
+ } while (true);
} else {
zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
}
@@ -288,18 +271,6 @@ static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
return zhdr;
}
-/* Returns the z3fold page where a given handle is stored */
-static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
-{
- return __get_z3fold_header(h, false);
-}
-
-/* return locked z3fold page if it's not headless */
-static inline struct z3fold_header *get_z3fold_header(unsigned long h)
-{
- return __get_z3fold_header(h, true);
-}
-
static inline void put_z3fold_header(struct z3fold_header *zhdr)
{
struct page *page = virt_to_page(zhdr);
@@ -308,35 +279,28 @@ static inline void put_z3fold_header(struct z3fold_header *zhdr)
z3fold_page_unlock(zhdr);
}
-static inline void free_handle(unsigned long handle)
+static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
{
struct z3fold_buddy_slots *slots;
- struct z3fold_header *zhdr;
int i;
bool is_free;
- if (handle & (1 << PAGE_HEADLESS))
- return;
-
if (WARN_ON(*(unsigned long *)handle == 0))
return;
- zhdr = handle_to_z3fold_header(handle);
slots = handle_to_slots(handle);
write_lock(&slots->lock);
*(unsigned long *)handle = 0;
- if (zhdr->slots == slots) {
+
+ if (test_bit(HANDLES_NOFREE, &slots->pool)) {
write_unlock(&slots->lock);
return; /* simple case, nothing else to do */
}
- /* we are freeing a foreign handle if we are here */
- zhdr->foreign_handles--;
+ if (zhdr->slots != slots)
+ zhdr->foreign_handles--;
+
is_free = true;
- if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
- write_unlock(&slots->lock);
- return;
- }
for (i = 0; i <= BUDDY_MASK; i++) {
if (slots->slot[i]) {
is_free = false;
@@ -348,58 +312,12 @@ static inline void free_handle(unsigned long handle)
if (is_free) {
struct z3fold_pool *pool = slots_to_pool(slots);
+ if (zhdr->slots == slots)
+ zhdr->slots = NULL;
kmem_cache_free(pool->c_handle, slots);
}
}
-static int z3fold_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type z3fold_fs = {
- .name = "z3fold",
- .init_fs_context = z3fold_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
-static struct vfsmount *z3fold_mnt;
-static int z3fold_mount(void)
-{
- int ret = 0;
-
- z3fold_mnt = kern_mount(&z3fold_fs);
- if (IS_ERR(z3fold_mnt))
- ret = PTR_ERR(z3fold_mnt);
-
- return ret;
-}
-
-static void z3fold_unmount(void)
-{
- kern_unmount(z3fold_mnt);
-}
-
-static const struct address_space_operations z3fold_aops;
-static int z3fold_register_migration(struct z3fold_pool *pool)
-{
- pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
- if (IS_ERR(pool->inode)) {
- pool->inode = NULL;
- return 1;
- }
-
- pool->inode->i_mapping->private_data = pool;
- pool->inode->i_mapping->a_ops = &z3fold_aops;
- return 0;
-}
-
-static void z3fold_unregister_migration(struct z3fold_pool *pool)
-{
- if (pool->inode)
- iput(pool->inode);
- }
-
/* Initializes the z3fold header of a newly allocated z3fold page */
static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
struct z3fold_pool *pool, gfp_t gfp)
@@ -407,12 +325,12 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
struct z3fold_header *zhdr = page_address(page);
struct z3fold_buddy_slots *slots;
- INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
clear_bit(PAGE_CLAIMED, &page->private);
+ clear_bit(PAGE_MIGRATED, &page->private);
if (headless)
return zhdr;
@@ -420,16 +338,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
if (!slots)
return NULL;
+ memset(zhdr, 0, sizeof(*zhdr));
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
- zhdr->first_chunks = 0;
- zhdr->middle_chunks = 0;
- zhdr->last_chunks = 0;
- zhdr->first_num = 0;
- zhdr->start_middle = 0;
zhdr->cpu = -1;
- zhdr->foreign_handles = 0;
- zhdr->mapped_count = 0;
zhdr->slots = slots;
zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
@@ -445,7 +357,6 @@ static void free_z3fold_page(struct page *page, bool headless)
__ClearPageMovable(page);
unlock_page(page);
}
- ClearPagePrivate(page);
__free_page(page);
}
@@ -530,32 +441,13 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- bool is_free = true;
- int i;
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
spin_lock(&pool->lock);
- if (!list_empty(&page->lru))
- list_del_init(&page->lru);
spin_unlock(&pool->lock);
- /* If there are no foreign handles, free the handles array */
- read_lock(&zhdr->slots->lock);
- for (i = 0; i <= BUDDY_MASK; i++) {
- if (zhdr->slots->slot[i]) {
- is_free = false;
- break;
- }
- }
- if (!is_free)
- set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
- read_unlock(&zhdr->slots->lock);
-
- if (is_free)
- kmem_cache_free(pool->c_handle, zhdr->slots);
-
if (locked)
z3fold_page_unlock(zhdr);
@@ -563,14 +455,8 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
list_add(&zhdr->buddy, &pool->stale);
queue_work(pool->release_wq, &pool->work);
spin_unlock(&pool->stale_lock);
-}
-static void __attribute__((__unused__))
- release_z3fold_page(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- __release_z3fold_page(zhdr, false);
+ atomic64_dec(&pool->pages_nr);
}
static void release_z3fold_page_locked(struct kref *ref)
@@ -647,15 +533,39 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool,
{
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
+ struct list_head *unbuddied;
int freechunks = num_free_chunks(zhdr);
+
+ migrate_disable();
+ unbuddied = this_cpu_ptr(pool->unbuddied);
spin_lock(&pool->lock);
list_add(&zhdr->buddy, &unbuddied[freechunks]);
spin_unlock(&pool->lock);
zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
+ }
+}
+
+static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
+{
+ enum buddy bud = HEADLESS;
+
+ if (zhdr->middle_chunks) {
+ if (!zhdr->first_chunks &&
+ chunks <= zhdr->start_middle - ZHDR_CHUNKS)
+ bud = FIRST;
+ else if (!zhdr->last_chunks)
+ bud = LAST;
+ } else {
+ if (!zhdr->first_chunks)
+ bud = FIRST;
+ else if (!zhdr->last_chunks)
+ bud = LAST;
+ else
+ bud = MIDDLE;
}
+
+ return bud;
}
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
@@ -719,18 +629,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
if (WARN_ON(new_zhdr == zhdr))
goto out_fail;
- if (new_zhdr->first_chunks == 0) {
- if (new_zhdr->middle_chunks != 0 &&
- chunks >= new_zhdr->start_middle) {
- new_bud = LAST;
- } else {
- new_bud = FIRST;
- }
- } else if (new_zhdr->last_chunks == 0) {
- new_bud = LAST;
- } else if (new_zhdr->middle_chunks == 0) {
- new_bud = MIDDLE;
- }
+ new_bud = get_free_buddy(new_zhdr, chunks);
q = new_zhdr;
switch (new_bud) {
case FIRST:
@@ -768,13 +667,9 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
return new_zhdr;
out_fail:
- if (new_zhdr) {
- if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
- atomic64_dec(&pool->pages_nr);
- else {
- add_to_unbuddied(pool, new_zhdr);
- z3fold_page_unlock(new_zhdr);
- }
+ if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) {
+ add_to_unbuddied(pool, new_zhdr);
+ z3fold_page_unlock(new_zhdr);
}
return NULL;
@@ -847,29 +742,27 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
- atomic64_dec(&pool->pages_nr);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
return;
- }
- if (unlikely(PageIsolated(page) ||
- test_bit(PAGE_CLAIMED, &page->private) ||
- test_bit(PAGE_STALE, &page->private))) {
+ if (test_bit(PAGE_STALE, &page->private) ||
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
return;
}
if (!zhdr->foreign_handles && buddy_single(zhdr) &&
zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
- atomic64_dec(&pool->pages_nr);
- else
+ if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
+ }
return;
}
z3fold_compact_page(zhdr);
add_to_unbuddied(pool, zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
@@ -891,8 +784,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
int chunks = size_to_chunks(size), i;
lookup:
+ migrate_disable();
/* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
+ unbuddied = this_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
struct list_head *l = &unbuddied[i];
@@ -904,13 +798,12 @@ lookup:
/* Re-check under lock. */
spin_lock(&pool->lock);
- l = &unbuddied[i];
if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
struct z3fold_header, buddy)) ||
!z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -924,7 +817,7 @@ lookup:
test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -939,7 +832,7 @@ lookup:
kref_get(&zhdr->refcount);
break;
}
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (!zhdr) {
int cpu;
@@ -978,7 +871,19 @@ lookup:
}
}
+ if (zhdr && !zhdr->slots) {
+ zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
+ if (!zhdr->slots)
+ goto out_fail;
+ }
return zhdr;
+
+out_fail:
+ if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ add_to_unbuddied(pool, zhdr);
+ z3fold_page_unlock(zhdr);
+ }
+ return NULL;
}
/*
@@ -989,13 +894,11 @@ lookup:
* z3fold_create_pool() - create a new z3fold pool
* @name: pool name
* @gfp: gfp flags when allocating the z3fold pool structure
- * @ops: user-defined operations for the z3fold pool
*
* Return: pointer to the new z3fold pool or NULL if the metadata allocation
* failed.
*/
-static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
- const struct z3fold_ops *ops)
+static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp)
{
struct z3fold_pool *pool = NULL;
int i, cpu;
@@ -1010,8 +913,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
- init_waitqueue_head(&pool->isolate_wait);
- pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+ pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
+ __alignof__(struct list_head));
if (!pool->unbuddied)
goto out_pool;
for_each_possible_cpu(cpu) {
@@ -1020,7 +923,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
for_each_unbuddied_list(i, 0)
INIT_LIST_HEAD(&unbuddied[i]);
}
- INIT_LIST_HEAD(&pool->lru);
INIT_LIST_HEAD(&pool->stale);
atomic64_set(&pool->pages_nr, 0);
pool->name = name;
@@ -1030,14 +932,9 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool->release_wq = create_singlethread_workqueue(pool->name);
if (!pool->release_wq)
goto out_wq;
- if (z3fold_register_migration(pool))
- goto out_rwq;
INIT_WORK(&pool->work, free_pages_work);
- pool->ops = ops;
return pool;
-out_rwq:
- destroy_workqueue(pool->release_wq);
out_wq:
destroy_workqueue(pool->compact_wq);
out_unbuddied:
@@ -1080,10 +977,12 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
destroy_workqueue(pool->compact_wq);
destroy_workqueue(pool->release_wq);
- z3fold_unregister_migration(pool);
+ free_percpu(pool->unbuddied);
kfree(pool);
}
+static const struct movable_operations z3fold_mops;
+
/**
* z3fold_alloc() - allocates a region of a given size
* @pool: z3fold pool from which to allocate
@@ -1096,9 +995,6 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
* performed first. If no suitable free region is found, then a new page is
* allocated and added to the pool to satisfy the request.
*
- * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
- * as z3fold pool pages.
- *
* Return: 0 if success and handle is set, otherwise -EINVAL if the size or
* gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
* a new page.
@@ -1112,7 +1008,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
enum buddy bud;
bool can_sleep = gfpflags_allow_blocking(gfp);
- if (!size)
+ if (!size || (gfp & __GFP_HIGHMEM))
return -EINVAL;
if (size > PAGE_SIZE)
@@ -1124,21 +1020,10 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
retry:
zhdr = __z3fold_alloc(pool, size, can_sleep);
if (zhdr) {
- if (zhdr->first_chunks == 0) {
- if (zhdr->middle_chunks != 0 &&
- chunks >= zhdr->start_middle)
- bud = LAST;
- else
- bud = FIRST;
- } else if (zhdr->last_chunks == 0)
- bud = LAST;
- else if (zhdr->middle_chunks == 0)
- bud = MIDDLE;
- else {
- if (kref_put(&zhdr->refcount,
+ bud = get_free_buddy(zhdr, chunks);
+ if (bud == HEADLESS) {
+ if (!kref_put(&zhdr->refcount,
release_z3fold_page_locked))
- atomic64_dec(&pool->pages_nr);
- else
z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
@@ -1150,28 +1035,7 @@ retry:
bud = FIRST;
}
- page = NULL;
- if (can_sleep) {
- spin_lock(&pool->stale_lock);
- zhdr = list_first_entry_or_null(&pool->stale,
- struct z3fold_header, buddy);
- /*
- * Before allocating a page, let's see if we can take one from
- * the stale pages list. cancel_work_sync() can sleep so we
- * limit this case to the contexts where we can sleep
- */
- if (zhdr) {
- list_del(&zhdr->buddy);
- spin_unlock(&pool->stale_lock);
- cancel_work_sync(&zhdr->work);
- page = virt_to_page(zhdr);
- } else {
- spin_unlock(&pool->stale_lock);
- }
- }
- if (!page)
- page = alloc_page(gfp);
-
+ page = alloc_page(gfp);
if (!page)
return -ENOMEM;
@@ -1188,13 +1052,12 @@ retry:
}
if (can_sleep) {
lock_page(page);
- __SetPageMovable(page, pool->inode->i_mapping);
+ __SetPageMovable(page, &z3fold_mops);
unlock_page(page);
} else {
- if (trylock_page(page)) {
- __SetPageMovable(page, pool->inode->i_mapping);
- unlock_page(page);
- }
+ WARN_ON(!trylock_page(page));
+ __SetPageMovable(page, &z3fold_mops);
+ unlock_page(page);
}
z3fold_page_lock(zhdr);
@@ -1211,12 +1074,6 @@ found:
headless:
spin_lock(&pool->lock);
- /* Add/move z3fold page to beginning of LRU */
- if (!list_empty(&page->lru))
- list_del(&page->lru);
-
- list_add(&page->lru, &pool->lru);
-
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
if (bud != HEADLESS)
@@ -1231,9 +1088,9 @@ headless:
* @handle: handle associated with the allocation returned by z3fold_alloc()
*
* In the case that the z3fold page in which the allocation resides is under
- * reclaim, as indicated by the PG_reclaim flag being set, this function
- * only sets the first|last_chunks to 0. The page is actually freed
- * once both buddies are evicted (see z3fold_reclaim_page() below).
+ * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function
+ * only sets the first|middle|last_chunks to 0. The page is actually freed
+ * once all buddies are evicted (see z3fold_reclaim_page() below).
*/
static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
{
@@ -1253,9 +1110,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
* immediately so we don't care about its value any more.
*/
if (!page_claimed) {
- spin_lock(&pool->lock);
- list_del(&page->lru);
- spin_unlock(&pool->lock);
put_z3fold_header(zhdr);
free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
@@ -1280,31 +1134,24 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
pr_err("%s: unknown bud %d\n", __func__, bud);
WARN_ON(1);
put_z3fold_header(zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
return;
}
if (!page_claimed)
- free_handle(handle);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
- atomic64_dec(&pool->pages_nr);
+ free_handle(handle, zhdr);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list))
return;
- }
if (page_claimed) {
/* the page has not been claimed by us */
- z3fold_page_unlock(zhdr);
+ put_z3fold_header(zhdr);
return;
}
- if (unlikely(PageIsolated(page)) ||
- test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
- put_z3fold_header(zhdr);
+ if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
clear_bit(PAGE_CLAIMED, &page->private);
+ put_z3fold_header(zhdr);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
- spin_lock(&pool->lock);
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
clear_bit(PAGE_CLAIMED, &page->private);
@@ -1318,184 +1165,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
}
/**
- * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
- * @pool: pool from which a page will attempt to be evicted
- * @retries: number of pages on the LRU list for which eviction will
- * be attempted before failing
- *
- * z3fold reclaim is different from normal system reclaim in that it is done
- * from the bottom, up. This is because only the bottom layer, z3fold, has
- * information on how the allocations are organized within each z3fold page.
- * This has the potential to create interesting locking situations between
- * z3fold and the user, however.
- *
- * To avoid these, this is how z3fold_reclaim_page() should be called:
- *
- * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
- * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
- * call the user-defined eviction handler with the pool and handle as
- * arguments.
- *
- * If the handle can not be evicted, the eviction handler should return
- * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
- * appropriate list and try the next z3fold page on the LRU up to
- * a user defined number of retries.
- *
- * If the handle is successfully evicted, the eviction handler should
- * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
- * contains logic to delay freeing the page if the page is under reclaim,
- * as indicated by the setting of the PG_reclaim flag on the underlying page.
- *
- * If all buddies in the z3fold page are successfully evicted, then the
- * z3fold page can be freed.
- *
- * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
- * no pages to evict or an eviction handler is not registered, -EAGAIN if
- * the retry limit was hit.
- */
-static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
-{
- int i, ret = -1;
- struct z3fold_header *zhdr = NULL;
- struct page *page = NULL;
- struct list_head *pos;
- unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
-
- spin_lock(&pool->lock);
- if (!pool->ops || !pool->ops->evict || retries == 0) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
- for (i = 0; i < retries; i++) {
- if (list_empty(&pool->lru)) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
- list_for_each_prev(pos, &pool->lru) {
- page = list_entry(pos, struct page, lru);
-
- /* this bit could have been set by free, in which case
- * we pass over to the next page in the pool.
- */
- if (test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- page = NULL;
- continue;
- }
-
- if (unlikely(PageIsolated(page))) {
- clear_bit(PAGE_CLAIMED, &page->private);
- page = NULL;
- continue;
- }
- zhdr = page_address(page);
- if (test_bit(PAGE_HEADLESS, &page->private))
- break;
-
- if (!z3fold_page_trylock(zhdr)) {
- clear_bit(PAGE_CLAIMED, &page->private);
- zhdr = NULL;
- continue; /* can't evict at this point */
- }
- if (zhdr->foreign_handles) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- continue; /* can't evict such page */
- }
- kref_get(&zhdr->refcount);
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- break;
- }
-
- if (!zhdr)
- break;
-
- list_del_init(&page->lru);
- spin_unlock(&pool->lock);
-
- if (!test_bit(PAGE_HEADLESS, &page->private)) {
- /*
- * We need encode the handles before unlocking, and
- * use our local slots structure because z3fold_free
- * can zero out zhdr->slots and we can't do much
- * about that
- */
- first_handle = 0;
- last_handle = 0;
- middle_handle = 0;
- if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
- if (zhdr->middle_chunks)
- middle_handle = encode_handle(zhdr, MIDDLE);
- if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
- /*
- * it's safe to unlock here because we hold a
- * reference to this page
- */
- z3fold_page_unlock(zhdr);
- } else {
- first_handle = encode_handle(zhdr, HEADLESS);
- last_handle = middle_handle = 0;
- }
- /* Issue the eviction callback(s) */
- if (middle_handle) {
- ret = pool->ops->evict(pool, middle_handle);
- if (ret)
- goto next;
- free_handle(middle_handle);
- }
- if (first_handle) {
- ret = pool->ops->evict(pool, first_handle);
- if (ret)
- goto next;
- free_handle(first_handle);
- }
- if (last_handle) {
- ret = pool->ops->evict(pool, last_handle);
- if (ret)
- goto next;
- free_handle(last_handle);
- }
-next:
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- if (ret == 0) {
- free_z3fold_page(page, true);
- atomic64_dec(&pool->pages_nr);
- return 0;
- }
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- spin_unlock(&pool->lock);
- clear_bit(PAGE_CLAIMED, &page->private);
- } else {
- z3fold_page_lock(zhdr);
- if (kref_put(&zhdr->refcount,
- release_z3fold_page_locked)) {
- atomic64_dec(&pool->pages_nr);
- return 0;
- }
- /*
- * if we are here, the page is still not completely
- * free. Take the global pool lock then to be able
- * to add it back to the lru list
- */
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- spin_unlock(&pool->lock);
- z3fold_page_unlock(zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- }
-
- /* We started off locked to we need to lock the pool back */
- spin_lock(&pool->lock);
- }
- spin_unlock(&pool->lock);
- return -EAGAIN;
-}
-
-/**
* z3fold_map() - maps the allocation associated with the given handle
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be mapped
@@ -1607,11 +1276,9 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
struct z3fold_header *zhdr;
struct z3fold_pool *pool;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
- if (test_bit(PAGE_HEADLESS, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private))
+ if (test_bit(PAGE_HEADLESS, &page->private))
return false;
zhdr = page_address(page);
@@ -1623,12 +1290,12 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
goto out;
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ goto out;
pool = zhdr_to_pool(zhdr);
spin_lock(&pool->lock);
if (!list_empty(&zhdr->buddy))
list_del_init(&zhdr->buddy);
- if (!list_empty(&page->lru))
- list_del_init(&page->lru);
spin_unlock(&pool->lock);
kref_get(&zhdr->refcount);
@@ -1640,24 +1307,23 @@ out:
return false;
}
-static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
+static int z3fold_page_migrate(struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
struct z3fold_header *zhdr, *new_zhdr;
struct z3fold_pool *pool;
- struct address_space *new_mapping;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
zhdr = page_address(page);
pool = zhdr_to_pool(zhdr);
- if (!z3fold_page_trylock(zhdr)) {
+ if (!z3fold_page_trylock(zhdr))
return -EAGAIN;
- }
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
return -EBUSY;
}
@@ -1668,7 +1334,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
new_zhdr = page_address(newpage);
memcpy(new_zhdr, zhdr, PAGE_SIZE);
newpage->private = page->private;
- page->private = 0;
+ set_bit(PAGE_MIGRATED, &page->private);
z3fold_page_unlock(zhdr);
spin_lock_init(&new_zhdr->page_lock);
INIT_WORK(&new_zhdr->work, compact_page_work);
@@ -1677,9 +1343,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
* so we only have to reinitialize it.
*/
INIT_LIST_HEAD(&new_zhdr->buddy);
- new_mapping = page_mapping(page);
__ClearPageMovable(page);
- ClearPagePrivate(page);
get_page(newpage);
z3fold_page_lock(new_zhdr);
@@ -1691,19 +1355,13 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
encode_handle(new_zhdr, MIDDLE);
set_bit(NEEDS_COMPACTING, &newpage->private);
new_zhdr->cpu = smp_processor_id();
- spin_lock(&pool->lock);
- list_add(&newpage->lru, &pool->lru);
- spin_unlock(&pool->lock);
- __SetPageMovable(newpage, new_mapping);
+ __SetPageMovable(newpage, &z3fold_mops);
z3fold_page_unlock(new_zhdr);
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
- spin_lock(&pool->lock);
- z3fold_dec_isolated(pool);
- spin_unlock(&pool->lock);
-
- page_mapcount_reset(page);
+ /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
+ page->private = 0;
put_page(page);
return 0;
}
@@ -1720,23 +1378,17 @@ static void z3fold_page_putback(struct page *page)
if (!list_empty(&zhdr->buddy))
list_del_init(&zhdr->buddy);
INIT_LIST_HEAD(&page->lru);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
- atomic64_dec(&pool->pages_nr);
- spin_lock(&pool->lock);
- z3fold_dec_isolated(pool);
- spin_unlock(&pool->lock);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
return;
- }
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- z3fold_dec_isolated(pool);
- spin_unlock(&pool->lock);
+ if (list_empty(&zhdr->buddy))
+ add_to_unbuddied(pool, zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
-static const struct address_space_operations z3fold_aops = {
+static const struct movable_operations z3fold_mops = {
.isolate_page = z3fold_page_isolate,
- .migratepage = z3fold_page_migrate,
+ .migrate_page = z3fold_page_migrate,
.putback_page = z3fold_page_putback,
};
@@ -1744,31 +1396,9 @@ static const struct address_space_operations z3fold_aops = {
* zpool
****************/
-static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
-{
- if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
- return pool->zpool_ops->evict(pool->zpool, handle);
- else
- return -ENOENT;
-}
-
-static const struct z3fold_ops z3fold_zpool_ops = {
- .evict = z3fold_zpool_evict
-};
-
-static void *z3fold_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
+static void *z3fold_zpool_create(const char *name, gfp_t gfp)
{
- struct z3fold_pool *pool;
-
- pool = z3fold_create_pool(name, gfp,
- zpool_ops ? &z3fold_zpool_ops : NULL);
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
- return pool;
+ return z3fold_create_pool(name, gfp);
}
static void z3fold_zpool_destroy(void *pool)
@@ -1786,25 +1416,6 @@ static void z3fold_zpool_free(void *pool, unsigned long handle)
z3fold_free(pool, handle);
}
-static int z3fold_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = z3fold_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
static void *z3fold_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -1822,12 +1433,12 @@ static u64 z3fold_zpool_total_size(void *pool)
static struct zpool_driver z3fold_zpool_driver = {
.type = "z3fold",
+ .sleep_mapped = true,
.owner = THIS_MODULE,
.create = z3fold_zpool_create,
.destroy = z3fold_zpool_destroy,
.malloc = z3fold_zpool_malloc,
.free = z3fold_zpool_free,
- .shrink = z3fold_zpool_shrink,
.map = z3fold_zpool_map,
.unmap = z3fold_zpool_unmap,
.total_size = z3fold_zpool_total_size,
@@ -1837,14 +1448,11 @@ MODULE_ALIAS("zpool-z3fold");
static int __init init_z3fold(void)
{
- int ret;
-
- /* Make sure the z3fold header is not larger than the page size */
- BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
- ret = z3fold_mount();
- if (ret)
- return ret;
-
+ /*
+ * Make sure the z3fold header is not larger than the page size and
+ * there has remaining spaces for its buddy.
+ */
+ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
zpool_register_driver(&z3fold_zpool_driver);
return 0;
@@ -1852,7 +1460,6 @@ static int __init init_z3fold(void)
static void __exit exit_z3fold(void)
{
- z3fold_unmount();
zpool_unregister_driver(&z3fold_zpool_driver);
}
diff --git a/mm/zbud.c b/mm/zbud.c
index c49966ece674..2190cc1f37b3 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -51,7 +51,6 @@
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/zbud.h>
#include <linux/zpool.h>
/*****************
@@ -73,6 +72,8 @@
#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+struct zbud_pool;
+
/**
* struct zbud_pool - stores metadata for each zbud pool
* @lock: protects all pool fields and first|last_chunk fields of any
@@ -82,141 +83,37 @@
* its free region.
* @buddied: list tracking the zbud pages that contain two buddies;
* these zbud pages are full
- * @lru: list tracking the zbud pages in LRU order by most recently
- * added buddy.
* @pages_nr: number of zbud pages in the pool.
- * @ops: pointer to a structure of user defined operations specified at
- * pool creation time.
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular zbud pool.
*/
struct zbud_pool {
spinlock_t lock;
- struct list_head unbuddied[NCHUNKS];
- struct list_head buddied;
- struct list_head lru;
+ union {
+ /*
+ * Reuse unbuddied[0] as buddied on the ground that
+ * unbuddied[0] is unused.
+ */
+ struct list_head buddied;
+ struct list_head unbuddied[NCHUNKS];
+ };
u64 pages_nr;
- const struct zbud_ops *ops;
-#ifdef CONFIG_ZPOOL
- struct zpool *zpool;
- const struct zpool_ops *zpool_ops;
-#endif
};
/*
* struct zbud_header - zbud page metadata occupying the first chunk of each
* zbud page.
* @buddy: links the zbud page into the unbuddied/buddied lists in the pool
- * @lru: links the zbud page into the lru list in the pool
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
*/
struct zbud_header {
struct list_head buddy;
- struct list_head lru;
unsigned int first_chunks;
unsigned int last_chunks;
- bool under_reclaim;
-};
-
-/*****************
- * zpool
- ****************/
-
-#ifdef CONFIG_ZPOOL
-
-static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
-{
- if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
- return pool->zpool_ops->evict(pool->zpool, handle);
- else
- return -ENOENT;
-}
-
-static const struct zbud_ops zbud_zpool_ops = {
- .evict = zbud_zpool_evict
-};
-
-static void *zbud_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
-{
- struct zbud_pool *pool;
-
- pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
- return pool;
-}
-
-static void zbud_zpool_destroy(void *pool)
-{
- zbud_destroy_pool(pool);
-}
-
-static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return zbud_alloc(pool, size, gfp, handle);
-}
-static void zbud_zpool_free(void *pool, unsigned long handle)
-{
- zbud_free(pool, handle);
-}
-
-static int zbud_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = zbud_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
-static void *zbud_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return zbud_map(pool, handle);
-}
-static void zbud_zpool_unmap(void *pool, unsigned long handle)
-{
- zbud_unmap(pool, handle);
-}
-
-static u64 zbud_zpool_total_size(void *pool)
-{
- return zbud_get_pool_size(pool) * PAGE_SIZE;
-}
-
-static struct zpool_driver zbud_zpool_driver = {
- .type = "zbud",
- .owner = THIS_MODULE,
- .create = zbud_zpool_create,
- .destroy = zbud_zpool_destroy,
- .malloc = zbud_zpool_malloc,
- .free = zbud_zpool_free,
- .shrink = zbud_zpool_shrink,
- .map = zbud_zpool_map,
- .unmap = zbud_zpool_unmap,
- .total_size = zbud_zpool_total_size,
};
-MODULE_ALIAS("zpool-zbud");
-#endif /* CONFIG_ZPOOL */
-
/*****************
* Helpers
*****************/
@@ -242,8 +139,6 @@ static struct zbud_header *init_zbud_page(struct page *page)
zhdr->first_chunks = 0;
zhdr->last_chunks = 0;
INIT_LIST_HEAD(&zhdr->buddy);
- INIT_LIST_HEAD(&zhdr->lru);
- zhdr->under_reclaim = false;
return zhdr;
}
@@ -298,12 +193,11 @@ static int num_free_chunks(struct zbud_header *zhdr)
/**
* zbud_create_pool() - create a new zbud pool
* @gfp: gfp flags when allocating the zbud pool structure
- * @ops: user-defined operations for the zbud pool
*
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
+static struct zbud_pool *zbud_create_pool(gfp_t gfp)
{
struct zbud_pool *pool;
int i;
@@ -315,9 +209,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
for_each_unbuddied_list(i, 0)
INIT_LIST_HEAD(&pool->unbuddied[i]);
INIT_LIST_HEAD(&pool->buddied);
- INIT_LIST_HEAD(&pool->lru);
pool->pages_nr = 0;
- pool->ops = ops;
return pool;
}
@@ -327,7 +219,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
*
* The pool should be emptied before this function is called.
*/
-void zbud_destroy_pool(struct zbud_pool *pool)
+static void zbud_destroy_pool(struct zbud_pool *pool)
{
kfree(pool);
}
@@ -351,7 +243,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
* gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
* a new page.
*/
-int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
+static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
int chunks, i, freechunks;
@@ -405,11 +297,6 @@ found:
list_add(&zhdr->buddy, &pool->buddied);
}
- /* Add/move zbud page to beginning of LRU */
- if (!list_empty(&zhdr->lru))
- list_del(&zhdr->lru);
- list_add(&zhdr->lru, &pool->lru);
-
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
@@ -420,13 +307,8 @@ found:
* zbud_free() - frees the allocation associated with the given handle
* @pool: pool in which the allocation resided
* @handle: handle associated with the allocation returned by zbud_alloc()
- *
- * In the case that the zbud page in which the allocation resides is under
- * reclaim, as indicated by the PG_reclaim flag being set, this function
- * only sets the first|last_chunks to 0. The page is actually freed
- * once both buddies are evicted (see zbud_reclaim_page() below).
*/
-void zbud_free(struct zbud_pool *pool, unsigned long handle)
+static void zbud_free(struct zbud_pool *pool, unsigned long handle)
{
struct zbud_header *zhdr;
int freechunks;
@@ -440,18 +322,11 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
else
zhdr->first_chunks = 0;
- if (zhdr->under_reclaim) {
- /* zbud page is under reclaim, reclaim will free */
- spin_unlock(&pool->lock);
- return;
- }
-
/* Remove from existing buddy list */
list_del(&zhdr->buddy);
if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
/* zbud page is empty, free */
- list_del(&zhdr->lru);
free_zbud_page(zhdr);
pool->pages_nr--;
} else {
@@ -464,111 +339,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
}
/**
- * zbud_reclaim_page() - evicts allocations from a pool page and frees it
- * @pool: pool from which a page will attempt to be evicted
- * @retries: number of pages on the LRU list for which eviction will
- * be attempted before failing
- *
- * zbud reclaim is different from normal system reclaim in that the reclaim is
- * done from the bottom, up. This is because only the bottom layer, zbud, has
- * information on how the allocations are organized within each zbud page. This
- * has the potential to create interesting locking situations between zbud and
- * the user, however.
- *
- * To avoid these, this is how zbud_reclaim_page() should be called:
- *
- * The user detects a page should be reclaimed and calls zbud_reclaim_page().
- * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
- * the user-defined eviction handler with the pool and handle as arguments.
- *
- * If the handle can not be evicted, the eviction handler should return
- * non-zero. zbud_reclaim_page() will add the zbud page back to the
- * appropriate list and try the next zbud page on the LRU up to
- * a user defined number of retries.
- *
- * If the handle is successfully evicted, the eviction handler should
- * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
- * contains logic to delay freeing the page if the page is under reclaim,
- * as indicated by the setting of the PG_reclaim flag on the underlying page.
- *
- * If all buddies in the zbud page are successfully evicted, then the
- * zbud page can be freed.
- *
- * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
- * no pages to evict or an eviction handler is not registered, -EAGAIN if
- * the retry limit was hit.
- */
-int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
-{
- int i, ret, freechunks;
- struct zbud_header *zhdr;
- unsigned long first_handle = 0, last_handle = 0;
-
- spin_lock(&pool->lock);
- if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
- retries == 0) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
- for (i = 0; i < retries; i++) {
- zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
- list_del(&zhdr->lru);
- list_del(&zhdr->buddy);
- /* Protect zbud page against free */
- zhdr->under_reclaim = true;
- /*
- * We need encode the handles before unlocking, since we can
- * race with free that will set (first|last)_chunks to 0
- */
- first_handle = 0;
- last_handle = 0;
- if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
- if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
- spin_unlock(&pool->lock);
-
- /* Issue the eviction callback(s) */
- if (first_handle) {
- ret = pool->ops->evict(pool, first_handle);
- if (ret)
- goto next;
- }
- if (last_handle) {
- ret = pool->ops->evict(pool, last_handle);
- if (ret)
- goto next;
- }
-next:
- spin_lock(&pool->lock);
- zhdr->under_reclaim = false;
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /*
- * Both buddies are now free, free the zbud page and
- * return success.
- */
- free_zbud_page(zhdr);
- pool->pages_nr--;
- spin_unlock(&pool->lock);
- return 0;
- } else if (zhdr->first_chunks == 0 ||
- zhdr->last_chunks == 0) {
- /* add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- } else {
- /* add to buddied list */
- list_add(&zhdr->buddy, &pool->buddied);
- }
-
- /* add to beginning of LRU */
- list_add(&zhdr->lru, &pool->lru);
- }
- spin_unlock(&pool->lock);
- return -EAGAIN;
-}
-
-/**
* zbud_map() - maps the allocation associated with the given handle
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be mapped
@@ -580,7 +350,7 @@ next:
*
* Returns: a pointer to the mapped allocation
*/
-void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+static void *zbud_map(struct zbud_pool *pool, unsigned long handle)
{
return (void *)(handle);
}
@@ -590,7 +360,7 @@ void *zbud_map(struct zbud_pool *pool, unsigned long handle)
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be unmapped
*/
-void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
{
}
@@ -601,30 +371,79 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
* Returns: size in pages of the given pool. The pool lock need not be
* taken to access pages_nr.
*/
-u64 zbud_get_pool_size(struct zbud_pool *pool)
+static u64 zbud_get_pool_size(struct zbud_pool *pool)
{
return pool->pages_nr;
}
+/*****************
+ * zpool
+ ****************/
+
+static void *zbud_zpool_create(const char *name, gfp_t gfp)
+{
+ return zbud_create_pool(gfp);
+}
+
+static void zbud_zpool_destroy(void *pool)
+{
+ zbud_destroy_pool(pool);
+}
+
+static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ return zbud_alloc(pool, size, gfp, handle);
+}
+static void zbud_zpool_free(void *pool, unsigned long handle)
+{
+ zbud_free(pool, handle);
+}
+
+static void *zbud_zpool_map(void *pool, unsigned long handle,
+ enum zpool_mapmode mm)
+{
+ return zbud_map(pool, handle);
+}
+static void zbud_zpool_unmap(void *pool, unsigned long handle)
+{
+ zbud_unmap(pool, handle);
+}
+
+static u64 zbud_zpool_total_size(void *pool)
+{
+ return zbud_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver zbud_zpool_driver = {
+ .type = "zbud",
+ .sleep_mapped = true,
+ .owner = THIS_MODULE,
+ .create = zbud_zpool_create,
+ .destroy = zbud_zpool_destroy,
+ .malloc = zbud_zpool_malloc,
+ .free = zbud_zpool_free,
+ .map = zbud_zpool_map,
+ .unmap = zbud_zpool_unmap,
+ .total_size = zbud_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-zbud");
+
static int __init init_zbud(void)
{
/* Make sure the zbud header will fit in one chunk */
BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
pr_info("loaded\n");
-#ifdef CONFIG_ZPOOL
zpool_register_driver(&zbud_zpool_driver);
-#endif
return 0;
}
static void __exit exit_zbud(void)
{
-#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zbud_zpool_driver);
-#endif
-
pr_info("unloaded\n");
}
diff --git a/mm/zpool.c b/mm/zpool.c
index 3744a2d1a624..846410479c2f 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -21,18 +21,11 @@
struct zpool {
struct zpool_driver *driver;
void *pool;
- const struct zpool_ops *ops;
- bool evictable;
-
- struct list_head list;
};
static LIST_HEAD(drivers_head);
static DEFINE_SPINLOCK(drivers_lock);
-static LIST_HEAD(pools_head);
-static DEFINE_SPINLOCK(pools_lock);
-
/**
* zpool_register_driver() - register a zpool implementation.
* @driver: driver to register
@@ -140,7 +133,6 @@ EXPORT_SYMBOL(zpool_has_pool);
* @type: The type of the zpool to create (e.g. zbud, zsmalloc)
* @name: The name of the zpool (e.g. zram0, zswap)
* @gfp: The GFP flags to use when allocating the pool.
- * @ops: The optional ops callback.
*
* This creates a new zpool of the specified type. The gfp flags will be
* used when allocating memory, if the implementation supports it. If the
@@ -152,8 +144,7 @@ EXPORT_SYMBOL(zpool_has_pool);
*
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
- const struct zpool_ops *ops)
+struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -180,9 +171,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
}
zpool->driver = driver;
- zpool->pool = driver->create(name, gfp, ops, zpool);
- zpool->ops = ops;
- zpool->evictable = driver->shrink && ops && ops->evict;
+ zpool->pool = driver->create(name, gfp);
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -193,10 +182,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
pr_debug("created pool type %s\n", type);
- spin_lock(&pools_lock);
- list_add(&zpool->list, &pools_head);
- spin_unlock(&pools_lock);
-
return zpool;
}
@@ -215,9 +200,6 @@ void zpool_destroy_pool(struct zpool *zpool)
{
pr_debug("destroying pool type %s\n", zpool->driver->type);
- spin_lock(&pools_lock);
- list_del(&zpool->list);
- spin_unlock(&pools_lock);
zpool->driver->destroy(zpool->pool);
zpool_put_driver(zpool->driver);
kfree(zpool);
@@ -296,30 +278,6 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
}
/**
- * zpool_shrink() - Shrink the pool size
- * @zpool: The zpool to shrink.
- * @pages: The number of pages to shrink the pool.
- * @reclaimed: The number of pages successfully evicted.
- *
- * This attempts to shrink the actual memory size of the pool
- * by evicting currently used handle(s). If the pool was
- * created with no zpool_ops, or the evict call fails for any
- * of the handles, this will fail. If non-NULL, the @reclaimed
- * parameter will be set to the number of pages reclaimed,
- * which may be more than the number of pages requested.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: 0 on success, negative value on error/failure.
- */
-int zpool_shrink(struct zpool *zpool, unsigned int pages,
- unsigned int *reclaimed)
-{
- return zpool->driver->shrink ?
- zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
-}
-
-/**
* zpool_map_handle() - Map a previously allocated handle into memory
* @zpool: The zpool that the handle was allocated from
* @handle: The handle to map
@@ -334,7 +292,7 @@ int zpool_shrink(struct zpool *zpool, unsigned int pages,
* This may hold locks, disable interrupts, and/or preemption,
* and the zpool_unmap_handle() must be called to undo those
* actions. The code that uses the mapped handle should complete
- * its operatons on the mapped handle memory quickly and unmap
+ * its operations on the mapped handle memory quickly and unmap
* as soon as possible. As the implementation may use per-cpu
* data, multiple handles should not be mapped concurrently on
* any cpu.
@@ -376,23 +334,22 @@ u64 zpool_get_total_size(struct zpool *zpool)
}
/**
- * zpool_evictable() - Test if zpool is potentially evictable
+ * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
* @zpool: The zpool to test
*
- * Zpool is only potentially evictable when it's created with struct
- * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
- *
- * However, it doesn't necessarily mean driver will use zpool_ops.evict
- * in its implementation of zpool_driver.shrink. It could do internal
- * defragmentation instead.
+ * Some allocators enter non-preemptible context in ->map() callback (e.g.
+ * disable pagefaults) and exit that context in ->unmap(), which limits what
+ * we can do with the mapped object. For instance, we cannot wait for
+ * asynchronous crypto API to decompress such an object or take mutexes
+ * since those will call into the scheduler. This function tells us whether
+ * we use such an allocator.
*
- * Returns: true if potentially evictable; false otherwise.
+ * Returns: true if zpool can sleep; false otherwise.
*/
-bool zpool_evictable(struct zpool *zpool)
+bool zpool_can_sleep_mapped(struct zpool *zpool)
{
- return zpool->evictable;
+ return zpool->driver->sleep_mapped;
}
-MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c36fdff9a371..32916d28d9d9 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -17,10 +17,10 @@
*
* Usage of struct page fields:
* page->private: points to zspage
- * page->freelist(index): links together all component pages of a zspage
+ * page->index: links together all component pages of a zspage
* For the huge page, this is always 0, so we use this field
* to store handle.
- * page->units: first object offset in a subpage of zspage
+ * page->page_type: first object offset in a subpage of zspage
*
* Usage of struct page flags:
* PG_private: identifies the first component page
@@ -30,10 +30,16 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+/*
+ * lock ordering:
+ * page_lock
+ * pool->lock
+ * zspage->lock
+ */
+
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/magic.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
@@ -51,30 +57,22 @@
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
+#include <linux/local_lock.h>
#define ZSPAGE_MAGIC 0x58
/*
- * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * This must be power of 2 and greater than or equal to sizeof(link_free).
* These two conditions ensure that any 'struct link_free' itself doesn't
* span more than 1 page which avoids complex case of mapping 2 pages simply
* to restore link_free pointer values.
*/
#define ZS_ALIGN 8
-/*
- * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
- * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
- */
-#define ZS_MAX_ZSPAGE_ORDER 2
-#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
-
#define ZS_HANDLE_SIZE (sizeof(unsigned long))
/*
@@ -101,15 +99,6 @@
#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
/*
- * Memory for allocating for handle keeps object position by
- * encoding <page, obj_idx> and the encoded value has a room
- * in least bit(ie, look at obj_to_location).
- * We use the bit to synchronize between object access by
- * user and migration.
- */
-#define HANDLE_PIN_BIT 0
-
-/*
* Head in allocated object should have OBJ_ALLOCATED_TAG
* to identify the object was allocated or not.
* It's okay to add the status bit in the least bit because
@@ -117,16 +106,23 @@
* have room for two bit at least.
*/
#define OBJ_ALLOCATED_TAG 1
-#define OBJ_TAG_BITS 1
+
+#define OBJ_TAG_BITS 1
+#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
+
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
-#define FULLNESS_BITS 2
+#define HUGE_BITS 1
+#define FULLNESS_BITS 4
#define CLASS_BITS 8
-#define ISOLATED_BITS 3
+#define ISOLATED_BITS 5
#define MAGIC_VAL_BITS 8
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+
+#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL))
+
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
@@ -150,56 +146,44 @@
#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
ZS_SIZE_CLASS_DELTA) + 1)
+/*
+ * Pages are distinguished by the ratio of used memory (that is the ratio
+ * of ->inuse objects to all objects that page can store). For example,
+ * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%.
+ *
+ * The number of fullness groups is not random. It allows us to keep
+ * difference between the least busy page in the group (minimum permitted
+ * number of ->inuse objects) and the most busy page (maximum permitted
+ * number of ->inuse objects) at a reasonable value.
+ */
enum fullness_group {
- ZS_EMPTY,
- ZS_ALMOST_EMPTY,
- ZS_ALMOST_FULL,
- ZS_FULL,
- NR_ZS_FULLNESS,
+ ZS_INUSE_RATIO_0,
+ ZS_INUSE_RATIO_10,
+ /* NOTE: 8 more fullness groups here */
+ ZS_INUSE_RATIO_99 = 10,
+ ZS_INUSE_RATIO_100,
+ NR_FULLNESS_GROUPS,
};
-enum zs_stat_type {
- CLASS_EMPTY,
- CLASS_ALMOST_EMPTY,
- CLASS_ALMOST_FULL,
- CLASS_FULL,
- OBJ_ALLOCATED,
- OBJ_USED,
- NR_ZS_STAT_TYPE,
+enum class_stat_type {
+ /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */
+ ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS,
+ ZS_OBJS_INUSE,
+ NR_CLASS_STAT_TYPES,
};
struct zs_size_stat {
- unsigned long objs[NR_ZS_STAT_TYPE];
+ unsigned long objs[NR_CLASS_STAT_TYPES];
};
#ifdef CONFIG_ZSMALLOC_STAT
static struct dentry *zs_stat_root;
#endif
-#ifdef CONFIG_COMPACTION
-static struct vfsmount *zsmalloc_mnt;
-#endif
-
-/*
- * We assign a page to ZS_ALMOST_EMPTY fullness group when:
- * n <= N / f, where
- * n = number of allocated objects
- * N = total number of objects zspage can store
- * f = fullness_threshold_frac
- *
- * Similarly, we assign zspage to:
- * ZS_ALMOST_FULL when n > N / f
- * ZS_EMPTY when n == 0
- * ZS_FULL when n == N
- *
- * (see: fix_fullness_group())
- */
-static const int fullness_threshold_frac = 4;
static size_t huge_class_size;
struct size_class {
- spinlock_t lock;
- struct list_head fullness_list[NR_ZS_FULLNESS];
+ struct list_head fullness_list[NR_FULLNESS_GROUPS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
@@ -213,22 +197,6 @@ struct size_class {
struct zs_size_stat stats;
};
-/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
-static void SetPageHugeObject(struct page *page)
-{
- SetPageOwnerPriv1(page);
-}
-
-static void ClearPageHugeObject(struct page *page)
-{
- ClearPageOwnerPriv1(page);
-}
-
-static int PageHugeObject(struct page *page)
-{
- return PageOwnerPriv1(page);
-}
-
/*
* Placed within free objects to form a singly linked list.
* For every zspage, zspage->freeobj gives head of this list.
@@ -267,17 +235,15 @@ struct zs_pool {
struct dentry *stat_dentry;
#endif
#ifdef CONFIG_COMPACTION
- struct inode *inode;
struct work_struct free_work;
- /* A wait queue for when migration races with async_free_zspage() */
- struct wait_queue_head migration_wait;
- atomic_long_t isolated_pages;
- bool destroying;
#endif
+ spinlock_t lock;
+ atomic_t compaction_in_progress;
};
struct zspage {
struct {
+ unsigned int huge:HUGE_BITS;
unsigned int fullness:FULLNESS_BITS;
unsigned int class:CLASS_BITS + 1;
unsigned int isolated:ISOLATED_BITS;
@@ -287,38 +253,43 @@ struct zspage {
unsigned int freeobj;
struct page *first_page;
struct list_head list; /* fullness list */
-#ifdef CONFIG_COMPACTION
+ struct zs_pool *pool;
rwlock_t lock;
-#endif
};
struct mapping_area {
-#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
- struct vm_struct *vm; /* vm area for mapping object that span pages */
-#else
+ local_lock_t lock;
char *vm_buf; /* copy buffer for objects that span pages */
-#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
};
-#ifdef CONFIG_COMPACTION
-static int zs_register_migration(struct zs_pool *pool);
-static void zs_unregister_migration(struct zs_pool *pool);
+/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+static void SetZsHugePage(struct zspage *zspage)
+{
+ zspage->huge = 1;
+}
+
+static bool ZsHugePage(struct zspage *zspage)
+{
+ return zspage->huge;
+}
+
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
+
+#ifdef CONFIG_COMPACTION
+static void migrate_write_lock(struct zspage *zspage);
+static void migrate_write_lock_nested(struct zspage *zspage);
+static void migrate_write_unlock(struct zspage *zspage);
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
#else
-static int zsmalloc_mount(void) { return 0; }
-static void zsmalloc_unmount(void) {}
-static int zs_register_migration(struct zs_pool *pool) { return 0; }
-static void zs_unregister_migration(struct zs_pool *pool) {}
-static void migrate_lock_init(struct zspage *zspage) {}
-static void migrate_read_lock(struct zspage *zspage) {}
-static void migrate_read_unlock(struct zspage *zspage) {}
+static void migrate_write_lock(struct zspage *zspage) {}
+static void migrate_write_lock_nested(struct zspage *zspage) {}
+static void migrate_write_unlock(struct zspage *zspage) {}
static void kick_deferred_free(struct zs_pool *pool) {}
static void init_deferred_free(struct zs_pool *pool) {}
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
@@ -361,7 +332,7 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
{
- return kmem_cache_alloc(pool->zspage_cachep,
+ return kmem_cache_zalloc(pool->zspage_cachep,
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
}
@@ -370,23 +341,17 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
kmem_cache_free(pool->zspage_cachep, zspage);
}
+/* pool->lock(which owns the handle) synchronizes races */
static void record_obj(unsigned long handle, unsigned long obj)
{
- /*
- * lsb of @obj represents handle lock while other bits
- * represent object value the handle is pointing so
- * updating shouldn't do store tearing.
- */
- WRITE_ONCE(*(unsigned long *)handle, obj);
+ *(unsigned long *)handle = obj;
}
/* zpool driver */
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
+static void *zs_zpool_create(const char *name, gfp_t gfp)
{
/*
* Ignore global gfp flags: zs_malloc() may be invoked from
@@ -405,7 +370,10 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
*handle = zs_malloc(pool, size, gfp);
- return *handle ? 0 : -1;
+
+ if (IS_ERR_VALUE(*handle))
+ return PTR_ERR((void *)*handle);
+ return 0;
}
static void zs_zpool_free(void *pool, unsigned long handle)
{
@@ -459,19 +427,16 @@ MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
-static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
-
-static bool is_zspage_isolated(struct zspage *zspage)
-{
- return zspage->isolated;
-}
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
static __maybe_unused int is_first_page(struct page *page)
{
return PagePrivate(page);
}
-/* Protected by class->lock */
+/* Protected by pool->lock */
static inline int get_zspage_inuse(struct zspage *zspage)
{
return zspage->inuse;
@@ -491,14 +456,14 @@ static inline struct page *get_first_page(struct zspage *zspage)
return first_page;
}
-static inline int get_first_obj_offset(struct page *page)
+static inline unsigned int get_first_obj_offset(struct page *page)
{
- return page->units;
+ return page->page_type;
}
-static inline void set_first_obj_offset(struct page *page, int offset)
+static inline void set_first_obj_offset(struct page *page, unsigned int offset)
{
- page->units = offset;
+ page->page_type = offset;
}
static inline unsigned int get_freeobj(struct zspage *zspage)
@@ -512,8 +477,8 @@ static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
}
static void get_zspage_mapping(struct zspage *zspage,
- unsigned int *class_idx,
- enum fullness_group *fullness)
+ unsigned int *class_idx,
+ int *fullness)
{
BUG_ON(zspage->magic != ZSPAGE_MAGIC);
@@ -521,9 +486,15 @@ static void get_zspage_mapping(struct zspage *zspage,
*class_idx = zspage->class;
}
+static struct size_class *zspage_class(struct zs_pool *pool,
+ struct zspage *zspage)
+{
+ return pool->size_class[zspage->class];
+}
+
static void set_zspage_mapping(struct zspage *zspage,
- unsigned int class_idx,
- enum fullness_group fullness)
+ unsigned int class_idx,
+ int fullness)
{
zspage->class = class_idx;
zspage->fullness = fullness;
@@ -534,7 +505,7 @@ static void set_zspage_mapping(struct zspage *zspage,
* class maintains a list of zspages where each zspage is divided
* into equal sized chunks. Each allocation falls into one of these
* classes depending on its size. This function returns index of the
- * size class which has chunk size big enough to hold the give size.
+ * size class which has chunk size big enough to hold the given size.
*/
static int get_size_class_index(int size)
{
@@ -547,23 +518,19 @@ static int get_size_class_index(int size)
return min_t(int, ZS_SIZE_CLASSES - 1, idx);
}
-/* type can be of enum type zs_stat_type or fullness_group */
-static inline void zs_stat_inc(struct size_class *class,
+static inline void class_stat_inc(struct size_class *class,
int type, unsigned long cnt)
{
class->stats.objs[type] += cnt;
}
-/* type can be of enum type zs_stat_type or fullness_group */
-static inline void zs_stat_dec(struct size_class *class,
+static inline void class_stat_dec(struct size_class *class,
int type, unsigned long cnt)
{
class->stats.objs[type] -= cnt;
}
-/* type can be of enum type zs_stat_type or fullness_group */
-static inline unsigned long zs_stat_get(struct size_class *class,
- int type)
+static inline unsigned long zs_stat_get(struct size_class *class, int type)
{
return class->stats.objs[type];
}
@@ -589,47 +556,49 @@ static unsigned long zs_can_compact(struct size_class *class);
static int zs_stats_size_show(struct seq_file *s, void *v)
{
- int i;
+ int i, fg;
struct zs_pool *pool = s->private;
struct size_class *class;
int objs_per_zspage;
- unsigned long class_almost_full, class_almost_empty;
unsigned long obj_allocated, obj_used, pages_used, freeable;
- unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
unsigned long total_freeable = 0;
+ unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, };
- seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
- "class", "size", "almost_full", "almost_empty",
+ seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n",
+ "class", "size", "10%", "20%", "30%", "40%",
+ "50%", "60%", "70%", "80%", "90%", "99%", "100%",
"obj_allocated", "obj_used", "pages_used",
"pages_per_zspage", "freeable");
for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+
class = pool->size_class[i];
if (class->index != i)
continue;
- spin_lock(&class->lock);
- class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
- class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
- obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- obj_used = zs_stat_get(class, OBJ_USED);
+ spin_lock(&pool->lock);
+
+ seq_printf(s, " %5u %5u ", i, class->size);
+ for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) {
+ inuse_totals[fg] += zs_stat_get(class, fg);
+ seq_printf(s, "%9lu ", zs_stat_get(class, fg));
+ }
+
+ obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
+ obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
freeable = zs_can_compact(class);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
objs_per_zspage = class->objs_per_zspage;
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
- seq_printf(s, " %5u %5u %11lu %12lu %13lu"
- " %10lu %10lu %16d %8lu\n",
- i, class->size, class_almost_full, class_almost_empty,
- obj_allocated, obj_used, pages_used,
- class->pages_per_zspage, freeable);
+ seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n",
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage, freeable);
- total_class_almost_full += class_almost_full;
- total_class_almost_empty += class_almost_empty;
total_objs += obj_allocated;
total_used_objs += obj_used;
total_pages += pages_used;
@@ -637,10 +606,14 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
}
seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
- "Total", "", total_class_almost_full,
- total_class_almost_empty, total_objs,
- total_used_objs, total_pages, "", total_freeable);
+ seq_printf(s, " %5s %5s ", "Total", "");
+
+ for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++)
+ seq_printf(s, "%9lu ", inuse_totals[fg]);
+
+ seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n",
+ total_objs, total_used_objs, total_pages, "",
+ total_freeable);
return 0;
}
@@ -685,30 +658,28 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
/*
* For each size class, zspages are divided into different groups
- * depending on how "full" they are. This was done so that we could
- * easily find empty or nearly empty zspages when we try to shrink
- * the pool (not yet implemented). This function returns fullness
+ * depending on their usage ratio. This function returns fullness
* status of the given page.
*/
-static enum fullness_group get_fullness_group(struct size_class *class,
- struct zspage *zspage)
+static int get_fullness_group(struct size_class *class, struct zspage *zspage)
{
- int inuse, objs_per_zspage;
- enum fullness_group fg;
+ int inuse, objs_per_zspage, ratio;
inuse = get_zspage_inuse(zspage);
objs_per_zspage = class->objs_per_zspage;
if (inuse == 0)
- fg = ZS_EMPTY;
- else if (inuse == objs_per_zspage)
- fg = ZS_FULL;
- else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
- fg = ZS_ALMOST_EMPTY;
- else
- fg = ZS_ALMOST_FULL;
+ return ZS_INUSE_RATIO_0;
+ if (inuse == objs_per_zspage)
+ return ZS_INUSE_RATIO_100;
- return fg;
+ ratio = 100 * inuse / objs_per_zspage;
+ /*
+ * Take integer division into consideration: a page with one inuse
+ * object out of 127 possible, will end up having 0 usage ratio,
+ * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group.
+ */
+ return ratio / 10 + 1;
}
/*
@@ -719,23 +690,9 @@ static enum fullness_group get_fullness_group(struct size_class *class,
*/
static void insert_zspage(struct size_class *class,
struct zspage *zspage,
- enum fullness_group fullness)
+ int fullness)
{
- struct zspage *head;
-
- zs_stat_inc(class, fullness, 1);
- head = list_first_entry_or_null(&class->fullness_list[fullness],
- struct zspage, list);
- /*
- * We want to see more ZS_FULL pages and less almost empty/full.
- * Put pages with higher ->inuse first.
- */
- if (head) {
- if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
- list_add(&zspage->list, &head->list);
- return;
- }
- }
+ class_stat_inc(class, fullness, 1);
list_add(&zspage->list, &class->fullness_list[fullness]);
}
@@ -745,85 +702,43 @@ static void insert_zspage(struct size_class *class,
*/
static void remove_zspage(struct size_class *class,
struct zspage *zspage,
- enum fullness_group fullness)
+ int fullness)
{
VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
- VM_BUG_ON(is_zspage_isolated(zspage));
list_del_init(&zspage->list);
- zs_stat_dec(class, fullness, 1);
+ class_stat_dec(class, fullness, 1);
}
/*
* Each size class maintains zspages in different fullness groups depending
* on the number of live objects they contain. When allocating or freeing
- * objects, the fullness status of the page can change, say, from ALMOST_FULL
- * to ALMOST_EMPTY when freeing an object. This function checks if such
- * a status change has occurred for the given page and accordingly moves the
- * page from the freelist of the old fullness group to that of the new
- * fullness group.
+ * objects, the fullness status of the page can change, for instance, from
+ * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function
+ * checks if such a status change has occurred for the given page and
+ * accordingly moves the page from the list of the old fullness group to that
+ * of the new fullness group.
*/
-static enum fullness_group fix_fullness_group(struct size_class *class,
- struct zspage *zspage)
+static int fix_fullness_group(struct size_class *class, struct zspage *zspage)
{
int class_idx;
- enum fullness_group currfg, newfg;
+ int currfg, newfg;
get_zspage_mapping(zspage, &class_idx, &currfg);
newfg = get_fullness_group(class, zspage);
if (newfg == currfg)
goto out;
- if (!is_zspage_isolated(zspage)) {
- remove_zspage(class, zspage, currfg);
- insert_zspage(class, zspage, newfg);
- }
-
+ remove_zspage(class, zspage, currfg);
+ insert_zspage(class, zspage, newfg);
set_zspage_mapping(zspage, class_idx, newfg);
-
out:
return newfg;
}
-/*
- * We have to decide on how many pages to link together
- * to form a zspage for each size class. This is important
- * to reduce wastage due to unusable space left at end of
- * each zspage which is given as:
- * wastage = Zp % class_size
- * usage = Zp - wastage
- * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
- *
- * For example, for size class of 3/8 * PAGE_SIZE, we should
- * link together 3 PAGE_SIZE sized pages to form a zspage
- * since then we can perfectly fit in 8 such objects.
- */
-static int get_pages_per_zspage(int class_size)
-{
- int i, max_usedpc = 0;
- /* zspage order which gives maximum used size per KB */
- int max_usedpc_order = 1;
-
- for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
- int zspage_size;
- int waste, usedpc;
-
- zspage_size = i * PAGE_SIZE;
- waste = zspage_size % class_size;
- usedpc = (zspage_size - waste) * 100 / zspage_size;
-
- if (usedpc > max_usedpc) {
- max_usedpc = usedpc;
- max_usedpc_order = i;
- }
- }
-
- return max_usedpc_order;
-}
-
static struct zspage *get_zspage(struct page *page)
{
- struct zspage *zspage = (struct zspage *)page->private;
+ struct zspage *zspage = (struct zspage *)page_private(page);
BUG_ON(zspage->magic != ZSPAGE_MAGIC);
return zspage;
@@ -831,10 +746,12 @@ static struct zspage *get_zspage(struct page *page)
static struct page *get_next_page(struct page *page)
{
- if (unlikely(PageHugeObject(page)))
+ struct zspage *zspage = get_zspage(page);
+
+ if (unlikely(ZsHugePage(zspage)))
return NULL;
- return page->freelist;
+ return (struct page *)page->index;
}
/**
@@ -851,6 +768,12 @@ static void obj_to_location(unsigned long obj, struct page **page,
*obj_idx = (obj & OBJ_INDEX_MASK);
}
+static void obj_to_page(unsigned long obj, struct page **page)
+{
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+}
+
/**
* location_to_obj - get obj value encoded from (<page>, <obj_idx>)
* @page: page object resides in zspage
@@ -872,33 +795,29 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static unsigned long obj_to_head(struct page *page, void *obj)
+static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
+ int tag)
{
- if (unlikely(PageHugeObject(page))) {
+ unsigned long handle;
+ struct zspage *zspage = get_zspage(page);
+
+ if (unlikely(ZsHugePage(zspage))) {
VM_BUG_ON_PAGE(!is_first_page(page), page);
- return page->index;
+ handle = page->index;
} else
- return *(unsigned long *)obj;
-}
-
-static inline int testpin_tag(unsigned long handle)
-{
- return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
-}
+ handle = *(unsigned long *)obj;
-static inline int trypin_tag(unsigned long handle)
-{
- return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
-}
+ if (!(handle & tag))
+ return false;
-static void pin_tag(unsigned long handle) __acquires(bitlock)
-{
- bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
+ /* Clear all tags before returning the handle */
+ *phandle = handle & ~OBJ_TAG_MASK;
+ return true;
}
-static void unpin_tag(unsigned long handle) __releases(bitlock)
+static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
{
- bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
+ return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
}
static void reset_page(struct page *page)
@@ -907,8 +826,7 @@ static void reset_page(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
page_mapcount_reset(page);
- ClearPageHugeObject(page);
- page->freelist = NULL;
+ page->index = 0;
}
static int trylock_zspage(struct zspage *zspage)
@@ -936,15 +854,15 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
struct page *page, *next;
- enum fullness_group fg;
+ int fg;
unsigned int class_idx;
get_zspage_mapping(zspage, &class_idx, &fg);
- assert_spin_locked(&class->lock);
+ assert_spin_locked(&pool->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
- VM_BUG_ON(fg != ZS_EMPTY);
+ VM_BUG_ON(fg != ZS_INUSE_RATIO_0);
next = page = get_first_page(zspage);
do {
@@ -959,9 +877,8 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
cache_free_zspage(pool, zspage);
- zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
- atomic_long_sub(class->pages_per_zspage,
- &pool->pages_allocated);
+ class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+ atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
}
static void free_zspage(struct zs_pool *pool, struct size_class *class,
@@ -970,12 +887,17 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(list_empty(&zspage->list));
+ /*
+ * Since zs_free couldn't be sleepable, this function cannot call
+ * lock_page. The page locks trylock_zspage got will be released
+ * by __free_zspage.
+ */
if (!trylock_zspage(zspage)) {
kick_deferred_free(pool);
return;
}
- remove_zspage(class, zspage, ZS_EMPTY);
+ remove_zspage(class, zspage, ZS_INUSE_RATIO_0);
__free_zspage(pool, class, zspage);
}
@@ -1034,7 +956,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
/*
* Allocate individual pages and link them together as:
- * 1. all pages are linked together using page->freelist
+ * 1. all pages are linked together using page->index
* 2. each sub-page point to zspage using page->private
*
* we set PG_private to identify the first page (i.e. no other sub-page
@@ -1043,15 +965,15 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
for (i = 0; i < nr_pages; i++) {
page = pages[i];
set_page_private(page, (unsigned long)zspage);
- page->freelist = NULL;
+ page->index = 0;
if (i == 0) {
zspage->first_page = page;
SetPagePrivate(page);
if (unlikely(class->objs_per_zspage == 1 &&
class->pages_per_zspage == 1))
- SetPageHugeObject(page);
+ SetZsHugePage(zspage);
} else {
- prev_page->freelist = page;
+ prev_page->index = (unsigned long)page;
}
prev_page = page;
}
@@ -1071,7 +993,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
- memset(zspage, 0, sizeof(struct zspage));
zspage->magic = ZSPAGE_MAGIC;
migrate_lock_init(zspage);
@@ -1094,6 +1015,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
create_page_chain(class, zspage, pages);
init_zspage(class, zspage);
+ zspage->pool = pool;
return zspage;
}
@@ -1103,9 +1025,9 @@ static struct zspage *find_get_zspage(struct size_class *class)
int i;
struct zspage *zspage;
- for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
+ for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) {
zspage = list_first_entry_or_null(&class->fullness_list[i],
- struct zspage, list);
+ struct zspage, list);
if (zspage)
break;
}
@@ -1113,48 +1035,6 @@ static struct zspage *find_get_zspage(struct size_class *class)
return zspage;
}
-#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
- /*
- * Make sure we don't leak memory if a cpu UP notification
- * and zs_init() race and both call zs_cpu_up() on the same cpu
- */
- if (area->vm)
- return 0;
- area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
- if (!area->vm)
- return -ENOMEM;
- return 0;
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
- if (area->vm)
- free_vm_area(area->vm);
- area->vm = NULL;
-}
-
-static inline void *__zs_map_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- unsigned long addr = (unsigned long)area->vm->addr;
-
- BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0);
- area->vm_addr = area->vm->addr;
- return area->vm_addr + off;
-}
-
-static inline void __zs_unmap_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- unsigned long addr = (unsigned long)area->vm_addr;
-
- unmap_kernel_range(addr, PAGE_SIZE * 2);
-}
-
-#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
-
static inline int __zs_cpu_up(struct mapping_area *area)
{
/*
@@ -1235,8 +1115,6 @@ out:
pagefault_enable();
}
-#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
-
static int zs_cpu_prepare(unsigned int cpu)
{
struct mapping_area *area;
@@ -1269,6 +1147,27 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage)
return get_zspage_inuse(zspage) == class->objs_per_zspage;
}
+/**
+ * zs_lookup_class_index() - Returns index of the zsmalloc &size_class
+ * that hold objects of the provided size.
+ * @pool: zsmalloc pool to use
+ * @size: object size
+ *
+ * Context: Any context.
+ *
+ * Return: the index of the zsmalloc &size_class that hold objects of the
+ * provided size.
+ */
+unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size)
+{
+ struct size_class *class;
+
+ class = pool->size_class[get_size_class_index(size)];
+
+ return class->index;
+}
+EXPORT_SYMBOL_GPL(zs_lookup_class_index);
+
unsigned long zs_get_total_pages(struct zs_pool *pool)
{
return atomic_long_read(&pool->pages_allocated);
@@ -1279,7 +1178,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
* zs_map_object - get address of allocated object from handle.
* @pool: pool from which the object was allocated
* @handle: handle returned from zs_malloc
- * @mm: maping mode to use
+ * @mm: mapping mode to use
*
* Before using an object allocated from zs_malloc, it must be mapped using
* this function. When done with the object, it must be unmapped using
@@ -1298,8 +1197,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
unsigned long obj, off;
unsigned int obj_idx;
- unsigned int class_idx;
- enum fullness_group fg;
struct size_class *class;
struct mapping_area *area;
struct page *pages[2];
@@ -1312,21 +1209,26 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
*/
BUG_ON(in_interrupt());
- /* From now on, migration cannot move the object */
- pin_tag(handle);
-
+ /* It guarantees it can get zspage from handle safely */
+ spin_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
- /* migration cannot move any subpage in this zspage */
+ /*
+ * migration cannot move any zpages in this zspage. Here, pool->lock
+ * is too heavy since callers would take some time until they calls
+ * zs_unmap_object API so delegate the locking from class to zspage
+ * which is smaller granularity.
+ */
migrate_read_lock(zspage);
+ spin_unlock(&pool->lock);
- get_zspage_mapping(zspage, &class_idx, &fg);
- class = pool->size_class[class_idx];
- off = (class->size * obj_idx) & ~PAGE_MASK;
+ class = zspage_class(pool, zspage);
+ off = offset_in_page(class->size * obj_idx);
- area = &get_cpu_var(zs_map_area);
+ local_lock(&zs_map_area.lock);
+ area = this_cpu_ptr(&zs_map_area);
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
@@ -1342,7 +1244,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
ret = __zs_map_object(area, pages, off, class->size);
out:
- if (likely(!PageHugeObject(page)))
+ if (likely(!ZsHugePage(zspage)))
ret += ZS_HANDLE_SIZE;
return ret;
@@ -1356,17 +1258,14 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
unsigned long obj, off;
unsigned int obj_idx;
- unsigned int class_idx;
- enum fullness_group fg;
struct size_class *class;
struct mapping_area *area;
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
- get_zspage_mapping(zspage, &class_idx, &fg);
- class = pool->size_class[class_idx];
- off = (class->size * obj_idx) & ~PAGE_MASK;
+ class = zspage_class(pool, zspage);
+ off = offset_in_page(class->size * obj_idx);
area = this_cpu_ptr(&zs_map_area);
if (off + class->size <= PAGE_SIZE)
@@ -1380,10 +1279,9 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
- put_cpu_var(zs_map_area);
+ local_unlock(&zs_map_area.lock);
migrate_read_unlock(zspage);
- unpin_tag(handle);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
@@ -1406,23 +1304,25 @@ size_t zs_huge_class_size(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_huge_class_size);
-static unsigned long obj_malloc(struct size_class *class,
+static unsigned long obj_malloc(struct zs_pool *pool,
struct zspage *zspage, unsigned long handle)
{
int i, nr_page, offset;
unsigned long obj;
struct link_free *link;
+ struct size_class *class;
struct page *m_page;
unsigned long m_offset;
void *vaddr;
+ class = pool->size_class[zspage->class];
handle |= OBJ_ALLOCATED_TAG;
obj = get_freeobj(zspage);
offset = obj * class->size;
nr_page = offset >> PAGE_SHIFT;
- m_offset = offset & ~PAGE_MASK;
+ m_offset = offset_in_page(offset);
m_page = get_first_page(zspage);
for (i = 0; i < nr_page; i++)
@@ -1431,7 +1331,7 @@ static unsigned long obj_malloc(struct size_class *class,
vaddr = kmap_atomic(m_page);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
- if (likely(!PageHugeObject(m_page)))
+ if (likely(!ZsHugePage(zspage)))
/* record handle in the header of allocated chunk */
link->handle = handle;
else
@@ -1440,7 +1340,6 @@ static unsigned long obj_malloc(struct size_class *class,
kunmap_atomic(vaddr);
mod_zspage_inuse(zspage, 1);
- zs_stat_inc(class, OBJ_USED, 1);
obj = location_to_obj(m_page, obj);
@@ -1455,66 +1354,68 @@ static unsigned long obj_malloc(struct size_class *class,
* @gfp: gfp flags when allocating object
*
* On success, handle to the allocated object is returned,
- * otherwise 0.
+ * otherwise an ERR_PTR().
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
{
unsigned long handle, obj;
struct size_class *class;
- enum fullness_group newfg;
+ int newfg;
struct zspage *zspage;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
- return 0;
+ return (unsigned long)ERR_PTR(-EINVAL);
handle = cache_alloc_handle(pool, gfp);
if (!handle)
- return 0;
+ return (unsigned long)ERR_PTR(-ENOMEM);
/* extra space in chunk to keep the handle */
size += ZS_HANDLE_SIZE;
class = pool->size_class[get_size_class_index(size)];
- spin_lock(&class->lock);
+ /* pool->lock effectively protects the zpage migration */
+ spin_lock(&pool->lock);
zspage = find_get_zspage(class);
if (likely(zspage)) {
- obj = obj_malloc(class, zspage, handle);
+ obj = obj_malloc(pool, zspage, handle);
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(class, zspage);
record_obj(handle, obj);
- spin_unlock(&class->lock);
+ class_stat_inc(class, ZS_OBJS_INUSE, 1);
- return handle;
+ goto out;
}
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
zspage = alloc_zspage(pool, class, gfp);
if (!zspage) {
cache_free_handle(pool, handle);
- return 0;
+ return (unsigned long)ERR_PTR(-ENOMEM);
}
- spin_lock(&class->lock);
- obj = obj_malloc(class, zspage, handle);
+ spin_lock(&pool->lock);
+ obj = obj_malloc(pool, zspage, handle);
newfg = get_fullness_group(class, zspage);
insert_zspage(class, zspage, newfg);
set_zspage_mapping(zspage, class->index, newfg);
record_obj(handle, obj);
- atomic_long_add(class->pages_per_zspage,
- &pool->pages_allocated);
- zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
+ atomic_long_add(class->pages_per_zspage, &pool->pages_allocated);
+ class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+ class_stat_inc(class, ZS_OBJS_INUSE, 1);
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
- spin_unlock(&class->lock);
+out:
+ spin_unlock(&pool->lock);
return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(struct size_class *class, unsigned long obj)
+static void obj_free(int class_size, unsigned long obj)
{
struct link_free *link;
struct zspage *zspage;
@@ -1523,20 +1424,22 @@ static void obj_free(struct size_class *class, unsigned long obj)
unsigned int f_objidx;
void *vaddr;
- obj &= ~OBJ_ALLOCATED_TAG;
obj_to_location(obj, &f_page, &f_objidx);
- f_offset = (class->size * f_objidx) & ~PAGE_MASK;
+ f_offset = offset_in_page(class_size * f_objidx);
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
+ link = (struct link_free *)(vaddr + f_offset);
/* Insert this object in containing zspage's freelist */
- link = (struct link_free *)(vaddr + f_offset);
- link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
- kunmap_atomic(vaddr);
+ if (likely(!ZsHugePage(zspage)))
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ else
+ f_page->index = 0;
set_freeobj(zspage, f_objidx);
+
+ kunmap_atomic(vaddr);
mod_zspage_inuse(zspage, -1);
- zs_stat_dec(class, OBJ_USED, 1);
}
void zs_free(struct zs_pool *pool, unsigned long handle)
@@ -1544,42 +1447,30 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
struct zspage *zspage;
struct page *f_page;
unsigned long obj;
- unsigned int f_objidx;
- int class_idx;
struct size_class *class;
- enum fullness_group fullness;
- bool isolated;
+ int fullness;
- if (unlikely(!handle))
+ if (IS_ERR_OR_NULL((void *)handle))
return;
- pin_tag(handle);
+ /*
+ * The pool->lock protects the race with zpage's migration
+ * so it's safe to get the page from handle.
+ */
+ spin_lock(&pool->lock);
obj = handle_to_obj(handle);
- obj_to_location(obj, &f_page, &f_objidx);
+ obj_to_page(obj, &f_page);
zspage = get_zspage(f_page);
+ class = zspage_class(pool, zspage);
- migrate_read_lock(zspage);
-
- get_zspage_mapping(zspage, &class_idx, &fullness);
- class = pool->size_class[class_idx];
+ class_stat_dec(class, ZS_OBJS_INUSE, 1);
+ obj_free(class->size, obj);
- spin_lock(&class->lock);
- obj_free(class, obj);
fullness = fix_fullness_group(class, zspage);
- if (fullness != ZS_EMPTY) {
- migrate_read_unlock(zspage);
- goto out;
- }
-
- isolated = is_zspage_isolated(zspage);
- migrate_read_unlock(zspage);
- /* If zspage is isolated, zs_page_putback will free the zspage */
- if (likely(!isolated))
+ if (fullness == ZS_INUSE_RATIO_0)
free_zspage(pool, class, zspage);
-out:
- spin_unlock(&class->lock);
- unpin_tag(handle);
+ spin_unlock(&pool->lock);
cache_free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -1599,8 +1490,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
obj_to_location(src, &s_page, &s_objidx);
obj_to_location(dst, &d_page, &d_objidx);
- s_off = (class->size * s_objidx) & ~PAGE_MASK;
- d_off = (class->size * d_objidx) & ~PAGE_MASK;
+ s_off = offset_in_page(class->size * s_objidx);
+ d_off = offset_in_page(class->size * d_objidx);
if (s_off + class->size > PAGE_SIZE)
s_size = PAGE_SIZE - s_off;
@@ -1624,6 +1515,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
d_off += size;
d_size -= size;
+ /*
+ * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic()
+ * calls must occurs in reverse order of calls to kmap_atomic().
+ * So, to call kunmap_atomic(s_addr) we should first call
+ * kunmap_atomic(d_addr). For more details see
+ * Documentation/mm/highmem.rst.
+ */
if (s_off >= PAGE_SIZE) {
kunmap_atomic(d_addr);
kunmap_atomic(s_addr);
@@ -1648,14 +1546,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
}
/*
- * Find alloced object in zspage from index object and
+ * Find object with a certain tag in zspage from index object and
* return handle.
*/
-static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
+static unsigned long find_tagged_obj(struct size_class *class,
+ struct page *page, int *obj_idx, int tag)
{
- unsigned long head;
- int offset = 0;
+ unsigned int offset;
int index = *obj_idx;
unsigned long handle = 0;
void *addr = kmap_atomic(page);
@@ -1664,13 +1561,8 @@ static unsigned long find_alloced_obj(struct size_class *class,
offset += class->size * index;
while (offset < PAGE_SIZE) {
- head = obj_to_head(page, addr + offset);
- if (head & OBJ_ALLOCATED_TAG) {
- handle = head & ~OBJ_ALLOCATED_TAG;
- if (trypin_tag(handle))
- break;
- handle = 0;
- }
+ if (obj_tagged(page, addr + offset, &handle, tag))
+ break;
offset += class->size;
index++;
@@ -1683,6 +1575,16 @@ static unsigned long find_alloced_obj(struct size_class *class,
return handle;
}
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
+{
+ return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
+}
+
struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
@@ -1694,15 +1596,14 @@ struct zs_compact_control {
int obj_idx;
};
-static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
- struct zs_compact_control *cc)
+static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
{
unsigned long used_obj, free_obj;
unsigned long handle;
struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page;
int obj_idx = cc->obj_idx;
- int ret = 0;
while (1) {
handle = find_alloced_obj(class, s_page, &obj_idx);
@@ -1715,52 +1616,49 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
}
/* Stop if there is no more space */
- if (zspage_full(class, get_zspage(d_page))) {
- unpin_tag(handle);
- ret = -ENOMEM;
+ if (zspage_full(class, get_zspage(d_page)))
break;
- }
used_obj = handle_to_obj(handle);
- free_obj = obj_malloc(class, get_zspage(d_page), handle);
+ free_obj = obj_malloc(pool, get_zspage(d_page), handle);
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
- /*
- * record_obj updates handle's value to free_obj and it will
- * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
- * breaks synchronization using pin_tag(e,g, zs_free) so
- * let's keep the lock bit.
- */
- free_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, free_obj);
- unpin_tag(handle);
- obj_free(class, used_obj);
+ obj_free(class->size, used_obj);
}
/* Remember last position in this iteration */
cc->s_page = s_page;
cc->obj_idx = obj_idx;
-
- return ret;
}
-static struct zspage *isolate_zspage(struct size_class *class, bool source)
+static struct zspage *isolate_src_zspage(struct size_class *class)
{
- int i;
struct zspage *zspage;
- enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
+ int fg;
- if (!source) {
- fg[0] = ZS_ALMOST_FULL;
- fg[1] = ZS_ALMOST_EMPTY;
+ for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg],
+ struct zspage, list);
+ if (zspage) {
+ remove_zspage(class, zspage, fg);
+ return zspage;
+ }
}
- for (i = 0; i < 2; i++) {
- zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
- struct zspage, list);
+ return zspage;
+}
+
+static struct zspage *isolate_dst_zspage(struct size_class *class)
+{
+ struct zspage *zspage;
+ int fg;
+
+ for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg],
+ struct zspage, list);
if (zspage) {
- VM_BUG_ON(is_zspage_isolated(zspage));
- remove_zspage(class, zspage, fg[i]);
+ remove_zspage(class, zspage, fg);
return zspage;
}
}
@@ -1773,14 +1671,11 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source)
* @class: destination class
* @zspage: target page
*
- * Return @zspage's fullness_group
+ * Return @zspage's fullness status
*/
-static enum fullness_group putback_zspage(struct size_class *class,
- struct zspage *zspage)
+static int putback_zspage(struct size_class *class, struct zspage *zspage)
{
- enum fullness_group fullness;
-
- VM_BUG_ON(is_zspage_isolated(zspage));
+ int fullness;
fullness = get_fullness_group(class, zspage);
insert_zspage(class, zspage, fullness);
@@ -1796,39 +1691,42 @@ static enum fullness_group putback_zspage(struct size_class *class,
*/
static void lock_zspage(struct zspage *zspage)
{
- struct page *page = get_first_page(zspage);
+ struct page *curr_page, *page;
- do {
- lock_page(page);
- } while ((page = get_next_page(page)) != NULL);
-}
-
-static int zs_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type zsmalloc_fs = {
- .name = "zsmalloc",
- .init_fs_context = zs_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
-static int zsmalloc_mount(void)
-{
- int ret = 0;
-
- zsmalloc_mnt = kern_mount(&zsmalloc_fs);
- if (IS_ERR(zsmalloc_mnt))
- ret = PTR_ERR(zsmalloc_mnt);
-
- return ret;
-}
+ /*
+ * Pages we haven't locked yet can be migrated off the list while we're
+ * trying to lock them, so we need to be careful and only attempt to
+ * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * may no longer belong to the zspage. This means that we may wait for
+ * the wrong page to unlock, so we must take a reference to the page
+ * prior to waiting for it to unlock outside migrate_read_lock().
+ */
+ while (1) {
+ migrate_read_lock(zspage);
+ page = get_first_page(zspage);
+ if (trylock_page(page))
+ break;
+ get_page(page);
+ migrate_read_unlock(zspage);
+ wait_on_page_locked(page);
+ put_page(page);
+ }
-static void zsmalloc_unmount(void)
-{
- kern_unmount(zsmalloc_mnt);
+ curr_page = page;
+ while ((page = get_next_page(curr_page))) {
+ if (trylock_page(page)) {
+ curr_page = page;
+ } else {
+ get_page(page);
+ migrate_read_unlock(zspage);
+ wait_on_page_locked(page);
+ put_page(page);
+ migrate_read_lock(zspage);
+ }
+ }
+ migrate_read_unlock(zspage);
}
+#endif /* CONFIG_COMPACTION */
static void migrate_lock_init(struct zspage *zspage)
{
@@ -1845,11 +1743,17 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
read_unlock(&zspage->lock);
}
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage)
{
write_lock(&zspage->lock);
}
+static void migrate_write_lock_nested(struct zspage *zspage)
+{
+ write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING);
+}
+
static void migrate_write_unlock(struct zspage *zspage)
{
write_unlock(&zspage->lock);
@@ -1863,33 +1767,11 @@ static void inc_zspage_isolation(struct zspage *zspage)
static void dec_zspage_isolation(struct zspage *zspage)
{
+ VM_BUG_ON(zspage->isolated == 0);
zspage->isolated--;
}
-static void putback_zspage_deferred(struct zs_pool *pool,
- struct size_class *class,
- struct zspage *zspage)
-{
- enum fullness_group fg;
-
- fg = putback_zspage(class, zspage);
- if (fg == ZS_EMPTY)
- schedule_work(&pool->free_work);
-
-}
-
-static inline void zs_pool_dec_isolated(struct zs_pool *pool)
-{
- VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
- atomic_long_dec(&pool->isolated_pages);
- /*
- * There's no possibility of racing, since wait_for_isolated_drain()
- * checks the isolated count under &class->lock after enqueuing
- * on migration_wait.
- */
- if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
- wake_up_all(&pool->migration_wait);
-}
+static const struct movable_operations zsmalloc_mops;
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
@@ -1909,82 +1791,43 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
create_page_chain(class, zspage, pages);
set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
- if (unlikely(PageHugeObject(oldpage)))
+ if (unlikely(ZsHugePage(zspage)))
newpage->index = oldpage->index;
- __SetPageMovable(newpage, page_mapping(oldpage));
+ __SetPageMovable(newpage, &zsmalloc_mops);
}
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
struct zs_pool *pool;
- struct size_class *class;
- int class_idx;
- enum fullness_group fullness;
struct zspage *zspage;
- struct address_space *mapping;
/*
* Page is locked so zspage couldn't be destroyed. For detail, look at
* lock_zspage in free_zspage.
*/
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
zspage = get_zspage(page);
-
- /*
- * Without class lock, fullness could be stale while class_idx is okay
- * because class_idx is constant unless page is freed so we should get
- * fullness again under class lock.
- */
- get_zspage_mapping(zspage, &class_idx, &fullness);
- mapping = page_mapping(page);
- pool = mapping->private_data;
- class = pool->size_class[class_idx];
-
- spin_lock(&class->lock);
- if (get_zspage_inuse(zspage) == 0) {
- spin_unlock(&class->lock);
- return false;
- }
-
- /* zspage is isolated for object migration */
- if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
- spin_unlock(&class->lock);
- return false;
- }
-
- /*
- * If this is first time isolation for the zspage, isolate zspage from
- * size_class to prevent further object allocation from the zspage.
- */
- if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
- get_zspage_mapping(zspage, &class_idx, &fullness);
- atomic_long_inc(&pool->isolated_pages);
- remove_zspage(class, zspage, fullness);
- }
-
+ pool = zspage->pool;
+ spin_lock(&pool->lock);
inc_zspage_isolation(zspage);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
return true;
}
-static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
+static int zs_page_migrate(struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
struct zs_pool *pool;
struct size_class *class;
- int class_idx;
- enum fullness_group fullness;
struct zspage *zspage;
struct page *dummy;
void *s_addr, *d_addr, *addr;
- int offset, pos;
- unsigned long handle, head;
+ unsigned int offset;
+ unsigned long handle;
unsigned long old_obj, new_obj;
unsigned int obj_idx;
- int ret = -EAGAIN;
/*
* We cannot support the _NO_COPY case here, because copy needs to
@@ -1994,38 +1837,24 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
if (mode == MIGRATE_SYNC_NO_COPY)
return -EINVAL;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ /* The page is locked, so this pointer must remain valid */
zspage = get_zspage(page);
+ pool = zspage->pool;
- /* Concurrent compactor cannot migrate any subpage in zspage */
- migrate_write_lock(zspage);
- get_zspage_mapping(zspage, &class_idx, &fullness);
- pool = mapping->private_data;
- class = pool->size_class[class_idx];
- offset = get_first_obj_offset(page);
+ /*
+ * The pool's lock protects the race between zpage migration
+ * and zs_free.
+ */
+ spin_lock(&pool->lock);
+ class = zspage_class(pool, zspage);
- spin_lock(&class->lock);
- if (!get_zspage_inuse(zspage)) {
- /*
- * Set "offset" to end of the page so that every loops
- * skips unnecessary object scanning.
- */
- offset = PAGE_SIZE;
- }
+ /* the migrate_write_lock protects zpage access via zs_map_object */
+ migrate_write_lock(zspage);
- pos = offset;
+ offset = get_first_obj_offset(page);
s_addr = kmap_atomic(page);
- while (pos < PAGE_SIZE) {
- head = obj_to_head(page, s_addr + pos);
- if (head & OBJ_ALLOCATED_TAG) {
- handle = head & ~OBJ_ALLOCATED_TAG;
- if (!trypin_tag(handle))
- goto unpin_objects;
- }
- pos += class->size;
- }
/*
* Here, any user cannot access all objects in the zspage so let's move.
@@ -2034,43 +1863,29 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
memcpy(d_addr, s_addr, PAGE_SIZE);
kunmap_atomic(d_addr);
- for (addr = s_addr + offset; addr < s_addr + pos;
+ for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
addr += class->size) {
- head = obj_to_head(page, addr);
- if (head & OBJ_ALLOCATED_TAG) {
- handle = head & ~OBJ_ALLOCATED_TAG;
- if (!testpin_tag(handle))
- BUG();
+ if (obj_allocated(page, addr, &handle)) {
old_obj = handle_to_obj(handle);
obj_to_location(old_obj, &dummy, &obj_idx);
new_obj = (unsigned long)location_to_obj(newpage,
obj_idx);
- new_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, new_obj);
}
}
+ kunmap_atomic(s_addr);
replace_sub_page(class, zspage, newpage, page);
- get_page(newpage);
-
dec_zspage_isolation(zspage);
-
/*
- * Page migration is done so let's putback isolated zspage to
- * the list if @page is final isolated subpage in the zspage.
+ * Since we complete the data copy and set up new zspage structure,
+ * it's okay to release the pool's lock.
*/
- if (!is_zspage_isolated(zspage)) {
- /*
- * We cannot race with zs_destroy_pool() here because we wait
- * for isolation to hit zero before we start destroying.
- * Also, we ensure that everyone can see pool->destroying before
- * we start waiting.
- */
- putback_zspage_deferred(pool, class, zspage);
- zs_pool_dec_isolated(pool);
- }
+ spin_unlock(&pool->lock);
+ migrate_write_unlock(zspage);
+ get_page(newpage);
if (page_zone(newpage) != page_zone(page)) {
dec_zone_page_state(page, NR_ZSPAGES);
inc_zone_page_state(newpage, NR_ZSPAGES);
@@ -2078,111 +1893,30 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
reset_page(page);
put_page(page);
- page = newpage;
-
- ret = MIGRATEPAGE_SUCCESS;
-unpin_objects:
- for (addr = s_addr + offset; addr < s_addr + pos;
- addr += class->size) {
- head = obj_to_head(page, addr);
- if (head & OBJ_ALLOCATED_TAG) {
- handle = head & ~OBJ_ALLOCATED_TAG;
- if (!testpin_tag(handle))
- BUG();
- unpin_tag(handle);
- }
- }
- kunmap_atomic(s_addr);
- spin_unlock(&class->lock);
- migrate_write_unlock(zspage);
- return ret;
+ return MIGRATEPAGE_SUCCESS;
}
static void zs_page_putback(struct page *page)
{
struct zs_pool *pool;
- struct size_class *class;
- int class_idx;
- enum fullness_group fg;
- struct address_space *mapping;
struct zspage *zspage;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
zspage = get_zspage(page);
- get_zspage_mapping(zspage, &class_idx, &fg);
- mapping = page_mapping(page);
- pool = mapping->private_data;
- class = pool->size_class[class_idx];
-
- spin_lock(&class->lock);
+ pool = zspage->pool;
+ spin_lock(&pool->lock);
dec_zspage_isolation(zspage);
- if (!is_zspage_isolated(zspage)) {
- /*
- * Due to page_lock, we cannot free zspage immediately
- * so let's defer.
- */
- putback_zspage_deferred(pool, class, zspage);
- zs_pool_dec_isolated(pool);
- }
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
}
-static const struct address_space_operations zsmalloc_aops = {
+static const struct movable_operations zsmalloc_mops = {
.isolate_page = zs_page_isolate,
- .migratepage = zs_page_migrate,
+ .migrate_page = zs_page_migrate,
.putback_page = zs_page_putback,
};
-static int zs_register_migration(struct zs_pool *pool)
-{
- pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
- if (IS_ERR(pool->inode)) {
- pool->inode = NULL;
- return 1;
- }
-
- pool->inode->i_mapping->private_data = pool;
- pool->inode->i_mapping->a_ops = &zsmalloc_aops;
- return 0;
-}
-
-static bool pool_isolated_are_drained(struct zs_pool *pool)
-{
- return atomic_long_read(&pool->isolated_pages) == 0;
-}
-
-/* Function for resolving migration */
-static void wait_for_isolated_drain(struct zs_pool *pool)
-{
-
- /*
- * We're in the process of destroying the pool, so there are no
- * active allocations. zs_page_isolate() fails for completely free
- * zspages, so we need only wait for the zs_pool's isolated
- * count to hit zero.
- */
- wait_event(pool->migration_wait,
- pool_isolated_are_drained(pool));
-}
-
-static void zs_unregister_migration(struct zs_pool *pool)
-{
- pool->destroying = true;
- /*
- * We need a memory barrier here to ensure global visibility of
- * pool->destroying. Thus pool->isolated pages will either be 0 in which
- * case we don't care, or it will be > 0 and pool->destroying will
- * ensure that we wake up once isolation hits 0.
- */
- smp_mb();
- wait_for_isolated_drain(pool); /* This can block */
- flush_work(&pool->free_work);
- iput(pool->inode);
-}
-
/*
* Caller should hold page_lock of all pages in the zspage
* In here, we cannot use zspage meta data.
@@ -2192,7 +1926,7 @@ static void async_free_zspage(struct work_struct *work)
int i;
struct size_class *class;
unsigned int class_idx;
- enum fullness_group fullness;
+ int fullness;
struct zspage *zspage, *tmp;
LIST_HEAD(free_pages);
struct zs_pool *pool = container_of(work, struct zs_pool,
@@ -2203,22 +1937,22 @@ static void async_free_zspage(struct work_struct *work)
if (class->index != i)
continue;
- spin_lock(&class->lock);
- list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
- spin_unlock(&class->lock);
+ spin_lock(&pool->lock);
+ list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0],
+ &free_pages);
+ spin_unlock(&pool->lock);
}
-
list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
list_del(&zspage->list);
lock_zspage(zspage);
get_zspage_mapping(zspage, &class_idx, &fullness);
- VM_BUG_ON(fullness != ZS_EMPTY);
+ VM_BUG_ON(fullness != ZS_INUSE_RATIO_0);
class = pool->size_class[class_idx];
- spin_lock(&class->lock);
- __free_zspage(pool, pool->size_class[class_idx], zspage);
- spin_unlock(&class->lock);
+ spin_lock(&pool->lock);
+ __free_zspage(pool, class, zspage);
+ spin_unlock(&pool->lock);
}
};
@@ -2227,6 +1961,11 @@ static void kick_deferred_free(struct zs_pool *pool)
schedule_work(&pool->free_work);
}
+static void zs_flush_migration(struct zs_pool *pool)
+{
+ flush_work(&pool->free_work);
+}
+
static void init_deferred_free(struct zs_pool *pool)
{
INIT_WORK(&pool->free_work, async_free_zspage);
@@ -2238,10 +1977,12 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
do {
WARN_ON(!trylock_page(page));
- __SetPageMovable(page, pool->inode->i_mapping);
+ __SetPageMovable(page, &zsmalloc_mops);
unlock_page(page);
} while ((page = get_next_page(page)) != NULL);
}
+#else
+static inline void zs_flush_migration(struct zs_pool *pool) { }
#endif
/*
@@ -2252,8 +1993,8 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
static unsigned long zs_can_compact(struct size_class *class)
{
unsigned long obj_wasted;
- unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- unsigned long obj_used = zs_stat_get(class, OBJ_USED);
+ unsigned long obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
+ unsigned long obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
if (obj_allocated <= obj_used)
return 0;
@@ -2264,68 +2005,99 @@ static unsigned long zs_can_compact(struct size_class *class)
return obj_wasted * class->pages_per_zspage;
}
-static void __zs_compact(struct zs_pool *pool, struct size_class *class)
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
{
struct zs_compact_control cc;
- struct zspage *src_zspage;
+ struct zspage *src_zspage = NULL;
struct zspage *dst_zspage = NULL;
+ unsigned long pages_freed = 0;
+
+ /*
+ * protect the race between zpage migration and zs_free
+ * as well as zpage allocation/free
+ */
+ spin_lock(&pool->lock);
+ while (zs_can_compact(class)) {
+ int fg;
- spin_lock(&class->lock);
- while ((src_zspage = isolate_zspage(class, true))) {
+ if (!dst_zspage) {
+ dst_zspage = isolate_dst_zspage(class);
+ if (!dst_zspage)
+ break;
+ migrate_write_lock(dst_zspage);
+ cc.d_page = get_first_page(dst_zspage);
+ }
- if (!zs_can_compact(class))
+ src_zspage = isolate_src_zspage(class);
+ if (!src_zspage)
break;
+ migrate_write_lock_nested(src_zspage);
+
cc.obj_idx = 0;
cc.s_page = get_first_page(src_zspage);
+ migrate_zspage(pool, class, &cc);
+ fg = putback_zspage(class, src_zspage);
+ migrate_write_unlock(src_zspage);
- while ((dst_zspage = isolate_zspage(class, false))) {
- cc.d_page = get_first_page(dst_zspage);
- /*
- * If there is no more space in dst_page, resched
- * and see if anyone had allocated another zspage.
- */
- if (!migrate_zspage(pool, class, &cc))
- break;
-
- putback_zspage(class, dst_zspage);
+ if (fg == ZS_INUSE_RATIO_0) {
+ free_zspage(pool, class, src_zspage);
+ pages_freed += class->pages_per_zspage;
}
+ src_zspage = NULL;
- /* Stop if we couldn't find slot */
- if (dst_zspage == NULL)
- break;
+ if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
+ || spin_is_contended(&pool->lock)) {
+ putback_zspage(class, dst_zspage);
+ migrate_write_unlock(dst_zspage);
+ dst_zspage = NULL;
- putback_zspage(class, dst_zspage);
- if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
- free_zspage(pool, class, src_zspage);
- pool->stats.pages_compacted += class->pages_per_zspage;
+ spin_unlock(&pool->lock);
+ cond_resched();
+ spin_lock(&pool->lock);
}
- spin_unlock(&class->lock);
- cond_resched();
- spin_lock(&class->lock);
}
- if (src_zspage)
+ if (src_zspage) {
putback_zspage(class, src_zspage);
+ migrate_write_unlock(src_zspage);
+ }
+
+ if (dst_zspage) {
+ putback_zspage(class, dst_zspage);
+ migrate_write_unlock(dst_zspage);
+ }
+ spin_unlock(&pool->lock);
- spin_unlock(&class->lock);
+ return pages_freed;
}
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
struct size_class *class;
+ unsigned long pages_freed = 0;
+
+ /*
+ * Pool compaction is performed under pool->lock so it is basically
+ * single-threaded. Having more than one thread in __zs_compact()
+ * will increase pool->lock contention, which will impact other
+ * zsmalloc operations that need pool->lock.
+ */
+ if (atomic_xchg(&pool->compaction_in_progress, 1))
+ return 0;
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
- if (!class)
- continue;
if (class->index != i)
continue;
- __zs_compact(pool, class);
+ pages_freed += __zs_compact(pool, class);
}
+ atomic_long_add(pages_freed, &pool->stats.pages_compacted);
+ atomic_set(&pool->compaction_in_progress, 0);
- return pool->stats.pages_compacted;
+ return pages_freed;
}
EXPORT_SYMBOL_GPL(zs_compact);
@@ -2342,13 +2114,12 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
shrinker);
- pages_freed = pool->stats.pages_compacted;
/*
* Compact classes and calculate compaction delta.
* Can run concurrently with a manually triggered
* (by user) compaction.
*/
- pages_freed = zs_compact(pool) - pages_freed;
+ pages_freed = zs_compact(pool);
return pages_freed ? pages_freed : SHRINK_STOP;
}
@@ -2364,8 +2135,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
- if (!class)
- continue;
if (class->index != i)
continue;
@@ -2387,7 +2156,29 @@ static int zs_register_shrinker(struct zs_pool *pool)
pool->shrinker.batch = 0;
pool->shrinker.seeks = DEFAULT_SEEKS;
- return register_shrinker(&pool->shrinker);
+ return register_shrinker(&pool->shrinker, "mm-zspool:%s",
+ pool->name);
+}
+
+static int calculate_zspage_chain_size(int class_size)
+{
+ int i, min_waste = INT_MAX;
+ int chain_size = 1;
+
+ if (is_power_of_2(class_size))
+ return chain_size;
+
+ for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
+ int waste;
+
+ waste = (i * PAGE_SIZE) % class_size;
+ if (waste < min_waste) {
+ min_waste = waste;
+ chain_size = i;
+ }
+ }
+
+ return chain_size;
}
/**
@@ -2411,15 +2202,13 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
+ spin_lock_init(&pool->lock);
+ atomic_set(&pool->compaction_in_progress, 0);
pool->name = kstrdup(name, GFP_KERNEL);
if (!pool->name)
goto err;
-#ifdef CONFIG_COMPACTION
- init_waitqueue_head(&pool->migration_wait);
-#endif
-
if (create_cache(pool))
goto err;
@@ -2432,12 +2221,12 @@ struct zs_pool *zs_create_pool(const char *name)
int pages_per_zspage;
int objs_per_zspage;
struct size_class *class;
- int fullness = 0;
+ int fullness;
size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
if (size > ZS_MAX_ALLOC_SIZE)
size = ZS_MAX_ALLOC_SIZE;
- pages_per_zspage = get_pages_per_zspage(size);
+ pages_per_zspage = calculate_zspage_chain_size(size);
objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
/*
@@ -2485,11 +2274,13 @@ struct zs_pool *zs_create_pool(const char *name)
class->index = i;
class->pages_per_zspage = pages_per_zspage;
class->objs_per_zspage = objs_per_zspage;
- spin_lock_init(&class->lock);
pool->size_class[i] = class;
- for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
- fullness++)
+
+ fullness = ZS_INUSE_RATIO_0;
+ while (fullness < NR_FULLNESS_GROUPS) {
INIT_LIST_HEAD(&class->fullness_list[fullness]);
+ fullness++;
+ }
prev_class = class;
}
@@ -2497,9 +2288,6 @@ struct zs_pool *zs_create_pool(const char *name)
/* debug only, don't abort if it fails */
zs_pool_stat_create(pool, name);
- if (zs_register_migration(pool))
- goto err;
-
/*
* Not critical since shrinker is only used to trigger internal
* defragmentation of the pool which is pretty optional thing. If
@@ -2521,7 +2309,7 @@ void zs_destroy_pool(struct zs_pool *pool)
int i;
zs_unregister_shrinker(pool);
- zs_unregister_migration(pool);
+ zs_flush_migration(pool);
zs_pool_stat_destroy(pool);
for (i = 0; i < ZS_SIZE_CLASSES; i++) {
@@ -2534,11 +2322,12 @@ void zs_destroy_pool(struct zs_pool *pool)
if (class->index != i)
continue;
- for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
- if (!list_empty(&class->fullness_list[fg])) {
- pr_info("Freeing non-empty class with size %db, fullness group %d\n",
- class->size, fg);
- }
+ for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) {
+ if (list_empty(&class->fullness_list[fg]))
+ continue;
+
+ pr_err("Class-%d fullness group %d is not empty\n",
+ class->size, fg);
}
kfree(class);
}
@@ -2553,14 +2342,10 @@ static int __init zs_init(void)
{
int ret;
- ret = zsmalloc_mount();
- if (ret)
- goto out;
-
ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
zs_cpu_prepare, zs_cpu_dead);
if (ret)
- goto hp_setup_fail;
+ goto out;
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
@@ -2570,8 +2355,6 @@ static int __init zs_init(void)
return 0;
-hp_setup_fail:
- zsmalloc_unmount();
out:
return ret;
}
@@ -2581,7 +2364,6 @@ static void __exit zs_exit(void)
#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zs_zpool_driver);
#endif
- zsmalloc_unmount();
cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
zs_stat_exit();
diff --git a/mm/zswap.c b/mm/zswap.c
index fbb782924ccc..62195f72bf56 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -24,8 +24,10 @@
#include <linux/rbtree.h>
#include <linux/swap.h>
#include <linux/crypto.h>
+#include <linux/scatterlist.h>
#include <linux/mempool.h>
#include <linux/zpool.h>
+#include <crypto/acompress.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
@@ -34,13 +36,16 @@
#include <linux/pagemap.h>
#include <linux/workqueue.h>
+#include "swap.h"
+#include "internal.h"
+
/*********************************
* statistics
**********************************/
/* Total bytes used by the compressed storage */
-static u64 zswap_pool_total_size;
+u64 zswap_pool_total_size;
/* The number of compressed pages currently stored in zswap */
-static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_t zswap_stored_pages = ATOMIC_INIT(0);
/* The number of same-value filled pages currently stored in zswap */
static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
@@ -77,11 +82,13 @@ static bool zswap_pool_reached_full;
#define ZSWAP_PARAM_UNSET ""
+static int zswap_setup(void);
+
/* Enable/disable zswap */
static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
static int zswap_enabled_param_set(const char *,
const struct kernel_param *);
-static struct kernel_param_ops zswap_enabled_param_ops = {
+static const struct kernel_param_ops zswap_enabled_param_ops = {
.set = zswap_enabled_param_set,
.get = param_get_bool,
};
@@ -91,7 +98,7 @@ module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
static int zswap_compressor_param_set(const char *,
const struct kernel_param *);
-static struct kernel_param_ops zswap_compressor_param_ops = {
+static const struct kernel_param_ops zswap_compressor_param_ops = {
.set = zswap_compressor_param_set,
.get = param_get_charp,
.free = param_free_charp,
@@ -102,7 +109,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops,
/* Compressed storage zpool to use */
static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
static int zswap_zpool_param_set(const char *, const struct kernel_param *);
-static struct kernel_param_ops zswap_zpool_param_ops = {
+static const struct kernel_param_ops zswap_zpool_param_ops = {
.set = zswap_zpool_param_set,
.get = param_get_charp,
.free = param_free_charp,
@@ -118,24 +125,52 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
uint, 0644);
-/* Enable/disable handling same-value filled pages (enabled by default) */
+/*
+ * Enable/disable handling same-value filled pages (enabled by default).
+ * If disabled every page is considered non-same-value filled.
+ */
static bool zswap_same_filled_pages_enabled = true;
module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
bool, 0644);
+/* Enable/disable handling non-same-value filled pages (enabled by default) */
+static bool zswap_non_same_filled_pages_enabled = true;
+module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
+ bool, 0644);
+
+static bool zswap_exclusive_loads_enabled = IS_ENABLED(
+ CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
+module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
+
/*********************************
* data structures
**********************************/
+struct crypto_acomp_ctx {
+ struct crypto_acomp *acomp;
+ struct acomp_req *req;
+ struct crypto_wait wait;
+ u8 *dstmem;
+ struct mutex *mutex;
+};
+
+/*
+ * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
+ * The only case where lru_lock is not acquired while holding tree.lock is
+ * when a zswap_entry is taken off the lru for writeback, in that case it
+ * needs to be verified that it's still valid in the tree.
+ */
struct zswap_pool {
struct zpool *zpool;
- struct crypto_comp * __percpu *tfm;
+ struct crypto_acomp_ctx __percpu *acomp_ctx;
struct kref kref;
struct list_head list;
struct work_struct release_work;
struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
+ struct list_head lru;
+ spinlock_t lru_lock;
};
/*
@@ -153,14 +188,16 @@ struct zswap_pool {
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
- * decompression. For a same value filled page length is 0.
+ * decompression. For a same value filled page length is 0, and both
+ * pool and lru are invalid and must be ignored.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
+ * lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
struct rb_node rbnode;
- pgoff_t offset;
+ swp_entry_t swpentry;
int refcount;
unsigned int length;
struct zswap_pool *pool;
@@ -168,10 +205,8 @@ struct zswap_entry {
unsigned long handle;
unsigned long value;
};
-};
-
-struct zswap_header {
- swp_entry_t swpentry;
+ struct obj_cgroup *objcg;
+ struct list_head lru;
};
/*
@@ -193,11 +228,16 @@ static DEFINE_SPINLOCK(zswap_pools_lock);
/* pool counter to provide unique names to zpool */
static atomic_t zswap_pools_count = ATOMIC_INIT(0);
-/* used by param callback function */
-static bool zswap_init_started;
+enum zswap_init_type {
+ ZSWAP_UNINIT,
+ ZSWAP_INIT_SUCCEED,
+ ZSWAP_INIT_FAILED
+};
+
+static enum zswap_init_type zswap_init_state;
-/* fatal error during init */
-static bool zswap_init_failed;
+/* used to ensure the integrity of initialization */
+static DEFINE_MUTEX(zswap_init_lock);
/* init completed, but couldn't create the initial pool */
static bool zswap_has_pool;
@@ -210,14 +250,11 @@ static bool zswap_has_pool;
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpool))
-static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_writeback_entry(struct zswap_entry *entry,
+ struct zswap_tree *tree);
static int zswap_pool_get(struct zswap_pool *pool);
static void zswap_pool_put(struct zswap_pool *pool);
-static const struct zpool_ops zswap_zpool_ops = {
- .evict = zswap_writeback_entry
-};
-
static bool zswap_is_full(void)
{
return totalram_pages() * zswap_max_pool_percent / 100 <
@@ -251,17 +288,6 @@ static void zswap_update_total_size(void)
**********************************/
static struct kmem_cache *zswap_entry_cache;
-static int __init zswap_entry_cache_create(void)
-{
- zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
- return zswap_entry_cache == NULL;
-}
-
-static void __init zswap_entry_cache_destroy(void)
-{
- kmem_cache_destroy(zswap_entry_cache);
-}
-
static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
{
struct zswap_entry *entry;
@@ -285,12 +311,14 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
{
struct rb_node *node = root->rb_node;
struct zswap_entry *entry;
+ pgoff_t entry_offset;
while (node) {
entry = rb_entry(node, struct zswap_entry, rbnode);
- if (entry->offset > offset)
+ entry_offset = swp_offset(entry->swpentry);
+ if (entry_offset > offset)
node = node->rb_left;
- else if (entry->offset < offset)
+ else if (entry_offset < offset)
node = node->rb_right;
else
return entry;
@@ -307,13 +335,15 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
{
struct rb_node **link = &root->rb_node, *parent = NULL;
struct zswap_entry *myentry;
+ pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
while (*link) {
parent = *link;
myentry = rb_entry(parent, struct zswap_entry, rbnode);
- if (myentry->offset > entry->offset)
+ myentry_offset = swp_offset(myentry->swpentry);
+ if (myentry_offset > entry_offset)
link = &(*link)->rb_left;
- else if (myentry->offset < entry->offset)
+ else if (myentry_offset < entry_offset)
link = &(*link)->rb_right;
else {
*dupentry = myentry;
@@ -325,12 +355,14 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
return 0;
}
-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
{
if (!RB_EMPTY_NODE(&entry->rbnode)) {
rb_erase(&entry->rbnode, root);
RB_CLEAR_NODE(&entry->rbnode);
+ return true;
}
+ return false;
}
/*
@@ -339,9 +371,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
*/
static void zswap_free_entry(struct zswap_entry *entry)
{
+ if (entry->objcg) {
+ obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
+ obj_cgroup_put(entry->objcg);
+ }
if (!entry->length)
atomic_dec(&zswap_same_filled_pages);
else {
+ spin_lock(&entry->pool->lru_lock);
+ list_del(&entry->lru);
+ spin_unlock(&entry->pool->lru_lock);
zpool_free(entry->pool->zpool, entry->handle);
zswap_pool_put(entry->pool);
}
@@ -388,23 +427,43 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
* per-cpu code
**********************************/
static DEFINE_PER_CPU(u8 *, zswap_dstmem);
+/*
+ * If users dynamically change the zpool type and compressor at runtime, i.e.
+ * zswap is running, zswap can have more than one zpool on one cpu, but they
+ * are sharing dtsmem. So we need this mutex to be per-cpu.
+ */
+static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
static int zswap_dstmem_prepare(unsigned int cpu)
{
+ struct mutex *mutex;
u8 *dst;
dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
if (!dst)
return -ENOMEM;
+ mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
+ if (!mutex) {
+ kfree(dst);
+ return -ENOMEM;
+ }
+
+ mutex_init(mutex);
per_cpu(zswap_dstmem, cpu) = dst;
+ per_cpu(zswap_mutex, cpu) = mutex;
return 0;
}
static int zswap_dstmem_dead(unsigned int cpu)
{
+ struct mutex *mutex;
u8 *dst;
+ mutex = per_cpu(zswap_mutex, cpu);
+ kfree(mutex);
+ per_cpu(zswap_mutex, cpu) = NULL;
+
dst = per_cpu(zswap_dstmem, cpu);
kfree(dst);
per_cpu(zswap_dstmem, cpu) = NULL;
@@ -415,30 +474,54 @@ static int zswap_dstmem_dead(unsigned int cpu)
static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_comp *tfm;
-
- if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
- return 0;
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct crypto_acomp *acomp;
+ struct acomp_req *req;
+
+ acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp)) {
+ pr_err("could not alloc crypto acomp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(acomp));
+ return PTR_ERR(acomp);
+ }
+ acomp_ctx->acomp = acomp;
- tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
- if (IS_ERR_OR_NULL(tfm)) {
- pr_err("could not alloc crypto comp %s : %ld\n",
- pool->tfm_name, PTR_ERR(tfm));
+ req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!req) {
+ pr_err("could not alloc crypto acomp_request %s\n",
+ pool->tfm_name);
+ crypto_free_acomp(acomp_ctx->acomp);
return -ENOMEM;
}
- *per_cpu_ptr(pool->tfm, cpu) = tfm;
+ acomp_ctx->req = req;
+
+ crypto_init_wait(&acomp_ctx->wait);
+ /*
+ * if the backend of acomp is async zip, crypto_req_done() will wakeup
+ * crypto_wait_req(); if the backend of acomp is scomp, the callback
+ * won't be called, crypto_wait_req() will return without blocking.
+ */
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &acomp_ctx->wait);
+
+ acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
+ acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
+
return 0;
}
static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_comp *tfm;
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+ if (!IS_ERR_OR_NULL(acomp_ctx)) {
+ if (!IS_ERR_OR_NULL(acomp_ctx->req))
+ acomp_request_free(acomp_ctx->req);
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+ }
- tfm = *per_cpu_ptr(pool->tfm, cpu);
- if (!IS_ERR_OR_NULL(tfm))
- crypto_free_comp(tfm);
- *per_cpu_ptr(pool->tfm, cpu) = NULL;
return 0;
}
@@ -518,13 +601,95 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}
+/*
+ * If the entry is still valid in the tree, drop the initial ref and remove it
+ * from the tree. This function must be called with an additional ref held,
+ * otherwise it may race with another invalidation freeing the entry.
+ */
+static void zswap_invalidate_entry(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ if (zswap_rb_erase(&tree->rbroot, entry))
+ zswap_entry_put(tree, entry);
+}
+
+static int zswap_reclaim_entry(struct zswap_pool *pool)
+{
+ struct zswap_entry *entry;
+ struct zswap_tree *tree;
+ pgoff_t swpoffset;
+ int ret;
+
+ /* Get an entry off the LRU */
+ spin_lock(&pool->lru_lock);
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lru_lock);
+ return -EINVAL;
+ }
+ entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
+ list_del_init(&entry->lru);
+ /*
+ * Once the lru lock is dropped, the entry might get freed. The
+ * swpoffset is copied to the stack, and entry isn't deref'd again
+ * until the entry is verified to still be alive in the tree.
+ */
+ swpoffset = swp_offset(entry->swpentry);
+ tree = zswap_trees[swp_type(entry->swpentry)];
+ spin_unlock(&pool->lru_lock);
+
+ /* Check for invalidate() race */
+ spin_lock(&tree->lock);
+ if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+ /* Hold a reference to prevent a free during writeback */
+ zswap_entry_get(entry);
+ spin_unlock(&tree->lock);
+
+ ret = zswap_writeback_entry(entry, tree);
+
+ spin_lock(&tree->lock);
+ if (ret) {
+ /* Writeback failed, put entry back on LRU */
+ spin_lock(&pool->lru_lock);
+ list_move(&entry->lru, &pool->lru);
+ spin_unlock(&pool->lru_lock);
+ goto put_unlock;
+ }
+
+ /*
+ * Writeback started successfully, the page now belongs to the
+ * swapcache. Drop the entry from zswap - unless invalidate already
+ * took it out while we had the tree->lock released for IO.
+ */
+ zswap_invalidate_entry(tree, entry);
+
+put_unlock:
+ /* Drop local reference */
+ zswap_entry_put(tree, entry);
+unlock:
+ spin_unlock(&tree->lock);
+ return ret ? -EAGAIN : 0;
+}
+
static void shrink_worker(struct work_struct *w)
{
struct zswap_pool *pool = container_of(w, typeof(*pool),
shrink_work);
+ int ret, failures = 0;
- if (zpool_shrink(pool->zpool, 1, NULL))
- zswap_reject_reclaim_fail++;
+ do {
+ ret = zswap_reclaim_entry(pool);
+ if (ret) {
+ zswap_reject_reclaim_fail++;
+ if (ret != -EAGAIN)
+ break;
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+ }
+ cond_resched();
+ } while (!zswap_can_accept());
zswap_pool_put(pool);
}
@@ -553,16 +718,17 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
- pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
+ pool->zpool = zpool_create_pool(type, name, gfp);
if (!pool->zpool) {
pr_err("%s zpool not available\n", type);
goto error;
}
pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
- strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
- pool->tfm = alloc_percpu(struct crypto_comp *);
- if (!pool->tfm) {
+ strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+
+ pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
+ if (!pool->acomp_ctx) {
pr_err("percpu alloc failed\n");
goto error;
}
@@ -578,6 +744,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
+ INIT_LIST_HEAD(&pool->lru);
+ spin_lock_init(&pool->lru_lock);
INIT_WORK(&pool->shrink_work, shrink_worker);
zswap_pool_debug("created", pool);
@@ -585,25 +753,26 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
return pool;
error:
- free_percpu(pool->tfm);
+ if (pool->acomp_ctx)
+ free_percpu(pool->acomp_ctx);
if (pool->zpool)
zpool_destroy_pool(pool->zpool);
kfree(pool);
return NULL;
}
-static __init struct zswap_pool *__zswap_pool_create_fallback(void)
+static struct zswap_pool *__zswap_pool_create_fallback(void)
{
bool has_comp, has_zpool;
- has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+ has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
if (!has_comp && strcmp(zswap_compressor,
CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
pr_err("compressor %s not available, using default %s\n",
zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
param_free_charp(&zswap_compressor);
zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
- has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+ has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
}
if (!has_comp) {
pr_err("default compressor %s not available\n",
@@ -639,7 +808,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
zswap_pool_debug("destroying", pool);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
- free_percpu(pool->tfm);
+ free_percpu(pool->acomp_ctx);
zpool_destroy_pool(pool->zpool);
kfree(pool);
}
@@ -693,28 +862,43 @@ static void zswap_pool_put(struct zswap_pool *pool)
* param callbacks
**********************************/
+static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
+{
+ /* no change required */
+ if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
+ return false;
+ return true;
+}
+
/* val must be a null-terminated string */
static int __zswap_param_set(const char *val, const struct kernel_param *kp,
char *type, char *compressor)
{
struct zswap_pool *pool, *put_pool = NULL;
char *s = strstrip((char *)val);
- int ret;
-
- if (zswap_init_failed) {
+ int ret = 0;
+ bool new_pool = false;
+
+ mutex_lock(&zswap_init_lock);
+ switch (zswap_init_state) {
+ case ZSWAP_UNINIT:
+ /* if this is load-time (pre-init) param setting,
+ * don't create a pool; that's done during init.
+ */
+ ret = param_set_charp(s, kp);
+ break;
+ case ZSWAP_INIT_SUCCEED:
+ new_pool = zswap_pool_changed(s, kp);
+ break;
+ case ZSWAP_INIT_FAILED:
pr_err("can't set param, initialization failed\n");
- return -ENODEV;
+ ret = -ENODEV;
}
+ mutex_unlock(&zswap_init_lock);
- /* no change required */
- if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
- return 0;
-
- /* if this is load-time (pre-init) param setting,
- * don't create a pool; that's done during init.
- */
- if (!zswap_init_started)
- return param_set_charp(s, kp);
+ /* no need to create a new pool, return directly */
+ if (!new_pool)
+ return ret;
if (!type) {
if (!zpool_has_pool(s)) {
@@ -723,7 +907,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
}
type = s;
} else if (!compressor) {
- if (!crypto_has_comp(s, 0, 0)) {
+ if (!crypto_has_acomp(s, 0, 0)) {
pr_err("compressor %s not available\n", s);
return -ENOENT;
}
@@ -774,7 +958,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
* failed, maybe both compressor and zpool params were bad.
* Allow changing this param, so pool creation will succeed
* when the other param is changed. We already verified this
- * param is ok in the zpool_has_pool() or crypto_has_comp()
+ * param is ok in the zpool_has_pool() or crypto_has_acomp()
* checks above.
*/
ret = param_set_charp(s, kp);
@@ -804,16 +988,30 @@ static int zswap_zpool_param_set(const char *val,
static int zswap_enabled_param_set(const char *val,
const struct kernel_param *kp)
{
- if (zswap_init_failed) {
+ int ret = -ENODEV;
+
+ /* if this is load-time (pre-init) param setting, only set param. */
+ if (system_state != SYSTEM_RUNNING)
+ return param_set_bool(val, kp);
+
+ mutex_lock(&zswap_init_lock);
+ switch (zswap_init_state) {
+ case ZSWAP_UNINIT:
+ if (zswap_setup())
+ break;
+ fallthrough;
+ case ZSWAP_INIT_SUCCEED:
+ if (!zswap_has_pool)
+ pr_err("can't enable, no pool configured\n");
+ else
+ ret = param_set_bool(val, kp);
+ break;
+ case ZSWAP_INIT_FAILED:
pr_err("can't enable, initialization failed\n");
- return -ENODEV;
- }
- if (!zswap_has_pool && zswap_init_started) {
- pr_err("can't enable, no pool configured\n");
- return -ENODEV;
}
+ mutex_unlock(&zswap_init_lock);
- return param_set_bool(val, kp);
+ return ret;
}
/*********************************
@@ -868,39 +1066,27 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
* the swap cache, the compressed version stored by zswap can be
* freed.
*/
-static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
+static int zswap_writeback_entry(struct zswap_entry *entry,
+ struct zswap_tree *tree)
{
- struct zswap_header *zhdr;
- swp_entry_t swpentry;
- struct zswap_tree *tree;
- pgoff_t offset;
- struct zswap_entry *entry;
+ swp_entry_t swpentry = entry->swpentry;
struct page *page;
- struct crypto_comp *tfm;
- u8 *src, *dst;
+ struct scatterlist input, output;
+ struct crypto_acomp_ctx *acomp_ctx;
+ struct zpool *pool = entry->pool->zpool;
+
+ u8 *src, *tmp = NULL;
unsigned int dlen;
int ret;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
- /* extract swpentry from data */
- zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
- swpentry = zhdr->swpentry; /* here */
- tree = zswap_trees[swp_type(swpentry)];
- offset = swp_offset(swpentry);
-
- /* find and ref zswap entry */
- spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
- if (!entry) {
- /* entry was invalidated */
- spin_unlock(&tree->lock);
- zpool_unmap_handle(pool, handle);
- return 0;
+ if (!zpool_can_sleep_mapped(pool)) {
+ tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
}
- spin_unlock(&tree->lock);
- BUG_ON(offset != entry->offset);
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
@@ -915,15 +1101,47 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
goto fail;
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
+ /*
+ * Having a local reference to the zswap entry doesn't exclude
+ * swapping from invalidating and recycling the swap slot. Once
+ * the swapcache is secured against concurrent swapping to and
+ * from the slot, recheck that the entry is still current before
+ * writing.
+ */
+ spin_lock(&tree->lock);
+ if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
+ spin_unlock(&tree->lock);
+ delete_from_swap_cache(page_folio(page));
+ ret = -ENOMEM;
+ goto fail;
+ }
+ spin_unlock(&tree->lock);
+
/* decompress */
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
dlen = PAGE_SIZE;
- src = (u8 *)zhdr + sizeof(struct zswap_header);
- dst = kmap_atomic(page);
- tfm = *get_cpu_ptr(entry->pool->tfm);
- ret = crypto_comp_decompress(tfm, src, entry->length,
- dst, &dlen);
- put_cpu_ptr(entry->pool->tfm);
- kunmap_atomic(dst);
+
+ src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
+ if (!zpool_can_sleep_mapped(pool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(pool, entry->handle);
+ }
+
+ mutex_lock(acomp_ctx->mutex);
+ sg_init_one(&input, src, entry->length);
+ sg_init_table(&output, 1);
+ sg_set_page(&output, page, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+ ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+ mutex_unlock(acomp_ctx->mutex);
+
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
+ else
+ zpool_unmap_handle(pool, entry->handle);
+
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -935,55 +1153,43 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
SetPageReclaim(page);
/* start writeback */
- __swap_writepage(page, &wbc, end_swap_bio_write);
+ __swap_writepage(page, &wbc);
put_page(page);
zswap_written_back_pages++;
- spin_lock(&tree->lock);
- /* drop local reference */
- zswap_entry_put(tree, entry);
-
- /*
- * There are two possible situations for entry here:
- * (1) refcount is 1(normal case), entry is valid and on the tree
- * (2) refcount is 0, entry is freed and not on the tree
- * because invalidate happened during writeback
- * search the tree and free the entry if find entry
- */
- if (entry == zswap_rb_search(&tree->rbroot, offset))
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
-
- goto end;
+ return ret;
+fail:
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
/*
* if we get here due to ZSWAP_SWAPCACHE_EXIST
- * a load may happening concurrently
- * it is safe and okay to not free the entry
- * if we free the entry in the following put
- * it it either okay to return !0
+ * a load may be happening concurrently.
+ * it is safe and okay to not free the entry.
+ * it is also okay to return !0
*/
-fail:
- spin_lock(&tree->lock);
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
-
-end:
- zpool_unmap_handle(pool, handle);
return ret;
}
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
{
- unsigned int pos;
unsigned long *page;
+ unsigned long val;
+ unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
page = (unsigned long *)ptr;
- for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
- if (page[pos] != page[0])
+ val = page[0];
+
+ if (val != page[last_pos])
+ return 0;
+
+ for (pos = 1; pos < last_pos; pos++) {
+ if (val != page[pos])
return 0;
}
- *value = page[0];
+
+ *value = val;
+
return 1;
}
@@ -1004,13 +1210,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry, *dupentry;
- struct crypto_comp *tfm;
+ struct scatterlist input, output;
+ struct crypto_acomp_ctx *acomp_ctx;
+ struct obj_cgroup *objcg = NULL;
+ struct zswap_pool *pool;
int ret;
- unsigned int hlen, dlen = PAGE_SIZE;
+ unsigned int dlen = PAGE_SIZE;
unsigned long handle, value;
char *buf;
u8 *src, *dst;
- struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
gfp_t gfp;
/* THP isn't supported */
@@ -1024,23 +1232,28 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto reject;
}
+ /*
+ * XXX: zswap reclaim does not work with cgroups yet. Without a
+ * cgroup-aware entry LRU, we will push out entries system-wide based on
+ * local cgroup limits.
+ */
+ objcg = get_obj_cgroup_from_page(page);
+ if (objcg && !obj_cgroup_may_zswap(objcg)) {
+ ret = -ENOMEM;
+ goto reject;
+ }
+
/* reclaim space if needed */
if (zswap_is_full()) {
- struct zswap_pool *pool;
-
zswap_pool_limit_hit++;
zswap_pool_reached_full = true;
- pool = zswap_pool_last_get();
- if (pool)
- queue_work(shrink_wq, &pool->shrink_work);
- ret = -ENOMEM;
- goto reject;
+ goto shrink;
}
if (zswap_pool_reached_full) {
if (!zswap_can_accept()) {
ret = -ENOMEM;
- goto reject;
+ goto shrink;
} else
zswap_pool_reached_full = false;
}
@@ -1057,7 +1270,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
src = kmap_atomic(page);
if (zswap_is_page_same_filled(src, &value)) {
kunmap_atomic(src);
- entry->offset = offset;
+ entry->swpentry = swp_entry(type, offset);
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
@@ -1066,6 +1279,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
kunmap_atomic(src);
}
+ if (!zswap_non_same_filled_pages_enabled) {
+ ret = -EINVAL;
+ goto freepage;
+ }
+
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool) {
@@ -1074,23 +1292,42 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
/* compress */
- dst = get_cpu_var(zswap_dstmem);
- tfm = *get_cpu_ptr(entry->pool->tfm);
- src = kmap_atomic(page);
- ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
- kunmap_atomic(src);
- put_cpu_ptr(entry->pool->tfm);
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+ mutex_lock(acomp_ctx->mutex);
+
+ dst = acomp_ctx->dstmem;
+ sg_init_table(&input, 1);
+ sg_set_page(&input, page, PAGE_SIZE, 0);
+
+ /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
+ sg_init_one(&output, dst, PAGE_SIZE * 2);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+ /*
+ * it maybe looks a little bit silly that we send an asynchronous request,
+ * then wait for its completion synchronously. This makes the process look
+ * synchronous in fact.
+ * Theoretically, acomp supports users send multiple acomp requests in one
+ * acomp instance, then get those requests done simultaneously. but in this
+ * case, frontswap actually does store and load page by page, there is no
+ * existing method to send the second page before the first page is done
+ * in one thread doing frontswap.
+ * but in different threads running on different cpu, we have different
+ * acomp instance, so multiple threads can do (de)compression in parallel.
+ */
+ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+
if (ret) {
ret = -EINVAL;
goto put_dstmem;
}
/* store */
- hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
if (zpool_malloc_support_movable(entry->pool->zpool))
gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
+ ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
@@ -1099,18 +1336,24 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto put_dstmem;
}
- buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
- memcpy(buf, &zhdr, hlen);
- memcpy(buf + hlen, dst, dlen);
+ buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
+ memcpy(buf, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
- put_cpu_var(zswap_dstmem);
+ mutex_unlock(acomp_ctx->mutex);
/* populate entry */
- entry->offset = offset;
+ entry->swpentry = swp_entry(type, offset);
entry->handle = handle;
entry->length = dlen;
insert_entry:
+ entry->objcg = objcg;
+ if (objcg) {
+ obj_cgroup_charge_zswap(objcg, entry->length);
+ /* Account before objcg ref is moved to tree */
+ count_objcg_event(objcg, ZSWPOUT);
+ }
+
/* map */
spin_lock(&tree->lock);
do {
@@ -1122,21 +1365,36 @@ insert_entry:
zswap_entry_put(tree, dupentry);
}
} while (ret == -EEXIST);
+ if (entry->length) {
+ spin_lock(&entry->pool->lru_lock);
+ list_add(&entry->lru, &entry->pool->lru);
+ spin_unlock(&entry->pool->lru_lock);
+ }
spin_unlock(&tree->lock);
/* update stats */
atomic_inc(&zswap_stored_pages);
zswap_update_total_size();
+ count_vm_event(ZSWPOUT);
return 0;
put_dstmem:
- put_cpu_var(zswap_dstmem);
+ mutex_unlock(acomp_ctx->mutex);
zswap_pool_put(entry->pool);
freepage:
zswap_entry_cache_free(entry);
reject:
+ if (objcg)
+ obj_cgroup_put(objcg);
return ret;
+
+shrink:
+ pool = zswap_pool_last_get();
+ if (pool)
+ queue_work(shrink_wq, &pool->shrink_work);
+ ret = -ENOMEM;
+ goto reject;
}
/*
@@ -1144,12 +1402,13 @@ reject:
* return -1 on entry not found or error
*/
static int zswap_frontswap_load(unsigned type, pgoff_t offset,
- struct page *page)
+ struct page *page, bool *exclusive)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
- struct crypto_comp *tfm;
- u8 *src, *dst;
+ struct scatterlist input, output;
+ struct crypto_acomp_ctx *acomp_ctx;
+ u8 *src, *dst, *tmp;
unsigned int dlen;
int ret;
@@ -1167,28 +1426,61 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
dst = kmap_atomic(page);
zswap_fill_page(dst, entry->value);
kunmap_atomic(dst);
- goto freeentry;
+ ret = 0;
+ goto stats;
+ }
+
+ if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+ tmp = kmalloc(entry->length, GFP_KERNEL);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto freeentry;
+ }
}
/* decompress */
dlen = PAGE_SIZE;
src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
- if (zpool_evictable(entry->pool->zpool))
- src += sizeof(struct zswap_header);
- dst = kmap_atomic(page);
- tfm = *get_cpu_ptr(entry->pool->tfm);
- ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
- put_cpu_ptr(entry->pool->tfm);
- kunmap_atomic(dst);
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
- BUG_ON(ret);
+ if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ }
+
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+ mutex_lock(acomp_ctx->mutex);
+ sg_init_one(&input, src, entry->length);
+ sg_init_table(&output, 1);
+ sg_set_page(&output, page, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+ ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ mutex_unlock(acomp_ctx->mutex);
+
+ if (zpool_can_sleep_mapped(entry->pool->zpool))
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ else
+ kfree(tmp);
+
+ BUG_ON(ret);
+stats:
+ count_vm_event(ZSWPIN);
+ if (entry->objcg)
+ count_objcg_event(entry->objcg, ZSWPIN);
freeentry:
spin_lock(&tree->lock);
+ if (!ret && zswap_exclusive_loads_enabled) {
+ zswap_invalidate_entry(tree, entry);
+ *exclusive = true;
+ } else if (entry->length) {
+ spin_lock(&entry->pool->lru_lock);
+ list_move(&entry->lru, &entry->pool->lru);
+ spin_unlock(&entry->pool->lru_lock);
+ }
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
- return 0;
+ return ret;
}
/* frees an entry in zswap */
@@ -1205,13 +1497,7 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
spin_unlock(&tree->lock);
return;
}
-
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, entry);
-
- /* drop the initial reference from entry creation */
- zswap_entry_put(tree, entry);
-
+ zswap_invalidate_entry(tree, entry);
spin_unlock(&tree->lock);
}
@@ -1249,7 +1535,7 @@ static void zswap_frontswap_init(unsigned type)
zswap_trees[type] = tree;
}
-static struct frontswap_ops zswap_frontswap_ops = {
+static const struct frontswap_ops zswap_frontswap_ops = {
.store = zswap_frontswap_store,
.load = zswap_frontswap_load,
.invalidate_page = zswap_frontswap_invalidate_page,
@@ -1265,7 +1551,7 @@ static struct frontswap_ops zswap_frontswap_ops = {
static struct dentry *zswap_debugfs_root;
-static int __init zswap_debugfs_init(void)
+static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
return -ENODEV;
@@ -1295,31 +1581,23 @@ static int __init zswap_debugfs_init(void)
return 0;
}
-
-static void __exit zswap_debugfs_exit(void)
-{
- debugfs_remove_recursive(zswap_debugfs_root);
-}
#else
-static int __init zswap_debugfs_init(void)
+static int zswap_debugfs_init(void)
{
return 0;
}
-
-static void __exit zswap_debugfs_exit(void) { }
#endif
/*********************************
* module init and exit
**********************************/
-static int __init init_zswap(void)
+static int zswap_setup(void)
{
struct zswap_pool *pool;
int ret;
- zswap_init_started = true;
-
- if (zswap_entry_cache_create()) {
+ zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
+ if (!zswap_entry_cache) {
pr_err("entry cache creation failed\n");
goto cache_fail;
}
@@ -1353,27 +1631,38 @@ static int __init init_zswap(void)
if (!shrink_wq)
goto fallback_fail;
- frontswap_register_ops(&zswap_frontswap_ops);
+ ret = frontswap_register_ops(&zswap_frontswap_ops);
+ if (ret)
+ goto destroy_wq;
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
+ zswap_init_state = ZSWAP_INIT_SUCCEED;
return 0;
+destroy_wq:
+ destroy_workqueue(shrink_wq);
fallback_fail:
if (pool)
zswap_pool_destroy(pool);
hp_fail:
cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
dstmem_fail:
- zswap_entry_cache_destroy();
+ kmem_cache_destroy(zswap_entry_cache);
cache_fail:
/* if built-in, we aren't unloaded on failure; don't allow use */
- zswap_init_failed = true;
+ zswap_init_state = ZSWAP_INIT_FAILED;
zswap_enabled = false;
return -ENOMEM;
}
+
+static int __init zswap_init(void)
+{
+ if (!zswap_enabled)
+ return 0;
+ return zswap_setup();
+}
/* must be late so crypto has time to come up */
-late_initcall(init_zswap);
+late_initcall(zswap_init);
-MODULE_LICENSE("GPL");
MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
MODULE_DESCRIPTION("Compressed cache for swap pages");