aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorBruce Ashfield <bruce.ashfield@gmail.com>2020-10-20 22:36:19 -0400
committerBruce Ashfield <bruce.ashfield@gmail.com>2020-10-20 22:36:19 -0400
commite148bcd802d3867d24336344ae1aa1b56623ba44 (patch)
treefd6c0538b4f8a0c759752e5f399e29e3cf27ed28 /mm
parentde88a4c8528b0177a92f739769a5c90f2a9d991f (diff)
parentc4d6fe7311762f2e03b3c27ad38df7c40c80cc93 (diff)
downloadlinux-yocto-master.tar.gz
linux-yocto-master.tar.bz2
linux-yocto-master.zip
Merge branch 'master-next'HEADmaster
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/compaction.c6
-rw-r--r--mm/debug_vm_pgtable.c207
-rw-r--r--mm/filemap.c66
-rw-r--r--mm/gup.c61
-rw-r--r--mm/gup_benchmark.c15
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/huge_memory.c45
-rw-r--r--mm/hwpoison-inject.c18
-rw-r--r--mm/internal.h27
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c177
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c75
-rw-r--r--mm/memory-failure.c329
-rw-r--r--mm/memory.c23
-rw-r--r--mm/memory_hotplug.c257
-rw-r--r--mm/memremap.c3
-rw-r--r--mm/migrate.c82
-rw-r--r--mm/mmap.c81
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c241
-rw-r--r--mm/page_isolation.c16
-rw-r--r--mm/page_owner.c10
-rw-r--r--mm/page_poison.c20
-rw-r--r--mm/page_reporting.c4
-rw-r--r--mm/percpu.c3
-rw-r--r--mm/readahead.c148
-rw-r--r--mm/rmap.c10
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/shuffle.c2
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/util.c3
-rw-r--r--mm/vmalloc.c147
-rw-r--r--mm/vmscan.c5
-rw-r--r--mm/vmstat.c8
-rw-r--r--mm/workingset.c15
-rw-r--r--mm/zsmalloc.c10
46 files changed, 1157 insertions, 1008 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e72e61c1d62e..d42423f884a7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -152,6 +152,7 @@ config HAVE_BOOTMEM_INFO_NODE
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
+ select MEMORY_ISOLATION
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on 64BIT || BROKEN
@@ -178,7 +179,6 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
- select MEMORY_ISOLATION
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
@@ -816,6 +816,9 @@ config DEVICE_PRIVATE
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
+config VMAP_PFN
+ bool
+
config FRAME_VECTOR
bool
diff --git a/mm/compaction.c b/mm/compaction.c
index 6c63844fc061..6e0ee5641788 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -625,7 +625,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
}
/* Found a free page, will break it into order-0 pages */
- order = page_order(page);
+ order = buddy_order(page);
isolated = __isolate_free_page(page, order);
if (!isolated)
break;
@@ -898,7 +898,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* potential isolation targets.
*/
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
@@ -1172,7 +1172,7 @@ static bool suitable_migration_target(struct compact_control *cc,
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (page_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= pageblock_order)
return false;
}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..c05d9dcf7891 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -28,6 +28,7 @@
#include <linux/swapops.h>
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
+#include <linux/io.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -44,10 +45,17 @@
* entry type. But these bits might affect the ability to clear entries with
* pxx_clear() because of how dynamic page table folding works on s390. So
* while loading up the entries do not change the lower 4 bits. It does not
- * have affect any other platform.
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
*/
-#define S390_MASK_BITS 4
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define S390_SKIP_MASK GENMASK(3, 0)
+#if __BITS_PER_LONG == 64
+#define PPC64_SKIP_MASK GENMASK(62, 62)
+#else
+#define PPC64_SKIP_MASK 0x0
+#endif
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
@@ -71,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
{
pte_t pte = pfn_pte(pfn, prot);
+ /*
+ * Architectures optimize set_pte_at by avoiding TLB flush.
+ * This requires set_pte_at to be not used to update an
+ * existing pte entry. Clear pte before we do set_pte_at
+ */
+
pr_debug("Validating PTE advanced\n");
pte = pfn_pte(pfn, prot);
set_pte_at(mm, vaddr, ptep, pte);
ptep_set_wrprotect(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(pte_write(pte));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
@@ -93,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
pte = ptep_get(ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear_full(mm, vaddr, ptep, 1);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
+ pte = pfn_pte(pfn, prot);
pte = pte_mkyoung(pte);
set_pte_at(mm, vaddr, ptep, pte);
ptep_test_and_clear_young(vma, vaddr, ptep);
@@ -111,10 +120,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pte_t pte = pfn_pte(pfn, prot);
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
pr_debug("Validating PTE saved write\n");
WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -141,7 +154,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
{
pmd_t pmd = pfn_pmd(pfn, prot);
@@ -152,14 +165,13 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
/* Align the address wrt HPAGE_PMD_SIZE */
vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+
pmd = pfn_pmd(pfn, prot);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_set_wrprotect(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_write(pmd));
-
- pmd = pfn_pmd(pfn, prot);
- set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
@@ -173,18 +185,20 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
-
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
- set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
pmd = pmd_mkyoung(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_test_and_clear_young(vma, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_young(pmd));
+
+ /* Clear the pte entries */
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
}
static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -199,11 +213,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_ioremap_pmd_supported())
return;
pr_debug("Validating PMD huge\n");
@@ -217,11 +232,17 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
}
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pmd_t pmd = pfn_pmd(pfn, prot);
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
pr_debug("Validating PMD saved write\n");
WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
@@ -272,17 +293,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
WARN_ON(pud_write(pud));
#ifndef __PAGETABLE_PMD_FOLDED
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
pudp_huge_get_and_clear(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
-
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
- pud = READ_ONCE(*pudp);
- WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(pfn, prot);
pud = pud_wrprotect(pud);
@@ -294,11 +307,20 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
pud = READ_ONCE(*pudp);
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+#ifndef __PAGETABLE_PMD_FOLDED
+ pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+ pud = pfn_pud(pfn, prot);
pud = pud_mkyoung(pud);
set_pud_at(mm, vaddr, pudp, pud);
pudp_test_and_clear_young(vma, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_young(pud));
+
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
}
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -313,11 +335,12 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pud_leaf(pud));
}
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
pud_t pud;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_ioremap_pud_supported())
return;
pr_debug("Validating PUD huge\n");
@@ -331,6 +354,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
}
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -350,7 +377,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
{
}
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -417,8 +444,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
*/
- pmd_clear(pmdp);
- pud_clear(pudp);
pud_populate(mm, pudp, pmdp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_bad(pud));
@@ -515,12 +540,15 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
#endif /* PAGETABLE_P4D_FOLDED */
static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
- unsigned long vaddr)
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
{
- pte_t pte = ptep_get(ptep);
+ pte_t pte = pfn_pte(pfn, prot);
pr_debug("Validating PTE clear\n");
+#ifndef CONFIG_RISCV
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
+#endif
set_pte_at(mm, vaddr, ptep, pte);
barrier();
pte_clear(mm, vaddr, ptep);
@@ -550,7 +578,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
*/
- pmd_clear(pmdp);
pmd_populate(mm, pmdp, pgtable);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_bad(pmd));
@@ -784,57 +811,8 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pte_huge(pte_mkhuge(pte)));
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
-
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
- struct page *page = pfn_to_page(pfn);
- pte_t pte = ptep_get(ptep);
- unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
-
- pr_debug("Validating HugeTLB advanced\n");
- pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
- huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_set_wrprotect(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(huge_pte_write(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_get_and_clear(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- pte = huge_pte_wrprotect(pte);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- pte = huge_pte_mkwrite(pte);
- pte = huge_pte_mkdirty(pte);
- huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
- pte = huge_ptep_get(ptep);
- WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
-}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
-}
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -965,7 +943,13 @@ static int __init debug_vm_pgtable(void)
p4dp = p4d_alloc(mm, pgdp, vaddr);
pudp = pud_alloc(mm, p4dp, vaddr);
pmdp = pmd_alloc(mm, pudp, vaddr);
- ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+ /*
+ * Allocate pgtable_t
+ */
+ if (pte_alloc(mm, pmdp)) {
+ pr_err("pgtable allocation failed\n");
+ return 1;
+ }
/*
* Save all the page table page addresses as the page table
@@ -985,32 +969,11 @@ static int __init debug_vm_pgtable(void)
p4d_basic_tests(p4d_aligned, prot);
pgd_basic_tests(pgd_aligned, prot);
- pte_clear_tests(mm, ptep, vaddr);
- pmd_clear_tests(mm, pmdp);
- pud_clear_tests(mm, pudp);
- p4d_clear_tests(mm, p4dp);
- pgd_clear_tests(mm, pgdp);
-
- pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
- pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
- pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
- hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
pmd_leaf_tests(pmd_aligned, prot);
pud_leaf_tests(pud_aligned, prot);
- pmd_huge_tests(pmdp, pmd_aligned, prot);
- pud_huge_tests(pudp, pud_aligned, prot);
-
- pte_savedwrite_tests(pte_aligned, prot);
- pmd_savedwrite_tests(pmd_aligned, prot);
-
- pte_unmap_unlock(ptep, ptl);
-
- pmd_populate_tests(mm, pmdp, saved_ptep);
- pud_populate_tests(mm, pudp, saved_pmdp);
- p4d_populate_tests(mm, p4dp, saved_pudp);
- pgd_populate_tests(mm, pgdp, saved_p4dp);
+ pte_savedwrite_tests(pte_aligned, protnone);
+ pmd_savedwrite_tests(pmd_aligned, protnone);
pte_special_tests(pte_aligned, prot);
pte_protnone_tests(pte_aligned, protnone);
@@ -1029,11 +992,43 @@ static int __init debug_vm_pgtable(void)
pmd_swap_tests(pmd_aligned, prot);
swap_migration_tests();
- hugetlb_basic_tests(pte_aligned, prot);
pmd_thp_tests(pmd_aligned, prot);
pud_thp_tests(pud_aligned, prot);
+ hugetlb_basic_tests(pte_aligned, prot);
+
+ /*
+ * Page table modifying tests. They need to hold
+ * proper page table lock.
+ */
+
+ ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
+ pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
+ pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+ pte_unmap_unlock(ptep, ptl);
+
+ ptl = pmd_lock(mm, pmdp);
+ pmd_clear_tests(mm, pmdp);
+ pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
+ pmd_huge_tests(pmdp, pmd_aligned, prot);
+ pmd_populate_tests(mm, pmdp, saved_ptep);
+ spin_unlock(ptl);
+
+ ptl = pud_lock(mm, pudp);
+ pud_clear_tests(mm, pudp);
+ pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+ pud_huge_tests(pudp, pud_aligned, prot);
+ pud_populate_tests(mm, pudp, saved_pmdp);
+ spin_unlock(ptl);
+
+ spin_lock(&mm->page_table_lock);
+ p4d_clear_tests(mm, p4dp);
+ pgd_clear_tests(mm, pgdp);
+ p4d_populate_tests(mm, p4dp, saved_pudp);
+ pgd_populate_tests(mm, pgdp, saved_p4dp);
+ spin_unlock(&mm->page_table_lock);
+
p4d_free(mm, saved_p4dp);
pud_free(mm, saved_pudp);
pmd_free(mm, saved_pmdp);
diff --git a/mm/filemap.c b/mm/filemap.c
index e3b8987153e6..e4101b5bfa82 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping,
freepage(page);
if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
+ page_ref_sub(page, thp_nr_pages(page));
VM_BUG_ON_PAGE(page_count(page) <= 0, page);
} else {
put_page(page);
@@ -829,13 +829,12 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
noinline int __add_to_page_cache_locked(struct page *page,
struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
+ pgoff_t offset, gfp_t gfp,
void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
- void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -846,25 +845,46 @@ noinline int __add_to_page_cache_locked(struct page *page,
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
}
+ gfp &= GFP_RECLAIM_MASK;
+
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
- if (xa_is_value(old)) {
+ if (old)
mapping->nrexceptional--;
- if (shadowp)
- *shadowp = old;
- }
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -872,7 +892,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
error = xas_error(&xas);
@@ -1425,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
* unlock_page - unlock a locked page
* @page: the page
*
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@ -2179,6 +2199,14 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
+ /*
+ * If we've already successfully copied some data, then we
+ * can no longer safely return -EIOCBQUEUED. Hence mark
+ * an async read NOWAIT at that point.
+ */
+ if (written && (iocb->ki_flags & IOCB_WAITQ))
+ iocb->ki_flags |= IOCB_NOWAIT;
+
for (;;) {
struct page *page;
pgoff_t end_index;
@@ -2568,8 +2596,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
struct file *fpin = NULL;
- pgoff_t offset = vmf->pgoff;
unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
@@ -2580,8 +2608,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
return fpin;
}
@@ -2601,10 +2628,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
- ra_submit(ra, mapping, file);
+ ractl._index = ra->start;
+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
return fpin;
}
@@ -2984,7 +3012,7 @@ filler:
goto out;
/*
- * Page is not up to date and may be locked due one of the following
+ * Page is not up to date and may be locked due to one of the following
* case a: Page is being filled and the page lock is held
* case b: Read/write error clearing the page uptodate status
* case c: Truncation in progress (page locked)
diff --git a/mm/gup.c b/mm/gup.c
index ad617e7f22f5..102877ed77a4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1490,35 +1490,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
mmap_read_unlock(mm);
return ret; /* 0 or negative error code */
}
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_lock, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
- struct vm_area_struct *vma;
- struct page *page;
-
- if (__get_user_pages(current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
- NULL) < 1)
- return NULL;
- flush_cache_page(vma, addr, page_to_pfn(page));
- return page;
-}
-#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
@@ -1564,6 +1535,38 @@ finish_or_fault:
}
#endif /* !CONFIG_MMU */
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_lock (takes and releases the mmap_lock by itself).
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct page *page;
+ int locked = 1;
+ int ret;
+
+ if (mmap_read_lock_killable(mm))
+ return NULL;
+ ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ if (locked)
+ mmap_read_unlock(mm);
+ return (ret == 1) ? page : NULL;
+}
+#endif /* CONFIG_ELF_CORE */
+
#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 464cae1fa3ea..8b3e5b5cd8fa 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -72,6 +72,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
int nr;
struct page **pages;
int ret = 0;
+ bool needs_mmap_lock =
+ cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK;
if (gup->size > ULONG_MAX)
return -EINVAL;
@@ -81,6 +83,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
if (!pages)
return -ENOMEM;
+ if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) {
+ ret = -EINTR;
+ goto free_pages;
+ }
+
i = 0;
nr = gup->nr_pages_per_call;
start_time = ktime_get();
@@ -120,9 +127,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
pages + i, NULL);
break;
default:
- kvfree(pages);
ret = -EINVAL;
- goto out;
+ goto unlock;
}
if (nr <= 0)
@@ -150,8 +156,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
end_time = ktime_get();
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
+unlock:
+ if (needs_mmap_lock)
+ mmap_read_unlock(current->mm);
+free_pages:
kvfree(pages);
-out:
return ret;
}
diff --git a/mm/highmem.c b/mm/highmem.c
index 64d8dea47dd1..1352a27951e3 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -369,7 +369,7 @@ void kunmap_high(struct page *page)
}
EXPORT_SYMBOL(kunmap_high);
-#endif
+#endif /* CONFIG_HIGHMEM */
#if defined(HASHED_PAGE_VIRTUAL)
@@ -481,4 +481,4 @@ void __init page_address_init(void)
}
}
-#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+#endif /* defined(HASHED_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 65c289c13b58..9474dbc150ed 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2335,13 +2335,13 @@ static void unmap_page(struct page *page)
VM_BUG_ON_PAGE(!unmap_success, page);
}
-static void remap_page(struct page *page)
+static void remap_page(struct page *page, unsigned int nr)
{
int i;
if (PageTransHuge(page)) {
remove_migration_ptes(page, page, true);
} else {
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
remove_migration_ptes(page + i, page + i, true);
}
}
@@ -2419,6 +2419,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
+ unsigned int nr = thp_nr_pages(head);
int i;
lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2434,7 +2435,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
@@ -2454,7 +2455,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageCompound(head);
- split_page_owner(head, HPAGE_PMD_ORDER);
+ split_page_owner(head, nr);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
@@ -2473,9 +2474,15 @@ static void __split_huge_page(struct page *page, struct list_head *list,
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- remap_page(head);
+ remap_page(head, nr);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ split_swap_cluster(entry);
+ }
+
+ for (i = 0; i < nr; i++) {
struct page *subpage = head + i;
if (subpage == page)
continue;
@@ -2494,7 +2501,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
int total_mapcount(struct page *page)
{
- int i, compound, ret;
+ int i, compound, nr, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -2502,16 +2509,17 @@ int total_mapcount(struct page *page)
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
+ nr = compound_nr(page);
if (PageHuge(page))
return compound;
ret = compound;
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
- return ret - compound * HPAGE_PMD_NR;
+ return ret - compound * nr;
if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
+ ret -= nr;
return ret;
}
@@ -2556,14 +2564,14 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
page = compound_head(page);
_total_mapcount = ret = 0;
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < thp_nr_pages(page); i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
_total_mapcount += mapcount;
}
if (PageDoubleMap(page)) {
ret -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
+ _total_mapcount -= thp_nr_pages(page);
}
mapcount = compound_mapcount(page);
ret += mapcount;
@@ -2580,9 +2588,9 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
/* Additional pins from page cache */
if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
else
- extra_pins = HPAGE_PMD_NR;
+ extra_pins = thp_nr_pages(page);
if (pextra_pins)
*pextra_pins = extra_pins;
return total_mapcount(page) == page_count(page) - extra_pins - 1;
@@ -2709,12 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
__split_huge_page(page, list, end, flags);
- if (PageSwapCache(head)) {
- swp_entry_t entry = { .val = page_private(head) };
-
- ret = split_swap_cluster(entry);
- } else
- ret = 0;
+ ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
@@ -2728,7 +2731,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
- remap_page(head);
+ remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e488876b168a..1ae1ebc2b9b1 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -26,11 +26,6 @@ static int hwpoison_inject(void *data, u64 val)
p = pfn_to_page(pfn);
hpage = compound_head(p);
- /*
- * This implies unable to support free buddy pages.
- */
- if (!get_hwpoison_page(p))
- return 0;
if (!hwpoison_filter_enable)
goto inject;
@@ -40,23 +35,20 @@ static int hwpoison_inject(void *data, u64 val)
* This implies unable to support non-LRU pages.
*/
if (!PageLRU(hpage) && !PageHuge(p))
- goto put_out;
+ return 0;
/*
- * do a racy check with elevated page count, to make sure PG_hwpoison
- * will only be set for the targeted owner (or on a free page).
+ * do a racy check to make sure PG_hwpoison will only be set for
+ * the targeted owner (or on a free page).
* memory_failure() will redo the check reliably inside page lock.
*/
err = hwpoison_filter(hpage);
if (err)
- goto put_out;
+ return 0;
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
- return memory_failure(pfn, MF_COUNT_INCREASED);
-put_out:
- put_hwpoison_page(p);
- return 0;
+ return memory_failure(pfn, 0);
}
static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/internal.h b/mm/internal.h
index a801a4d51f26..c43ccdddb0f6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,20 +49,15 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-void force_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read);
-void __do_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read,
+void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
unsigned long lookahead_size);
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-static inline void ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
+void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
+ unsigned long nr);
+static inline void force_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read)
{
- __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
+ DEFINE_READAHEAD(ractl, file, mapping, index);
+ force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
}
struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
@@ -275,16 +270,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
* page from being allocated in parallel and returning garbage as the order.
* If a caller does not hold page_zone(page)->lock, it must guarantee that the
* page cannot be allocated or merged in parallel. Alternatively, it must
- * handle invalid values gracefully, and use page_order_unsafe() below.
+ * handle invalid values gracefully, and use buddy_order_unsafe() below.
*/
-static inline unsigned int page_order(struct page *page)
+static inline unsigned int buddy_order(struct page *page)
{
/* PageBuddy() must be checked by the caller */
return page_private(page);
}
/*
- * Like page_order(), but for callers who cannot afford to hold the zone lock.
+ * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
* PageBuddy() should be checked first by the caller to minimize race window,
* and invalid values must be handled gracefully.
*
@@ -294,7 +289,7 @@ static inline unsigned int page_order(struct page *page)
* times, potentially observing different values in the tests and the actual
* use of the result.
*/
-#define page_order_unsafe(page) READ_ONCE(page_private(page))
+#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 58b0d9c502a1..4e3dff13eb70 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -434,7 +434,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
+ return atomic_read(&mm->mm_users) == 0;
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
diff --git a/mm/ksm.c b/mm/ksm.c
index 9afccc36dbd2..0960750bb316 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -81,7 +81,7 @@
* different KSM page copy of that content
*
* Internally, the regular nodes, "dups" and "chains" are represented
- * using the same :c:type:`struct stable_node` structure.
+ * using the same struct stable_node structure.
*
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
diff --git a/mm/madvise.c b/mm/madvise.c
index 9b065d412e5f..416a56b8e757 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,6 +17,8 @@
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
@@ -27,7 +29,6 @@
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/mm.h>
#include <asm/tlb.h>
@@ -258,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
+ struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
@@ -294,10 +296,10 @@ static long madvise_willneed(struct vm_area_struct *vma,
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return 0;
}
@@ -766,6 +768,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int behavior)
{
+ struct mm_struct *mm = vma->vm_mm;
+
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -773,8 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
- mmap_read_lock(current->mm);
- vma = find_vma(current->mm, start);
+ mmap_read_lock(mm);
+ vma = find_vma(mm, start);
if (!vma)
return -ENOMEM;
if (start < vma->vm_start) {
@@ -828,6 +832,7 @@ static long madvise_remove(struct vm_area_struct *vma,
loff_t offset;
int error;
struct file *f;
+ struct mm_struct *mm = vma->vm_mm;
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
@@ -855,13 +860,13 @@ static long madvise_remove(struct vm_area_struct *vma,
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
/* mmap_lock was not released by userfaultfd_remove() */
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return error;
}
@@ -872,7 +877,6 @@ static long madvise_remove(struct vm_area_struct *vma,
static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
- struct page *page;
struct zone *zone;
unsigned long size;
@@ -882,6 +886,7 @@ static int madvise_inject_error(int behavior,
for (; start < end; start += size) {
unsigned long pfn;
+ struct page *page;
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
@@ -896,32 +901,23 @@ static int madvise_inject_error(int behavior,
*/
size = page_size(compound_head(page));
- if (PageHWPoison(page)) {
- put_page(page);
- continue;
- }
-
if (behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
+ pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
- if (ret)
- return ret;
- continue;
+ } else {
+ pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
+ pfn, start);
+ /*
+ * Drop the page reference taken by get_user_pages_fast(). In
+ * the absence of MF_COUNT_INCREASED the memory_failure()
+ * routine is responsible for pinning the page to prevent it
+ * from being released back to the page allocator.
+ */
+ put_page(page);
+ ret = memory_failure(pfn, 0);
}
- pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
- /*
- * Drop the page reference taken by get_user_pages_fast(). In
- * the absence of MF_COUNT_INCREASED the memory_failure()
- * routine is responsible for pinning the page to prevent it
- * from being released back to the page allocator.
- */
- put_page(page);
- ret = memory_failure(pfn, 0);
if (ret)
return ret;
}
@@ -993,6 +989,18 @@ madvise_behavior_valid(int behavior)
}
}
+static bool
+process_madvise_behavior_valid(int behavior)
+{
+ switch (behavior) {
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* The madvise(2) system call.
*
@@ -1040,6 +1048,11 @@ madvise_behavior_valid(int behavior)
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ * MADV_COLD - the application is not expected to use this memory soon,
+ * deactivate pages in this range so that they can be reclaimed
+ * easily if memory pressure hanppens.
+ * MADV_PAGEOUT - the application is not expected to use this memory soon,
+ * page out the pages in this range immediately.
*
* return values:
* zero - success
@@ -1054,7 +1067,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-int do_madvise(unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@@ -1092,27 +1105,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
- if (mmap_write_lock_killable(current->mm))
- return -EINTR;
-
- /*
- * We may have stolen the mm from another process
- * that is undergoing core dumping.
- *
- * Right now that's io_ring, in the future it may
- * be remote process management and not "current"
- * at all.
- *
- * We need to fix core dumping to not do this,
- * but for now we have the mmget_still_valid()
- * model.
- */
- if (!mmget_still_valid(current->mm)) {
- mmap_write_unlock(current->mm);
+ if (mmap_write_lock_killable(mm))
return -EINTR;
- }
} else {
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
}
/*
@@ -1120,7 +1116,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
- vma = find_vma_prev(current->mm, start, &prev);
+ vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
@@ -1157,19 +1153,92 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
- vma = find_vma(current->mm, start);
+ vma = find_vma(mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
else
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
- return do_madvise(start, len_in, behavior);
+ return do_madvise(current->mm, start, len_in, behavior);
+}
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+ size_t, vlen, int, behavior, unsigned int, flags)
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV], iovec;
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ struct pid *pid;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ size_t total_len;
+ unsigned int f_flags;
+
+ if (flags != 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ goto out;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid)) {
+ ret = PTR_ERR(pid);
+ goto free_iov;
+ }
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ret = -ESRCH;
+ goto put_pid;
+ }
+
+ if (task->mm != current->mm &&
+ !process_madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_task;
+ }
+
+ mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
+ total_len = iov_iter_count(&iter);
+
+ while (iov_iter_count(&iter)) {
+ iovec = iov_iter_iovec(&iter);
+ ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
+ if (ret < 0)
+ break;
+ iov_iter_advance(&iter, iovec.iov_len);
+ }
+
+ if (ret == 0)
+ ret = total_len - iov_iter_count(&iter);
+
+ mmput(mm);
+ return ret;
+
+release_task:
+ put_task_struct(task);
+put_pid:
+ put_pid(pid);
+free_iov:
+ kfree(iov);
+out:
+ return ret;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 165f40a8a254..b68ee86788af 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -48,12 +48,12 @@
* boot regardless of the possible restrictions and memory hot(un)plug;
* the ``physmem`` type is only available on some architectures.
*
- * Each region is represented by :c:type:`struct memblock_region` that
+ * Each region is represented by struct memblock_region that
* defines the region extents, its attributes and NUMA node id on NUMA
- * systems. Every memory type is described by the :c:type:`struct
- * memblock_type` which contains an array of memory regions along with
+ * systems. Every memory type is described by the struct memblock_type
+ * which contains an array of memory regions along with
* the allocator metadata. The "memory" and "reserved" types are nicely
- * wrapped with :c:type:`struct memblock`. This structure is statically
+ * wrapped with struct memblock. This structure is statically
* initialized at build time. The region arrays are initially sized to
* %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
* for "reserved". The region array for "physmem" is initially sized to
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b98911c88bab..7fe93470f59f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,6 +73,9 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Active memory cgroup to use from an interrupt context */
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@@ -1061,23 +1064,56 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
}
EXPORT_SYMBOL(get_mem_cgroup_from_page);
-/**
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+static __always_inline struct mem_cgroup *active_memcg(void)
{
- if (unlikely(current->active_memcg)) {
- struct mem_cgroup *memcg;
+ if (in_interrupt())
+ return this_cpu_read(int_active_memcg);
+ else
+ return current->active_memcg;
+}
- rcu_read_lock();
+static __always_inline struct mem_cgroup *get_active_memcg(void)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = active_memcg();
+ if (memcg) {
/* current->active_memcg must hold a ref. */
- if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css)))
+ if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
memcg = root_mem_cgroup;
else
memcg = current->active_memcg;
- rcu_read_unlock();
- return memcg;
}
+ rcu_read_unlock();
+
+ return memcg;
+}
+
+static __always_inline bool memcg_kmem_bypass(void)
+{
+ /* Allow remote memcg charging from any context. */
+ if (unlikely(active_memcg()))
+ return false;
+
+ /* Memcg to charge can't be determined. */
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
+
+ return false;
+}
+
+/**
+ * If active memcg is set, do not fallback to current->mm->memcg.
+ */
+static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+ if (memcg_kmem_bypass())
+ return NULL;
+
+ if (unlikely(active_memcg()))
+ return get_active_memcg();
+
return get_mem_cgroup_from_mm(current->mm);
}
@@ -2933,12 +2969,12 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg;
- if (unlikely(!current->mm && !current->active_memcg))
+ if (memcg_kmem_bypass())
return NULL;
rcu_read_lock();
- if (unlikely(current->active_memcg))
- memcg = rcu_dereference(current->active_memcg);
+ if (unlikely(active_memcg()))
+ memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
@@ -3059,19 +3095,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
struct mem_cgroup *memcg;
int ret = 0;
- if (memcg_kmem_bypass())
- return 0;
-
memcg = get_mem_cgroup_from_current();
- if (!mem_cgroup_is_root(memcg)) {
+ if (memcg && !mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
return 0;
}
+ css_put(&memcg->css);
}
- css_put(&memcg->css);
return ret;
}
@@ -5296,12 +5329,12 @@ static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *old_memcg;
long error = -ENOMEM;
- memalloc_use_memcg(parent);
+ old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 990e3b2e37d5..c0bb186bba62 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -65,6 +65,33 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
+{
+ if (hugepage_or_freepage) {
+ /*
+ * Doing this check for free pages is also fine since dissolve_free_huge_page
+ * returns 0 for non-hugetlb pages as well.
+ */
+ if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ /*
+ * We could fail to take off the target page from buddy
+ * for example due to racy page allocaiton, but that's
+ * acceptable because soft-offlined page is not broken
+ * and if someone really want to use it, they should
+ * take it.
+ */
+ return false;
+ }
+
+ SetPageHWPoison(page);
+ if (release)
+ put_page(page);
+ page_ref_inc(page);
+ num_poisoned_pages_inc();
+
+ return true;
+}
+
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
u32 hwpoison_filter_enable = 0;
@@ -555,6 +582,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -924,7 +952,7 @@ static int page_action(struct page_state *ps, struct page *p,
* Return: return 0 if failed to grab the refcount, otherwise true (some
* non-zero value.)
*/
-int get_hwpoison_page(struct page *page)
+static int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
@@ -953,7 +981,6 @@ int get_hwpoison_page(struct page *page)
return 0;
}
-EXPORT_SYMBOL_GPL(get_hwpoison_page);
/*
* Do all that is necessary to remove user space mappings. Unmap
@@ -1103,6 +1130,25 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
+static int try_to_split_thp_page(struct page *page, const char *msg)
+{
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unsigned long pfn = page_to_pfn(page);
+
+ unlock_page(page);
+ if (!PageAnon(page))
+ pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
+ else
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ put_page(page);
+ return -EBUSY;
+ }
+ unlock_page(page);
+
+ return 0;
+}
+
static int memory_failure_hugetlb(unsigned long pfn, int flags)
{
struct page *p = pfn_to_page(pfn);
@@ -1144,7 +1190,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(head);
- put_hwpoison_page(head);
+ put_page(head);
return 0;
}
@@ -1325,23 +1371,11 @@ int memory_failure(unsigned long pfn, int flags)
}
if (PageTransHuge(hpage)) {
- lock_page(p);
- if (!PageAnon(p) || unlikely(split_huge_page(p))) {
- unlock_page(p);
- if (!PageAnon(p))
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- pfn);
- else
- pr_err("Memory failure: %#lx: thp split failed\n",
- pfn);
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- put_hwpoison_page(p);
+ if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
return -EBUSY;
}
- unlock_page(p);
VM_BUG_ON_PAGE(!page_count(p), p);
- hpage = compound_head(p);
}
/*
@@ -1381,10 +1415,7 @@ int memory_failure(unsigned long pfn, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- if (PageHuge(p))
- page_flags = hpage->flags;
- else
- page_flags = p->flags;
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1393,14 +1424,14 @@ int memory_failure(unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
@@ -1416,11 +1447,8 @@ int memory_failure(unsigned long pfn, int flags)
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
- *
- * When the raw error page is thp tail page, hpage points to the raw
- * page after thp split.
*/
- if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1637,24 +1665,14 @@ int unpoison_memory(unsigned long pfn)
}
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_hwpoison_page(page);
+ put_page(page);
return 0;
}
EXPORT_SYMBOL(unpoison_memory);
-static struct page *new_page(struct page *p, unsigned long private)
-{
- struct migration_target_control mtc = {
- .nid = page_to_nid(p),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- return alloc_migration_target(p, (unsigned long)&mtc);
-}
-
/*
* Safely get reference count of an arbitrary page.
* Returns 0 for a free page, -EIO for a zero refcount page
@@ -1679,6 +1697,9 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
} else if (is_free_buddy_page(p)) {
pr_info("%s: %#lx free buddy page\n", __func__, pfn);
ret = 0;
+ } else if (page_count(p)) {
+ /* raced with allocation */
+ ret = -EBUSY;
} else {
pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
__func__, pfn, p->flags);
@@ -1695,12 +1716,15 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
{
int ret = __get_any_page(page, pfn, flags);
+ if (ret == -EBUSY)
+ ret = __get_any_page(page, pfn, flags);
+
if (ret == 1 && !PageHuge(page) &&
!PageLRU(page) && !__PageMovable(page)) {
/*
* Try to free it.
*/
- put_hwpoison_page(page);
+ put_page(page);
shake_page(page, 1);
/*
@@ -1709,7 +1733,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
ret = __get_any_page(page, pfn, 0);
if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
pfn, page->flags, &page->flags);
return -EIO;
@@ -1718,69 +1742,55 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
return ret;
}
-static int soft_offline_huge_page(struct page *page, int flags)
+static bool isolate_page(struct page *page, struct list_head *pagelist)
{
- int ret;
- unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_head(page);
- LIST_HEAD(pagelist);
+ bool isolated = false;
+ bool lru = PageLRU(page);
- /*
- * This double-check of PageHWPoison is to avoid the race with
- * memory_failure(). See also comment in __soft_offline_page().
- */
- lock_page(hpage);
- if (PageHWPoison(hpage)) {
- unlock_page(hpage);
- put_hwpoison_page(hpage);
- pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
- return -EBUSY;
+ if (PageHuge(page)) {
+ isolated = isolate_huge_page(page, pagelist);
+ } else {
+ if (lru)
+ isolated = !isolate_lru_page(page);
+ else
+ isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+
+ if (isolated)
+ list_add(&page->lru, pagelist);
}
- unlock_page(hpage);
- ret = isolate_huge_page(hpage, &pagelist);
+ if (isolated && lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+
/*
- * get_any_page() and isolate_huge_page() takes a refcount each,
- * so need to drop one here.
+ * If we succeed to isolate the page, we grabbed another refcount on
+ * the page, so we can safely drop the one we got from get_any_pages().
+ * If we failed to isolate the page, it means that we cannot go further
+ * and we will return an error, so drop the reference we got from
+ * get_any_pages() as well.
*/
- put_hwpoison_page(hpage);
- if (!ret) {
- pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
- return -EBUSY;
- }
-
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
- pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
- if (!list_empty(&pagelist))
- putback_movable_pages(&pagelist);
- if (ret > 0)
- ret = -EIO;
- } else {
- /*
- * We set PG_hwpoison only when the migration source hugepage
- * was successfully dissolved, because otherwise hwpoisoned
- * hugepage remains on free hugepage list, then userspace will
- * find it as SIGBUS by allocation failure. That's not expected
- * in soft-offlining.
- */
- ret = dissolve_free_huge_page(page);
- if (!ret) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- ret = -EBUSY;
- }
- }
- return ret;
+ put_page(page);
+ return isolated;
}
-static int __soft_offline_page(struct page *page, int flags)
+/*
+ * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
+ * If the page is mapped, it migrates the contents over.
+ */
+static int __soft_offline_page(struct page *page)
{
- int ret;
+ int ret = 0;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ char const *msg_page[] = {"page", "hugepage"};
+ bool huge = PageHuge(page);
+ LIST_HEAD(pagelist);
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@@ -1789,121 +1799,75 @@ static int __soft_offline_page(struct page *page, int flags)
* so there's no race between soft_offline_page() and memory_failure().
*/
lock_page(page);
- wait_on_page_writeback(page);
+ if (!PageHuge(page))
+ wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
- return -EBUSY;
+ return 0;
}
- /*
- * Try to invalidate first. This should work for
- * non dirty unmapped page cache pages.
- */
- ret = invalidate_inode_page(page);
+
+ if (!PageHuge(page))
+ /*
+ * Try to invalidate first. This should work for
+ * non dirty unmapped page cache pages.
+ */
+ ret = invalidate_inode_page(page);
unlock_page(page);
+
/*
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- if (ret == 1) {
- put_hwpoison_page(page);
+ if (ret) {
pr_info("soft_offline: %#lx: invalidated\n", pfn);
- SetPageHWPoison(page);
- num_poisoned_pages_inc();
+ page_handle_poison(page, false, true);
return 0;
}
- /*
- * Simple invalidation didn't work.
- * Try to migrate to a new page instead. migrate.c
- * handles a large number of cases for us.
- */
- if (PageLRU(page))
- ret = isolate_lru_page(page);
- else
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- /*
- * Drop page reference which is came from get_any_page()
- * successful isolate_lru_page() already took another one.
- */
- put_hwpoison_page(page);
- if (!ret) {
- LIST_HEAD(pagelist);
- /*
- * After isolated lru page, the PageLRU will be cleared,
- * so use !__PageMovable instead for LRU page's mapping
- * cannot have PAGE_MAPPING_MOVABLE.
- */
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
- list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
+ if (isolate_page(hpage, &pagelist)) {
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (!ret) {
+ bool release = !huge;
+
+ if (!page_handle_poison(page, huge, release))
+ ret = -EBUSY;
+ } else {
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page->flags, &page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
- pfn, ret, page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags);
+ ret = -EBUSY;
}
return ret;
}
-static int soft_offline_in_use_page(struct page *page, int flags)
+static int soft_offline_in_use_page(struct page *page)
{
- int ret;
- int mt;
struct page *hpage = compound_head(page);
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(page);
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
- unlock_page(page);
- if (!PageAnon(page))
- pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
- else
- pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(page);
+ if (!PageHuge(page) && PageTransHuge(hpage))
+ if (try_to_split_thp_page(page, "soft offline") < 0)
return -EBUSY;
- }
- unlock_page(page);
- }
-
- /*
- * Setting MIGRATE_ISOLATE here ensures that the page will be linked
- * to free list immediately (not via pcplist) when released after
- * successful page migration. Otherwise we can't guarantee that the
- * page is really free after put_page() returns, so
- * set_hwpoison_free_buddy_page() highly likely fails.
- */
- mt = get_pageblock_migratetype(page);
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- set_pageblock_migratetype(page, mt);
- return ret;
+ return __soft_offline_page(page);
}
static int soft_offline_free_page(struct page *page)
{
- int rc = dissolve_free_huge_page(page);
+ int rc = 0;
+
+ if (!page_handle_poison(page, true, false))
+ rc = -EBUSY;
- if (!rc) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- rc = -EBUSY;
- }
return rc;
}
@@ -1933,6 +1897,7 @@ int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
struct page *page;
+ bool try_again = true;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1944,18 +1909,22 @@ int soft_offline_page(unsigned long pfn, int flags)
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
+ put_page(page);
+ return 0;
}
+retry:
get_online_mems();
ret = get_any_page(page, pfn, flags);
put_online_mems();
if (ret > 0)
- ret = soft_offline_in_use_page(page, flags);
+ ret = soft_offline_in_use_page(page);
else if (ret == 0)
- ret = soft_offline_free_page(page);
+ if (soft_offline_free_page(page) && try_again) {
+ try_again = false;
+ goto retry;
+ }
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 2afb01ea1307..c48f8df6e502 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2391,13 +2391,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
- do {
- if (create || !pte_none(*pte)) {
- err = fn(pte++, addr, data);
- if (err)
- break;
- }
- } while (addr += PAGE_SIZE, addr != end);
+ if (fn) {
+ do {
+ if (create || !pte_none(*pte)) {
+ err = fn(pte++, addr, data);
+ if (err)
+ break;
+ }
+ } while (addr += PAGE_SIZE, addr != end);
+ }
*mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
@@ -3709,13 +3711,14 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i;
- vm_fault_t ret;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
- return VM_FAULT_FALLBACK;
+ return ret;
- ret = VM_FAULT_FALLBACK;
page = compound_head(page);
+ if (compound_order(page) != HPAGE_PMD_ORDER)
+ return ret;
/*
* Archs like ppc64 need additonal space to store information
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e9e2d44cdad..b44d4c7ba73b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -105,7 +105,7 @@ static struct resource *register_memory_resource(u64 start, u64 size,
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
if (strcmp(resource_name, "System RAM"))
- flags |= IORESOURCE_MEM_DRIVER_MANAGED;
+ flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
/*
* Make sure value parsed from 'mem=' only restricts memory adding
@@ -625,31 +625,22 @@ void generic_online_page(struct page *page, unsigned int order)
}
EXPORT_SYMBOL_GPL(generic_online_page);
-static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
{
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn;
- int order;
/*
- * Online the pages. The callback might decide to keep some pages
- * PG_reserved (to add them to the buddy later), but we still account
- * them as being online/belonging to this zone ("present").
+ * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
+ * decide to not expose all pages to the buddy (e.g., expose them
+ * later). We account all pages as being online and belonging to this
+ * zone ("present").
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
- order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
- /* __free_pages_core() wants pfns to be aligned to the order */
- if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
- order = 0;
- (*online_page_callback)(pfn_to_page(pfn), order);
- }
+ for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
+ (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
-
- *(unsigned long *)arg += nr_pages;
- return 0;
}
/* check which state of node_states will be changed when online memory */
@@ -710,9 +701,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
* call, all affected pages are PG_reserved.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages,
+ struct vmem_altmap *altmap, int migratetype)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -737,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMINIT_HOTPLUG, altmap);
+ MEMINIT_HOTPLUG, altmap, migratetype);
set_zone_contiguous(zone);
}
@@ -803,17 +799,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
int online_type, int nid)
{
unsigned long flags;
- unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
int ret;
struct memory_notify arg;
+ /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -825,6 +825,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
goto failed_addition;
/*
+ * Fixup the number of isolated pageblocks before marking the sections
+ * onlining, such that undo_isolate_page_range() works correctly.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
@@ -834,36 +842,29 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
setup_zone_pageset(zone);
}
- ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
- online_pages_range);
- if (ret) {
- /* not a single memory resource was applicable */
- if (need_zonelists_rebuild)
- zone_pcp_reset(zone);
- goto failed_addition;
- }
-
- zone->present_pages += onlined_pages;
+ online_pages_range(pfn, nr_pages);
+ zone->present_pages += nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
- /*
- * When exposing larger, physically contiguous memory areas to the
- * buddy, shuffling in the buddy (when freeing onlined pages, putting
- * them either to the head or the tail of the freelist) is only helpful
- * for maintaining the shuffle, but not for creating the initial
- * shuffle. Shuffle the whole zone to make sure the just onlined pages
- * are properly distributed across the whole freelist.
- */
- shuffle_zone(zone);
-
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
zone_pcp_update(zone);
+ /* Basic onlining is complete, allow allocation of onlined pages. */
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+
+ /*
+ * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
+ * the tail of the freelist when undoing isolation). Shuffle the whole
+ * zone to make sure the just onlined pages are properly distributed
+ * across the whole freelist - to create an initial shuffle.
+ */
+ shuffle_zone(zone);
+
init_per_zone_wmark_min();
kswapd_run(nid);
@@ -1035,7 +1036,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = PAGE_KERNEL };
u64 start, size;
@@ -1088,9 +1089,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
}
/* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
- BUG_ON(ret);
+ link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1099,6 +1099,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
+ /*
+ * In case we're allowed to merge the resource, flag it and trigger
+ * merging now that adding succeeded.
+ */
+ if (mhp_flags & MEMHP_MERGE_RESOURCE)
+ merge_system_ram_resource(res);
+
/* online pages if requested */
if (memhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
@@ -1115,7 +1122,7 @@ error:
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size)
+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
@@ -1124,18 +1131,18 @@ int __ref __add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, mhp_flags);
if (ret < 0)
release_memory_resource(res);
return ret;
}
-int add_memory(int nid, u64 start, u64 size)
+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
int rc;
lock_device_hotplug();
- rc = __add_memory(nid, start, size);
+ rc = __add_memory(nid, start, size, mhp_flags);
unlock_device_hotplug();
return rc;
@@ -1157,14 +1164,14 @@ EXPORT_SYMBOL_GPL(add_memory);
*
* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
* memory map") are created. Also, the created memory resource is flagged
- * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
* this memory as well (esp., not place kexec images onto it).
*
* The resource_name (visible via /proc/iomem) has to have the format
* "System RAM ($DRIVER)".
*/
int add_memory_driver_managed(int nid, u64 start, u64 size,
- const char *resource_name)
+ const char *resource_name, mhp_t mhp_flags)
{
struct resource *res;
int rc;
@@ -1182,7 +1189,7 @@ int add_memory_driver_managed(int nid, u64 start, u64 size,
goto out_unlock;
}
- rc = add_memory_resource(nid, res);
+ rc = add_memory_resource(nid, res, mhp_flags);
if (rc < 0)
release_memory_resource(res);
@@ -1283,27 +1290,6 @@ found:
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private)
-{
- nodemask_t nmask = node_states[N_MEMORY];
- struct migration_target_control mtc = {
- .nid = page_to_nid(page),
- .nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- /*
- * try to allocate from a different node but reuse this node if there
- * are no other online nodes to be used (e.g. we are offlining a part
- * of the only existing node)
- */
- node_clear(mtc.nid, nmask);
- if (nodes_empty(nmask))
- node_set(mtc.nid, nmask);
-
- return alloc_migration_target(page, (unsigned long)&mtc);
-}
-
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1363,9 +1349,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
put_page(page);
}
if (!list_empty(&source)) {
- /* Allocate a new page from the nearest neighbor node */
- ret = migrate_pages(&source, new_node_page, NULL, 0,
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ /*
+ * We have checked that migration range is on a single zone so
+ * we can use the nid of the first page to all the others.
+ */
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+
+ /*
+ * try to allocate from a different node but reuse this node
+ * if there are no other online nodes to be used (e.g. we are
+ * offlining a part of the only existing node)
+ */
+ node_clear(mtc.nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(mtc.nid, nmask);
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
pr_warn("migrating pfn %lx failed ret:%d ",
@@ -1379,28 +1384,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
-/* Mark all sections offline and remove all free pages from the buddy. */
-static int
-offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
- void *data)
-{
- unsigned long *offlined_pages = (unsigned long *)data;
-
- *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
-
-/*
- * Check all pages in range, recorded as memory resource, are isolated.
- */
-static int
-check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
- void *data)
-{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages,
- MEMORY_OFFLINE);
-}
-
static int __init cmdline_parse_movable_node(char *p)
{
movable_node_enabled = true;
@@ -1484,17 +1467,21 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
return 0;
}
-static int __ref __offline_pages(unsigned long start_pfn,
- unsigned long end_pfn)
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
- unsigned long pfn, nr_pages = 0;
- unsigned long offlined_pages = 0;
- int ret, node, nr_isolate_pageblock;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn, system_ram_pages = 0;
unsigned long flags;
struct zone *zone;
struct memory_notify arg;
+ int ret, node;
char *reason;
+ /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/*
@@ -1505,9 +1492,9 @@ static int __ref __offline_pages(unsigned long start_pfn,
* memory holes PG_reserved, don't need pfn_valid() checks, and can
* avoid using walk_system_ram_range() later.
*/
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
count_system_ram_pages_cb);
- if (nr_pages != end_pfn - start_pfn) {
+ if (system_ram_pages != nr_pages) {
ret = -EINVAL;
reason = "memory holes";
goto failed_removal;
@@ -1527,11 +1514,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
- if (ret < 0) {
+ if (ret) {
reason = "failure to isolate range";
goto failed_removal;
}
- nr_isolate_pageblock = ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1581,9 +1567,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
}
- /* check again */
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- NULL, check_pages_isolated_cb);
+
/*
* per-cpu pages are drained in start_isolate_page_range, but if
* there are still pages that are not free, make sure that we
@@ -1596,30 +1580,30 @@ static int __ref __offline_pages(unsigned long start_pfn,
* because has_unmovable_pages explicitly checks for
* PageBuddy on freed pages on other zones.
*/
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
if (ret)
drain_all_pages(zone);
} while (ret);
- /* Ok, all of our target is isolated.
- We cannot do rollback at this point. */
- walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- &offlined_pages, offline_isolated_pages_cb);
- pr_info("Offlined Pages %ld\n", offlined_pages);
+ /* Mark all sections offline and remove free pages from the buddy. */
+ __offline_isolated_pages(start_pfn, end_pfn);
+ pr_info("Offlined Pages %ld\n", nr_pages);
+
/*
- * Onlining will reset pagetype flags and makes migrate type
- * MOVABLE, so just need to decrease the number of isolated
- * pageblocks zone counter here.
+ * The memory sections are marked offline, and the pageblock flags
+ * effectively stale; nobody should be touching them. Fixup the number
+ * of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock -= nr_isolate_pageblock;
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
- zone->present_pages -= offlined_pages;
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ zone->present_pages -= nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= offlined_pages;
+ zone->zone_pgdat->node_present_pages -= nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
init_per_zone_wmark_min();
@@ -1656,11 +1640,6 @@ failed_removal:
return ret;
}
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
- return __offline_pages(start_pfn, start_pfn + nr_pages);
-}
-
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1749,26 +1728,6 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
-static void __release_memory_resource(resource_size_t start,
- resource_size_t size)
-{
- int ret;
-
- /*
- * When removing memory in the same granularity as it was added,
- * this function never fails. It might only fail if resources
- * have to be adjusted or split. We'll ignore the error, as
- * removing of memory cannot fail.
- */
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
-}
-
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
@@ -1802,7 +1761,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
memblock_remove(start, size);
}
- __release_memory_resource(start, size);
+ release_mem_region_adjustable(start, size);
try_offline_node(nid);
diff --git a/mm/memremap.c b/mm/memremap.c
index 198083453182..73a206d0f645 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -266,7 +266,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
- PHYS_PFN(range_len(range)), params->altmap);
+ PHYS_PFN(range_len(range)), params->altmap,
+ MIGRATE_MOVABLE);
}
mem_hotplug_done();
diff --git a/mm/migrate.c b/mm/migrate.c
index f94d7c7eeddf..5ca5842df5db 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1223,16 +1223,11 @@ out:
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- put_page(page);
- if (reason == MR_MEMORY_FAILURE) {
+ if (reason != MR_MEMORY_FAILURE)
/*
- * Set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird,
- * it's how HWPoison flag works at the moment.
+ * We release the page in page_handle_poison.
*/
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- }
+ put_page(page);
} else {
if (rc != -EAGAIN) {
if (likely(!__PageMovable(page))) {
@@ -1869,33 +1864,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
return nr_pages ? -EFAULT : 0;
}
-/*
- * Move a list of pages in the address space of the currently executing
- * process.
- */
-static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
- const void __user * __user *pages,
- const int __user *nodes,
- int __user *status, int flags)
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
{
struct task_struct *task;
struct mm_struct *mm;
- int err;
- nodemask_t task_nodes;
- /* Check flags */
- if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
- return -EINVAL;
-
- if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
- return -EPERM;
+ /*
+ * There is no need to check if current process has the right to modify
+ * the specified process when they are same.
+ */
+ if (!pid) {
+ mmget(current->mm);
+ *mem_nodes = cpuset_mems_allowed(current);
+ return current->mm;
+ }
/* Find the mm_struct */
rcu_read_lock();
- task = pid ? find_task_by_vpid(pid) : current;
+ task = find_task_by_vpid(pid);
if (!task) {
rcu_read_unlock();
- return -ESRCH;
+ return ERR_PTR(-ESRCH);
}
get_task_struct(task);
@@ -1905,22 +1894,47 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
- err = -EPERM;
+ mm = ERR_PTR(-EPERM);
goto out;
}
rcu_read_unlock();
- err = security_task_movememory(task);
- if (err)
+ mm = ERR_PTR(security_task_movememory(task));
+ if (IS_ERR(mm))
goto out;
-
- task_nodes = cpuset_mems_allowed(task);
+ *mem_nodes = cpuset_mems_allowed(task);
mm = get_task_mm(task);
+out:
put_task_struct(task);
-
if (!mm)
+ mm = ERR_PTR(-EINVAL);
+ return mm;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
+{
+ struct mm_struct *mm;
+ int err;
+ nodemask_t task_nodes;
+
+ /* Check flags */
+ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ mm = find_mm_struct(pid, &task_nodes);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
if (nodes)
err = do_pages_move(mm, task_nodes, nr_pages, pages,
nodes, status, flags);
@@ -1929,10 +1943,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
mmput(mm);
return err;
-
-out:
- put_task_struct(task);
- return err;
}
SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
diff --git a/mm/mmap.c b/mm/mmap.c
index 67d11ad6df24..d91ecb00d38c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -558,6 +558,50 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
return 0;
}
+/*
+ * vma_next() - Get the next VMA.
+ * @mm: The mm_struct.
+ * @vma: The current vma.
+ *
+ * If @vma is NULL, return the first vma in the mm.
+ *
+ * Returns: The next VMA after @vma.
+ */
+static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ if (!vma)
+ return mm->mmap;
+
+ return vma->vm_next;
+}
+
+/*
+ * munmap_vma_range() - munmap VMAs that overlap a range.
+ * @mm: The mm struct
+ * @start: The start of the range.
+ * @len: The length of the range.
+ * @pprev: pointer to the pointer that will be set to previous vm_area_struct
+ * @rb_link: the rb_node
+ * @rb_parent: the parent rb_node
+ *
+ * Find all the vm_area_struct that overlap from @start to
+ * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ *
+ * Returns: -ENOMEM on munmap failure or 0 on success.
+ */
+static inline int
+munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
+ struct vm_area_struct **pprev, struct rb_node ***link,
+ struct rb_node **parent, struct list_head *uf)
+{
+
+ while (find_vma_links(mm, start, start + len, pprev, link, parent))
+ if (do_munmap(mm, start, len, uf))
+ return -ENOMEM;
+
+ return 0;
+}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
@@ -619,7 +663,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file_inode(file)->i_writecount);
+ put_write_access(file_inode(file));
if (vma->vm_flags & VM_SHARED)
mapping_allow_writable(mapping);
@@ -1128,10 +1172,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- if (prev)
- next = prev->vm_next;
- else
- next = mm->mmap;
+ next = vma_next(mm, prev);
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
@@ -1707,13 +1748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return -ENOMEM;
}
- /* Clear old maps */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
-
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
@@ -2562,7 +2599,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (vma && (vma->vm_start <= addr))
return vma;
/* don't alter vm_end if the coredump is running */
- if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2588,9 +2625,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
- /* don't alter vm_start if the coredump is running */
- if (!mmget_still_valid(mm))
- return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
@@ -2635,7 +2669,7 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
+ struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
@@ -2834,7 +2868,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (error)
return error;
}
- vma = prev ? prev->vm_next : mm->mmap;
+ vma = vma_next(mm, prev);
if (unlikely(uf)) {
/*
@@ -3052,14 +3086,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (error)
return error;
- /*
- * Clear old maps. this also does some error checking for us
- */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 4fc918163dd3..5654dd19addc 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -913,7 +913,7 @@ static int __mmu_interval_notifier_insert(
return -EOVERFLOW;
/* Must call with a mmget() held */
- if (WARN_ON(atomic_read(&mm->mm_count) <= 0))
+ if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
return -EINVAL;
/* pairs with mmdrop in mmu_interval_notifier_remove() */
diff --git a/mm/nommu.c b/mm/nommu.c
index 0df7ca321314..0faf39b32cdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -354,13 +354,6 @@ void vm_unmap_aliases(void)
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- BUG();
- return NULL;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
BUG();
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 358d6f28c627..7709f0e223f5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2849,6 +2849,7 @@ EXPORT_SYMBOL_GPL(wait_on_page_writeback);
*/
void wait_for_stable_page(struct page *page)
{
+ page = thp_head(page);
if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
wait_on_page_writeback(page);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0ff3a811ec5..23f5066bd4a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -78,6 +78,34 @@
#include "shuffle.h"
#include "page_reporting.h"
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
+typedef int __bitwise fpi_t;
+
+/* No special request */
+#define FPI_NONE ((__force fpi_t)0)
+
+/*
+ * Skip free page reporting notification for the (possibly merged) page.
+ * This does not hinder free page reporting from grabbing the page,
+ * reporting it and marking it "reported" - it only skips notifying
+ * the free page reporting infrastructure about a newly freed page. For
+ * example, used when temporarily pulling a page from a freelist and
+ * putting it back unmodified.
+ */
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
+
+/*
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
+ * shuffle the whole zone).
+ *
+ * Note: No code should rely on this flag for correctness - it's purely
+ * to allow for optimizations when handing back either fresh pages
+ * (memory onlining) or untouched pages (page isolation, free page
+ * reporting).
+ */
+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
@@ -247,7 +275,8 @@ bool pm_suspended_storage(void)
unsigned int pageblock_order __read_mostly;
#endif
-static void __free_pages_ok(struct page *page, unsigned int order);
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -659,7 +688,7 @@ out:
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
- __free_pages_ok(page, compound_order(page));
+ __free_pages_ok(page, compound_order(page), FPI_NONE);
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -763,7 +792,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
-static inline void set_page_order(struct page *page, unsigned int order)
+static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -788,7 +817,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy,
if (!page_is_guard(buddy) && !PageBuddy(buddy))
return false;
- if (page_order(buddy) != order)
+ if (buddy_order(buddy) != order)
return false;
/*
@@ -873,13 +902,17 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
area->nr_free++;
}
-/* Used for pages which are on another list */
+/*
+ * Used for pages which are on another list. Move the pages to the tail
+ * of the list - so the moved pages won't immediately be considered for
+ * allocation again (e.g., optimization for memory onlining).
+ */
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
- list_move(&page->lru, &area->free_list[migratetype]);
+ list_move_tail(&page->lru, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -952,7 +985,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
- int migratetype, bool report)
+ int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn;
@@ -1026,9 +1059,11 @@ continue_merging:
}
done_merging:
- set_page_order(page, order);
+ set_buddy_order(page, order);
- if (is_shuffle_order(order))
+ if (fpi_flags & FPI_TO_TAIL)
+ to_tail = true;
+ else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -1039,7 +1074,7 @@ done_merging:
add_to_free_list(page, zone, order, migratetype);
/* Notify page reporting subsystem of freed page */
- if (report)
+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
@@ -1174,6 +1209,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
trace_mm_page_free(page, order);
+ if (unlikely(PageHWPoison(page)) && !order) {
+ /*
+ * Do not let hwpoison pages hit pcplists/buddy
+ * Untie memcg state and reset page's owner
+ */
+ if (memcg_kmem_enabled() && PageKmemcg(page))
+ __memcg_kmem_uncharge_page(page, order);
+ reset_page_owner(page, order);
+ return false;
+ }
+
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
@@ -1369,7 +1415,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
@@ -1378,14 +1424,14 @@ static void free_pcppages_bulk(struct zone *zone, int count,
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
- int migratetype)
+ int migratetype, fpi_t fpi_flags)
{
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
- __free_one_page(page, pfn, zone, order, migratetype, true);
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(&zone->lock);
}
@@ -1463,7 +1509,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
}
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long flags;
int migratetype;
@@ -1475,7 +1522,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, pfn, order, migratetype);
+ free_one_page(page_zone(page), page, pfn, order, migratetype,
+ fpi_flags);
local_irq_restore(flags);
}
@@ -1485,6 +1533,11 @@ void __free_pages_core(struct page *page, unsigned int order)
struct page *p = page;
unsigned int loop;
+ /*
+ * When initializing the memmap, __init_single_page() sets the refcount
+ * of all pages to 1 ("allocated"/"not free"). We have to set the
+ * refcount of all involved pages to 0.
+ */
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
@@ -1495,8 +1548,12 @@ void __free_pages_core(struct page *page, unsigned int order)
set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
- set_page_refcounted(page);
- __free_pages(page, order);
+
+ /*
+ * Bypass PCP and place fresh pages right to the tail, primarily
+ * relevant for memory onlining.
+ */
+ __free_pages_ok(page, order, FPI_TO_TAIL);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -2121,7 +2178,7 @@ static inline void expand(struct zone *zone, struct page *page,
continue;
add_to_free_list(&page[size], zone, high, migratetype);
- set_page_order(&page[size], high);
+ set_buddy_order(&page[size], high);
}
}
@@ -2299,7 +2356,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Move the free pages in a range to the free lists of the requested type.
+ * Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
@@ -2335,7 +2392,7 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
- order = page_order(page);
+ order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
@@ -2459,7 +2516,7 @@ static inline void boost_watermark(struct zone *zone)
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
- unsigned int current_order = page_order(page);
+ unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
@@ -3123,7 +3180,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype);
+ free_one_page(zone, page, pfn, 0, migratetype,
+ FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
@@ -3209,7 +3267,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, order);
+ split_page_owner(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -3278,7 +3336,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
lockdep_assert_held(&zone->lock);
/* Return isolated page to tail of freelist. */
- __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
+ FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
}
/*
@@ -4945,7 +5004,7 @@ static inline void free_the_page(struct page *page, unsigned int order)
if (order == 0) /* Via pcp? */
free_unref_page(page);
else
- __free_pages_ok(page, order);
+ __free_pages_ok(page, order, FPI_NONE);
}
void __free_pages(struct page *page, unsigned int order)
@@ -5979,10 +6038,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum meminit_context context,
- struct vmem_altmap *altmap)
+ unsigned long start_pfn,
+ enum meminit_context context,
+ struct vmem_altmap *altmap, int migratetype)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -6026,19 +6090,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
__SetPageReserved(page);
/*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+ * such that unmovable allocations won't be scattered all
+ * over the place during system boot.
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ set_pageblock_migratetype(page, migratetype);
cond_resched();
}
pfn++;
@@ -6100,15 +6157,10 @@ void __ref memmap_init_zone_device(struct zone *zone,
* the address space during boot when many long-lived
* kernel allocations are made.
*
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
@@ -6143,7 +6195,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
if (end_pfn > start_pfn) {
size = end_pfn - start_pfn;
memmap_init_zone(size, nid, zone, start_pfn,
- MEMINIT_EARLY, NULL);
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
}
}
}
@@ -8292,7 +8344,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
- iter += (1 << page_order(page)) - 1;
+ iter += (1 << buddy_order(page)) - 1;
continue;
}
@@ -8457,7 +8509,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype, 0);
- if (ret < 0)
+ if (ret)
return ret;
/*
@@ -8505,7 +8557,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
if (outer_start != start) {
- order = page_order(pfn_to_page(outer_start));
+ order = buddy_order(pfn_to_page(outer_start));
/*
* outer_start page could be small order buddy page and
@@ -8693,35 +8745,21 @@ void zone_pcp_reset(struct zone *zone)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be in a single zone and isolated
- * before calling this.
+ * All pages in the range must be in a single zone, must not contain holes,
+ * must span full sections, and must be isolated before calling this function.
*/
-unsigned long
-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
+ unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
unsigned int order;
- unsigned long pfn;
unsigned long flags;
- unsigned long offlined_pages = 0;
-
- /* find the first valid pfn */
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- if (pfn_valid(pfn))
- break;
- if (pfn == end_pfn)
- return offlined_pages;
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
- pfn = start_pfn;
while (pfn < end_pfn) {
- if (!pfn_valid(pfn)) {
- pfn++;
- continue;
- }
page = pfn_to_page(pfn);
/*
* The HWPoisoned page may be not in buddy system, and
@@ -8729,7 +8767,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
- offlined_pages++;
continue;
}
/*
@@ -8740,20 +8777,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(PageBuddy(page));
pfn++;
- offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
- order = page_order(page);
- offlined_pages += 1 << order;
+ order = buddy_order(page);
del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return offlined_pages;
}
#endif
@@ -8768,7 +8801,7 @@ bool is_free_buddy_page(struct page *page)
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) && page_order(page_head) >= order)
+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
break;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -8778,30 +8811,70 @@ bool is_free_buddy_page(struct page *page)
#ifdef CONFIG_MEMORY_FAILURE
/*
- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
- * test is performed under the zone lock to prevent a race against page
- * allocation.
+ * Break down a higher-order page in sub-pages, and keep our target out of
+ * buddy allocator.
*/
-bool set_hwpoison_free_buddy_page(struct page *page)
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ struct page *target, int low, int high,
+ int migratetype)
+{
+ unsigned long size = 1 << high;
+ struct page *current_buddy, *next_page;
+
+ while (high > low) {
+ high--;
+ size >>= 1;
+
+ if (target >= &page[size]) {
+ next_page = page + size;
+ current_buddy = page;
+ } else {
+ next_page = page;
+ current_buddy = page + size;
+ }
+
+ if (set_page_guard(zone, current_buddy, high, migratetype))
+ continue;
+
+ if (current_buddy != target) {
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
+ page = next_page;
+ }
+ }
+}
+
+/*
+ * Take a page that will be marked as poisoned off the buddy allocator.
+ */
+bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
- bool hwpoisoned = false;
+ bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
+ int page_order = buddy_order(page_head);
- if (PageBuddy(page_head) && page_order(page_head) >= order) {
- if (!TestSetPageHWPoison(page))
- hwpoisoned = true;
+ if (PageBuddy(page_head) && page_order >= order) {
+ unsigned long pfn_head = page_to_pfn(page_head);
+ int migratetype = get_pfnblock_migratetype(page_head,
+ pfn_head);
+
+ del_page_from_free_list(page_head, zone, page_order);
+ break_down_buddy_pages(zone, page_head, page, 0,
+ page_order, migratetype);
+ ret = true;
break;
}
+ if (page_count(page_head) > 0)
+ break;
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return hwpoisoned;
+ return ret;
}
#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index aa94afb63823..abbf42214485 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* these pages to be merged.
*/
if (PageBuddy(page)) {
- order = page_order(page);
+ order = buddy_order(page);
if (order >= pageblock_order) {
pfn = page_to_pfn(page);
buddy_pfn = __find_buddy_pfn(pfn, order);
@@ -106,6 +106,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* If we isolate freepage with more than pageblock_order, there
* should be no freepage in the range, so we could avoid costly
* pageblock scanning for freepage moving.
+ *
+ * We didn't actually touch any of the isolated pages, so place them
+ * to the tail of the freelist. This is an optimization for memory
+ * onlining - just onlined memory won't immediately be considered for
+ * allocation.
*/
if (!isolated_page) {
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
@@ -173,8 +178,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* (e.g. __offline_pages will need to call it after check for isolated range for
* a next retry).
*
- * Return: the number of isolated pageblocks on success and -EBUSY if any part
- * of range cannot be isolated.
+ * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype, int flags)
@@ -182,7 +186,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn;
unsigned long undo_pfn;
struct page *page;
- int nr_isolate_pageblock = 0;
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
@@ -196,10 +199,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo_pfn = pfn;
goto undo;
}
- nr_isolate_pageblock++;
}
}
- return nr_isolate_pageblock;
+ return 0;
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
@@ -259,7 +261,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* the correct MIGRATE_ISOLATE freelist. There is no
* simple way to verify that as VM_BUG_ON(), though.
*/
- pfn += 1 << page_order(page);
+ pfn += 1 << buddy_order(page);
else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 360461509423..b735a8eafcdb 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -204,7 +204,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner->last_migrate_reason = reason;
}
-void __split_page_owner(struct page *page, unsigned int order)
+void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
struct page_ext *page_ext = lookup_page_ext(page);
@@ -213,7 +213,7 @@ void __split_page_owner(struct page *page, unsigned int order)
if (unlikely(!page_ext))
return;
- for (i = 0; i < (1 << order); i++) {
+ for (i = 0; i < nr; i++) {
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
@@ -295,7 +295,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageBuddy(page)) {
unsigned long freepage_order;
- freepage_order = page_order_unsafe(page);
+ freepage_order = buddy_order_unsafe(page);
if (freepage_order < MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
continue;
@@ -490,7 +490,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
page = pfn_to_page(pfn);
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
if (freepage_order < MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
@@ -584,7 +584,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
* heavy lock contention.
*/
if (PageBuddy(page)) {
- unsigned long order = page_order_unsafe(page);
+ unsigned long order = buddy_order_unsafe(page);
if (order > 0 && order < MAX_ORDER)
pfn += (1UL << order) - 1;
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 34b9181ee5d1..ae0482cded87 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -8,13 +8,23 @@
#include <linux/ratelimit.h>
#include <linux/kasan.h>
-static bool want_page_poisoning __read_mostly;
+static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning);
static int __init early_page_poison_param(char *buf)
{
- if (!buf)
- return -EINVAL;
- return strtobool(buf, &want_page_poisoning);
+ int ret;
+ bool tmp;
+
+ ret = strtobool(buf, &tmp);
+ if (ret)
+ return ret;
+
+ if (tmp)
+ static_branch_enable(&want_page_poisoning);
+ else
+ static_branch_disable(&want_page_poisoning);
+
+ return 0;
}
early_param("page_poison", early_page_poison_param);
@@ -31,7 +41,7 @@ bool page_poisoning_enabled(void)
* Page poisoning is debug page alloc for some arches. If
* either of those options are enabled, enable poisoning.
*/
- return (want_page_poisoning ||
+ return (static_branch_unlikely(&want_page_poisoning) ||
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled()));
}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 3bbd471cfc81..cd8e13d41df4 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -92,7 +92,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
* report on the new larger page when we make our way
* up to that higher order.
*/
- if (PageBuddy(page) && page_order(page) == order)
+ if (PageBuddy(page) && buddy_order(page) == order)
__SetPageReported(page);
} while ((sg = sg_next(sg)));
@@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
* the new head of the free list before we release the
* zone lock.
*/
- if (&page->lru != list && !list_is_first(&page->lru, list))
+ if (!list_is_first(&page->lru, list))
list_rotate_to_front(&page->lru, list);
/* release lock before waiting on report processing */
diff --git a/mm/percpu.c b/mm/percpu.c
index 1ed1a349eab8..66a93f096394 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1584,8 +1584,7 @@ static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) ||
- memcg_kmem_bypass())
+ if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
return PCPU_CHUNK_ROOT;
objcg = get_obj_cgroup_from_current();
diff --git a/mm/readahead.c b/mm/readahead.c
index 3c9a8dd7c56c..c5b0457415be 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,10 +158,8 @@ out:
}
/**
- * page_cache_readahead_unbounded - Start unchecked readahead.
- * @mapping: File address space.
- * @file: This instance of the open file; used for authentication.
- * @index: First page index to read.
+ * page_cache_ra_unbounded - Start unchecked readahead.
+ * @ractl: Readahead control.
* @nr_to_read: The number of pages to read.
* @lookahead_size: Where to start the next readahead.
*
@@ -173,17 +171,13 @@ out:
* Context: File is referenced by caller. Mutexes may be held by caller.
* May sleep, but will not reenter filesystem to reclaim memory.
*/
-void page_cache_readahead_unbounded(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void page_cache_ra_unbounded(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
+ struct address_space *mapping = ractl->mapping;
+ unsigned long index = readahead_index(ractl);
LIST_HEAD(page_pool);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- struct readahead_control rac = {
- .mapping = mapping,
- .file = file,
- ._index = index,
- };
unsigned long i;
/*
@@ -204,7 +198,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
for (i = 0; i < nr_to_read; i++) {
struct page *page = xa_load(&mapping->i_pages, index + i);
- BUG_ON(index + i != rac._index + rac._nr_pages);
+ BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
if (page && !xa_is_value(page)) {
/*
@@ -215,7 +209,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl, &page_pool, true);
continue;
}
@@ -228,12 +222,12 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
} else if (add_to_page_cache_lru(page, mapping, index + i,
gfp_mask) < 0) {
put_page(page);
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl, &page_pool, true);
continue;
}
if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
- rac._nr_pages++;
+ ractl->_nr_pages++;
}
/*
@@ -241,22 +235,22 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- read_pages(&rac, &page_pool, false);
+ read_pages(ractl, &page_pool, false);
memalloc_nofs_restore(nofs);
}
-EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * do_page_cache_ra() actually reads a chunk of disk. It allocates
* the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
-void __do_page_cache_readahead(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void do_page_cache_ra(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = ractl->mapping->host;
+ unsigned long index = readahead_index(ractl);
loff_t isize = i_size_read(inode);
pgoff_t end_index; /* The last page we want to read */
@@ -270,20 +264,19 @@ void __do_page_cache_readahead(struct address_space *mapping,
if (nr_to_read > end_index - index)
nr_to_read = end_index - index + 1;
- page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
- lookahead_size);
+ page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
-void force_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t index, unsigned long nr_to_read)
+void force_page_cache_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long nr_to_read)
{
+ struct address_space *mapping = ractl->mapping;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
- struct file_ra_state *ra = &filp->f_ra;
- unsigned long max_pages;
+ unsigned long max_pages, index;
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
!mapping->a_ops->readahead))
@@ -293,14 +286,16 @@ void force_page_cache_readahead(struct address_space *mapping,
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
+ index = readahead_index(ractl);
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
- nr_to_read = min(nr_to_read, max_pages);
+ nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
while (nr_to_read) {
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
+ ractl->_index = index;
+ do_page_cache_ra(ractl, this_chunk, 0);
index += this_chunk;
nr_to_read -= this_chunk;
@@ -437,14 +432,14 @@ static int try_context_readahead(struct address_space *mapping,
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
-static void ondemand_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- bool hit_readahead_marker, pgoff_t index,
+static void ondemand_readahead(struct readahead_control *ractl,
+ struct file_ra_state *ra, bool hit_readahead_marker,
unsigned long req_size)
{
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
+ unsigned long index = readahead_index(ractl);
pgoff_t prev_index;
/*
@@ -482,7 +477,8 @@ static void ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_miss(mapping, index + 1, max_pages);
+ start = page_cache_next_miss(ractl->mapping, index + 1,
+ max_pages);
rcu_read_unlock();
if (!start || start - index > max_pages)
@@ -515,14 +511,15 @@ static void ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, index, req_size, max_pages))
+ if (try_context_readahead(ractl->mapping, ra, index, req_size,
+ max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
- __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+ do_page_cache_ra(ractl, req_size, 0);
return;
initial_readahead:
@@ -548,63 +545,42 @@ readit:
}
}
- ra_submit(ra, mapping, filp);
+ ractl->_index = ra->start;
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
}
-/**
- * page_cache_sync_readahead - generic file readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_sync_readahead() should be called when a cache miss happened:
- * it will submit the read. The readahead logic may decide to piggyback more
- * pages onto the read request if access patterns suggest it will improve
- * performance.
- */
-void page_cache_sync_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- pgoff_t index, unsigned long req_count)
+void page_cache_sync_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long req_count)
{
- /* no read-ahead */
- if (!ra->ra_pages)
- return;
+ bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
- if (blk_cgroup_congested())
- return;
+ /*
+ * Even if read-ahead is disabled, issue this request as read-ahead
+ * as we'll need it to satisfy the requested range. The forced
+ * read-ahead will do the right thing and limit the read to just the
+ * requested range, which we'll set to 1 page for this case.
+ */
+ if (!ra->ra_pages || blk_cgroup_congested()) {
+ if (!ractl->file)
+ return;
+ req_count = 1;
+ do_forced_ra = true;
+ }
/* be dumb */
- if (filp && (filp->f_mode & FMODE_RANDOM)) {
- force_page_cache_readahead(mapping, filp, index, req_count);
+ if (do_forced_ra) {
+ force_page_cache_ra(ractl, ra, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, false, index, req_count);
+ ondemand_readahead(ractl, ra, false, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
+EXPORT_SYMBOL_GPL(page_cache_sync_ra);
-/**
- * page_cache_async_readahead - file readahead for marked pages
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @page: The page at @index which triggered the readahead call.
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_async_readahead() should be called when a page is used which
- * is marked as PageReadahead; this is a marker to suggest that the application
- * has used up enough of the readahead window that we should start pulling in
- * more pages.
- */
-void
-page_cache_async_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- struct page *page, pgoff_t index,
- unsigned long req_count)
+void page_cache_async_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, struct page *page,
+ unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -621,16 +597,16 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (inode_read_congested(mapping->host))
+ if (inode_read_congested(ractl->mapping->host))
return;
if (blk_cgroup_congested())
return;
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, true, index, req_count);
+ ondemand_readahead(ractl, ra, true, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index 9425260774a1..1b84945d655c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1205,7 +1205,7 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
@@ -1246,7 +1246,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
@@ -1293,7 +1293,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
* Subpages can be mapped with PTEs too. Check how many of
* them are still mapped.
*/
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
@@ -1303,10 +1303,10 @@ static void page_remove_anon_compound_rmap(struct page *page)
* page of the compound page is unmapped, but at least one
* small page is still mapped.
*/
- if (nr && nr < HPAGE_PMD_NR)
+ if (nr && nr < thp_nr_pages(page))
deferred_split_huge_page(page);
} else {
- nr = HPAGE_PMD_NR;
+ nr = thp_nr_pages(page);
}
if (unlikely(PageMlocked(page)))
diff --git a/mm/shmem.c b/mm/shmem.c
index 6d4ddef4a24f..537c137698f8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3984,7 +3984,7 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
};
int __init shmem_init(void)
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 9b5cd4b004b0..9c2e145a747a 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -60,7 +60,7 @@ static struct page * __meminit shuffle_valid_page(struct zone *zone,
* ...is the page on the same list as the page we will
* shuffle it with?
*/
- if (page_order(page) != order)
+ if (buddy_order(page) != order)
return NULL;
return page;
diff --git a/mm/slab.c b/mm/slab.c
index 399a9d185b0f..b1113561b98b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1062,7 +1062,7 @@ int slab_prepare_cpu(unsigned int cpu)
* Even if all the cpus of a node are down, we don't free the
* kmem_cache_node of any cache. This to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
- * the cpu going down. The list3 structure is usually allocated from
+ * the cpu going down. The kmem_cache_node structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
*/
int slab_dead_cpu(unsigned int cpu)
diff --git a/mm/slab.h b/mm/slab.h
index 6dd4b702888a..6d7c6a5056ba 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -46,7 +46,6 @@ struct kmem_cache {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
-#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -281,9 +280,6 @@ static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
{
struct obj_cgroup *objcg;
- if (memcg_kmem_bypass())
- return NULL;
-
objcg = get_obj_cgroup_from_current();
if (!objcg)
return NULL;
diff --git a/mm/slub.c b/mm/slub.c
index 61d0d2968413..b30be2385d1c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1956,7 +1956,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
- * partial slab and there is none available then get_partials()
+ * partial slab and there is none available then get_partial()
* will return NULL.
*/
if (!n || !n->nr_partial)
diff --git a/mm/sparse.c b/mm/sparse.c
index b25ad8e64839..7bd23f9d6cef 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -312,6 +312,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
return coded_mem_map;
}
+#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Decode mem_map from the coded memmap
*/
@@ -321,6 +322,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
coded_mem_map &= SECTION_MAP_MASK;
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
+#endif /* CONFIG_MEMORY_HOTPLUG */
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index aa40e706604c..ee465827420e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -246,7 +246,7 @@ int add_to_swap(struct page *page)
goto fail;
/*
* Normally the page will be dirtied in unmap because its pte should be
- * dirty. A special case is MADV_FREE page. The page'e pte could have
+ * dirty. A special case is MADV_FREE page. The page's pte could have
* dirty bit cleared but the page's SwapBacked bit is still set because
* clearing the dirty bit and SwapBacked bit has no lock protected. For
* such page, unmap will not set dirty bit for it, so page reclaim will
diff --git a/mm/truncate.c b/mm/truncate.c
index 6bbe0f0b3ce9..18cec39a9f53 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -168,7 +168,7 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
*
- * We need to bale out if page->mapping is no longer equal to the original
+ * We need to bail out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
@@ -177,12 +177,12 @@ static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
if (page_mapped(page)) {
- pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
+ unsigned int nr = thp_nr_pages(page);
unmap_mapping_pages(mapping, page->index, nr, false);
}
if (page_has_private(page))
- do_invalidatepage(page, 0, PAGE_SIZE);
+ do_invalidatepage(page, 0, thp_size(page));
/*
* Some filesystems seem to re-dirty the page even after
diff --git a/mm/util.c b/mm/util.c
index 4e21fe7eae27..4ddb6e186dd5 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -69,7 +69,8 @@ EXPORT_SYMBOL(kstrdup);
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
- * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
+ * must not be passed to krealloc().
*
* Return: source string if it is in .rodata section otherwise
* fallback to kstrdup.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 04ac98bf5045..6ae491a8b210 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/mm/vmalloc.c
- *
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
@@ -2321,20 +2319,21 @@ static void __vfree(const void *addr)
}
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - Release memory allocated by vmalloc()
+ * @addr: Memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as obtained
+ * from one of the vmalloc() family of APIs. This will usually also free the
+ * physical memory underlying the virtual allocation, but that memory is
+ * reference counted, so it will not be freed until the last user goes away.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * If @addr is NULL, no operation is performed.
*
+ * Context:
* May sleep if called *not* from interrupt context.
- *
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
+ * Must not be called in NMI context (strictly speaking, it could be
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea).
*/
void vfree(const void *addr)
{
@@ -2376,8 +2375,11 @@ EXPORT_SYMBOL(vunmap);
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * Maps @count pages from @pages into contiguous kernel virtual space.
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
+ * are transferred from the caller to vmap(), and will be freed / dropped when
+ * vfree() is called on the return value.
*
* Return: the address of the area or %NULL on failure
*/
@@ -2403,28 +2405,73 @@ void *vmap(struct page **pages, unsigned int count,
return NULL;
}
+ if (flags & VM_MAP_PUT_PAGES)
+ area->pages = pages;
return area->addr;
}
EXPORT_SYMBOL(vmap);
+#ifdef CONFIG_VMAP_PFN
+struct vmap_pfn_data {
+ unsigned long *pfns;
+ pgprot_t prot;
+ unsigned int idx;
+};
+
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
+{
+ struct vmap_pfn_data *data = private;
+
+ if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
+ return -EINVAL;
+ *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
+ return 0;
+}
+
+/**
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
+ * @pfns: array of PFNs
+ * @count: number of pages to map
+ * @prot: page protection for the mapping
+ *
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
+ * the start address of the mapping.
+ */
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
+{
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
+ free_vm_area(area);
+ return NULL;
+ }
+ return area->addr;
+}
+EXPORT_SYMBOL_GPL(vmap_pfn);
+#endif /* CONFIG_VMAP_PFN */
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
- struct page **pages;
- unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
- const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
- 0 :
- __GFP_HIGHMEM;
+ unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned int array_size = nr_pages * sizeof(struct page *), i;
+ struct page **pages;
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
- array_size = (nr_pages * sizeof(struct page *));
+ gfp_mask |= __GFP_NOWARN;
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
+ gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- node, area->caller);
+ pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+ area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2442,9 +2489,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask|highmem_mask);
+ page = alloc_page(gfp_mask);
else
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
+ page = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
@@ -3032,54 +3079,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-static int f(pte_t *pte, unsigned long addr, void *data)
-{
- pte_t ***p = data;
-
- if (p) {
- *(*p) = pte;
- (*p)++;
- }
- return 0;
-}
-
-/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
- *
- * Returns: NULL on failure, vm_struct on success
- *
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
- *
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
- */
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- struct vm_struct *area;
-
- area = get_vm_area_caller(size, VM_IOREMAP,
- __builtin_return_address(0));
- if (area == NULL)
- return NULL;
-
- /*
- * This ensures that page tables are constructed for this region
- * of kernel virtual address space and mapped into init_mm.
- */
- if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
- size, f, ptes ? &ptes : NULL)) {
- free_vm_area(area);
- return NULL;
- }
-
- return area;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 879fb57c5045..1b8f0e059767 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -725,8 +725,7 @@ static inline int is_page_cache_freeable(struct page *page)
* that isolated the page, the page cache and optional buffer
* heads at page->private.
*/
- int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
- HPAGE_PMD_NR : 1;
+ int page_cache_pins = thp_nr_pages(page);
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
@@ -2240,7 +2239,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
- u64 fraction[2];
+ u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
unsigned long ap, fp;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f7b4ee6aa12..698bc0bc18d1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -325,7 +325,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
@@ -350,7 +350,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
node_page_state_add(x, pgdat, item);
x = 0;
}
@@ -511,7 +511,7 @@ static inline void mod_zone_state(struct zone *zone,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to zone counters */
@@ -573,7 +573,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to node counters */
diff --git a/mm/workingset.c b/mm/workingset.c
index 92e66113a577..975a4d2dd02e 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -216,7 +216,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
- * @memcg: the lruvec that was aged
+ * @lruvec: the lruvec that was aged
* @nr_pages: the number of pages to count
*
* As in-memory pages are aged, non-resident pages need to be aged as
@@ -519,12 +519,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
void *arg) __must_hold(lru_lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
- XA_STATE(xas, node->array, 0);
struct address_space *mapping;
int ret;
/*
- * Page cache insertions and deletions synchroneously maintain
+ * Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
@@ -559,15 +558,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
mapping->nrexceptional -= node->nr_values;
- xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
- xas.xa_offset = node->offset;
- xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
- xas_set_update(&xas, workingset_update_node);
- /*
- * We could store a shadow entry here which was the minimum of the
- * shadow entries we were tracking ...
- */
- xas_store(&xas, NULL);
+ xa_delete_node(node, workingset_update_node);
__inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
out_invalid:
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c36fdff9a371..918c7b019b3d 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1122,10 +1122,16 @@ static inline int __zs_cpu_up(struct mapping_area *area)
*/
if (area->vm)
return 0;
- area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
+ area->vm = get_vm_area(PAGE_SIZE * 2, 0);
if (!area->vm)
return -ENOMEM;
- return 0;
+
+ /*
+ * Populate ptes in advance to avoid pte allocation with GFP_KERNEL
+ * in non-preemtible context of zs_map_object.
+ */
+ return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr,
+ PAGE_SIZE * 2, NULL, NULL);
}
static inline void __zs_cpu_down(struct mapping_area *area)