aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/backing-dev.c12
-rw-r--r--mm/compaction.c25
-rw-r--r--mm/filemap.c26
-rw-r--r--mm/gup.c86
-rw-r--r--mm/gup_benchmark.c8
-rw-r--r--mm/highmem.c6
-rw-r--r--mm/huge_memory.c98
-rw-r--r--mm/hugetlb.c91
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h21
-rw-r--r--mm/kasan/Makefile8
-rw-r--r--mm/kasan/generic.c1
-rw-r--r--mm/kasan/tags.c1
-rw-r--r--mm/khugepaged.c7
-rw-r--r--mm/kmemleak.c92
-rw-r--r--mm/ksm.c26
-rw-r--r--mm/maccess.c167
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c98
-rw-r--r--mm/memory.c38
-rw-r--r--mm/memory_hotplug.c103
-rw-r--r--mm/mempolicy.c26
-rw-r--r--mm/migrate.c50
-rw-r--r--mm/mmap.c23
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mmu_gather.c16
-rw-r--r--mm/mprotect.c38
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c255
-rw-r--r--mm/page_counter.c6
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/shmem.c42
-rw-r--r--mm/slab.c90
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c67
-rw-r--r--mm/slub.c215
-rw-r--r--mm/swap.c74
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/util.c18
-rw-r--r--mm/vmalloc.c52
-rw-r--r--mm/vmscan.c9
-rw-r--r--mm/vmstat.c14
-rw-r--r--mm/workingset.c5
-rw-r--r--mm/zsmalloc.c85
-rw-r--r--mm/zswap.c12
48 files changed, 1479 insertions, 578 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f0c76ba47695..3925eacd96d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -373,7 +373,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
select COMPACTION
select XARRAY_MULTI
help
@@ -765,4 +765,14 @@ config GUP_BENCHMARK
config ARCH_HAS_PTE_SPECIAL
bool
+#
+# Some architectures require a special hugepage directory format that is
+# required to support multiple hugepage sizes. For example a4fe3ce76
+# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
+# introduced it on powerpc. This allows for a more flexible hugepage
+# pagetable layouts.
+#
+config ARCH_HAS_HUGEPD
+ bool
+
endmenu
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 909dae445ea7..18ddd26967b6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -20,6 +20,7 @@ struct backing_dev_info noop_backing_dev_info = {
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
+static const char *bdi_unknown_name = "(unknown)";
/*
* bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
@@ -880,7 +881,8 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
- dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
+ vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
+ dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
if (IS_ERR(dev))
return PTR_ERR(dev);
@@ -976,6 +978,14 @@ void bdi_put(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_put);
+const char *bdi_dev_name(struct backing_dev_info *bdi)
+{
+ if (!bdi || !bdi->dev)
+ return bdi_unknown_name;
+ return bdi->dev_name;
+}
+EXPORT_SYMBOL_GPL(bdi_dev_name);
+
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
diff --git a/mm/compaction.c b/mm/compaction.c
index 5ab9c2b22693..a0e19d40ecc2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2241,10 +2241,12 @@ check_drain:
block_start_pfn(cc->migrate_pfn, cc->order);
if (last_migrated_pfn < current_block_start) {
- cpu = get_cpu();
+ cpu = get_cpu_light();
+ local_lock_irq(swapvec_lock);
lru_add_drain_cpu(cpu);
+ local_unlock_irq(swapvec_lock);
drain_local_pages(cc->zone);
- put_cpu();
+ put_cpu_light();
/* No more flushing until we migrate again */
last_migrated_pfn = 0;
}
@@ -2311,16 +2313,26 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.page = NULL,
};
- if (capture)
- current->capture_control = &capc;
+ /*
+ * Make sure the structs are really initialized before we expose the
+ * capture control, in case we are interrupted and the interrupt handler
+ * frees a page.
+ */
+ barrier();
+ WRITE_ONCE(current->capture_control, &capc);
ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
- *capture = capc.page;
- current->capture_control = NULL;
+ /*
+ * Make sure we hide capture control first before we read the captured
+ * page pointer, otherwise an interrupt could free and capture a page
+ * and we would leak it.
+ */
+ WRITE_ONCE(current->capture_control, NULL);
+ *capture = READ_ONCE(capc.page);
return ret;
}
@@ -2334,6 +2346,7 @@ int sysctl_extfrag_threshold = 500;
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
* @prio: Determines how hard direct compaction should try to succeed
+ * @capture: Pointer to free page created by compaction will be stored here
*
* This is the main entry point for direct page compaction.
*/
diff --git a/mm/filemap.c b/mm/filemap.c
index 7ac5b439794c..7301431fb34e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -408,7 +408,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping))
+ if (!mapping_cap_writeback_dirty(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -2349,27 +2350,6 @@ EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
-static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
- struct file *fpin)
-{
- int flags = vmf->flags;
-
- if (fpin)
- return fpin;
-
- /*
- * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
- * anything, so we only pin the file and drop the mmap_sem if only
- * FAULT_FLAG_ALLOW_RETRY is set.
- */
- if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
- FAULT_FLAG_ALLOW_RETRY) {
- fpin = get_file(vmf->vma->vm_file);
- up_read(&vmf->vma->vm_mm->mmap_sem);
- }
- return fpin;
-}
-
/*
* lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
* @vmf - the vm_fault for this fault.
@@ -2479,7 +2459,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vmf->vma->vm_flags & VM_RAND_READ)
+ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
diff --git a/mm/gup.c b/mm/gup.c
index d2c14fc4b5d4..68177ec67de7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1881,8 +1881,92 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
}
#endif
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+ unsigned long sz)
+{
+ unsigned long __boundary = (addr + sz) & ~(sz-1);
+ return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+
+static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ unsigned long pte_end;
+ struct page *head, *page;
+ pte_t pte;
+ int refs;
+
+ pte_end = (addr + sz) & ~(sz-1);
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = READ_ONCE(*ptep);
+
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
+ return 0;
+
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+
+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ /* Could be optimized better */
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ pte_t *ptep;
+ unsigned long sz = 1UL << hugepd_shift(hugepd);
+ unsigned long next;
+
+ ptep = hugepte_offset(hugepd, addr, pdshift);
+ do {
+ next = hugepte_addr_end(addr, end, sz);
+ if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
+ return 0;
+ } while (ptep++, addr = next, addr != end);
+
+ return 1;
+}
+#else
+static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_HAS_HUGEPD */
+
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 7dd602d7f8db..ad9d5b1c4473 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -26,6 +26,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
unsigned long i, nr_pages, addr, next;
int nr;
struct page **pages;
+ int ret = 0;
if (gup->size > ULONG_MAX)
return -EINVAL;
@@ -63,7 +64,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
NULL);
break;
default:
- return -1;
+ kvfree(pages);
+ ret = -EINVAL;
+ goto out;
}
if (nr <= 0)
@@ -85,7 +88,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
kvfree(pages);
- return 0;
+out:
+ return ret;
}
static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
diff --git a/mm/highmem.c b/mm/highmem.c
index 107b10f9878e..4b67b9d2fc49 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,9 +30,11 @@
#include <linux/kgdb.h>
#include <asm/tlbflush.h>
-
+#ifndef CONFIG_PREEMPT_RT_FULL
#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
DEFINE_PER_CPU(int, __kmap_atomic_idx);
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+#endif
#endif
/*
@@ -108,8 +110,6 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
atomic_long_t _totalhigh_pages __read_mostly;
EXPORT_SYMBOL(_totalhigh_pages);
-EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
-
unsigned int nr_free_highpages (void)
{
struct zone *zone;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 83236e22a8a8..7cc039aa63ab 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -172,16 +172,13 @@ static ssize_t enabled_store(struct kobject *kobj,
{
ssize_t ret = count;
- if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
+ if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
+ } else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else
@@ -245,32 +242,27 @@ static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
+ if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer+madvise", buf,
- min(sizeof("defer+madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "defer+madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer", buf,
- min(sizeof("defer")-1, count))) {
+ } else if (sysfs_streq(buf, "defer")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
+ } else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
@@ -508,13 +500,13 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+static unsigned long __thp_get_unmapped_area(struct file *filp,
+ unsigned long addr, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
{
- unsigned long addr;
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
- unsigned long len_pad;
+ unsigned long len_pad, ret;
if (off_end <= off_align || (off_end - off_align) < size)
return 0;
@@ -523,30 +515,40 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long le
if (len_pad < len || (off + len_pad) < off)
return 0;
- addr = current->mm->get_unmapped_area(filp, 0, len_pad,
+ ret = current->mm->get_unmapped_area(filp, addr, len_pad,
off >> PAGE_SHIFT, flags);
- if (IS_ERR_VALUE(addr))
+
+ /*
+ * The failure might be due to length padding. The caller will retry
+ * without the padding.
+ */
+ if (IS_ERR_VALUE(ret))
return 0;
- addr += (off - addr) & (size - 1);
- return addr;
+ /*
+ * Do not try to align to THP boundary if allocation at the address
+ * hint succeeds.
+ */
+ if (ret == addr)
+ return addr;
+
+ ret += (off - ret) & (size - 1);
+ return ret;
}
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
+ unsigned long ret;
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
- if (addr)
- goto out;
if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
goto out;
- addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
- if (addr)
- return addr;
-
- out:
+ ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
+ if (ret)
+ return ret;
+out:
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
@@ -2281,6 +2283,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
spinlock_t *ptl;
struct mmu_notifier_range range;
+ bool was_locked = false;
+ pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@ -2293,11 +2297,32 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* pmd against. Otherwise we can end up replacing wrong page.
*/
VM_BUG_ON(freeze && !page);
- if (page && page != pmd_page(*pmd))
- goto out;
+ if (page) {
+ VM_WARN_ON_ONCE(!PageLocked(page));
+ was_locked = true;
+ if (page != pmd_page(*pmd))
+ goto out;
+ }
+repeat:
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ if (!page) {
+ page = pmd_page(*pmd);
+ if (unlikely(!trylock_page(page))) {
+ get_page(page);
+ _pmd = *pmd;
+ spin_unlock(ptl);
+ lock_page(page);
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, _pmd))) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto repeat;
+ }
+ put_page(page);
+ }
+ }
if (PageMlocked(page))
clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
@@ -2305,6 +2330,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
+ if (!was_locked && page)
+ unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2664,7 +2691,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
unsigned long flags;
pgoff_t end;
- VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+ VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -2953,8 +2980,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
return;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
- pmdval = *pvmw->pmd;
- pmdp_invalidate(vma, address, pvmw->pmd);
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
if (pmd_dirty(pmdval))
set_page_dirty(page);
entry = make_migration_entry(page, pmd_write(pmdval));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c842adf14e0..1eedc7003380 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,7 @@
#include <linux/swapops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
+#include <linux/llist.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -1255,7 +1256,7 @@ static inline void ClearPageHugeTemporary(struct page *page)
page[2].mapping = NULL;
}
-void free_huge_page(struct page *page)
+static void __free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
@@ -1318,6 +1319,54 @@ void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
}
+/*
+ * As free_huge_page() can be called from a non-task context, we have
+ * to defer the actual freeing in a workqueue to prevent potential
+ * hugetlb_lock deadlock.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to
+ * be freed and frees them one-by-one. As the page->mapping pointer is
+ * going to be cleared in __free_huge_page() anyway, it is reused as the
+ * llist_node structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+ struct llist_node *node;
+ struct page *page;
+
+ node = llist_del_all(&hpage_freelist);
+
+ while (node) {
+ page = container_of((struct address_space **)node,
+ struct page, mapping);
+ node = node->next;
+ __free_huge_page(page);
+ }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+void free_huge_page(struct page *page)
+{
+ /*
+ * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
+ */
+ if (!in_task()) {
+ /*
+ * Only call schedule_work() if hpage_freelist is previously
+ * empty. Otherwise, schedule_work() had been called but the
+ * workfn hasn't retrieved the list yet.
+ */
+ if (llist_add((struct llist_node *)&page->mapping,
+ &hpage_freelist))
+ schedule_work(&free_hpage_work);
+ return;
+ }
+
+ __free_huge_page(page);
+}
+
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
@@ -2964,6 +3013,22 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
}
#ifdef CONFIG_SYSCTL
+static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *length,
+ loff_t *ppos, unsigned long *out)
+{
+ struct ctl_table dup_table;
+
+ /*
+ * In order to avoid races with __do_proc_doulongvec_minmax(), we
+ * can duplicate the @table and alter the duplicate of it.
+ */
+ dup_table = *table;
+ dup_table.data = out;
+
+ return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
+}
+
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
@@ -2975,9 +3040,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (!hugepages_supported())
return -EOPNOTSUPP;
- table->data = &tmp;
- table->maxlen = sizeof(unsigned long);
- ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
if (ret)
goto out;
@@ -3021,9 +3085,8 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
if (write && hstate_is_gigantic(h))
return -EINVAL;
- table->data = &tmp;
- table->maxlen = sizeof(unsigned long);
- ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
if (ret)
goto out;
@@ -4879,8 +4942,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
{
pgd_t *pgd;
p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
+ pud_t *pud, pud_entry;
+ pmd_t *pmd, pmd_entry;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
@@ -4890,17 +4953,19 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return NULL;
pud = pud_offset(p4d, addr);
- if (sz != PUD_SIZE && pud_none(*pud))
+ pud_entry = READ_ONCE(*pud);
+ if (sz != PUD_SIZE && pud_none(pud_entry))
return NULL;
/* hugepage or swap? */
- if (pud_huge(*pud) || !pud_present(*pud))
+ if (pud_huge(pud_entry) || !pud_present(pud_entry))
return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
- if (sz != PMD_SIZE && pmd_none(*pmd))
+ pmd_entry = READ_ONCE(*pmd);
+ if (sz != PMD_SIZE && pmd_none(pmd_entry))
return NULL;
/* hugepage or swap? */
- if (pmd_huge(*pmd) || !pmd_present(*pmd))
+ if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry))
return (pte_t *)pmd;
return NULL;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 68c2f2f3c05b..7a93e1e439dd 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
- if (!css_tryget_online(&h_cg->css)) {
+ if (!css_tryget(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
diff --git a/mm/internal.h b/mm/internal.h
index e32390802fd3..5d91f9e90741 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -362,6 +362,27 @@ vma_address(struct page *page, struct vm_area_struct *vma)
return max(start, vma->vm_start);
}
+static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
+{
+ int flags = vmf->flags;
+
+ if (fpin)
+ return fpin;
+
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_sem if only
+ * FAULT_FLAG_ALLOW_RETRY is set.
+ */
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+ FAULT_FLAG_ALLOW_RETRY) {
+ fpin = get_file(vmf->vma->vm_file);
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ }
+ return fpin;
+}
+
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 08b43de2383b..f36ffc090f5f 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -14,10 +14,10 @@ CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE)
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
-CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
+CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
+CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
+CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
obj-$(CONFIG_KASAN) := common.o init.o report.o
obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 504c79363a34..4acdf7e1f965 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -15,7 +15,6 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
#include <linux/interrupt.h>
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 63fca3172659..0d83b7fc21fd 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -12,7 +12,6 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
#include <linux/interrupt.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eaaa21b23215..5ce6d8728e2b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1016,12 +1016,13 @@ static void collapse_huge_page(struct mm_struct *mm,
anon_vma_lock_write(vma->anon_vma);
- pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
-
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
address, address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
+
+ pte = pte_offset_map(pmd, address);
+ pte_ptl = pte_lockptr(mm, pmd);
+
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 3afb01bce736..355dd95d0611 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -13,7 +13,7 @@
*
* The following locks and mutexes are used by kmemleak:
*
- * - kmemleak_lock (rwlock): protects the object_list modifications and
+ * - kmemleak_lock (raw spinlock): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
* blocks. The object_tree_root is a red black tree used to look-up
@@ -135,7 +135,7 @@ struct kmemleak_scan_area {
* (use_count) and freed using the RCU mechanism.
*/
struct kmemleak_object {
- spinlock_t lock;
+ raw_spinlock_t lock;
unsigned int flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
@@ -186,7 +186,7 @@ static LIST_HEAD(gray_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
/* rw_lock protecting the access to object_list and object_tree_root */
-static DEFINE_RWLOCK(kmemleak_lock);
+static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
static struct kmem_cache *object_cache;
@@ -497,9 +497,9 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
struct kmemleak_object *object;
rcu_read_lock();
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
if (object && !get_object(object))
@@ -519,13 +519,13 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
unsigned long flags;
struct kmemleak_object *object;
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
if (object) {
rb_erase(&object->rb_node, &object_tree_root);
list_del_rcu(&object->object_list);
}
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -560,7 +560,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
INIT_LIST_HEAD(&object->object_list);
INIT_LIST_HEAD(&object->gray_list);
INIT_HLIST_HEAD(&object->area_list);
- spin_lock_init(&object->lock);
+ raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
@@ -592,7 +592,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
min_addr = min(min_addr, untagged_ptr);
@@ -624,7 +624,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
list_add_tail_rcu(&object->object_list, &object_list);
out:
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -642,9 +642,9 @@ static void __delete_object(struct kmemleak_object *object)
* Locking here also ensures that the corresponding memory block
* cannot be freed when it is being scanned.
*/
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->flags &= ~OBJECT_ALLOCATED;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -716,9 +716,9 @@ static void paint_it(struct kmemleak_object *object, int color)
{
unsigned long flags;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
__paint_it(object, color);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
static void paint_ptr(unsigned long ptr, int color)
@@ -778,7 +778,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
goto out;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (size == SIZE_MAX) {
size = object->pointer + object->size - ptr;
} else if (ptr + size > object->pointer + object->size) {
@@ -794,7 +794,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
hlist_add_head(&area->node, &object->area_list);
out_unlock:
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
out:
put_object(object);
}
@@ -817,9 +817,9 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->excess_ref = excess_ref;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -839,9 +839,9 @@ static void object_no_scan(unsigned long ptr)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->flags |= OBJECT_NO_SCAN;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -902,11 +902,11 @@ static void early_alloc(struct early_log *log)
log->min_count, GFP_ATOMIC);
if (!object)
goto out;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
for (i = 0; i < log->trace_len; i++)
object->trace[i] = log->trace[i];
object->trace_len = log->trace_len;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
out:
rcu_read_unlock();
}
@@ -1096,9 +1096,9 @@ void __ref kmemleak_update_trace(const void *ptr)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->trace_len = __save_stack_trace(object->trace);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1311,7 +1311,7 @@ static void scan_block(void *_start, void *_end,
unsigned long flags;
unsigned long untagged_ptr;
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
@@ -1346,7 +1346,7 @@ static void scan_block(void *_start, void *_end,
* previously acquired in scan_object(). These locks are
* enclosed by scan_mutex.
*/
- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
/* only pass surplus references (object already gray) */
if (color_gray(object)) {
excess_ref = object->excess_ref;
@@ -1355,7 +1355,7 @@ static void scan_block(void *_start, void *_end,
excess_ref = 0;
update_refs(object);
}
- spin_unlock(&object->lock);
+ raw_spin_unlock(&object->lock);
if (excess_ref) {
object = lookup_object(excess_ref, 0);
@@ -1364,12 +1364,12 @@ static void scan_block(void *_start, void *_end,
if (object == scanned)
/* circular reference, ignore */
continue;
- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
update_refs(object);
- spin_unlock(&object->lock);
+ raw_spin_unlock(&object->lock);
}
}
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -1402,7 +1402,7 @@ static void scan_object(struct kmemleak_object *object)
* Once the object->lock is acquired, the corresponding memory block
* cannot be freed (the same lock is acquired in delete_object).
*/
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (object->flags & OBJECT_NO_SCAN)
goto out;
if (!(object->flags & OBJECT_ALLOCATED))
@@ -1421,9 +1421,9 @@ static void scan_object(struct kmemleak_object *object)
if (start >= end)
break;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
cond_resched();
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
} while (object->flags & OBJECT_ALLOCATED);
} else
hlist_for_each_entry(area, &object->area_list, node)
@@ -1431,7 +1431,7 @@ static void scan_object(struct kmemleak_object *object)
(void *)(area->start + area->size),
object);
out:
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
/*
@@ -1484,7 +1484,7 @@ static void kmemleak_scan(void)
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
@@ -1501,7 +1501,7 @@ static void kmemleak_scan(void)
if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1569,14 +1569,14 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1596,7 +1596,7 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
@@ -1606,7 +1606,7 @@ static void kmemleak_scan(void)
new_leaks++;
}
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1758,10 +1758,10 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
struct kmemleak_object *object = v;
unsigned long flags;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
print_unreferenced(seq, object);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
return 0;
}
@@ -1791,9 +1791,9 @@ static int dump_str_object_info(const char *str)
return -EINVAL;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
dump_object_info(object);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
return 0;
@@ -1812,11 +1812,11 @@ static void kmemleak_clear(void)
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
diff --git a/mm/ksm.c b/mm/ksm.c
index 3dc4346411e4..d614a49ab82c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -885,13 +885,13 @@ static int remove_stable_node(struct stable_node *stable_node)
return 0;
}
- if (WARN_ON_ONCE(page_mapped(page))) {
- /*
- * This should not happen: but if it does, just refuse to let
- * merge_across_nodes be switched - there is no need to panic.
- */
- err = -EBUSY;
- } else {
+ /*
+ * Page could be still mapped if this races with __mmput() running in
+ * between ksm_exit() and exit_mmap(). Just refuse to let
+ * merge_across_nodes/max_page_sharing be switched.
+ */
+ err = -EBUSY;
+ if (!page_mapped(page)) {
/*
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
@@ -2130,8 +2130,16 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, rmap_item->address);
- err = try_to_merge_one_page(vma, page,
- ZERO_PAGE(rmap_item->address));
+ if (vma) {
+ err = try_to_merge_one_page(vma, page,
+ ZERO_PAGE(rmap_item->address));
+ } else {
+ /*
+ * If the vma is out of date, we do not need to
+ * continue.
+ */
+ err = 0;
+ }
up_read(&mm->mmap_sem);
/*
* In case of failure, the page was not really empty, so we
diff --git a/mm/maccess.c b/mm/maccess.c
index 482d4d670f19..2d3c3d01064c 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -6,8 +6,32 @@
#include <linux/mm.h>
#include <linux/uaccess.h>
+static __always_inline long
+probe_read_common(void *dst, const void __user *src, size_t size)
+{
+ long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
+static __always_inline long
+probe_write_common(void __user *dst, const void *src, size_t size)
+{
+ long ret;
+
+ pagefault_disable();
+ ret = __copy_to_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
/**
- * probe_kernel_read(): safely attempt to read from a location
+ * probe_kernel_read(): safely attempt to read from a kernel-space location
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
@@ -30,17 +54,41 @@ long __probe_kernel_read(void *dst, const void *src, size_t size)
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst,
- (__force const void __user *)src, size);
- pagefault_enable();
+ ret = probe_read_common(dst, (__force const void __user *)src, size);
set_fs(old_fs);
- return ret ? -EFAULT : 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(probe_kernel_read);
/**
+ * probe_user_read(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_read(void *dst, const void __user *src, size_t size)
+ __attribute__((alias("__probe_user_read")));
+
+long __probe_user_read(void *dst, const void __user *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(src, size))
+ ret = probe_read_common(dst, src, size);
+ set_fs(old_fs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_read);
+
+/**
* probe_kernel_write(): safely attempt to write to a location
* @dst: address to write to
* @src: pointer to the data that shall be written
@@ -49,6 +97,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
+
long __weak probe_kernel_write(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_write")));
@@ -58,16 +107,41 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
- pagefault_enable();
+ ret = probe_write_common((__force void __user *)dst, src, size);
set_fs(old_fs);
- return ret ? -EFAULT : 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(probe_kernel_write);
/**
+ * probe_user_write(): safely attempt to write to a user-space location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_write(void __user *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_user_write")));
+
+long __probe_user_write(void __user *dst, const void *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(dst, size))
+ ret = probe_write_common(dst, src, size);
+ set_fs(old_fs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_write);
+
+/**
* strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
@@ -106,3 +180,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
return ret ? -EFAULT : src - unsafe_addr;
}
+
+/**
+ * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ * address.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @unsafe_addr: Unsafe user address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe user address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+ long count)
+{
+ mm_segment_t old_fs = get_fs();
+ long ret;
+
+ if (unlikely(count <= 0))
+ return 0;
+
+ set_fs(USER_DS);
+ pagefault_disable();
+ ret = strncpy_from_user(dst, unsafe_addr, count);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret >= count) {
+ ret = count;
+ dst[ret - 1] = '\0';
+ } else if (ret > 0) {
+ ret++;
+ }
+
+ return ret;
+}
+
+/**
+ * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * @unsafe_addr: The string to measure.
+ * @count: Maximum count (including NUL)
+ *
+ * Get the size of a NUL-terminated string in user space without pagefault.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ *
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
+ * On exception (or invalid count), returns 0.
+ *
+ * Unlike strnlen_user, this can be used from IRQ handler etc. because
+ * it disables pagefaults.
+ */
+long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+{
+ mm_segment_t old_fs = get_fs();
+ int ret;
+
+ set_fs(USER_DS);
+ pagefault_disable();
+ ret = strnlen_user(unsafe_addr, count);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index ae56d0ef337d..9c43c8515cef 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -308,9 +308,9 @@ static long madvise_willneed(struct vm_area_struct *vma,
*/
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
get_file(file);
- up_read(&current->mm->mmap_sem);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ up_read(&current->mm->mmap_sem);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
down_read(&current->mm->mmap_sem);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3ef243948993..0172f7557e08 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,6 +61,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
+#include <linux/locallock.h>
#include <linux/uaccess.h>
@@ -86,6 +87,8 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
+static DEFINE_LOCAL_IRQ_LOCK(event_lock);
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -411,8 +414,10 @@ int memcg_expand_shrinker_maps(int new_id)
if (mem_cgroup_is_root(memcg))
continue;
ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
- if (ret)
+ if (ret) {
+ mem_cgroup_iter_break(NULL, memcg);
goto unlock;
+ }
}
unlock:
if (!ret)
@@ -938,7 +943,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
- } while (!css_tryget_online(&memcg->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -2184,7 +2189,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu();
+ curcpu = get_cpu_light();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2204,7 +2209,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
}
css_put(&memcg->css);
}
- put_cpu();
+ put_cpu_light();
mutex_unlock(&percpu_charge_mutex);
}
@@ -2331,6 +2336,15 @@ retry:
}
/*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (gfp_mask & __GFP_ATOMIC)
+ goto force;
+
+ /*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
@@ -3879,7 +3893,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
unsigned long usage;
- int i, j, size;
+ int i, j, size, entries;
mutex_lock(&memcg->thresholds_lock);
@@ -3899,14 +3913,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
- size = 0;
+ size = entries = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
size++;
+ else
+ entries++;
}
new = thresholds->spare;
+ /* If no items related to eventfd have been cleared, nothing to do */
+ if (!entries)
+ goto unlock;
+
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
kfree(new);
@@ -4613,12 +4633,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- /*
- * Flush percpu vmstats and vmevents to guarantee the value correctness
- * on parent's and all ancestor levels.
- */
- memcg_flush_percpu_vmstats(memcg);
- memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
@@ -4629,6 +4643,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg);
+ memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
@@ -4637,19 +4657,22 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
struct mem_cgroup *memcg;
unsigned int size;
int node;
+ long error = -ENOMEM;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return NULL;
+ return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
1, MEM_CGROUP_ID_MAX,
GFP_KERNEL);
- if (memcg->id.id < 0)
+ if (memcg->id.id < 0) {
+ error = memcg->id.id;
goto fail;
+ }
memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
if (!memcg->vmstats_local)
@@ -4686,7 +4709,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
fail:
mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
- return NULL;
+ return ERR_PTR(error);
}
static struct cgroup_subsys_state * __ref
@@ -4697,8 +4720,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
long error = -ENOMEM;
memcg = mem_cgroup_alloc();
- if (!memcg)
- return ERR_PTR(error);
+ if (IS_ERR(memcg))
+ return ERR_CAST(memcg);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4745,7 +4768,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
fail:
mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
@@ -5063,12 +5086,12 @@ static int mem_cgroup_move_account(struct page *page,
ret = 0;
- local_irq_disable();
+ local_lock_irq(event_lock);
mem_cgroup_charge_statistics(to, page, compound, nr_pages);
memcg_check_events(to, page);
mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
memcg_check_events(from, page);
- local_irq_enable();
+ local_unlock_irq(event_lock);
out_unlock:
unlock_page(page);
out:
@@ -5280,7 +5303,6 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
- mem_cgroup_id_get_many(mc.to, mc.moved_swap);
css_put_many(&mc.to->css, mc.moved_swap);
mc.moved_swap = 0;
@@ -5471,7 +5493,8 @@ put: /* get_mctgt_type() gets the page */
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
mc.precharge--;
- /* we fixup refcnts and charges later. */
+ mem_cgroup_id_get_many(mc.to, 1);
+ /* we fixup other refcnts and charges later. */
mc.moved_swap++;
}
break;
@@ -6182,10 +6205,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
commit_charge(page, memcg, lrucare);
- local_irq_disable();
+ local_lock_irq(event_lock);
mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
memcg_check_events(memcg, page);
- local_irq_enable();
+ local_unlock_irq(event_lock);
if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
@@ -6254,7 +6277,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
memcg_oom_recover(ug->memcg);
}
- local_irq_save(flags);
+ local_lock_irqsave(event_lock, flags);
__mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
__mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
@@ -6262,7 +6285,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
- local_irq_restore(flags);
+ local_unlock_irqrestore(event_lock, flags);
if (!mem_cgroup_is_root(ug->memcg))
css_put_many(&ug->memcg->css, nr_pages);
@@ -6425,10 +6448,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
commit_charge(newpage, memcg, false);
- local_irq_save(flags);
+ local_lock_irqsave(event_lock, flags);
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
memcg_check_events(memcg, newpage);
- local_irq_restore(flags);
+ local_unlock_irqrestore(event_lock, flags);
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -6441,19 +6464,9 @@ void mem_cgroup_sk_alloc(struct sock *sk)
if (!mem_cgroup_sockets_enabled)
return;
- /*
- * Socket cloning can throw us here with sk_memcg already
- * filled. It won't however, necessarily happen from
- * process context. So the test for root memcg given
- * the current task's memcg won't help us in this case.
- *
- * Respecting the original socket's memcg is a better
- * decision in this case.
- */
- if (sk->sk_memcg) {
- css_get(&sk->sk_memcg->css);
+ /* Do not associate the sock with unrelated interrupted task's memcg. */
+ if (in_interrupt())
return;
- }
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
@@ -6620,6 +6633,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
unsigned short oldid;
+ unsigned long flags;
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
@@ -6665,13 +6679,17 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
+ local_lock_irqsave(event_lock, flags);
+#ifndef CONFIG_PREEMPT_RT_BASE
VM_BUG_ON(!irqs_disabled());
+#endif
mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
-nr_entries);
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
css_put_many(&memcg->css, nr_entries);
+ local_unlock_irqrestore(event_lock, flags);
}
/**
diff --git a/mm/memory.c b/mm/memory.c
index b0efc69b2634..339f8ba20470 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2257,10 +2257,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
*
* The function expects the page to be locked and unlocks it.
*/
-static void fault_dirty_shared_page(struct vm_area_struct *vma,
- struct page *page)
+static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
+ struct page *page = vmf->page;
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
@@ -2275,16 +2276,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
mapping = page_rmapping(page);
unlock_page(page);
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+
+ /*
+ * Throttle page dirtying rate down to writeback speed.
+ *
+ * mapping may be NULL here because some device drivers do not
+ * set page.mapping but still dirty their pages
+ *
+ * Drop the mmap_sem before waiting on IO, if we can. The file
+ * is pinning the mapping, as per above.
+ */
if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
+ struct file *fpin;
+
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
balance_dirty_pages_ratelimited(mapping);
+ if (fpin) {
+ fput(fpin);
+ return VM_FAULT_RETRY;
+ }
}
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
+ return 0;
}
/*
@@ -2527,6 +2542,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = VM_FAULT_WRITE;
get_page(vmf->page);
@@ -2550,10 +2566,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
wp_page_reuse(vmf);
lock_page(vmf->page);
}
- fault_dirty_shared_page(vma, vmf->page);
+ ret |= fault_dirty_shared_page(vmf);
put_page(vmf->page);
- return VM_FAULT_WRITE;
+ return ret;
}
/*
@@ -3615,7 +3631,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
return ret;
}
- fault_dirty_shared_page(vma, vmf->page);
+ ret |= fault_dirty_shared_page(vmf);
return ret;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 77d1f69cdead..e59a1c6e9e01 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -324,12 +324,8 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
-
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(start_pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -349,15 +345,12 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
unsigned long pfn;
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
@@ -379,7 +372,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
unsigned long zone_end_pfn = z;
unsigned long pfn;
- struct mem_section *ms;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
@@ -417,9 +409,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
*/
pfn = zone_start_pfn;
for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (page_zone(pfn_to_page(pfn)) != zone)
@@ -440,70 +430,33 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writeunlock(zone);
}
-static void shrink_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
+static void update_pgdat_span(struct pglist_data *pgdat)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
- unsigned long pfn;
- struct mem_section *ms;
- int nid = pgdat->node_id;
-
- if (pgdat_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the pgdat, it need
- * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
- pgdat_end_pfn);
- if (pfn) {
- pgdat->node_start_pfn = pfn;
- pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
- }
- } else if (pgdat_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the pgdat, it need
- * shrink pgdat->node_spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
- start_pfn);
- if (pfn)
- pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
- }
-
- /*
- * If the section is not biggest or smallest mem_section in the pgdat,
- * it only creates a hole in the pgdat. So in this case, we need not
- * change the pgdat.
- * But perhaps, the pgdat has only hole data. Thus it check the pgdat
- * has only hole or not.
- */
- pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
+ unsigned long node_start_pfn = 0, node_end_pfn = 0;
+ struct zone *zone;
- if (unlikely(!valid_section(ms)))
- continue;
+ for (zone = pgdat->node_zones;
+ zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ unsigned long zone_end_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
- if (pfn_to_nid(pfn) != nid)
+ /* No need to lock the zones, they can't change. */
+ if (!zone->spanned_pages)
continue;
-
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
+ if (!node_end_pfn) {
+ node_start_pfn = zone->zone_start_pfn;
+ node_end_pfn = zone_end_pfn;
continue;
+ }
- /* If we find valid section, we have nothing to do */
- return;
+ if (zone_end_pfn > node_end_pfn)
+ node_end_pfn = zone_end_pfn;
+ if (zone->zone_start_pfn < node_start_pfn)
+ node_start_pfn = zone->zone_start_pfn;
}
- /* The pgdat has no valid section */
- pgdat->node_start_pfn = 0;
- pgdat->node_spanned_pages = 0;
+ pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
static void __remove_zone(struct zone *zone, unsigned long start_pfn)
@@ -512,9 +465,19 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
int nr_pages = PAGES_PER_SECTION;
unsigned long flags;
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+ update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ca3f443c8fc1..b28d0616f002 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -666,7 +666,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 0 - queue pages successfully or no misplaced page.
- * -EIO - there is misplaced page and only MPOL_MF_STRICT was specified.
+ * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
+ * memory range specified by nodemask and maxnode points outside
+ * your accessible address space (-EFAULT)
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
@@ -1287,7 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len,
flags | MPOL_MF_INVERT, &pagelist);
if (ret < 0) {
- err = -EIO;
+ err = ret;
goto up_out;
}
@@ -1306,10 +1308,12 @@ static long do_mbind(unsigned long start, unsigned long len,
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
- } else
- putback_movable_pages(&pagelist);
-
+ } else {
up_out:
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
+ }
+
up_write(&mm->mmap_sem);
mpol_out:
mpol_put(new);
@@ -2788,6 +2792,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
char *flags = strchr(str, '=');
int err = 1, mode;
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
+
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
@@ -2798,9 +2805,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
} else
nodes_clear(nodes);
- if (flags)
- *flags++ = '\0'; /* terminate mode string */
-
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
goto out;
@@ -2808,7 +2812,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
switch (mode) {
case MPOL_PREFERRED:
/*
- * Insist on a nodelist of one node only
+ * Insist on a nodelist of one node only, although later
+ * we use first_node(nodes) to grab a single node, so here
+ * nodelist (or nodes) cannot be empty.
*/
if (nodelist) {
char *rest = nodelist;
@@ -2816,6 +2822,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
rest++;
if (*rest)
goto out;
+ if (nodes_empty(nodes))
+ goto out;
}
break;
case MPOL_INTERLEAVE:
diff --git a/mm/migrate.c b/mm/migrate.c
index dbb3b5bee4ee..8631d8d3730c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1522,9 +1522,11 @@ static int do_move_pages_to_node(struct mm_struct *mm,
/*
* Resolves the given address to a struct page, isolates it from the LRU and
* puts it to the given pagelist.
- * Returns -errno if the page cannot be found/isolated or 0 when it has been
- * queued or the page doesn't need to be migrated because it is already on
- * the target node
+ * Returns:
+ * errno - if the page cannot be found/isolated
+ * 0 - when it doesn't have to be migrated because it is already on the
+ * target node
+ * 1 - when it has been queued
*/
static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
int node, struct list_head *pagelist, bool migrate_all)
@@ -1563,7 +1565,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
if (PageHuge(page)) {
if (PageHead(page)) {
isolate_huge_page(page, pagelist);
- err = 0;
+ err = 1;
}
} else {
struct page *head;
@@ -1573,7 +1575,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
if (err)
goto out_putpage;
- err = 0;
+ err = 1;
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_cache(head),
@@ -1635,8 +1637,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
start = i;
} else if (node != current_node) {
err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err)
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
goto out;
+ }
err = store_status(status, start, current_node, i - start);
if (err)
goto out;
@@ -1650,16 +1663,28 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
*/
err = add_page_for_migration(mm, addr, current_node,
&pagelist, flags & MPOL_MF_MOVE_ALL);
- if (!err)
+
+ if (!err) {
+ /* The page is already on the target node */
+ err = store_status(status, i, current_node, 1);
+ if (err)
+ goto out_flush;
continue;
+ } else if (err > 0) {
+ /* The page is successfully queued for migration */
+ continue;
+ }
err = store_status(status, i, err, 1);
if (err)
goto out_flush;
err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err)
+ if (err) {
+ if (err > 0)
+ err += nr_pages - i - 1;
goto out;
+ }
if (i > start) {
err = store_status(status, start, current_node, i - start);
if (err)
@@ -1673,9 +1698,16 @@ out_flush:
/* Make sure we do not overwrite the existing error */
err1 = do_move_pages_to_node(mm, &pagelist, current_node);
+ /*
+ * Don't have to report non-attempted pages here since:
+ * - If the above loop is done gracefully all pages have been
+ * attempted.
+ * - If the above loop is aborted it means a fatal error
+ * happened, should return ret.
+ */
if (!err1)
err1 = store_status(status, start, current_node, i - start);
- if (!err)
+ if (err >= 0)
err = err1;
out:
return err;
diff --git a/mm/mmap.c b/mm/mmap.c
index d8a903f61262..bc841e98bb92 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -90,12 +90,6 @@ static void unmap_region(struct mm_struct *mm,
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
- * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
- * MAP_PRIVATE:
- * r: (no) no
- * w: (no) no
- * x: (yes) yes
*/
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
@@ -2617,7 +2611,7 @@ static void unmap_region(struct mm_struct *mm,
* Create a list of vma's touched by the unmap, removing them from the mm's
* vma list as we go..
*/
-static void
+static bool
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end)
{
@@ -2642,6 +2636,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
/* Kill the cache */
vmacache_invalidate(mm);
+
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (vma && (vma->vm_flags & VM_GROWSDOWN))
+ return false;
+ if (prev && (prev->vm_flags & VM_GROWSUP))
+ return false;
+ return true;
}
/*
@@ -2822,7 +2827,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
}
/* Detach vmas from rbtree */
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
+ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
+ downgrade = false;
if (downgrade)
downgrade_write(&mm->mmap_sem);
@@ -3171,6 +3177,7 @@ void exit_mmap(struct mm_struct *mm)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
+ cond_resched();
}
vm_unacct_memory(nr_accounted);
}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 3e612ae748e9..d0ccc070979f 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -25,6 +25,7 @@ void use_mm(struct mm_struct *mm)
struct task_struct *tsk = current;
task_lock(tsk);
+ preempt_disable_rt();
active_mm = tsk->active_mm;
if (active_mm != mm) {
mmgrab(mm);
@@ -32,6 +33,7 @@ void use_mm(struct mm_struct *mm)
}
tsk->mm = mm;
switch_mm(active_mm, mm, tsk);
+ preempt_enable_rt();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
finish_arch_post_lock_switch();
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 8c943a6e1696..a490ff445644 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -102,14 +102,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
*/
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
-#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
- /*
- * Invalidate page-table caches used by hardware walkers. Then we still
- * need to RCU-sched wait while freeing the pages because software
- * walkers can still be in-flight.
- */
- tlb_flush_mmu_tlbonly(tlb);
-#endif
+ if (tlb_needs_table_invalidate()) {
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then
+ * we still need to RCU-sched wait while freeing the pages
+ * because software walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+ }
}
static void tlb_remove_table_smp_sync(void *arg)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bf38dfbbb4b4..c5add7c033a0 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -161,6 +161,31 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return pages;
}
+/*
+ * Used when setting automatic NUMA hinting protection where it is
+ * critical that a numa hinting PMD is not confused with a bad PMD.
+ */
+static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+{
+ pmd_t pmdval = pmd_read_atomic(pmd);
+
+ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ barrier();
+#endif
+
+ if (pmd_none(pmdval))
+ return 1;
+ if (pmd_trans_huge(pmdval))
+ return 0;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -178,8 +203,17 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
unsigned long this_pages;
next = pmd_addr_end(addr, end);
- if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
- && pmd_none_or_clear_bad(pmd))
+
+ /*
+ * Automatic NUMA balancing walks the tables with mmap_sem
+ * held for read. It's possible a parallel update to occur
+ * between pmd_trans_huge() and a pmd_none_or_clear_bad()
+ * check leading to a false positive and clearing.
+ * Hence, it's necessary to atomically read the PMD value
+ * for all the checks.
+ */
+ if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
+ pmd_none_or_clear_bad_unless_trans_huge(pmd))
goto next;
/* invoke the mmu notifier if the pmd is populated */
diff --git a/mm/mremap.c b/mm/mremap.c
index fc241d23cd97..8b6edb259495 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -266,7 +266,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE) {
bool moved;
/* See comment in move_ptes() */
diff --git a/mm/nommu.c b/mm/nommu.c
index d7e4abcaca3e..690c26ac8b27 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -447,10 +447,14 @@ void vm_unmap_aliases(void)
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement a stub for vmalloc_sync_[un]mapping() if the architecture
+ * chose not to have one.
*/
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
+{
+}
+
+void __weak vmalloc_sync_unmappings(void)
{
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bdbe8b6b1225..dde05d75c471 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -201,11 +201,11 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
if (this_bw < tot_bw) {
if (min) {
min *= this_bw;
- do_div(min, tot_bw);
+ min = div64_ul(min, tot_bw);
}
if (max < 100) {
max *= this_bw;
- do_div(max, tot_bw);
+ max = div64_ul(max, tot_bw);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8b6a295fdb02..3a777cff1bea 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -62,6 +62,7 @@
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
+#include <linux/locallock.h>
#include <linux/page_owner.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
@@ -311,6 +312,18 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define cpu_lock_irqsave(cpu, flags) \
+ local_lock_irqsave_on(pa_lock, flags, cpu)
+# define cpu_unlock_irqrestore(cpu, flags) \
+ local_unlock_irqrestore_on(pa_lock, flags, cpu)
+#else
+# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
+# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
+#endif
+
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1188,7 +1201,7 @@ static inline void prefetch_buddy(struct page *page)
}
/*
- * Frees a number of pages from the PCP lists
+ * Frees a number of pages which have been collected from the pcp lists.
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
@@ -1198,15 +1211,57 @@ static inline void prefetch_buddy(struct page *page)
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
-static void free_pcppages_bulk(struct zone *zone, int count,
- struct per_cpu_pages *pcp)
+static void free_pcppages_bulk(struct zone *zone, struct list_head *head,
+ bool zone_retry)
+{
+ bool isolated_pageblocks;
+ struct page *page, *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ isolated_pageblocks = has_isolate_pageblock(zone);
+
+ /*
+ * Use safe version since after __free_one_page(),
+ * page->lru.next will not point to original list.
+ */
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ int mt = get_pcppage_migratetype(page);
+
+ if (page_zone(page) != zone) {
+ /*
+ * free_unref_page_list() sorts pages by zone. If we end
+ * up with pages from a different NUMA nodes belonging
+ * to the same ZONE index then we need to redo with the
+ * correct ZONE pointer. Skip the page for now, redo it
+ * on the next iteration.
+ */
+ WARN_ON_ONCE(zone_retry == false);
+ if (zone_retry)
+ continue;
+ }
+
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
+ if (unlikely(isolated_pageblocks))
+ mt = get_pageblock_migratetype(page);
+
+ list_del(&page->lru);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
+ struct list_head *dst)
+
{
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = 0;
- bool isolated_pageblocks;
- struct page *page, *tmp;
- LIST_HEAD(head);
+ struct page *page;
while (count) {
struct list_head *list;
@@ -1238,7 +1293,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (bulkfree_pcp_prepare(page))
continue;
- list_add_tail(&page->lru, &head);
+ list_add_tail(&page->lru, dst);
/*
* We are going to put the page back to the global
@@ -1253,26 +1308,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
prefetch_buddy(page);
} while (--count && --batch_free && !list_empty(list));
}
-
- spin_lock(&zone->lock);
- isolated_pageblocks = has_isolate_pageblock(zone);
-
- /*
- * Use safe version since after __free_one_page(),
- * page->lru.next will not point to original list.
- */
- list_for_each_entry_safe(page, tmp, &head, lru) {
- int mt = get_pcppage_migratetype(page);
- /* MIGRATE_ISOLATE page should not go to pcplists */
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- /* Pageblock could have been isolated meanwhile */
- if (unlikely(isolated_pageblocks))
- mt = get_pageblock_migratetype(page);
-
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
- trace_mm_page_pcpu_drain(page, 0, mt);
- }
- spin_unlock(&zone->lock);
}
static void free_one_page(struct zone *zone,
@@ -1373,10 +1408,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
}
void __free_pages_core(struct page *page, unsigned int order)
@@ -1507,6 +1542,7 @@ void set_zone_contiguous(struct zone *zone)
if (!__pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, zone))
return;
+ cond_resched();
}
/* We confirm that there is no hole */
@@ -1591,7 +1627,6 @@ static void __init deferred_free_pages(unsigned long pfn,
} else if (!(pfn & nr_pgmask)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
- touch_nmi_watchdog();
} else {
nr_free++;
}
@@ -1621,7 +1656,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
continue;
} else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
- touch_nmi_watchdog();
} else {
page++;
}
@@ -1744,6 +1778,13 @@ static int __init deferred_init_memmap(void *data)
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
pgdat->first_deferred_pfn = ULONG_MAX;
+ /*
+ * Once we unlock here, the zone cannot be grown anymore, thus if an
+ * interrupt thread must allocate this early in boot, zone must be
+ * pre-grown prior to start of deferred page initialization.
+ */
+ pgdat_resize_unlock(pgdat, &flags);
+
/* Only the highest zone is deferred so find it */
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
zone = pgdat->node_zones + zid;
@@ -1761,11 +1802,11 @@ static int __init deferred_init_memmap(void *data)
* that we can avoid introducing any issues with the buddy
* allocator.
*/
- while (spfn < epfn)
+ while (spfn < epfn) {
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
zone_empty:
- pgdat_resize_unlock(pgdat, &flags);
-
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
@@ -1808,17 +1849,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
pgdat_resize_lock(pgdat, &flags);
/*
- * If deferred pages have been initialized while we were waiting for
- * the lock, return true, as the zone was grown. The caller will retry
- * this zone. We won't return to this function since the caller also
- * has this static branch.
- */
- if (!static_branch_unlikely(&deferred_pages)) {
- pgdat_resize_unlock(pgdat, &flags);
- return true;
- }
-
- /*
* If someone grew this zone while we were waiting for spinlock, return
* true, as there might be enough pages already.
*/
@@ -1846,6 +1876,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
first_deferred_pfn = spfn;
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ touch_nmi_watchdog();
/* We should only stop along section boundaries */
if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
@@ -1893,6 +1924,14 @@ void __init page_alloc_init_late(void)
wait_for_completion(&pgdat_init_all_done_comp);
/*
+ * The number of managed pages has changed due to the initialisation
+ * so the pcpu batch and high limits needs to be updated or the limits
+ * will be artificially small.
+ */
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone);
+
+ /*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
@@ -2281,6 +2320,14 @@ static inline void boost_watermark(struct zone *zone)
if (!watermark_boost_factor)
return;
+ /*
+ * Don't bother in zones that are unlikely to produce results.
+ * On small machines, including kdump capture kernels running
+ * in a small area, boosting the watermark can cause an out of
+ * memory situation immediately.
+ */
+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
+ return;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
@@ -2719,13 +2766,18 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
int to_drain, batch;
+ LIST_HEAD(dst);
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
- free_pcppages_bulk(zone, to_drain, pcp);
- local_irq_restore(flags);
+ isolate_pcp_pages(to_drain, pcp, &dst);
+
+ local_unlock_irqrestore(pa_lock, flags);
+
+ if (to_drain > 0)
+ free_pcppages_bulk(zone, &dst, false);
}
#endif
@@ -2741,14 +2793,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
unsigned long flags;
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
+ LIST_HEAD(dst);
+ int count;
- local_irq_save(flags);
+ cpu_lock_irqsave(cpu, flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp);
- local_irq_restore(flags);
+ count = pcp->count;
+ if (count)
+ isolate_pcp_pages(count, pcp, &dst);
+
+ cpu_unlock_irqrestore(cpu, flags);
+
+ if (count)
+ free_pcppages_bulk(zone, &dst, false);
}
/*
@@ -2783,6 +2842,7 @@ void drain_local_pages(struct zone *zone)
drain_pages(cpu);
}
+#ifndef CONFIG_PREEMPT_RT_BASE
static void drain_local_pages_wq(struct work_struct *work)
{
struct pcpu_drain *drain;
@@ -2800,6 +2860,7 @@ static void drain_local_pages_wq(struct work_struct *work)
drain_local_pages(drain->zone);
preempt_enable();
}
+#endif
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
@@ -2867,6 +2928,14 @@ void drain_all_pages(struct zone *zone)
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
+#ifdef CONFIG_PREEMPT_RT_BASE
+ for_each_cpu(cpu, &cpus_with_pcps) {
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
+ }
+#else
for_each_cpu(cpu, &cpus_with_pcps) {
struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
@@ -2876,6 +2945,7 @@ void drain_all_pages(struct zone *zone)
}
for_each_cpu(cpu, &cpus_with_pcps)
flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
+#endif
mutex_unlock(&pcpu_drain_mutex);
}
@@ -2947,7 +3017,8 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
return true;
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+ struct list_head *dst)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
@@ -2976,7 +3047,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
- free_pcppages_bulk(zone, batch, pcp);
+
+ isolate_pcp_pages(batch, pcp, dst);
}
}
@@ -2987,13 +3059,17 @@ void free_unref_page(struct page *page)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
+ struct zone *zone = page_zone(page);
+ LIST_HEAD(dst);
if (!free_unref_page_prepare(page, pfn))
return;
- local_irq_save(flags);
- free_unref_page_commit(page, pfn);
- local_irq_restore(flags);
+ local_lock_irqsave(pa_lock, flags);
+ free_unref_page_commit(page, pfn, &dst);
+ local_unlock_irqrestore(pa_lock, flags);
+ if (!list_empty(&dst))
+ free_pcppages_bulk(zone, &dst, false);
}
/*
@@ -3004,6 +3080,11 @@ void free_unref_page_list(struct list_head *list)
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+ struct list_head dsts[__MAX_NR_ZONES];
+ int i;
+
+ for (i = 0; i < __MAX_NR_ZONES; i++)
+ INIT_LIST_HEAD(&dsts[i]);
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
@@ -3013,25 +3094,42 @@ void free_unref_page_list(struct list_head *list)
set_page_private(page, pfn);
}
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
+ enum zone_type type;
set_page_private(page, 0);
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn);
+ type = page_zonenum(page);
+ free_unref_page_commit(page, pfn, &dsts[type]);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
batch_count = 0;
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
+ }
+ }
+ local_unlock_irqrestore(pa_lock, flags);
+
+ for (i = 0; i < __MAX_NR_ZONES; ) {
+ struct page *page;
+ struct zone *zone;
+
+ if (list_empty(&dsts[i])) {
+ i++;
+ continue;
}
+
+ page = list_first_entry(&dsts[i], struct page, lru);
+ zone = page_zone(page);
+
+ free_pcppages_bulk(zone, &dsts[i], true);
}
- local_irq_restore(flags);
}
/*
@@ -3166,7 +3264,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct page *page;
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
@@ -3174,7 +3272,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
return page;
}
@@ -3201,7 +3299,7 @@ struct page *rmqueue(struct zone *preferred_zone,
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- spin_lock_irqsave(&zone->lock, flags);
+ local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
do {
page = NULL;
@@ -3221,7 +3319,7 @@ struct page *rmqueue(struct zone *preferred_zone,
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
out:
/* Separate test+clear to avoid unnecessary atomics */
@@ -3234,7 +3332,7 @@ out:
return page;
failed:
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
return NULL;
}
@@ -6840,7 +6938,8 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
* memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=.
+ * layout is manually configured via memmap=, or when the highest physical
+ * address (max_pfn) does not end on a section boundary.
*/
void __init zero_resv_unavail(void)
{
@@ -6858,7 +6957,16 @@ void __init zero_resv_unavail(void)
pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
next = end;
}
- pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
+
+ /*
+ * Early sections always have a fully populated memmap for the whole
+ * section - see pfn_valid(). If the last section has holes at the
+ * end and that section is marked "online", the memmap will be
+ * considered initialized. Make sure that memmap has a well defined
+ * state.
+ */
+ pgcnt += zero_pfn_range(PFN_DOWN(next),
+ round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
@@ -7492,8 +7600,9 @@ void __init free_area_init(unsigned long *zones_size)
static int page_alloc_cpu_dead(unsigned int cpu)
{
-
+ local_lock_irq_on(swapvec_lock, cpu);
lru_add_drain_cpu(cpu);
+ local_unlock_irq_on(swapvec_lock, cpu);
drain_pages(cpu);
/*
@@ -8400,7 +8509,6 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
@@ -8414,7 +8522,6 @@ void __meminit zone_pcp_update(struct zone *zone)
per_cpu_ptr(zone->pageset, cpu));
mutex_unlock(&pcp_batch_high_lock);
}
-#endif
void zone_pcp_reset(struct zone *zone)
{
@@ -8423,7 +8530,7 @@ void zone_pcp_reset(struct zone *zone)
struct per_cpu_pageset *pset;
/* avoid races with drain_pages() */
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
if (zone->pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
@@ -8432,7 +8539,7 @@ void zone_pcp_reset(struct zone *zone)
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/mm/page_counter.c b/mm/page_counter.c
index de31470655f6..147ff99187b8 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -77,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -121,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -130,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
diff --git a/mm/page_io.c b/mm/page_io.c
index a39aac2f8c8d..fd5cd06aef3f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -73,6 +73,7 @@ static void swap_slot_free_notify(struct page *page)
{
struct swap_info_struct *sis;
struct gendisk *disk;
+ swp_entry_t entry;
/*
* There is no guarantee that the page is in swap cache - the software
@@ -104,11 +105,10 @@ static void swap_slot_free_notify(struct page *page)
* we again wish to reclaim it.
*/
disk = sis->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify) {
- swp_entry_t entry;
+ entry.val = page_private(page);
+ if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
unsigned long offset;
- entry.val = page_private(page);
offset = swp_offset(entry);
SetPageDirty(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index f4dce9c8670d..2238a3df07a0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2005,16 +2005,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
shmem_falloc->waitq &&
vmf->pgoff >= shmem_falloc->start &&
vmf->pgoff < shmem_falloc->next) {
+ struct file *fpin;
wait_queue_head_t *shmem_falloc_waitq;
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
ret = VM_FAULT_NOPAGE;
- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* It's polite to up mmap_sem if we can */
- up_read(&vma->vm_mm->mmap_sem);
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ if (fpin)
ret = VM_FAULT_RETRY;
- }
shmem_falloc_waitq = shmem_falloc->waitq;
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
@@ -2032,6 +2030,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
spin_lock(&inode->i_lock);
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
spin_unlock(&inode->i_lock);
+
+ if (fpin)
+ fput(fpin);
return ret;
}
spin_unlock(&inode->i_lock);
@@ -2088,9 +2089,10 @@ unsigned long shmem_get_unmapped_area(struct file *file,
/*
* Our priority is to support MAP_SHARED mapped hugely;
* and support MAP_PRIVATE mapped hugely too, until it is COWed.
- * But if caller specified an address hint, respect that as before.
+ * But if caller specified an address hint and we allocated area there
+ * successfully, respect that as before.
*/
- if (uaddr)
+ if (uaddr == addr)
return addr;
if (shmem_huge != SHMEM_HUGE_FORCE) {
@@ -2124,7 +2126,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
if (inflated_len < len)
return addr;
- inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
+ inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
if (IS_ERR_VALUE(inflated_addr))
return addr;
if (inflated_addr & ~PAGE_MASK)
@@ -2164,7 +2166,11 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
- spin_lock_irq(&info->lock);
+ /*
+ * What serializes the accesses to info->flags?
+ * ipc_lock_object() when called from shmctl_do_lock(),
+ * no serialization needed when called from shm_destroy().
+ */
if (lock && !(info->flags & VM_LOCKED)) {
if (!user_shm_lock(inode->i_size, user))
goto out_nomem;
@@ -2179,7 +2185,6 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
retval = 0;
out_nomem:
- spin_unlock_irq(&info->lock);
return retval;
}
@@ -2196,11 +2201,14 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return -EPERM;
/*
- * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
- * read-only mapping, take care to not allow mprotect to revert
- * protections.
+ * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+ * MAP_SHARED and read-only, take care to not allow mprotect to
+ * revert protections on such mappings. Do this only for shared
+ * mappings. For private mappings, don't need to mask
+ * VM_MAYWRITE as we still want them to be COW-writable.
*/
- vma->vm_flags &= ~(VM_MAYWRITE);
+ if (vma->vm_flags & VM_SHARED)
+ vma->vm_flags &= ~(VM_MAYWRITE);
}
file_accessed(file);
@@ -2381,11 +2389,11 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
lru_cache_add_anon(page);
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
inc_mm_counter(dst_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -2725,7 +2733,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
}
shmem_falloc.waitq = &shmem_falloc_waitq;
- shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
spin_lock(&inode->i_lock);
inode->i_private = &shmem_falloc;
diff --git a/mm/slab.c b/mm/slab.c
index f7117ad9b3a3..780f4e46688b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -233,7 +233,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
- spin_lock_init(&parent->list_lock);
+ raw_spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
}
@@ -564,9 +564,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
page_node = page_to_nid(page);
n = get_node(cachep, page_node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, &objp, 1, page_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -694,7 +694,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
struct kmem_cache_node *n = get_node(cachep, node);
if (ac->avail) {
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
/*
* Stuff objects into the remote nodes shared array first.
* That way we could avoid the overhead of putting the objects
@@ -705,7 +705,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
free_block(cachep, ac->entry, ac->avail, node, list);
ac->avail = 0;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
}
}
@@ -778,9 +778,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
slabs_destroy(cachep, &list);
} else {
n = get_node(cachep, page_node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, &objp, 1, page_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
return 1;
@@ -821,10 +821,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
*/
n = get_node(cachep, node);
if (n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
cachep->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
return 0;
}
@@ -903,7 +903,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
goto fail;
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
if (n->shared && force_change) {
free_block(cachep, n->shared->entry,
n->shared->avail, node, &list);
@@ -921,7 +921,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
new_alien = NULL;
}
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
/*
@@ -960,7 +960,7 @@ static void cpuup_canceled(long cpu)
if (!n)
continue;
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
/* Free limit for this kmem_cache_node */
n->free_limit -= cachep->batchcount;
@@ -971,7 +971,7 @@ static void cpuup_canceled(long cpu)
nc->avail = 0;
if (!cpumask_empty(mask)) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto free_slab;
}
@@ -985,7 +985,7 @@ static void cpuup_canceled(long cpu)
alien = n->alien;
n->alien = NULL;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
kfree(shared);
if (alien) {
@@ -1169,7 +1169,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
- spin_lock_init(&ptr->list_lock);
+ raw_spin_lock_init(&ptr->list_lock);
MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->node[nodeid] = ptr;
@@ -1340,11 +1340,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
for_each_kmem_cache_node(cachep, node, n) {
unsigned long total_slabs, free_slabs, free_objs;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
total_slabs = n->total_slabs;
free_slabs = n->free_slabs;
free_objs = n->free_objects;
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
node, total_slabs - free_slabs, total_slabs,
@@ -2107,7 +2107,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
#endif
}
@@ -2115,7 +2115,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, node)->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
#endif
}
@@ -2155,9 +2155,9 @@ static void do_drain(void *arg)
check_irq_off();
ac = cpu_cache_get(cachep);
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
ac->avail = 0;
}
@@ -2175,9 +2175,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
drain_alien_cache(cachep, n->alien);
for_each_kmem_cache_node(cachep, node, n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, n->shared, node, true, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -2199,10 +2199,10 @@ static int drain_freelist(struct kmem_cache *cache,
nr_freed = 0;
while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
p = n->slabs_free.prev;
if (p == &n->slabs_free) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto out;
}
@@ -2215,7 +2215,7 @@ static int drain_freelist(struct kmem_cache *cache,
* to the cache.
*/
n->free_objects -= cache->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slab_destroy(cache, page);
nr_freed++;
}
@@ -2664,7 +2664,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
INIT_LIST_HEAD(&page->slab_list);
n = get_node(cachep, page_to_nid(page));
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
n->total_slabs++;
if (!page->active) {
list_add_tail(&page->slab_list, &n->slabs_free);
@@ -2674,7 +2674,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
}
@@ -2840,7 +2840,7 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
- assert_spin_locked(&n->list_lock);
+ assert_raw_spin_locked(&n->list_lock);
page = list_first_entry_or_null(&n->slabs_partial, struct page,
slab_list);
if (!page) {
@@ -2867,10 +2867,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
if (!gfp_pfmemalloc_allowed(flags))
return NULL;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
page = get_first_slab(n, true);
if (!page) {
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
return NULL;
}
@@ -2879,7 +2879,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
fixup_slab_list(cachep, n, page, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
@@ -2938,7 +2938,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
if (!n->free_objects && (!shared || !shared->avail))
goto direct_grow;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
@@ -2962,7 +2962,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
must_grow:
n->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
direct_grow:
@@ -3187,7 +3187,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
BUG_ON(!n);
check_irq_off();
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -3205,12 +3205,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
fixup_slab_list(cachep, n, page, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
must_grow:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
if (page) {
/* This slab isn't counted yet so don't update free_objects */
@@ -3386,7 +3386,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
check_irq_off();
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
if (n->shared) {
struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
@@ -3415,7 +3415,7 @@ free_done:
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
@@ -3829,9 +3829,9 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
node = cpu_to_mem(cpu);
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
free_percpu(prev);
@@ -3956,9 +3956,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
return;
}
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, ac, node, false, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -4042,7 +4042,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
for_each_kmem_cache_node(cachep, node, n) {
check_irq_on();
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
total_slabs += n->total_slabs;
free_slabs += n->free_slabs;
@@ -4051,7 +4051,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
if (n->shared)
shared_avail += n->shared->avail;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
}
num_objs = total_slabs * cachep->num;
active_slabs = total_slabs - free_slabs;
diff --git a/mm/slab.h b/mm/slab.h
index 43ac818b8592..514f5be61baa 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -449,7 +449,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
* The slab lists for all objects.
*/
struct kmem_cache_node {
- spinlock_t list_lock;
+ raw_spinlock_t list_lock;
#ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index cbd3411f644e..70fb7eec7360 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -130,6 +130,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
#ifdef CONFIG_MEMCG_KMEM
LIST_HEAD(slab_root_caches);
+static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
void slab_init_memcg_params(struct kmem_cache *s)
{
@@ -310,6 +311,14 @@ int slab_unmergeable(struct kmem_cache *s)
if (s->refcount < 0)
return 1;
+#ifdef CONFIG_MEMCG_KMEM
+ /*
+ * Skip the dying kmem_cache.
+ */
+ if (s->memcg_params.dying)
+ return 1;
+#endif
+
return 0;
}
@@ -744,14 +753,22 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
WARN_ON_ONCE(s->memcg_params.deact_fn))
return;
+ /*
+ * memcg_kmem_wq_lock is used to synchronize memcg_params.dying
+ * flag and make sure that no new kmem_cache deactivation tasks
+ * are queued (see flush_memcg_workqueue() ).
+ */
+ spin_lock_irq(&memcg_kmem_wq_lock);
if (s->memcg_params.root_cache->memcg_params.dying)
- return;
+ goto unlock;
/* pin memcg so that @s doesn't get destroyed in the middle */
css_get(&s->memcg_params.memcg->css);
s->memcg_params.deact_fn = deact_fn;
call_rcu(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
+unlock:
+ spin_unlock_irq(&memcg_kmem_wq_lock);
}
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
@@ -859,12 +876,15 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
return 0;
}
-static void flush_memcg_workqueue(struct kmem_cache *s)
+static void memcg_set_kmem_cache_dying(struct kmem_cache *s)
{
- mutex_lock(&slab_mutex);
+ spin_lock_irq(&memcg_kmem_wq_lock);
s->memcg_params.dying = true;
- mutex_unlock(&slab_mutex);
+ spin_unlock_irq(&memcg_kmem_wq_lock);
+}
+static void flush_memcg_workqueue(struct kmem_cache *s)
+{
/*
* SLUB deactivates the kmem_caches through call_rcu. Make
* sure all registered rcu callbacks have been invoked.
@@ -877,17 +897,26 @@ static void flush_memcg_workqueue(struct kmem_cache *s)
* deactivates the memcg kmem_caches through workqueue. Make sure all
* previous workitems on workqueue are processed.
*/
- flush_workqueue(memcg_kmem_cache_wq);
+ if (likely(memcg_kmem_cache_wq))
+ flush_workqueue(memcg_kmem_cache_wq);
+
+ /*
+ * If we're racing with children kmem_cache deactivation, it might
+ * take another rcu grace period to complete their destruction.
+ * At this moment the corresponding percpu_ref_kill() call should be
+ * done, but it might take another rcu grace period to complete
+ * switching to the atomic mode.
+ * Please, note that we check without grabbing the slab_mutex. It's safe
+ * because at this moment the children list can't grow.
+ */
+ if (!list_empty(&s->memcg_params.children))
+ rcu_barrier();
}
#else
static inline int shutdown_memcg_caches(struct kmem_cache *s)
{
return 0;
}
-
-static inline void flush_memcg_workqueue(struct kmem_cache *s)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
void slab_kmem_cache_release(struct kmem_cache *s)
@@ -905,8 +934,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (unlikely(!s))
return;
- flush_memcg_workqueue(s);
-
get_online_cpus();
get_online_mems();
@@ -916,6 +943,22 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
+#ifdef CONFIG_MEMCG_KMEM
+ memcg_set_kmem_cache_dying(s);
+
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ flush_memcg_workqueue(s);
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+#endif
+
err = shutdown_memcg_caches(s);
if (!err)
err = shutdown_cache(s);
@@ -1593,7 +1636,7 @@ void kzfree(const void *p)
if (unlikely(ZERO_OR_NULL_PTR(mem)))
return;
ks = ksize(mem);
- memset(mem, 0, ks);
+ memzero_explicit(mem, ks);
kfree(mem);
}
EXPORT_SYMBOL(kzfree);
diff --git a/mm/slub.c b/mm/slub.c
index 5308227e65db..34394571e7b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -261,7 +261,7 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
* freepointer to be restored incorrectly.
*/
return (void *)((unsigned long)ptr ^ s->random ^
- (unsigned long)kasan_reset_tag((void *)ptr_addr));
+ swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
#else
return ptr;
#endif
@@ -644,6 +644,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
va_end(args);
}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void **freelist, void *nextfree)
+{
+ if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
+ !check_valid_pointer(s, page, nextfree) && freelist) {
+ object_err(s, page, *freelist, "Freechain corrupt");
+ *freelist = NULL;
+ slab_fix(s, "Isolate corrupted freechain");
+ return true;
+ }
+
+ return false;
+}
+
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
@@ -1175,7 +1189,7 @@ static noinline int free_debug_processing(
unsigned long uninitialized_var(flags);
int ret = 0;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
slab_lock(page);
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
@@ -1210,7 +1224,7 @@ out:
bulk_cnt, cnt);
slab_unlock(page);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
if (!ret)
slab_fix(s, "Object at 0x%p not freed", object);
return ret;
@@ -1376,8 +1390,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void **freelist, void *nextfree)
+{
+ return false;
+}
#endif /* CONFIG_SLUB_DEBUG */
+struct slub_free_list {
+ raw_spinlock_t lock;
+ struct list_head list;
+};
+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
+
/*
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
@@ -1615,10 +1640,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
void *start, *p, *next;
int idx, order;
bool shuffle;
+ bool enableirqs = false;
flags &= gfp_allowed_mask;
if (gfpflags_allow_blocking(flags))
+ enableirqs = true;
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (system_state > SYSTEM_BOOTING)
+ enableirqs = true;
+#endif
+ if (enableirqs)
local_irq_enable();
flags |= s->allocflags;
@@ -1678,7 +1711,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->frozen = 1;
out:
- if (gfpflags_allow_blocking(flags))
+ if (enableirqs)
local_irq_disable();
if (!page)
return NULL;
@@ -1736,6 +1769,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__free_pages(page, order);
}
+static void free_delayed(struct list_head *h)
+{
+ while (!list_empty(h)) {
+ struct page *page = list_first_entry(h, struct page, lru);
+
+ list_del(&page->lru);
+ __free_slab(page->slab_cache, page);
+ }
+}
+
static void rcu_free_slab(struct rcu_head *h)
{
struct page *page = container_of(h, struct page, rcu_head);
@@ -1747,6 +1790,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
{
if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
call_rcu(&page->rcu_head, rcu_free_slab);
+ } else if (irqs_disabled()) {
+ struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
+
+ raw_spin_lock(&f->lock);
+ list_add(&page->lru, &f->list);
+ raw_spin_unlock(&f->lock);
} else
__free_slab(s, page);
}
@@ -1854,7 +1903,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
if (!n || !n->nr_partial)
return NULL;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
@@ -1879,7 +1928,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
break;
}
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
return object;
}
@@ -1958,8 +2007,6 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- else if (!node_present_pages(node))
- searchnode = node_to_mem_node(node);
object = get_partial_node(s, get_node(s, searchnode), c, flags);
if (object || node != NUMA_NO_NODE)
@@ -2067,6 +2114,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
void *prior;
unsigned long counters;
+ /*
+ * If 'nextfree' is invalid, it is possible that the object at
+ * 'freelist' is already corrupted. So isolate all objects
+ * starting at 'freelist'.
+ */
+ if (freelist_corrupted(s, page, &freelist, nextfree))
+ break;
+
do {
prior = page->freelist;
counters = page->counters;
@@ -2125,7 +2180,7 @@ redo:
* that acquire_slab() will see a slab page that
* is frozen
*/
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
} else {
m = M_FULL;
@@ -2136,7 +2191,7 @@ redo:
* slabs from diagnostic functions will not see
* any frozen slabs.
*/
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
}
@@ -2160,7 +2215,7 @@ redo:
goto redo;
if (lock)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
if (m == M_PARTIAL)
stat(s, tail);
@@ -2199,10 +2254,10 @@ static void unfreeze_partials(struct kmem_cache *s,
n2 = get_node(s, page_to_nid(page));
if (n != n2) {
if (n)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
n = n2;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
do {
@@ -2231,7 +2286,7 @@ static void unfreeze_partials(struct kmem_cache *s,
}
if (n)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
while (discard_page) {
page = discard_page;
@@ -2268,14 +2323,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
pobjects = oldpage->pobjects;
pages = oldpage->pages;
if (drain && pobjects > s->cpu_partial) {
+ struct slub_free_list *f;
unsigned long flags;
+ LIST_HEAD(tofree);
/*
* partial array is full. Move the existing
* set to the per node partial list.
*/
local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ f = this_cpu_ptr(&slub_free_list);
+ raw_spin_lock(&f->lock);
+ list_splice_init(&f->list, &tofree);
+ raw_spin_unlock(&f->lock);
local_irq_restore(flags);
+ free_delayed(&tofree);
oldpage = NULL;
pobjects = 0;
pages = 0;
@@ -2343,7 +2405,19 @@ static bool has_cpu_slab(int cpu, void *info)
static void flush_all(struct kmem_cache *s)
{
+ LIST_HEAD(tofree);
+ int cpu;
+
on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+ for_each_online_cpu(cpu) {
+ struct slub_free_list *f;
+
+ f = &per_cpu(slub_free_list, cpu);
+ raw_spin_lock_irq(&f->lock);
+ list_splice_init(&f->list, &tofree);
+ raw_spin_unlock_irq(&f->lock);
+ free_delayed(&tofree);
+ }
}
/*
@@ -2398,10 +2472,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
unsigned long x = 0;
struct page *page;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
x += get_count(page);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
@@ -2540,23 +2614,35 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c,
+ struct list_head *to_free)
{
+ struct slub_free_list *f;
void *freelist;
struct page *page;
page = c->page;
- if (!page)
+ if (!page) {
+ /*
+ * if the node is not online or has no normal memory, just
+ * ignore the node constraint
+ */
+ if (unlikely(node != NUMA_NO_NODE &&
+ !node_state(node, N_NORMAL_MEMORY)))
+ node = NUMA_NO_NODE;
goto new_slab;
+ }
redo:
if (unlikely(!node_match(page, node))) {
- int searchnode = node;
-
- if (node != NUMA_NO_NODE && !node_present_pages(node))
- searchnode = node_to_mem_node(node);
-
- if (unlikely(!node_match(page, searchnode))) {
+ /*
+ * same as above but node_match() being false already
+ * implies node != NUMA_NO_NODE
+ */
+ if (!node_state(node, N_NORMAL_MEMORY)) {
+ node = NUMA_NO_NODE;
+ goto redo;
+ } else {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, page, c->freelist, c);
goto new_slab;
@@ -2597,6 +2683,13 @@ load_freelist:
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
+
+out:
+ f = this_cpu_ptr(&slub_free_list);
+ raw_spin_lock(&f->lock);
+ list_splice_init(&f->list, to_free);
+ raw_spin_unlock(&f->lock);
+
return freelist;
new_slab:
@@ -2612,7 +2705,7 @@ new_slab:
if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
- return NULL;
+ goto out;
}
page = c->page;
@@ -2625,7 +2718,7 @@ new_slab:
goto new_slab; /* Slab failed checks. Next slab needed */
deactivate_slab(s, page, get_freepointer(s, freelist), c);
- return freelist;
+ goto out;
}
/*
@@ -2637,6 +2730,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
{
void *p;
unsigned long flags;
+ LIST_HEAD(tofree);
local_irq_save(flags);
#ifdef CONFIG_PREEMPT
@@ -2648,8 +2742,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = this_cpu_ptr(s->cpu_slab);
#endif
- p = ___slab_alloc(s, gfpflags, node, addr, c);
+ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
local_irq_restore(flags);
+ free_delayed(&tofree);
return p;
}
@@ -2835,7 +2930,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
do {
if (unlikely(n)) {
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
prior = page->freelist;
@@ -2867,7 +2962,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* Otherwise the list_lock will synchronize with
* other processors updating the list of slabs.
*/
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
}
}
@@ -2908,7 +3003,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return;
slab_empty:
@@ -2923,7 +3018,7 @@ slab_empty:
remove_full(s, n, page);
}
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
discard_slab(s, page);
}
@@ -2967,11 +3062,13 @@ redo:
barrier();
if (likely(page == c->page)) {
- set_freepointer(s, tail_obj, c->freelist);
+ void **freelist = READ_ONCE(c->freelist);
+
+ set_freepointer(s, tail_obj, freelist);
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
- c->freelist, tid,
+ freelist, tid,
head, next_tid(tid)))) {
note_cmpxchg_failure("slab_free", s, tid);
@@ -3126,6 +3223,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
struct kmem_cache_cpu *c;
+ LIST_HEAD(to_free);
int i;
/* memcg and kmem_cache debug support */
@@ -3145,11 +3243,20 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
if (unlikely(!object)) {
/*
+ * We may have removed an object from c->freelist using
+ * the fastpath in the previous iteration; in that case,
+ * c->tid has not been bumped yet.
+ * Since ___slab_alloc() may reenable interrupts while
+ * allocating memory, we should bump c->tid now.
+ */
+ c->tid = next_tid(c->tid);
+
+ /*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
*/
p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c);
+ _RET_IP_, c, &to_free);
if (unlikely(!p[i]))
goto error;
@@ -3161,6 +3268,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
}
c->tid = next_tid(c->tid);
local_irq_enable();
+ free_delayed(&to_free);
/* Clear memory outside IRQ disabled fastpath loop */
if (unlikely(flags & __GFP_ZERO)) {
@@ -3175,6 +3283,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
return i;
error:
local_irq_enable();
+ free_delayed(&to_free);
slab_post_alloc_hook(s, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
@@ -3310,7 +3419,7 @@ static void
init_kmem_cache_node(struct kmem_cache_node *n)
{
n->nr_partial = 0;
- spin_lock_init(&n->list_lock);
+ raw_spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
@@ -3663,6 +3772,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_PREEMPT_RT_BASE
+ /* XXX move out of irq-off section */
+ slab_err(s, page, text, s->name);
+#else
+
void *addr = page_address(page);
void *p;
unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
@@ -3682,8 +3796,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
slab_unlock(page);
bitmap_free(map);
#endif
+#endif
}
+
/*
* Attempt to free all partial slabs on a node.
* This is called from __kmem_cache_shutdown(). We must take list_lock
@@ -3695,7 +3811,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
struct page *page, *h;
BUG_ON(irqs_disabled());
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &n->partial, slab_list) {
if (!page->inuse) {
remove_partial(n, page);
@@ -3705,7 +3821,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
"Objects remaining in %s on __kmem_cache_shutdown()");
}
}
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
@@ -3979,7 +4095,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
INIT_LIST_HEAD(promote + i);
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
/*
* Build lists of slabs to discard or promote.
@@ -4010,7 +4126,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
list_splice(promote + i, &n->partial);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
list_for_each_entry_safe(page, t, &discard, slab_list)
@@ -4223,6 +4339,12 @@ void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
+ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
+ }
if (debug_guardpage_minorder())
slub_max_order = 0;
@@ -4424,7 +4546,7 @@ static int validate_slab_node(struct kmem_cache *s,
struct page *page;
unsigned long flags;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list) {
validate_slab_slab(s, page, map);
@@ -4446,7 +4568,7 @@ static int validate_slab_node(struct kmem_cache *s,
s->name, count, atomic_long_read(&n->nr_slabs));
out:
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return count;
}
@@ -4632,12 +4754,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
if (!atomic_long_read(&n->nr_slabs))
continue;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
process_slab(&t, s, page, alloc, map);
list_for_each_entry(page, &n->full, slab_list)
process_slab(&t, s, page, alloc, map);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
}
for (i = 0; i < t.count; i++) {
@@ -5601,7 +5723,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
*/
if (buffer)
buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
+ else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
+ !IS_ENABLED(CONFIG_SLUB_STATS))
buf = mbuf;
else {
buffer = (char *) get_zeroed_page(GFP_KERNEL);
@@ -5756,8 +5879,10 @@ static int sysfs_slab_add(struct kmem_cache *s)
s->kobj.kset = kset;
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- if (err)
+ if (err) {
+ kobject_put(&s->kobj);
goto out;
+ }
err = sysfs_create_group(&s->kobj, &slab_attr_group);
if (err)
diff --git a/mm/swap.c b/mm/swap.c
index 607c48229a1d..182fb73f589f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -33,6 +33,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
+#include <linux/locallock.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
@@ -51,6 +52,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
#endif
+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
+DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
/*
* This path almost never happens for VM activity - pages are normally
@@ -253,11 +256,11 @@ void rotate_reclaimable_page(struct page *page)
unsigned long flags;
get_page(page);
- local_irq_save(flags);
+ local_lock_irqsave(rotate_lock, flags);
pvec = this_cpu_ptr(&lru_rotate_pvecs);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(rotate_lock, flags);
}
}
@@ -307,12 +310,13 @@ void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ activate_page_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
- put_cpu_var(activate_page_pvecs);
+ put_locked_var(swapvec_lock, activate_page_pvecs);
}
}
@@ -334,7 +338,7 @@ void activate_page(struct page *page)
static void __lru_cache_activate_page(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
int i;
/*
@@ -356,7 +360,7 @@ static void __lru_cache_activate_page(struct page *page)
}
}
- put_cpu_var(lru_add_pvec);
+ put_locked_var(swapvec_lock, lru_add_pvec);
}
/*
@@ -398,12 +402,12 @@ EXPORT_SYMBOL(mark_page_accessed);
static void __lru_cache_add(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
__pagevec_lru_add(pvec);
- put_cpu_var(lru_add_pvec);
+ put_locked_var(swapvec_lock, lru_add_pvec);
}
/**
@@ -581,9 +585,15 @@ void lru_add_drain_cpu(int cpu)
unsigned long flags;
/* No harm done if a racing interrupt already did this */
- local_irq_save(flags);
+#ifdef CONFIG_PREEMPT_RT_BASE
+ local_lock_irqsave_on(rotate_lock, flags, cpu);
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore_on(rotate_lock, flags, cpu);
+#else
+ local_lock_irqsave(rotate_lock, flags);
+ pagevec_move_tail(pvec);
+ local_unlock_irqrestore(rotate_lock, flags);
+#endif
}
pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
@@ -615,11 +625,12 @@ void deactivate_file_page(struct page *page)
return;
if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ lru_deactivate_file_pvecs);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- put_cpu_var(lru_deactivate_file_pvecs);
+ put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
}
}
@@ -634,23 +645,34 @@ void mark_page_lazyfree(struct page *page)
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ lru_lazyfree_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
- put_cpu_var(lru_lazyfree_pvecs);
+ put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
}
}
void lru_add_drain(void)
{
- lru_add_drain_cpu(get_cpu());
- put_cpu();
+ lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
+ local_unlock_cpu(swapvec_lock);
}
#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
+{
+ local_lock_on(swapvec_lock, cpu);
+ lru_add_drain_cpu(cpu);
+ local_unlock_on(swapvec_lock, cpu);
+}
+
+#else
+
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
static void lru_add_drain_per_cpu(struct work_struct *dummy)
@@ -658,6 +680,16 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
lru_add_drain();
}
+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
+{
+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+ INIT_WORK(work, lru_add_drain_per_cpu);
+ queue_work_on(cpu, mm_percpu_wq, work);
+ cpumask_set_cpu(cpu, has_work);
+}
+#endif
+
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -682,21 +714,19 @@ void lru_add_drain_all(void)
cpumask_clear(&has_work);
for_each_online_cpu(cpu) {
- struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
- need_activate_page_drain(cpu)) {
- INIT_WORK(work, lru_add_drain_per_cpu);
- queue_work_on(cpu, mm_percpu_wq, work);
- cpumask_set_cpu(cpu, &has_work);
- }
+ need_activate_page_drain(cpu))
+ remote_lru_add_drain(cpu, &has_work);
}
+#ifndef CONFIG_PREEMPT_RT_BASE
for_each_cpu(cpu, &has_work)
flush_work(&per_cpu(lru_add_drain_work, cpu));
+#endif
mutex_unlock(&lock);
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 61453f1faf72..3f2e8873ccfe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,7 +21,7 @@
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
-
+#include "internal.h"
#include <asm/pgtable.h>
/*
@@ -417,7 +417,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* May fail (-ENOMEM) if XArray node allocation failed. */
__SetPageLocked(new_page);
__SetPageSwapBacked(new_page);
- err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
+ err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_RECLAIM_MASK);
if (likely(!err)) {
/* Initiate read into locked page */
SetPageWorkingset(new_page);
diff --git a/mm/util.c b/mm/util.c
index 9834c4ab7d8e..be52aa2c9733 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -461,6 +461,24 @@ void kvfree(const void *addr)
}
EXPORT_SYMBOL(kvfree);
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 080d30408ce3..558a7a94695f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,6 +34,7 @@
#include <linux/llist.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
+#include <linux/overflow.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
@@ -1217,7 +1218,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
* First make sure the mappings are removed from all page-tables
* before they are freed.
*/
- vmalloc_sync_all();
+ vmalloc_sync_unmappings();
/*
* TODO: to calculate a flush range without looping.
@@ -1406,7 +1407,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
struct vmap_block *vb;
struct vmap_area *va;
unsigned long vb_idx;
- int node, err;
+ int node, err, cpu;
void *vaddr;
node = numa_node_id();
@@ -1449,11 +1450,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
BUG_ON(err);
radix_tree_preload_end();
- vbq = &get_cpu_var(vmap_block_queue);
+ cpu = get_cpu_light();
+ vbq = this_cpu_ptr(&vmap_block_queue);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
- put_cpu_var(vmap_block_queue);
+ put_cpu_light();
return vaddr;
}
@@ -1522,6 +1524,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
struct vmap_block *vb;
void *vaddr = NULL;
unsigned int order;
+ int cpu;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -1536,7 +1539,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
order = get_order(size);
rcu_read_lock();
- vbq = &get_cpu_var(vmap_block_queue);
+ cpu = get_cpu_light();
+ vbq = this_cpu_ptr(&vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
@@ -1559,7 +1563,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
break;
}
- put_cpu_var(vmap_block_queue);
+ put_cpu_light();
rcu_read_unlock();
/* Allocate new block if nothing was found */
@@ -2931,6 +2935,7 @@ finished:
* @vma: vma to cover
* @uaddr: target user address to start at
* @kaddr: virtual address of vmalloc kernel memory
+ * @pgoff: offset from @kaddr to start at
* @size: size of map area
*
* Returns: 0 for success, -Exxx on failure
@@ -2943,9 +2948,15 @@ finished:
* Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
- void *kaddr, unsigned long size)
+ void *kaddr, unsigned long pgoff,
+ unsigned long size)
{
struct vm_struct *area;
+ unsigned long off;
+ unsigned long end_index;
+
+ if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
+ return -EINVAL;
size = PAGE_ALIGN(size);
@@ -2959,8 +2970,10 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!(area->flags & VM_USERMAP))
return -EINVAL;
- if (kaddr + size > area->addr + get_vm_area_size(area))
+ if (check_add_overflow(size, off, &end_index) ||
+ end_index > get_vm_area_size(area))
return -EINVAL;
+ kaddr += off;
do {
struct page *page = vmalloc_to_page(kaddr);
@@ -2999,22 +3012,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
return remap_vmalloc_range_partial(vma, vma->vm_start,
- addr + (pgoff << PAGE_SHIFT),
+ addr, pgoff,
vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);
/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
+ * not to have one.
*
* The purpose of this function is to make sure the vmalloc area
* mappings are identical in all page-tables in the system.
*/
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
{
}
+void __weak vmalloc_sync_unmappings(void)
+{
+}
static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
{
@@ -3242,9 +3258,19 @@ retry:
goto overflow;
/*
+ * If required width exeeds current VA block, move
+ * base downwards and then recheck.
+ */
+ if (base + end > va->va_end) {
+ base = pvm_determine_end_from_reverse(&va, align) - end;
+ term_area = area;
+ continue;
+ }
+
+ /*
* If this VA does not fit, move base downwards and recheck.
*/
- if (base + start < va->va_start || base + end > va->va_end) {
+ if (base + start < va->va_start) {
va = node_to_va(rb_prev(&va->rb_node));
base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1b1b42de1ee8..fdd307634c08 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2379,10 +2379,13 @@ out:
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
- * Make sure we don't miss the last page
- * because of a round-off error.
+ * Make sure we don't miss the last page on
+ * the offlined memory cgroups because of a
+ * round-off error.
*/
- scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ scan = mem_cgroup_online(memcg) ?
+ div64_u64(scan * fraction[file], denominator) :
+ DIV64_U64_ROUND_UP(scan * fraction[file],
denominator);
break;
case SCAN_FILE:
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fd7e16ca6996..33f0670d7756 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,6 +321,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long x;
long t;
+ preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -330,6 +331,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
+ preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -341,6 +343,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long x;
long t;
+ preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -350,6 +353,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
+ preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -382,6 +386,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -390,6 +395,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v + overstep, zone, item);
__this_cpu_write(*p, -overstep);
}
+ preempt_enable_rt();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -398,6 +404,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -406,6 +413,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
}
+ preempt_enable_rt();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -426,6 +434,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -434,6 +443,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v - overstep, zone, item);
__this_cpu_write(*p, overstep);
}
+ preempt_enable_rt();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -442,6 +452,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -450,6 +461,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
}
+ preempt_enable_rt();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -1970,7 +1982,7 @@ void __init init_mm_internals(void)
#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
- proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op);
+ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
#endif
diff --git a/mm/workingset.c b/mm/workingset.c
index e0b4edcb88c8..67b00ddcdab2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -367,6 +367,8 @@ static struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
+ struct address_space *mapping;
+
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -375,7 +377,8 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+ mapping = container_of(node->array, struct address_space, i_pages);
+ lockdep_assert_held(&mapping->i_pages.xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7d62ef2daf83..863512987563 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -56,6 +56,7 @@
#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
+#include <linux/locallock.h>
#define ZSPAGE_MAGIC 0x58
@@ -73,9 +74,22 @@
*/
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
-
#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+struct zsmalloc_handle {
+ unsigned long addr;
+ struct mutex lock;
+};
+
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
+
+#else
+
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
+#endif
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
@@ -325,7 +339,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
static int create_cache(struct zs_pool *pool)
{
- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
0, 0, NULL);
if (!pool->handle_cachep)
return 1;
@@ -349,10 +363,27 @@ static void destroy_cache(struct zs_pool *pool)
static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ void *p;
+
+ p = kmem_cache_alloc(pool->handle_cachep,
+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (p) {
+ struct zsmalloc_handle *zh = p;
+
+ mutex_init(&zh->lock);
+ }
+#endif
+ return (unsigned long)p;
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
+{
+ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
+}
+#endif
+
static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
{
kmem_cache_free(pool->handle_cachep, (void *)handle);
@@ -371,12 +402,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
static void record_obj(unsigned long handle, unsigned long obj)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ WRITE_ONCE(zh->addr, obj);
+#else
/*
* lsb of @obj represents handle lock while other bits
* represent object value the handle is pointing so
* updating shouldn't do store tearing.
*/
WRITE_ONCE(*(unsigned long *)handle, obj);
+#endif
}
/* zpool driver */
@@ -458,6 +495,7 @@ MODULE_ALIAS("zpool-zsmalloc");
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
static bool is_zspage_isolated(struct zspage *zspage)
{
@@ -887,7 +925,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
static unsigned long handle_to_obj(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return zh->addr;
+#else
return *(unsigned long *)handle;
+#endif
}
static unsigned long obj_to_head(struct page *page, void *obj)
@@ -901,22 +945,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
static inline int testpin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_is_locked(&zh->lock);
+#else
return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static inline int trypin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_trylock(&zh->lock);
+#else
return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void pin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_lock(&zh->lock);
+#else
bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void unpin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_unlock(&zh->lock);
+#else
bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void reset_page(struct page *page)
@@ -1342,7 +1410,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
class = pool->size_class[class_idx];
off = (class->size * obj_idx) & ~PAGE_MASK;
- area = &get_cpu_var(zs_map_area);
+ area = &get_locked_var(zs_map_area_lock, zs_map_area);
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
@@ -1396,7 +1464,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
- put_cpu_var(zs_map_area);
+ put_locked_var(zs_map_area_lock, zs_map_area);
migrate_read_unlock(zspage);
unpin_tag(handle);
@@ -2092,6 +2160,11 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
zs_pool_dec_isolated(pool);
}
+ if (page_zone(newpage) != page_zone(page)) {
+ dec_zone_page_state(page, NR_ZSPAGES);
+ inc_zone_page_state(newpage, NR_ZSPAGES);
+ }
+
reset_page(page);
put_page(page);
page = newpage;
diff --git a/mm/zswap.c b/mm/zswap.c
index 2412042f5550..54d523914c1e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/locallock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/frontswap.h>
@@ -981,6 +982,8 @@ static void zswap_fill_page(void *ptr, unsigned long value)
memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
}
+/* protect zswap_dstmem from concurrency */
+static DEFINE_LOCAL_IRQ_LOCK(zswap_dstmem_lock);
/*********************************
* frontswap hooks
**********************************/
@@ -1057,12 +1060,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
/* compress */
- dst = get_cpu_var(zswap_dstmem);
- tfm = *get_cpu_ptr(entry->pool->tfm);
+ dst = get_locked_var(zswap_dstmem_lock, zswap_dstmem);
+ tfm = *this_cpu_ptr(entry->pool->tfm);
src = kmap_atomic(page);
ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
kunmap_atomic(src);
- put_cpu_ptr(entry->pool->tfm);
if (ret) {
ret = -EINVAL;
goto put_dstmem;
@@ -1085,7 +1087,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
memcpy(buf, &zhdr, hlen);
memcpy(buf + hlen, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
- put_cpu_var(zswap_dstmem);
+ put_locked_var(zswap_dstmem_lock, zswap_dstmem);
/* populate entry */
entry->offset = offset;
@@ -1113,7 +1115,7 @@ insert_entry:
return 0;
put_dstmem:
- put_cpu_var(zswap_dstmem);
+ put_locked_var(zswap_dstmem_lock, zswap_dstmem);
zswap_pool_put(entry->pool);
freepage:
zswap_entry_cache_free(entry);