aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c27
-rw-r--r--mm/filemap.c239
-rw-r--r--mm/frame_vector.c31
-rw-r--r--mm/frontswap.c2
-rw-r--r--mm/gup.c15
-rw-r--r--mm/huge_memory.c3
-rw-r--r--mm/hugetlb.c41
-rw-r--r--mm/kasan/report.c4
-rw-r--r--mm/khugepaged.c45
-rw-r--r--mm/kmemleak.c8
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c7
-rw-r--r--mm/memcontrol.c27
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c8
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/memtest.c4
-rw-r--r--mm/migrate.c11
-rw-r--r--mm/mmap.c26
-rw-r--r--mm/mmu_gather.c5
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c91
-rw-r--r--mm/pagewalk.c13
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/readahead.c3
-rw-r--r--mm/rmap.c31
-rw-r--r--mm/shmem.c28
-rw-r--r--mm/slub.c13
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/userfaultfd.c14
-rw-r--r--mm/util.c32
-rw-r--r--mm/vmscan.c5
-rw-r--r--mm/zsmalloc.c37
36 files changed, 608 insertions, 217 deletions
diff --git a/mm/cma.c b/mm/cma.c
index 7de520c0a1db..a9635a560094 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -481,7 +481,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
*/
if (page) {
for (i = 0; i < count; i++)
- page_kasan_tag_reset(page + i);
+ page_kasan_tag_reset(nth_page(page, i));
}
if (ret && !no_warn) {
diff --git a/mm/compaction.c b/mm/compaction.c
index d686887856fe..138edfa834b9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1219,7 +1219,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
}
static void
-fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
+fast_isolate_around(struct compact_control *cc, unsigned long pfn)
{
unsigned long start_pfn, end_pfn;
struct page *page = pfn_to_page(pfn);
@@ -1236,21 +1236,13 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
start_pfn = pageblock_start_pfn(pfn);
end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
- /* Scan before */
- if (start_pfn != pfn) {
- isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
- if (cc->nr_freepages >= cc->nr_migratepages)
- return;
- }
-
- /* Scan after */
- start_pfn = pfn + nr_isolated;
- if (start_pfn < end_pfn)
- isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+ isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
if (cc->nr_freepages < cc->nr_migratepages)
set_pageblock_skip(page);
+
+ return;
}
/* Search orders in round-robin fashion */
@@ -1422,7 +1414,7 @@ fast_isolate_freepages(struct compact_control *cc)
return cc->free_pfn;
low_pfn = page_to_pfn(page);
- fast_isolate_around(cc, low_pfn, nr_isolated);
+ fast_isolate_around(cc, low_pfn);
return low_pfn;
}
@@ -1709,6 +1701,8 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
update_fast_start_pfn(cc, free_pfn);
pfn = pageblock_start_pfn(free_pfn);
+ if (pfn < cc->zone->zone_start_pfn)
+ pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
set_pageblock_skip(freepage);
@@ -2349,16 +2343,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, struct page **capture)
{
- int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- /*
- * Check if the GFP flags allow compaction - GFP_NOIO is really
- * tricky context because the migration might require IO
- */
- if (!may_perform_io)
+ if (!gfp_compaction_allowed(gfp_mask))
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
diff --git a/mm/filemap.c b/mm/filemap.c
index c89e9be5c6bd..7ec4fcd8a34d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1044,8 +1044,43 @@ struct wait_page_queue {
wait_queue_entry_t wait;
};
+/*
+ * The page wait code treats the "wait->flags" somewhat unusually, because
+ * we have multiple different kinds of waits, not just he usual "exclusive"
+ * one.
+ *
+ * We have:
+ *
+ * (a) no special bits set:
+ *
+ * We're just waiting for the bit to be released, and when a waker
+ * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
+ * and remove it from the wait queue.
+ *
+ * Simple and straightforward.
+ *
+ * (b) WQ_FLAG_EXCLUSIVE:
+ *
+ * The waiter is waiting to get the lock, and only one waiter should
+ * be woken up to avoid any thundering herd behavior. We'll set the
+ * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
+ *
+ * This is the traditional exclusive wait.
+ *
+ * (b) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
+ *
+ * The waiter is waiting to get the bit, and additionally wants the
+ * lock to be transferred to it for fair lock behavior. If the lock
+ * cannot be taken, we stop walking the wait queue without waking
+ * the waiter.
+ *
+ * This is the "fair lock handoff" case, and in addition to setting
+ * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
+ * that it now has the lock.
+ */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
+ unsigned int flags;
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
= container_of(wait, struct wait_page_queue, wait);
@@ -1058,17 +1093,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
return 0;
/*
- * Stop walking if it's locked.
- * Is this safe if put_and_wait_on_page_locked() is in use?
- * Yes: the waker must hold a reference to this page, and if PG_locked
- * has now already been set by another task, that task must also hold
- * a reference to the *same usage* of this page; so there is no need
- * to walk on to wake even the put_and_wait_on_page_locked() callers.
+ * If it's a lock handoff wait, we get the bit for it, and
+ * stop walking (and do not wake it up) if we can't.
*/
- if (test_bit(key->bit_nr, &key->page->flags))
- return -1;
+ flags = wait->flags;
+ if (flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ if (flags & WQ_FLAG_CUSTOM) {
+ if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ flags |= WQ_FLAG_DONE;
+ }
+ }
- return autoremove_wake_function(wait, mode, sync, key);
+ /*
+ * We are holding the wait-queue lock, but the waiter that
+ * is waiting for this will be checking the flags without
+ * any locking.
+ *
+ * So update the flags atomically, and wake up the waiter
+ * afterwards to avoid any races. This store-release pairs
+ * with the load-acquire in wait_on_page_bit_common().
+ */
+ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
+ wake_up_state(wait->private, mode);
+
+ /*
+ * Ok, we have successfully done what we're waiting for,
+ * and we can unconditionally remove the wait entry.
+ *
+ * Note that this pairs with the "finish_wait()" in the
+ * waiter, and has to be the absolute last thing we do.
+ * After this list_del_init(&wait->entry) the wait entry
+ * might be de-allocated and the process might even have
+ * exited.
+ */
+ list_del_init_careful(&wait->entry);
+ return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}
static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1147,16 +1209,35 @@ enum behavior {
*/
};
+/*
+ * Attempt to check (or get) the page bit, and mark us done
+ * if successful.
+ */
+static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+ struct wait_queue_entry *wait)
+{
+ if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_and_set_bit(bit_nr, &page->flags))
+ return false;
+ } else if (test_bit(bit_nr, &page->flags))
+ return false;
+
+ wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
+ return true;
+}
+
+/* How many times do we accept lock stealing from under a waiter? */
+int sysctl_page_lock_unfairness = 5;
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, enum behavior behavior)
{
+ int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
- bool bit_is_set;
bool thrashing = false;
bool delayacct = false;
unsigned long pflags;
- int ret = 0;
if (bit_nr == PG_locked &&
!PageUptodate(page) && PageWorkingset(page)) {
@@ -1169,55 +1250,97 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
}
init_wait(wait);
- wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
- for (;;) {
- spin_lock_irq(&q->lock);
-
- if (likely(list_empty(&wait->entry))) {
- __add_wait_queue_entry_tail(q, wait);
- SetPageWaiters(page);
- }
+repeat:
+ wait->flags = 0;
+ if (behavior == EXCLUSIVE) {
+ wait->flags = WQ_FLAG_EXCLUSIVE;
+ if (--unfairness < 0)
+ wait->flags |= WQ_FLAG_CUSTOM;
+ }
- set_current_state(state);
+ /*
+ * Do one last check whether we can get the
+ * page bit synchronously.
+ *
+ * Do the SetPageWaiters() marking before that
+ * to let any waker we _just_ missed know they
+ * need to wake us up (otherwise they'll never
+ * even go to the slow case that looks at the
+ * page queue), and add ourselves to the wait
+ * queue if we need to sleep.
+ *
+ * This part needs to be done under the queue
+ * lock to avoid races.
+ */
+ spin_lock_irq(&q->lock);
+ SetPageWaiters(page);
+ if (!trylock_page_bit_common(page, bit_nr, wait))
+ __add_wait_queue_entry_tail(q, wait);
+ spin_unlock_irq(&q->lock);
- spin_unlock_irq(&q->lock);
+ /*
+ * From now on, all the logic will be based on
+ * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
+ * see whether the page bit testing has already
+ * been done by the wake function.
+ *
+ * We can drop our reference to the page.
+ */
+ if (behavior == DROP)
+ put_page(page);
- bit_is_set = test_bit(bit_nr, &page->flags);
- if (behavior == DROP)
- put_page(page);
+ /*
+ * Note that until the "finish_wait()", or until
+ * we see the WQ_FLAG_WOKEN flag, we need to
+ * be very careful with the 'wait->flags', because
+ * we may race with a waker that sets them.
+ */
+ for (;;) {
+ unsigned int flags;
- if (likely(bit_is_set))
- io_schedule();
+ set_current_state(state);
- if (behavior == EXCLUSIVE) {
- if (!test_and_set_bit_lock(bit_nr, &page->flags))
- break;
- } else if (behavior == SHARED) {
- if (!test_bit(bit_nr, &page->flags))
+ /* Loop until we've been woken or interrupted */
+ flags = smp_load_acquire(&wait->flags);
+ if (!(flags & WQ_FLAG_WOKEN)) {
+ if (signal_pending_state(state, current))
break;
+
+ io_schedule();
+ continue;
}
- if (signal_pending_state(state, current)) {
- ret = -EINTR;
+ /* If we were non-exclusive, we're done */
+ if (behavior != EXCLUSIVE)
break;
- }
- if (behavior == DROP) {
- /*
- * We can no longer safely access page->flags:
- * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
- * there is a risk of waiting forever on a page reused
- * for something that keeps it locked indefinitely.
- * But best check for -EINTR above before breaking.
- */
+ /* If the waker got the lock for us, we're done */
+ if (flags & WQ_FLAG_DONE)
break;
- }
+
+ /*
+ * Otherwise, if we're getting the lock, we need to
+ * try to get it ourselves.
+ *
+ * And if that fails, we'll have to retry this all.
+ */
+ if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ goto repeat;
+
+ wait->flags |= WQ_FLAG_DONE;
+ break;
}
+ /*
+ * If a signal happened, this 'finish_wait()' may remove the last
+ * waiter from the wait-queues, but the PageWaiters bit will remain
+ * set. That's ok. The next wakeup will take care of it, and trying
+ * to do it here would be difficult and prone to races.
+ */
finish_wait(q, wait);
if (thrashing) {
@@ -1227,14 +1350,22 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
}
/*
- * A signal could leave PageWaiters set. Clearing it here if
- * !waitqueue_active would be possible (by open-coding finish_wait),
- * but still fail to catch it in the case of wait hash collision. We
- * already can fail to clear wait hash collision cases, so don't
- * bother with signals either.
+ * NOTE! The wait->flags weren't stable until we've done the
+ * 'finish_wait()', and we could have exited the loop above due
+ * to a signal, and had a wakeup event happen after the signal
+ * test but before the 'finish_wait()'.
+ *
+ * So only after the finish_wait() can we reliably determine
+ * if we got woken up or not, so we can now figure out the final
+ * return value based on that state without races.
+ *
+ * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
+ * waiter, but an exclusive one requires WQ_FLAG_DONE.
*/
+ if (behavior == EXCLUSIVE)
+ return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
- return ret;
+ return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
void wait_on_page_bit(struct page *page, int bit_nr)
@@ -1355,11 +1486,19 @@ void end_page_writeback(struct page *page)
rotate_reclaimable_page(page);
}
+ /*
+ * Writeback does not hold a page reference of its own, relying
+ * on truncation to wait for the clearing of PG_writeback.
+ * But here we must make sure that the page is not freed and
+ * reused before the wake_up_page().
+ */
+ get_page(page);
if (!test_clear_page_writeback(page))
BUG();
smp_mb__after_atomic();
wake_up_page(page, PG_writeback);
+ put_page(page);
}
EXPORT_SYMBOL(end_page_writeback);
@@ -3281,7 +3420,7 @@ ssize_t generic_perform_write(struct file *file,
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
- void *fsdata;
+ void *fsdata = NULL;
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index c431ca81dad5..5779c1bc6f44 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -37,7 +37,6 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int ret = 0;
- int err;
int locked;
if (nr_frames == 0)
@@ -74,32 +73,14 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
vec->is_pfns = false;
ret = get_user_pages_locked(start, nr_frames,
gup_flags, (struct page **)(vec->ptrs), &locked);
- goto out;
+ if (likely(ret > 0))
+ goto out;
}
- vec->got_ref = false;
- vec->is_pfns = true;
- do {
- unsigned long *nums = frame_vector_pfns(vec);
-
- while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
- err = follow_pfn(vma, start, &nums[ret]);
- if (err) {
- if (ret == 0)
- ret = err;
- goto out;
- }
- start += PAGE_SIZE;
- ret++;
- }
- /*
- * We stop if we have enough pages or if VMA doesn't completely
- * cover the tail page.
- */
- if (ret >= nr_frames || start < vma->vm_end)
- break;
- vma = find_vma_intersection(mm, start, start + 1);
- } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
+ /* This used to (racily) return non-refcounted pfns. Let people know */
+ WARN_ONCE(1, "get_vaddr_frames() cannot follow VM_IO mapping");
+ vec->nr_frames = 0;
+
out:
if (locked)
up_read(&mm->mmap_sem);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 60bb20e8a951..fd3337702513 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -446,7 +446,7 @@ static int __frontswap_shrink(unsigned long target_pages,
void frontswap_shrink(unsigned long target_pages)
{
unsigned long pages_to_unuse = 0;
- int uninitialized_var(type), ret;
+ int type, ret;
/*
* we don't want to hold swap_lock while doing a very
diff --git a/mm/gup.c b/mm/gup.c
index 3ef769529548..2cff8b4d412c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -188,6 +188,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t *ptep, pte;
+ /*
+ * Considering PTE level hugetlb, like continuous-PTE hugetlb on
+ * ARM64 architecture.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = follow_huge_pmd_pte(vma, address, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
@@ -333,7 +344,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if (pmd_none(pmdval))
return no_page_table(vma, flags);
if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags);
+ page = follow_huge_pmd_pte(vma, address, flags);
if (page)
return page;
return no_page_table(vma, flags);
@@ -2240,7 +2251,7 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (unlikely(pud_huge(pud))) {
+ if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e50799d7002e..03b57323c53b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2912,6 +2912,9 @@ void deferred_split_huge_page(struct page *page)
if (PageSwapCache(page))
return;
+ if (!list_empty(page_deferred_list(page)))
+ return;
+
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 20da6ede7704..e4478b62d0f7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2218,11 +2218,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
+ spin_lock(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
SetPagePrivate(page);
h->resv_huge_pages--;
}
- spin_lock(&hugetlb_lock);
list_move(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
@@ -5033,7 +5033,14 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
pud_clear(pud);
put_page(virt_to_page(ptep));
mm_dec_nr_pmds(mm);
- *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ /*
+ * This update of passed address optimizes loops sequentially
+ * processing addresses in increments of huge page size (PMD_SIZE
+ * in this case). By clearing the pud, a PUD_SIZE area is unmapped.
+ * Update address to the 'last page' in the cleared area so that
+ * calling loop can move to first page past this area.
+ */
+ *addr |= PUD_SIZE - PMD_SIZE;
return 1;
}
#define want_pmd_share() (1)
@@ -5150,30 +5157,30 @@ follow_huge_pd(struct vm_area_struct *vma,
}
struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
spinlock_t *ptl;
- pte_t pte;
+ pte_t *ptep, pte;
+
retry:
- ptl = pmd_lockptr(mm, pmd);
- spin_lock(ptl);
- /*
- * make sure that the address range covered by this pmd is not
- * unmapped from other threads.
- */
- if (!pmd_huge(*pmd))
- goto out;
- pte = huge_ptep_get((pte_t *)pmd);
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ if (!ptep)
+ return NULL;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ pte = huge_ptep_get(ptep);
if (pte_present(pte)) {
- page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pte_page(pte) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
- __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+ __migration_entry_wait(mm, ptep, ptl);
goto retry;
}
/*
@@ -5181,7 +5188,7 @@ retry:
* follow_page_mask().
*/
}
-out:
+
spin_unlock(ptl);
return page;
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 621782100eaa..4d87df96acc1 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -92,8 +92,8 @@ static void end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ check_panic_on_warn("KASAN");
kasan_enable_current();
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3c2326568193..f1f98305433e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1060,6 +1060,7 @@ static void collapse_huge_page(struct mm_struct *mm,
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();
spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1312,6 +1313,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
spinlock_t *ptl;
int count = 0;
int i;
+ struct mmu_notifier_range range;
if (!vma || !vma->vm_file ||
vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
@@ -1338,6 +1340,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!pmd)
goto drop_hpage;
+ /*
+ * We need to lock the mapping so that from here on, only GUP-fast and
+ * hardware page walks can access the parts of the page tables that
+ * we're operating on.
+ */
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ /*
+ * This spinlock should be unnecessary: Nobody else should be accessing
+ * the page tables under spinlock protection here, only
+ * lockless_pages_from_mm() and the hardware page walker can access page
+ * tables while all the high-level locks are held in write mode.
+ */
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
/* step 1: check all mapped PTEs are to the right huge page */
@@ -1384,12 +1399,23 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
}
/* step 4: collapse pmd */
- ptl = pmd_lock(vma->vm_mm, pmd);
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
_pmd = pmdp_collapse_flush(vma, haddr, pmd);
- spin_unlock(ptl);
mm_dec_nr_ptes(mm);
+ tlb_remove_table_sync_one();
+ mmu_notifier_invalidate_range_end(&range);
pte_free(mm, pmd_pgtable(_pmd));
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
drop_hpage:
unlock_page(hpage);
put_page(hpage);
@@ -1397,6 +1423,7 @@ drop_hpage:
abort:
pte_unmap_unlock(start_pte, ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}
@@ -1446,7 +1473,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too.
+ * has higher cost too. It would also probably require locking
+ * the anon_vma.
*/
if (vma->anon_vma)
continue;
@@ -1468,12 +1496,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
*/
if (down_write_trylock(&mm->mmap_sem)) {
if (!khugepaged_test_exit(mm)) {
- spinlock_t *ptl = pmd_lock(mm, pmd);
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, addr,
+ addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
mm_dec_nr_ptes(mm);
+ tlb_remove_table_sync_one();
pte_free(mm, pmd_pgtable(_pmd));
+ mmu_notifier_invalidate_range_end(&range);
}
up_write(&mm->mmap_sem);
} else {
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 3761c79137b1..d8cde7292bf9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1123,7 +1123,7 @@ EXPORT_SYMBOL(kmemleak_no_scan);
void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
gfp_t gfp)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
kmemleak_alloc(__va(phys), size, min_count, gfp);
}
EXPORT_SYMBOL(kmemleak_alloc_phys);
@@ -1137,7 +1137,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
*/
void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
kmemleak_free_part(__va(phys), size);
}
EXPORT_SYMBOL(kmemleak_free_part_phys);
@@ -1149,7 +1149,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys);
*/
void __ref kmemleak_not_leak_phys(phys_addr_t phys)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
kmemleak_not_leak(__va(phys));
}
EXPORT_SYMBOL(kmemleak_not_leak_phys);
@@ -1161,7 +1161,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
*/
void __ref kmemleak_ignore_phys(phys_addr_t phys)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
kmemleak_ignore(__va(phys));
}
EXPORT_SYMBOL(kmemleak_ignore_phys);
diff --git a/mm/ksm.c b/mm/ksm.c
index 0bbae78aaaa0..93e1898cd705 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2388,7 +2388,7 @@ next_mm:
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
- struct page *uninitialized_var(page);
+ struct page *page;
while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
diff --git a/mm/madvise.c b/mm/madvise.c
index 1107e99e498b..ac8d68c488b5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -428,8 +428,11 @@ regular_page:
continue;
}
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /*
+ * Do not interfere with other mappings of this page and
+ * non-LRU page.
+ */
+ if (!PageLRU(page) || page_mapcount(page) != 1)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8fc663545498..0cc7ee52d6bd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1044,7 +1044,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *iter;
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -2214,6 +2214,9 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
@@ -2221,6 +2224,8 @@ static void drain_stock(struct memcg_stock_pcp *stock)
css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2256,6 +2261,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
@@ -3775,6 +3781,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+
if (val & ~MOVE_MASK)
return -EINVAL;
@@ -4709,6 +4719,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
unsigned int efd, cfd;
struct fd efile;
struct fd cfile;
+ struct dentry *cdentry;
const char *name;
char *endp;
int ret;
@@ -4760,6 +4771,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
goto out_put_cfile;
/*
+ * The control file must be a regular cgroup1 file. As a regular cgroup
+ * file can't be renamed, it's safe to access its name afterwards.
+ */
+ cdentry = cfile.file->f_path.dentry;
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
* Determine the event callbacks and set them in @event. This used
* to be done via struct cftype but cgroup core no longer knows
* about these events. The following is crude but the whole thing
@@ -4767,7 +4788,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
*
* DO NOT ADD NEW FILES.
*/
- name = cfile.file->f_path.dentry->d_name.name;
+ name = cdentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
@@ -4791,7 +4812,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
&memory_cgrp_subsys);
ret = -EINVAL;
if (IS_ERR(cfile_css))
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9030ab0d9d97..0e7566c25939 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -986,7 +986,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
- if (!page_mapped(hpage))
+ if (!page_mapped(p))
return true;
if (PageKsm(p)) {
@@ -1033,7 +1033,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
unmap_success = try_to_unmap(hpage, ttu);
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pfn, page_mapcount(p));
/*
* try_to_unmap() might put mlocked page in lru cache, so call
diff --git a/mm/memory.c b/mm/memory.c
index d416e329442d..f8d76c66311d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2872,8 +2872,8 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
- pgoff_t hba = holebegin >> PAGE_SHIFT;
- pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
+ pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;
/* Check for overflow. */
if (sizeof(holelen) > sizeof(hlen)) {
@@ -4428,6 +4428,10 @@ int follow_phys(struct vm_area_struct *vma,
goto out;
pte = *ptep;
+ /* Never return PFNs of anon folios in COW mappings. */
+ if (vm_normal_page(vma, address, pte))
+ goto unlock;
+
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d79ab5116a7b..2bf4ab7b2713 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -348,7 +348,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
- if (!pol)
+ if (!pol || pol->mode == MPOL_LOCAL)
return;
if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
@@ -571,7 +571,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
goto unlock;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
+ !hugetlb_pmd_shared(pte)))
isolate_huge_page(page, qp->pagelist);
unlock:
spin_unlock(ptl);
@@ -1159,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
- unsigned long uninitialized_var(address);
+ unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
@@ -1554,7 +1555,7 @@ static int kernel_get_mempolicy(int __user *policy,
unsigned long flags)
{
int err;
- int uninitialized_var(pval);
+ int pval;
nodemask_t nodes;
addr = untagged_addr(addr);
diff --git a/mm/memtest.c b/mm/memtest.c
index f53ace709ccd..d407373f225b 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -46,10 +46,10 @@ static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size
last_bad = 0;
for (p = start; p < end; p++)
- *p = pattern;
+ WRITE_ONCE(*p, pattern);
for (p = start; p < end; p++, start_phys_aligned += incr) {
- if (*p == pattern)
+ if (READ_ONCE(*p) == pattern)
continue;
if (start_phys_aligned == last_bad + incr) {
last_bad += incr;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6948d6ec0fd0..9cfd53eaeb4e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -441,8 +441,12 @@ int migrate_page_move_mapping(struct address_space *mapping,
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
+ int i;
+
SetPageSwapCache(newpage);
- set_page_private(newpage, page_private(page));
+ for (i = 0; i < (1 << compound_order(page)); i++)
+ set_page_private(newpage + i,
+ page_private(page + i));
}
} else {
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -2343,13 +2347,14 @@ next:
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index f30f2a49ff2f..dad2ca2d27fc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1679,8 +1679,12 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) &&
+ !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
@@ -1860,7 +1864,6 @@ unmap_and_free_vma:
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
@@ -2602,11 +2605,28 @@ static void unmap_region(struct mm_struct *mm,
{
struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct mmu_gather tlb;
+ struct vm_area_struct *cur_vma;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
+
+ /*
+ * Ensure we have no stale TLB entries by the time this mapping is
+ * removed from the rmap.
+ * Note that we don't have to worry about nested flushes here because
+ * we're holding the mm semaphore for removing the mapping - so any
+ * concurrent flush in this region has to be coming through the rmap,
+ * and we synchronize against that using the rmap lock.
+ */
+ for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) {
+ if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) {
+ tlb_flush_mmu(&tlb);
+ break;
+ }
+ }
+
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 7c1b8f67af7b..341aa036b03c 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -117,6 +117,11 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */
}
+void tlb_remove_table_sync_one(void)
+{
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+}
+
static void tlb_remove_table_one(void *table)
{
/*
diff --git a/mm/mremap.c b/mm/mremap.c
index 8ce1b7632fbb..f6b8c009dd05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -293,12 +293,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
*/
bool moved;
- if (need_rmap_locks)
- take_rmap_locks(vma);
+ take_rmap_locks(vma);
moved = move_normal_pmd(vma, old_addr, new_addr,
old_end, old_pmd, new_pmd);
- if (need_rmap_locks)
- drop_rmap_locks(vma);
+ drop_rmap_locks(vma);
if (moved)
continue;
#endif
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2d658b208319..fdebfaf1873c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1530,7 +1530,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
*/
dtc->wb_thresh = __wb_calc_thresh(dtc);
dtc->wb_bg_thresh = dtc->thresh ?
- div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
+ div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
@@ -2746,12 +2746,6 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
- /*
- * NOTE: Page might be free now! Writeback doesn't hold a page
- * reference on its own, it relies on truncation to wait for
- * the clearing of PG_writeback. The below can only access
- * page state that is static across allocation cycles.
- */
if (ret) {
dec_lruvec_state(lruvec, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
@@ -2817,7 +2811,7 @@ EXPORT_SYMBOL(__test_set_page_writeback);
*/
void wait_on_page_writeback(struct page *page)
{
- if (PageWriteback(page)) {
+ while (PageWriteback(page)) {
trace_wait_on_page_writeback(page, page_mapping(page));
wait_on_page_bit(page, PG_writeback);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f08ce248af2a..0ad582945f54 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4123,6 +4123,30 @@ void fs_reclaim_release(gfp_t gfp_mask)
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -4421,6 +4445,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ bool can_compact = gfp_compaction_allowed(gfp_mask);
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
@@ -4430,6 +4455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
/*
@@ -4440,11 +4466,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
@@ -4484,7 +4511,7 @@ retry_cpuset:
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
- if (can_direct_reclaim &&
+ if (can_direct_reclaim && can_compact &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
@@ -4595,9 +4622,10 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_RETRY_MAYFAIL
+ * __GFP_RETRY_MAYFAIL and we can compact
*/
- if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
+ if (costly_order && (!can_compact ||
+ !(gfp_mask & __GFP_RETRY_MAYFAIL)))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -4610,16 +4638,20 @@ retry:
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
- if (did_some_progress > 0 &&
+ if (did_some_progress > 0 && can_compact &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -4639,9 +4671,13 @@ retry:
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -4945,6 +4981,18 @@ refill:
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
}
nc->pagecnt_bias--;
@@ -5770,9 +5818,22 @@ static void __build_all_zonelists(void *data)
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
- spin_lock(&lock);
+ /*
+ * Explicitly disable this CPU's interrupts before taking seqlock
+ * to prevent any IRQ handler from calling into the page allocator
+ * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
+ */
+ local_irq_save(flags);
+ /*
+ * Explicitly disable this CPU's synchronous printk() before taking
+ * seqlock to prevent any printk() from trying to hold port->lock, for
+ * tty_insert_flip_string_and_push_buffer() on other CPU might be
+ * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
+ */
+ printk_deferred_enter();
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -5805,7 +5866,9 @@ static void __build_all_zonelists(void *data)
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
+ printk_deferred_exit();
+ local_irq_restore(flags);
}
static noinline void __init
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 4eb09e089881..ec41e7552f37 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -38,7 +38,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
do {
again:
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd) || !walk->vma) {
+ if (pmd_none(*pmd)) {
if (ops->pte_hole)
err = ops->pte_hole(addr, next, walk);
if (err)
@@ -84,7 +84,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
do {
again:
next = pud_addr_end(addr, end);
- if (pud_none(*pud) || !walk->vma) {
+ if (pud_none(*pud)) {
if (ops->pte_hole)
err = ops->pte_hole(addr, next, walk);
if (err)
@@ -254,7 +254,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
int err = 0;
struct vm_area_struct *vma = walk->vma;
- if (vma && is_vm_hugetlb_page(vma)) {
+ if (is_vm_hugetlb_page(vma)) {
if (walk->ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
@@ -324,9 +324,13 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (!vma) { /* after the last vma */
walk.vma = NULL;
next = end;
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, next, &walk);
} else if (start < vma->vm_start) { /* outside vma */
walk.vma = NULL;
next = min(end, vma->vm_start);
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, next, &walk);
} else { /* inside vma */
walk.vma = vma;
next = min(end, vma->vm_end);
@@ -344,9 +348,8 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
}
if (err < 0)
break;
- }
- if (walk.vma || walk.ops->pte_hole)
err = __walk_page_range(start, next, &walk);
+ }
if (err)
break;
} while (start = next, start < end);
diff --git a/mm/percpu.c b/mm/percpu.c
index 806bc16f88eb..2e4b286c88c8 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2536,7 +2536,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
- int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int upa, max_upa, best_upa; /* units_per_alloc */
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu_alloc_info *ai;
diff --git a/mm/readahead.c b/mm/readahead.c
index 2fe72cd29b47..95fbc02cae6a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -592,7 +592,8 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
*/
ret = -EINVAL;
if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
- !S_ISREG(file_inode(f.file)->i_mode))
+ (!S_ISREG(file_inode(f.file)->i_mode) &&
+ !S_ISBLK(file_inode(f.file)->i_mode)))
goto out;
ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
diff --git a/mm/rmap.c b/mm/rmap.c
index 6d80e92688fe..c64da910bb73 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -83,7 +83,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
- anon_vma->degree = 1; /* Reference for first vma */
+ anon_vma->num_children = 0;
+ anon_vma->num_active_vmas = 0;
anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
@@ -191,6 +192,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
+ anon_vma->num_children++; /* self-parent link for new root */
allocated = anon_vma;
}
@@ -200,8 +202,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
- /* vma reference or self-parent link for new root */
- anon_vma->degree++;
+ anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
}
@@ -280,19 +281,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
anon_vma_chain_link(dst, avc, anon_vma);
/*
- * Reuse existing anon_vma if its degree lower than two,
- * that means it has no vma and only one anon_vma child.
+ * Reuse existing anon_vma if it has no vma and only one
+ * anon_vma child.
*
- * Do not chose parent anon_vma, otherwise first child
- * will always reuse it. Root anon_vma is never reused:
+ * Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
- if (!dst->anon_vma && anon_vma != src->anon_vma &&
- anon_vma->degree < 2)
+ if (!dst->anon_vma &&
+ anon_vma->num_children < 2 &&
+ anon_vma->num_active_vmas == 0)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
- dst->anon_vma->degree++;
+ dst->anon_vma->num_active_vmas++;
unlock_anon_vma_root(root);
return 0;
@@ -342,6 +343,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
+ anon_vma->num_active_vmas++;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
@@ -362,7 +364,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
- anon_vma->parent->degree++;
+ anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
return 0;
@@ -394,7 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* to free them outside the lock.
*/
if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
- anon_vma->parent->degree--;
+ anon_vma->parent->num_children--;
continue;
}
@@ -402,7 +404,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
anon_vma_chain_free(avc);
}
if (vma->anon_vma)
- vma->anon_vma->degree--;
+ vma->anon_vma->num_active_vmas--;
unlock_anon_vma_root(root);
/*
@@ -413,7 +415,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- VM_WARN_ON(anon_vma->degree);
+ VM_WARN_ON(anon_vma->num_children);
+ VM_WARN_ON(anon_vma->num_active_vmas);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
diff --git a/mm/shmem.c b/mm/shmem.c
index aae2f408f905..264229680ad7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3417,6 +3417,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
unsigned long long size;
char *rest;
int opt;
+ kuid_t kuid;
+ kgid_t kgid;
opt = fs_parse(fc, &shmem_fs_parameters, param, &result);
if (opt < 0)
@@ -3452,14 +3454,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->mode = result.uint_32 & 07777;
break;
case Opt_uid:
- ctx->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(ctx->uid))
+ kuid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(kuid))
goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, kuid))
+ goto bad_value;
+
+ ctx->uid = kuid;
break;
case Opt_gid:
- ctx->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(ctx->gid))
+ kgid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(kgid))
goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, kgid))
+ goto bad_value;
+
+ ctx->gid = kgid;
break;
case Opt_huge:
ctx->huge = result.uint_32;
diff --git a/mm/slub.c b/mm/slub.c
index 63fe43c8d332..e978f647e92a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1198,7 +1198,7 @@ static noinline int free_debug_processing(
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
void *object = head;
int cnt = 0;
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
int ret = 0;
spin_lock_irqsave(&n->list_lock, flags);
@@ -2214,6 +2214,7 @@ redo:
c->page = NULL;
c->freelist = NULL;
+ c->tid = next_tid(c->tid);
}
/*
@@ -2347,8 +2348,6 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
deactivate_slab(s, c->page, c->freelist, c);
-
- c->tid = next_tid(c->tid);
}
/*
@@ -2632,6 +2631,7 @@ redo:
if (!freelist) {
c->page = NULL;
+ c->tid = next_tid(c->tid);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -2888,7 +2888,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
stat(s, FREE_SLOWPATH);
@@ -5743,7 +5743,8 @@ static char *create_unique_id(struct kmem_cache *s)
char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
char *p = name;
- BUG_ON(!name);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
*p++ = ':';
/*
@@ -5825,6 +5826,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
* for the symlinks.
*/
name = create_unique_id(s);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
}
s->kobj.kset = kset;
diff --git a/mm/swap.c b/mm/swap.c
index 38c3fa4308e2..1c5021b57e2c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -763,8 +763,8 @@ void release_pages(struct page **pages, int nr)
LIST_HEAD(pages_to_free);
struct pglist_data *locked_pgdat = NULL;
struct lruvec *lruvec;
- unsigned long uninitialized_var(flags);
- unsigned int uninitialized_var(lock_batch);
+ unsigned long flags;
+ unsigned int lock_batch;
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f6964212c6c8..5e444be061a8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -672,6 +672,7 @@ static void __del_from_avail_list(struct swap_info_struct *p)
{
int nid;
+ assert_spin_locked(&p->lock);
for_each_node(nid)
plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
@@ -1061,6 +1062,7 @@ start_over:
goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type);
+ cond_resched();
spin_lock(&swap_avail_lock);
nextsi:
@@ -1950,10 +1952,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_unmap(pte);
swap_map = &si->swap_map[offset];
- vmf.vma = vma;
- vmf.address = addr;
- vmf.pmd = pmd;
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+ page = lookup_swap_cache(entry, vma, addr);
+ if (!page) {
+ vmf.vma = vma;
+ vmf.address = addr;
+ vmf.pmd = pmd;
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ &vmf);
+ }
if (!page) {
if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
goto try_next;
@@ -2574,8 +2580,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- del_from_avail_list(p);
spin_lock(&p->lock);
+ del_from_avail_list(p);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 6fa66e2111ea..e8758304a9f3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -177,6 +177,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ bool *mmap_changing,
bool zeropage)
{
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
@@ -308,6 +309,15 @@ retry:
goto out;
}
down_read(&dst_mm->mmap_sem);
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ if (mmap_changing && READ_ONCE(*mmap_changing)) {
+ err = -EAGAIN;
+ break;
+ }
dst_vma = NULL;
goto retry;
@@ -389,6 +399,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ bool *mmap_changing,
bool zeropage);
#endif /* CONFIG_HUGETLB_PAGE */
@@ -506,7 +517,8 @@ retry:
*/
if (is_vm_hugetlb_page(dst_vma))
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, zeropage);
+ src_start, len, mmap_changing,
+ zeropage);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
diff --git a/mm/util.c b/mm/util.c
index ab358c64bbd3..04ebc76588aa 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -320,6 +320,38 @@ unsigned long randomize_stack_top(unsigned long stack_top)
#endif
}
+/**
+ * randomize_page - Generate a random, page aligned address
+ * @start: The smallest acceptable address the caller will take.
+ * @range: The size of the area, starting at @start, within which the
+ * random address must fall.
+ *
+ * If @start + @range would overflow, @range is capped.
+ *
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ * @start was already page aligned. We now align it regardless.
+ *
+ * Return: A page aligned address within [start, start + range). On error,
+ * @start is returned.
+ */
+unsigned long randomize_page(unsigned long start, unsigned long range)
+{
+ if (!PAGE_ALIGNED(start)) {
+ range -= PAGE_ALIGN(start) - start;
+ start = PAGE_ALIGN(start);
+ }
+
+ if (start > ULONG_MAX - range)
+ range = ULONG_MAX - start;
+
+ range >>= PAGE_SHIFT;
+
+ if (range == 0)
+ return start;
+
+ return start + (get_random_long() % range << PAGE_SHIFT);
+}
+
#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index de94881eaa92..cefd9cd27405 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2691,7 +2691,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+ if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2))
return true;
@@ -2940,6 +2940,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
unsigned long watermark;
enum compact_result suitable;
+ if (!gfp_compaction_allowed(sc->gfp_mask))
+ return false;
+
suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
if (suitable == COMPACT_SUCCESS)
/* Allocation should succeed already. Don't reclaim. */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 490e5f3ae614..6b100f02ee43 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1748,11 +1748,40 @@ static enum fullness_group putback_zspage(struct size_class *class,
*/
static void lock_zspage(struct zspage *zspage)
{
- struct page *page = get_first_page(zspage);
+ struct page *curr_page, *page;
- do {
- lock_page(page);
- } while ((page = get_next_page(page)) != NULL);
+ /*
+ * Pages we haven't locked yet can be migrated off the list while we're
+ * trying to lock them, so we need to be careful and only attempt to
+ * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * may no longer belong to the zspage. This means that we may wait for
+ * the wrong page to unlock, so we must take a reference to the page
+ * prior to waiting for it to unlock outside migrate_read_lock().
+ */
+ while (1) {
+ migrate_read_lock(zspage);
+ page = get_first_page(zspage);
+ if (trylock_page(page))
+ break;
+ get_page(page);
+ migrate_read_unlock(zspage);
+ wait_on_page_locked(page);
+ put_page(page);
+ }
+
+ curr_page = page;
+ while ((page = get_next_page(curr_page))) {
+ if (trylock_page(page)) {
+ curr_page = page;
+ } else {
+ get_page(page);
+ migrate_read_unlock(zspage);
+ wait_on_page_locked(page);
+ put_page(page);
+ migrate_read_lock(zspage);
+ }
+ }
+ migrate_read_unlock(zspage);
}
static int zs_init_fs_context(struct fs_context *fc)