diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/cma.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 27 | ||||
-rw-r--r-- | mm/filemap.c | 239 | ||||
-rw-r--r-- | mm/frame_vector.c | 31 | ||||
-rw-r--r-- | mm/frontswap.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 15 | ||||
-rw-r--r-- | mm/huge_memory.c | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 41 | ||||
-rw-r--r-- | mm/kasan/report.c | 4 | ||||
-rw-r--r-- | mm/khugepaged.c | 45 | ||||
-rw-r--r-- | mm/kmemleak.c | 8 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 27 | ||||
-rw-r--r-- | mm/memory-failure.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 8 | ||||
-rw-r--r-- | mm/mempolicy.c | 9 | ||||
-rw-r--r-- | mm/memtest.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 11 | ||||
-rw-r--r-- | mm/mmap.c | 26 | ||||
-rw-r--r-- | mm/mmu_gather.c | 5 | ||||
-rw-r--r-- | mm/mremap.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 91 | ||||
-rw-r--r-- | mm/pagewalk.c | 13 | ||||
-rw-r--r-- | mm/percpu.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 31 | ||||
-rw-r--r-- | mm/shmem.c | 28 | ||||
-rw-r--r-- | mm/slub.c | 13 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 16 | ||||
-rw-r--r-- | mm/userfaultfd.c | 14 | ||||
-rw-r--r-- | mm/util.c | 32 | ||||
-rw-r--r-- | mm/vmscan.c | 5 | ||||
-rw-r--r-- | mm/zsmalloc.c | 37 |
36 files changed, 608 insertions, 217 deletions
@@ -481,7 +481,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(page + i); + page_kasan_tag_reset(nth_page(page, i)); } if (ret && !no_warn) { diff --git a/mm/compaction.c b/mm/compaction.c index d686887856fe..138edfa834b9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1219,7 +1219,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) } static void -fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) +fast_isolate_around(struct compact_control *cc, unsigned long pfn) { unsigned long start_pfn, end_pfn; struct page *page = pfn_to_page(pfn); @@ -1236,21 +1236,13 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long start_pfn = pageblock_start_pfn(pfn); end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1; - /* Scan before */ - if (start_pfn != pfn) { - isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); - if (cc->nr_freepages >= cc->nr_migratepages) - return; - } - - /* Scan after */ - start_pfn = pfn + nr_isolated; - if (start_pfn < end_pfn) - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (cc->nr_freepages < cc->nr_migratepages) set_pageblock_skip(page); + + return; } /* Search orders in round-robin fashion */ @@ -1422,7 +1414,7 @@ fast_isolate_freepages(struct compact_control *cc) return cc->free_pfn; low_pfn = page_to_pfn(page); - fast_isolate_around(cc, low_pfn, nr_isolated); + fast_isolate_around(cc, low_pfn); return low_pfn; } @@ -1709,6 +1701,8 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) update_fast_start_pfn(cc, free_pfn); pfn = pageblock_start_pfn(free_pfn); + if (pfn < cc->zone->zone_start_pfn) + pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; set_pageblock_skip(freepage); @@ -2349,16 +2343,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **capture) { - int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* - * Check if the GFP flags allow compaction - GFP_NOIO is really - * tricky context because the migration might require IO - */ - if (!may_perform_io) + if (!gfp_compaction_allowed(gfp_mask)) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); diff --git a/mm/filemap.c b/mm/filemap.c index c89e9be5c6bd..7ec4fcd8a34d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1044,8 +1044,43 @@ struct wait_page_queue { wait_queue_entry_t wait; }; +/* + * The page wait code treats the "wait->flags" somewhat unusually, because + * we have multiple different kinds of waits, not just he usual "exclusive" + * one. + * + * We have: + * + * (a) no special bits set: + * + * We're just waiting for the bit to be released, and when a waker + * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, + * and remove it from the wait queue. + * + * Simple and straightforward. + * + * (b) WQ_FLAG_EXCLUSIVE: + * + * The waiter is waiting to get the lock, and only one waiter should + * be woken up to avoid any thundering herd behavior. We'll set the + * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. + * + * This is the traditional exclusive wait. + * + * (b) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: + * + * The waiter is waiting to get the bit, and additionally wants the + * lock to be transferred to it for fair lock behavior. If the lock + * cannot be taken, we stop walking the wait queue without waking + * the waiter. + * + * This is the "fair lock handoff" case, and in addition to setting + * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see + * that it now has the lock. + */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { + unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); @@ -1058,17 +1093,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, return 0; /* - * Stop walking if it's locked. - * Is this safe if put_and_wait_on_page_locked() is in use? - * Yes: the waker must hold a reference to this page, and if PG_locked - * has now already been set by another task, that task must also hold - * a reference to the *same usage* of this page; so there is no need - * to walk on to wake even the put_and_wait_on_page_locked() callers. + * If it's a lock handoff wait, we get the bit for it, and + * stop walking (and do not wake it up) if we can't. */ - if (test_bit(key->bit_nr, &key->page->flags)) - return -1; + flags = wait->flags; + if (flags & WQ_FLAG_EXCLUSIVE) { + if (test_bit(key->bit_nr, &key->page->flags)) + return -1; + if (flags & WQ_FLAG_CUSTOM) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + flags |= WQ_FLAG_DONE; + } + } - return autoremove_wake_function(wait, mode, sync, key); + /* + * We are holding the wait-queue lock, but the waiter that + * is waiting for this will be checking the flags without + * any locking. + * + * So update the flags atomically, and wake up the waiter + * afterwards to avoid any races. This store-release pairs + * with the load-acquire in wait_on_page_bit_common(). + */ + smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); + wake_up_state(wait->private, mode); + + /* + * Ok, we have successfully done what we're waiting for, + * and we can unconditionally remove the wait entry. + * + * Note that this pairs with the "finish_wait()" in the + * waiter, and has to be the absolute last thing we do. + * After this list_del_init(&wait->entry) the wait entry + * might be de-allocated and the process might even have + * exited. + */ + list_del_init_careful(&wait->entry); + return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1147,16 +1209,35 @@ enum behavior { */ }; +/* + * Attempt to check (or get) the page bit, and mark us done + * if successful. + */ +static inline bool trylock_page_bit_common(struct page *page, int bit_nr, + struct wait_queue_entry *wait) +{ + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(bit_nr, &page->flags)) + return false; + } else if (test_bit(bit_nr, &page->flags)) + return false; + + wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; + return true; +} + +/* How many times do we accept lock stealing from under a waiter? */ +int sysctl_page_lock_unfairness = 5; + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { + int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; - bool bit_is_set; bool thrashing = false; bool delayacct = false; unsigned long pflags; - int ret = 0; if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { @@ -1169,55 +1250,97 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } init_wait(wait); - wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; - for (;;) { - spin_lock_irq(&q->lock); - - if (likely(list_empty(&wait->entry))) { - __add_wait_queue_entry_tail(q, wait); - SetPageWaiters(page); - } +repeat: + wait->flags = 0; + if (behavior == EXCLUSIVE) { + wait->flags = WQ_FLAG_EXCLUSIVE; + if (--unfairness < 0) + wait->flags |= WQ_FLAG_CUSTOM; + } - set_current_state(state); + /* + * Do one last check whether we can get the + * page bit synchronously. + * + * Do the SetPageWaiters() marking before that + * to let any waker we _just_ missed know they + * need to wake us up (otherwise they'll never + * even go to the slow case that looks at the + * page queue), and add ourselves to the wait + * queue if we need to sleep. + * + * This part needs to be done under the queue + * lock to avoid races. + */ + spin_lock_irq(&q->lock); + SetPageWaiters(page); + if (!trylock_page_bit_common(page, bit_nr, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); - spin_unlock_irq(&q->lock); + /* + * From now on, all the logic will be based on + * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to + * see whether the page bit testing has already + * been done by the wake function. + * + * We can drop our reference to the page. + */ + if (behavior == DROP) + put_page(page); - bit_is_set = test_bit(bit_nr, &page->flags); - if (behavior == DROP) - put_page(page); + /* + * Note that until the "finish_wait()", or until + * we see the WQ_FLAG_WOKEN flag, we need to + * be very careful with the 'wait->flags', because + * we may race with a waker that sets them. + */ + for (;;) { + unsigned int flags; - if (likely(bit_is_set)) - io_schedule(); + set_current_state(state); - if (behavior == EXCLUSIVE) { - if (!test_and_set_bit_lock(bit_nr, &page->flags)) - break; - } else if (behavior == SHARED) { - if (!test_bit(bit_nr, &page->flags)) + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(state, current)) break; + + io_schedule(); + continue; } - if (signal_pending_state(state, current)) { - ret = -EINTR; + /* If we were non-exclusive, we're done */ + if (behavior != EXCLUSIVE) break; - } - if (behavior == DROP) { - /* - * We can no longer safely access page->flags: - * even if CONFIG_MEMORY_HOTREMOVE is not enabled, - * there is a risk of waiting forever on a page reused - * for something that keeps it locked indefinitely. - * But best check for -EINTR above before breaking. - */ + /* If the waker got the lock for us, we're done */ + if (flags & WQ_FLAG_DONE) break; - } + + /* + * Otherwise, if we're getting the lock, we need to + * try to get it ourselves. + * + * And if that fails, we'll have to retry this all. + */ + if (unlikely(test_and_set_bit(bit_nr, &page->flags))) + goto repeat; + + wait->flags |= WQ_FLAG_DONE; + break; } + /* + * If a signal happened, this 'finish_wait()' may remove the last + * waiter from the wait-queues, but the PageWaiters bit will remain + * set. That's ok. The next wakeup will take care of it, and trying + * to do it here would be difficult and prone to races. + */ finish_wait(q, wait); if (thrashing) { @@ -1227,14 +1350,22 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } /* - * A signal could leave PageWaiters set. Clearing it here if - * !waitqueue_active would be possible (by open-coding finish_wait), - * but still fail to catch it in the case of wait hash collision. We - * already can fail to clear wait hash collision cases, so don't - * bother with signals either. + * NOTE! The wait->flags weren't stable until we've done the + * 'finish_wait()', and we could have exited the loop above due + * to a signal, and had a wakeup event happen after the signal + * test but before the 'finish_wait()'. + * + * So only after the finish_wait() can we reliably determine + * if we got woken up or not, so we can now figure out the final + * return value based on that state without races. + * + * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive + * waiter, but an exclusive one requires WQ_FLAG_DONE. */ + if (behavior == EXCLUSIVE) + return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; - return ret; + return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } void wait_on_page_bit(struct page *page, int bit_nr) @@ -1355,11 +1486,19 @@ void end_page_writeback(struct page *page) rotate_reclaimable_page(page); } + /* + * Writeback does not hold a page reference of its own, relying + * on truncation to wait for the clearing of PG_writeback. + * But here we must make sure that the page is not freed and + * reused before the wake_up_page(). + */ + get_page(page); if (!test_clear_page_writeback(page)) BUG(); smp_mb__after_atomic(); wake_up_page(page, PG_writeback); + put_page(page); } EXPORT_SYMBOL(end_page_writeback); @@ -3281,7 +3420,7 @@ ssize_t generic_perform_write(struct file *file, unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ - void *fsdata; + void *fsdata = NULL; offset = (pos & (PAGE_SIZE - 1)); bytes = min_t(unsigned long, PAGE_SIZE - offset, diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c431ca81dad5..5779c1bc6f44 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -37,7 +37,6 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret = 0; - int err; int locked; if (nr_frames == 0) @@ -74,32 +73,14 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, vec->is_pfns = false; ret = get_user_pages_locked(start, nr_frames, gup_flags, (struct page **)(vec->ptrs), &locked); - goto out; + if (likely(ret > 0)) + goto out; } - vec->got_ref = false; - vec->is_pfns = true; - do { - unsigned long *nums = frame_vector_pfns(vec); - - while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) { - err = follow_pfn(vma, start, &nums[ret]); - if (err) { - if (ret == 0) - ret = err; - goto out; - } - start += PAGE_SIZE; - ret++; - } - /* - * We stop if we have enough pages or if VMA doesn't completely - * cover the tail page. - */ - if (ret >= nr_frames || start < vma->vm_end) - break; - vma = find_vma_intersection(mm, start, start + 1); - } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)); + /* This used to (racily) return non-refcounted pfns. Let people know */ + WARN_ONCE(1, "get_vaddr_frames() cannot follow VM_IO mapping"); + vec->nr_frames = 0; + out: if (locked) up_read(&mm->mmap_sem); diff --git a/mm/frontswap.c b/mm/frontswap.c index 60bb20e8a951..fd3337702513 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -446,7 +446,7 @@ static int __frontswap_shrink(unsigned long target_pages, void frontswap_shrink(unsigned long target_pages) { unsigned long pages_to_unuse = 0; - int uninitialized_var(type), ret; + int type, ret; /* * we don't want to hold swap_lock while doing a very @@ -188,6 +188,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, spinlock_t *ptl; pte_t *ptep, pte; + /* + * Considering PTE level hugetlb, like continuous-PTE hugetlb on + * ARM64 architecture. + */ + if (is_vm_hugetlb_page(vma)) { + page = follow_huge_pmd_pte(vma, address, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); @@ -333,7 +344,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (pmd_none(pmdval)) return no_page_table(vma, flags); if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags); + page = follow_huge_pmd_pte(vma, address, flags); if (page) return page; return no_page_table(vma, flags); @@ -2240,7 +2251,7 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo next = pud_addr_end(addr, end); if (pud_none(pud)) return 0; - if (unlikely(pud_huge(pud))) { + if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e50799d7002e..03b57323c53b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2912,6 +2912,9 @@ void deferred_split_huge_page(struct page *page) if (PageSwapCache(page)) return; + if (!list_empty(page_deferred_list(page))) + return; + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 20da6ede7704..e4478b62d0f7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2218,11 +2218,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; + spin_lock(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { SetPagePrivate(page); h->resv_huge_pages--; } - spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ } @@ -5033,7 +5033,14 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) pud_clear(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); - *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; + /* + * This update of passed address optimizes loops sequentially + * processing addresses in increments of huge page size (PMD_SIZE + * in this case). By clearing the pud, a PUD_SIZE area is unmapped. + * Update address to the 'last page' in the cleared area so that + * calling loop can move to first page past this area. + */ + *addr |= PUD_SIZE - PMD_SIZE; return 1; } #define want_pmd_share() (1) @@ -5150,30 +5157,30 @@ follow_huge_pd(struct vm_area_struct *vma, } struct page * __weak -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags) +follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) { + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; spinlock_t *ptl; - pte_t pte; + pte_t *ptep, pte; + retry: - ptl = pmd_lockptr(mm, pmd); - spin_lock(ptl); - /* - * make sure that the address range covered by this pmd is not - * unmapped from other threads. - */ - if (!pmd_huge(*pmd)) - goto out; - pte = huge_ptep_get((pte_t *)pmd); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); + if (!ptep) + return NULL; + + ptl = huge_pte_lock(h, mm, ptep); + pte = huge_ptep_get(ptep); if (pte_present(pte)) { - page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); + page = pte_page(pte) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pmd, ptl); + __migration_entry_wait(mm, ptep, ptl); goto retry; } /* @@ -5181,7 +5188,7 @@ retry: * follow_page_mask(). */ } -out: + spin_unlock(ptl); return page; } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 621782100eaa..4d87df96acc1 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -92,8 +92,8 @@ static void end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + check_panic_on_warn("KASAN"); kasan_enable_current(); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3c2326568193..f1f98305433e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1060,6 +1060,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); @@ -1312,6 +1313,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) spinlock_t *ptl; int count = 0; int i; + struct mmu_notifier_range range; if (!vma || !vma->vm_file || vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) @@ -1338,6 +1340,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!pmd) goto drop_hpage; + /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); /* step 1: check all mapped PTEs are to the right huge page */ @@ -1384,12 +1399,23 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) } /* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); + /* we make no change to anon, but protect concurrent anon page lookup */ + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); mm_dec_nr_ptes(mm); + tlb_remove_table_sync_one(); + mmu_notifier_invalidate_range_end(&range); pte_free(mm, pmd_pgtable(_pmd)); + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + i_mmap_unlock_write(vma->vm_file->f_mapping); + drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1397,6 +1423,7 @@ drop_hpage: abort: pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; } @@ -1446,7 +1473,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * An alternative would be drop the check, but check that page * table is clear before calling pmdp_collapse_flush() under * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. + * has higher cost too. It would also probably require locking + * the anon_vma. */ if (vma->anon_vma) continue; @@ -1468,12 +1496,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ if (down_write_trylock(&mm->mmap_sem)) { if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); + struct mmu_notifier_range range; + + mmu_notifier_range_init(&range, + MMU_NOTIFY_CLEAR, 0, + NULL, mm, addr, + addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); mm_dec_nr_ptes(mm); + tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); + mmu_notifier_invalidate_range_end(&range); } up_write(&mm->mmap_sem); } else { diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3761c79137b1..d8cde7292bf9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1123,7 +1123,7 @@ EXPORT_SYMBOL(kmemleak_no_scan); void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, gfp_t gfp) { - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) kmemleak_alloc(__va(phys), size, min_count, gfp); } EXPORT_SYMBOL(kmemleak_alloc_phys); @@ -1137,7 +1137,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) kmemleak_free_part(__va(phys), size); } EXPORT_SYMBOL(kmemleak_free_part_phys); @@ -1149,7 +1149,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); */ void __ref kmemleak_not_leak_phys(phys_addr_t phys) { - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) kmemleak_not_leak(__va(phys)); } EXPORT_SYMBOL(kmemleak_not_leak_phys); @@ -1161,7 +1161,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys); */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) kmemleak_ignore(__va(phys)); } EXPORT_SYMBOL(kmemleak_ignore_phys); @@ -2388,7 +2388,7 @@ next_mm: static void ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; - struct page *uninitialized_var(page); + struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); diff --git a/mm/madvise.c b/mm/madvise.c index 1107e99e498b..ac8d68c488b5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -428,8 +428,11 @@ regular_page: continue; } - /* Do not interfere with other mappings of this page */ - if (page_mapcount(page) != 1) + /* + * Do not interfere with other mappings of this page and + * non-LRU page. + */ + if (!PageLRU(page) || page_mapcount(page) != 1) continue; VM_BUG_ON_PAGE(PageTransCompound(page), page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8fc663545498..0cc7ee52d6bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1044,7 +1044,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { - struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + struct mem_cgroup_reclaim_iter *iter; struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; struct mem_cgroup *pos = NULL; @@ -2214,6 +2214,9 @@ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; + if (!old) + return; + if (stock->nr_pages) { page_counter_uncharge(&old->memory, stock->nr_pages); if (do_memsw_account()) @@ -2221,6 +2224,8 @@ static void drain_stock(struct memcg_stock_pcp *stock) css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; } + + css_put(&old->css); stock->cached = NULL; } @@ -2256,6 +2261,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); + css_get(&memcg->css); stock->cached = memcg; } stock->nr_pages += nr_pages; @@ -3775,6 +3781,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + if (val & ~MOVE_MASK) return -EINVAL; @@ -4709,6 +4719,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, unsigned int efd, cfd; struct fd efile; struct fd cfile; + struct dentry *cdentry; const char *name; char *endp; int ret; @@ -4760,6 +4771,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, goto out_put_cfile; /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing @@ -4767,7 +4788,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * * DO NOT ADD NEW FILES. */ - name = cfile.file->f_path.dentry->d_name.name; + name = cdentry->d_name.name; if (!strcmp(name, "memory.usage_in_bytes")) { event->register_event = mem_cgroup_usage_register_event; @@ -4791,7 +4812,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9030ab0d9d97..0e7566c25939 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -986,7 +986,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ - if (!page_mapped(hpage)) + if (!page_mapped(p)) return true; if (PageKsm(p)) { @@ -1033,7 +1033,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, unmap_success = try_to_unmap(hpage, ttu); if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(p)); /* * try_to_unmap() might put mlocked page in lru cache, so call diff --git a/mm/memory.c b/mm/memory.c index d416e329442d..f8d76c66311d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2872,8 +2872,8 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - pgoff_t hba = holebegin >> PAGE_SHIFT; - pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; + pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { @@ -4428,6 +4428,10 @@ int follow_phys(struct vm_area_struct *vma, goto out; pte = *ptep; + /* Never return PFNs of anon folios in COW mappings. */ + if (vm_normal_page(vma, address, pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d79ab5116a7b..2bf4ab7b2713 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -348,7 +348,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol, */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { - if (!pol) + if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) @@ -571,7 +571,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, goto unlock; /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + !hugetlb_pmd_shared(pte))) isolate_huge_page(page, qp->pagelist); unlock: spin_unlock(ptl); @@ -1159,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; - unsigned long uninitialized_var(address); + unsigned long address; vma = find_vma(current->mm, start); while (vma) { @@ -1554,7 +1555,7 @@ static int kernel_get_mempolicy(int __user *policy, unsigned long flags) { int err; - int uninitialized_var(pval); + int pval; nodemask_t nodes; addr = untagged_addr(addr); diff --git a/mm/memtest.c b/mm/memtest.c index f53ace709ccd..d407373f225b 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -46,10 +46,10 @@ static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size last_bad = 0; for (p = start; p < end; p++) - *p = pattern; + WRITE_ONCE(*p, pattern); for (p = start; p < end; p++, start_phys_aligned += incr) { - if (*p == pattern) + if (READ_ONCE(*p) == pattern) continue; if (start_phys_aligned == last_bad + incr) { last_bad += incr; diff --git a/mm/migrate.c b/mm/migrate.c index 6948d6ec0fd0..9cfd53eaeb4e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -441,8 +441,12 @@ int migrate_page_move_mapping(struct address_space *mapping, if (PageSwapBacked(page)) { __SetPageSwapBacked(newpage); if (PageSwapCache(page)) { + int i; + SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); + for (i = 0; i < (1 << compound_order(page)); i++) + set_page_private(newpage + i, + page_private(page + i)); } } else { VM_BUG_ON_PAGE(PageSwapCache(page), page); @@ -2343,13 +2347,14 @@ next: migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = mpfn; } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(ptep - 1, ptl); /* Only flush the TLB if we actually modified any entries */ if (unmapped) flush_tlb_range(walk->vma, start, end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index f30f2a49ff2f..dad2ca2d27fc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1679,8 +1679,12 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; - /* Do we need to track softdirty? */ - if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + /* + * Do we need to track softdirty? hugetlb does not support softdirty + * tracking yet. + */ + if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) && + !is_vm_hugetlb_page(vma)) return 1; /* Specialty mapping? */ @@ -1860,7 +1864,6 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); allow_write_and_free_vma: @@ -2602,11 +2605,28 @@ static void unmap_region(struct mm_struct *mm, { struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; struct mmu_gather tlb; + struct vm_area_struct *cur_vma; lru_add_drain(); tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); + + /* + * Ensure we have no stale TLB entries by the time this mapping is + * removed from the rmap. + * Note that we don't have to worry about nested flushes here because + * we're holding the mm semaphore for removing the mapping - so any + * concurrent flush in this region has to be coming through the rmap, + * and we synchronize against that using the rmap lock. + */ + for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) { + if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) { + tlb_flush_mmu(&tlb); + break; + } + } + free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, start, end); diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 7c1b8f67af7b..341aa036b03c 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -117,6 +117,11 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ } +void tlb_remove_table_sync_one(void) +{ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); +} + static void tlb_remove_table_one(void *table) { /* diff --git a/mm/mremap.c b/mm/mremap.c index 8ce1b7632fbb..f6b8c009dd05 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -293,12 +293,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma, */ bool moved; - if (need_rmap_locks) - take_rmap_locks(vma); + take_rmap_locks(vma); moved = move_normal_pmd(vma, old_addr, new_addr, old_end, old_pmd, new_pmd); - if (need_rmap_locks) - drop_rmap_locks(vma); + drop_rmap_locks(vma); if (moved) continue; #endif diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2d658b208319..fdebfaf1873c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1530,7 +1530,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) */ dtc->wb_thresh = __wb_calc_thresh(dtc); dtc->wb_bg_thresh = dtc->thresh ? - div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; + div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need @@ -2746,12 +2746,6 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } - /* - * NOTE: Page might be free now! Writeback doesn't hold a page - * reference on its own, it relies on truncation to wait for - * the clearing of PG_writeback. The below can only access - * page state that is static across allocation cycles. - */ if (ret) { dec_lruvec_state(lruvec, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); @@ -2817,7 +2811,7 @@ EXPORT_SYMBOL(__test_set_page_writeback); */ void wait_on_page_writeback(struct page *page) { - if (PageWriteback(page)) { + while (PageWriteback(page)) { trace_wait_on_page_writeback(page, page_mapping(page)); wait_on_page_bit(page, PG_writeback); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f08ce248af2a..0ad582945f54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4123,6 +4123,30 @@ void fs_reclaim_release(gfp_t gfp_mask) EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif +/* + * Zonelists may change due to hotplug during allocation. Detect when zonelists + * have been rebuilt so allocation retries. Reader side does not lock and + * retries the allocation if zonelist changes. Writer side is protected by the + * embedded spin_lock. + */ +static DEFINE_SEQLOCK(zonelist_update_seq); + +static unsigned int zonelist_iter_begin(void) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqbegin(&zonelist_update_seq); + + return 0; +} + +static unsigned int check_retry_zonelist(unsigned int seq) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqretry(&zonelist_update_seq, seq); + + return seq; +} + /* Perform direct synchronous page reclaim */ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -4421,6 +4445,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + bool can_compact = gfp_compaction_allowed(gfp_mask); const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@ -4430,6 +4455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int compaction_retries; int no_progress_loops; unsigned int cpuset_mems_cookie; + unsigned int zonelist_iter_cookie; int reserve_flags; /* @@ -4440,11 +4466,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry_cpuset: +restart: compaction_retries = 0; no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist_iter_cookie = zonelist_iter_begin(); /* * The fast path uses conservative alloc_flags to succeed only until @@ -4484,7 +4511,7 @@ retry_cpuset: * Don't try this for allocations that are allowed to ignore * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && + if (can_direct_reclaim && can_compact && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask)) { @@ -4595,9 +4622,10 @@ retry: /* * Do not retry costly high order allocations unless they are - * __GFP_RETRY_MAYFAIL + * __GFP_RETRY_MAYFAIL and we can compact */ - if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) + if (costly_order && (!can_compact || + !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -4610,16 +4638,20 @@ retry: * implementation of the compaction depends on the sufficient amount * of free memory (see __compaction_suitable) */ - if (did_some_progress > 0 && + if (did_some_progress > 0 && can_compact && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, &compaction_retries)) goto retry; - /* Deal with possible cpuset update races before we start OOM killing */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); @@ -4639,9 +4671,13 @@ retry: } nopage: - /* Deal with possible cpuset update races before we fail */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* * Make sure that __GFP_NOFAIL request doesn't leak out and make sure @@ -4945,6 +4981,18 @@ refill: /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; + if (unlikely(offset < 0)) { + /* + * The caller is trying to allocate a fragment + * with fragsz > PAGE_SIZE but the cache isn't big + * enough to satisfy the request, this may + * happen in low memory conditions. + * We don't release the cache page because + * it could make memory pressure worse + * so we simply return NULL here. + */ + return NULL; + } } nc->pagecnt_bias--; @@ -5770,9 +5818,22 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; - static DEFINE_SPINLOCK(lock); + unsigned long flags; - spin_lock(&lock); + /* + * Explicitly disable this CPU's interrupts before taking seqlock + * to prevent any IRQ handler from calling into the page allocator + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + */ + local_irq_save(flags); + /* + * Explicitly disable this CPU's synchronous printk() before taking + * seqlock to prevent any printk() from trying to hold port->lock, for + * tty_insert_flip_string_and_push_buffer() on other CPU might be + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. + */ + printk_deferred_enter(); + write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -5805,7 +5866,9 @@ static void __build_all_zonelists(void *data) #endif } - spin_unlock(&lock); + write_sequnlock(&zonelist_update_seq); + printk_deferred_exit(); + local_irq_restore(flags); } static noinline void __init diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 4eb09e089881..ec41e7552f37 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -38,7 +38,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { again: next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || !walk->vma) { + if (pmd_none(*pmd)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, walk); if (err) @@ -84,7 +84,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, do { again: next = pud_addr_end(addr, end); - if (pud_none(*pud) || !walk->vma) { + if (pud_none(*pud)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, walk); if (err) @@ -254,7 +254,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, int err = 0; struct vm_area_struct *vma = walk->vma; - if (vma && is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma)) { if (walk->ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else @@ -324,9 +324,13 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (!vma) { /* after the last vma */ walk.vma = NULL; next = end; + if (ops->pte_hole) + err = ops->pte_hole(start, next, &walk); } else if (start < vma->vm_start) { /* outside vma */ walk.vma = NULL; next = min(end, vma->vm_start); + if (ops->pte_hole) + err = ops->pte_hole(start, next, &walk); } else { /* inside vma */ walk.vma = vma; next = min(end, vma->vm_end); @@ -344,9 +348,8 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } if (err < 0) break; - } - if (walk.vma || walk.ops->pte_hole) err = __walk_page_range(start, next, &walk); + } if (err) break; } while (start = next, start < end); diff --git a/mm/percpu.c b/mm/percpu.c index 806bc16f88eb..2e4b286c88c8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2536,7 +2536,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; - int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int upa, max_upa, best_upa; /* units_per_alloc */ int last_allocs, group, unit; unsigned int cpu, tcpu; struct pcpu_alloc_info *ai; diff --git a/mm/readahead.c b/mm/readahead.c index 2fe72cd29b47..95fbc02cae6a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -592,7 +592,8 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) */ ret = -EINVAL; if (!f.file->f_mapping || !f.file->f_mapping->a_ops || - !S_ISREG(file_inode(f.file)->i_mode)) + (!S_ISREG(file_inode(f.file)->i_mode) && + !S_ISBLK(file_inode(f.file)->i_mode))) goto out; ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); diff --git a/mm/rmap.c b/mm/rmap.c index 6d80e92688fe..c64da910bb73 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -83,7 +83,8 @@ static inline struct anon_vma *anon_vma_alloc(void) anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); - anon_vma->degree = 1; /* Reference for first vma */ + anon_vma->num_children = 0; + anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called @@ -191,6 +192,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; + anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } @@ -200,8 +202,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; + anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } @@ -280,19 +281,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) anon_vma_chain_link(dst, avc, anon_vma); /* - * Reuse existing anon_vma if its degree lower than two, - * that means it has no vma and only one anon_vma child. + * Reuse existing anon_vma if it has no vma and only one + * anon_vma child. * - * Do not chose parent anon_vma, otherwise first child - * will always reuse it. Root anon_vma is never reused: + * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ - if (!dst->anon_vma && anon_vma != src->anon_vma && - anon_vma->degree < 2) + if (!dst->anon_vma && + anon_vma->num_children < 2 && + anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) - dst->anon_vma->degree++; + dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; @@ -342,6 +343,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; + anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; @@ -362,7 +364,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); - anon_vma->parent->degree++; + anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; @@ -394,7 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { - anon_vma->parent->degree--; + anon_vma->parent->num_children--; continue; } @@ -402,7 +404,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) anon_vma_chain_free(avc); } if (vma->anon_vma) - vma->anon_vma->degree--; + vma->anon_vma->num_active_vmas--; unlock_anon_vma_root(root); /* @@ -413,7 +415,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - VM_WARN_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->num_children); + VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); diff --git a/mm/shmem.c b/mm/shmem.c index aae2f408f905..264229680ad7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3417,6 +3417,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) unsigned long long size; char *rest; int opt; + kuid_t kuid; + kgid_t kgid; opt = fs_parse(fc, &shmem_fs_parameters, param, &result); if (opt < 0) @@ -3452,14 +3454,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->mode = result.uint_32 & 07777; break; case Opt_uid: - ctx->uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(ctx->uid)) + kuid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(kuid)) goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, kuid)) + goto bad_value; + + ctx->uid = kuid; break; case Opt_gid: - ctx->gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(ctx->gid)) + kgid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(kgid)) goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, kgid)) + goto bad_value; + + ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; diff --git a/mm/slub.c b/mm/slub.c index 63fe43c8d332..e978f647e92a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1198,7 +1198,7 @@ static noinline int free_debug_processing( struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; - unsigned long uninitialized_var(flags); + unsigned long flags; int ret = 0; spin_lock_irqsave(&n->list_lock, flags); @@ -2214,6 +2214,7 @@ redo: c->page = NULL; c->freelist = NULL; + c->tid = next_tid(c->tid); } /* @@ -2347,8 +2348,6 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); deactivate_slab(s, c->page, c->freelist, c); - - c->tid = next_tid(c->tid); } /* @@ -2632,6 +2631,7 @@ redo: if (!freelist) { c->page = NULL; + c->tid = next_tid(c->tid); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2888,7 +2888,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, struct page new; unsigned long counters; struct kmem_cache_node *n = NULL; - unsigned long uninitialized_var(flags); + unsigned long flags; stat(s, FREE_SLOWPATH); @@ -5743,7 +5743,8 @@ static char *create_unique_id(struct kmem_cache *s) char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); char *p = name; - BUG_ON(!name); + if (!name) + return ERR_PTR(-ENOMEM); *p++ = ':'; /* @@ -5825,6 +5826,8 @@ static int sysfs_slab_add(struct kmem_cache *s) * for the symlinks. */ name = create_unique_id(s); + if (IS_ERR(name)) + return PTR_ERR(name); } s->kobj.kset = kset; diff --git a/mm/swap.c b/mm/swap.c index 38c3fa4308e2..1c5021b57e2c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -763,8 +763,8 @@ void release_pages(struct page **pages, int nr) LIST_HEAD(pages_to_free); struct pglist_data *locked_pgdat = NULL; struct lruvec *lruvec; - unsigned long uninitialized_var(flags); - unsigned int uninitialized_var(lock_batch); + unsigned long flags; + unsigned int lock_batch; for (i = 0; i < nr; i++) { struct page *page = pages[i]; diff --git a/mm/swapfile.c b/mm/swapfile.c index f6964212c6c8..5e444be061a8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -672,6 +672,7 @@ static void __del_from_avail_list(struct swap_info_struct *p) { int nid; + assert_spin_locked(&p->lock); for_each_node(nid) plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); } @@ -1061,6 +1062,7 @@ start_over: goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); + cond_resched(); spin_lock(&swap_avail_lock); nextsi: @@ -1950,10 +1952,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); swap_map = &si->swap_map[offset]; - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + page = lookup_swap_cache(entry, vma, addr); + if (!page) { + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + &vmf); + } if (!page) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; @@ -2574,8 +2580,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - del_from_avail_list(p); spin_lock(&p->lock); + del_from_avail_list(p); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 6fa66e2111ea..e8758304a9f3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -177,6 +177,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, + bool *mmap_changing, bool zeropage) { int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; @@ -308,6 +309,15 @@ retry: goto out; } down_read(&dst_mm->mmap_sem); + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + if (mmap_changing && READ_ONCE(*mmap_changing)) { + err = -EAGAIN; + break; + } dst_vma = NULL; goto retry; @@ -389,6 +399,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, + bool *mmap_changing, bool zeropage); #endif /* CONFIG_HUGETLB_PAGE */ @@ -506,7 +517,8 @@ retry: */ if (is_vm_hugetlb_page(dst_vma)) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, - src_start, len, zeropage); + src_start, len, mmap_changing, + zeropage); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; diff --git a/mm/util.c b/mm/util.c index ab358c64bbd3..04ebc76588aa 100644 --- a/mm/util.c +++ b/mm/util.c @@ -320,6 +320,38 @@ unsigned long randomize_stack_top(unsigned long stack_top) #endif } +/** + * randomize_page - Generate a random, page aligned address + * @start: The smallest acceptable address the caller will take. + * @range: The size of the area, starting at @start, within which the + * random address must fall. + * + * If @start + @range would overflow, @range is capped. + * + * NOTE: Historical use of randomize_range, which this replaces, presumed that + * @start was already page aligned. We now align it regardless. + * + * Return: A page aligned address within [start, start + range). On error, + * @start is returned. + */ +unsigned long randomize_page(unsigned long start, unsigned long range) +{ + if (!PAGE_ALIGNED(start)) { + range -= PAGE_ALIGN(start) - start; + start = PAGE_ALIGN(start); + } + + if (start > ULONG_MAX - range) + range = ULONG_MAX - start; + + range >>= PAGE_SHIFT; + + if (range == 0) + return start; + + return start + (get_random_long() % range << PAGE_SHIFT); +} + #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT unsigned long arch_randomize_brk(struct mm_struct *mm) { diff --git a/mm/vmscan.c b/mm/vmscan.c index de94881eaa92..cefd9cd27405 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2691,7 +2691,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -2940,6 +2940,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) unsigned long watermark; enum compact_result suitable; + if (!gfp_compaction_allowed(sc->gfp_mask)) + return false; + suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); if (suitable == COMPACT_SUCCESS) /* Allocation should succeed already. Don't reclaim. */ diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 490e5f3ae614..6b100f02ee43 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1748,11 +1748,40 @@ static enum fullness_group putback_zspage(struct size_class *class, */ static void lock_zspage(struct zspage *zspage) { - struct page *page = get_first_page(zspage); + struct page *curr_page, *page; - do { - lock_page(page); - } while ((page = get_next_page(page)) != NULL); + /* + * Pages we haven't locked yet can be migrated off the list while we're + * trying to lock them, so we need to be careful and only attempt to + * lock each page under migrate_read_lock(). Otherwise, the page we lock + * may no longer belong to the zspage. This means that we may wait for + * the wrong page to unlock, so we must take a reference to the page + * prior to waiting for it to unlock outside migrate_read_lock(). + */ + while (1) { + migrate_read_lock(zspage); + page = get_first_page(zspage); + if (trylock_page(page)) + break; + get_page(page); + migrate_read_unlock(zspage); + wait_on_page_locked(page); + put_page(page); + } + + curr_page = page; + while ((page = get_next_page(curr_page))) { + if (trylock_page(page)) { + curr_page = page; + } else { + get_page(page); + migrate_read_unlock(zspage); + wait_on_page_locked(page); + put_page(page); + migrate_read_lock(zspage); + } + } + migrate_read_unlock(zspage); } static int zs_init_fs_context(struct fs_context *fc) |