diff options
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r-- | mm/swap_state.c | 665 |
1 files changed, 330 insertions, 335 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c index aa40e706604c..7255c01a1e4e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,19 +10,20 @@ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> +#include <linux/mempolicy.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> -#include <linux/pagevec.h> #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/swap_slots.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" +#include "swap.h" /* * swapper_space is a fiction, retained to simplify the path through @@ -30,9 +31,9 @@ */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .set_page_dirty = swap_set_page_dirty, + .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION - .migratepage = migrate_page, + .migrate_folio = migrate_folio, #endif }; @@ -58,53 +59,13 @@ static bool enable_vma_readahead __read_mostly = true; #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) -#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++) -#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr)) - -static struct { - unsigned long add_total; - unsigned long del_total; - unsigned long find_success; - unsigned long find_total; -} swap_cache_info; - -unsigned long total_swapcache_pages(void) -{ - unsigned int i, j, nr; - unsigned long ret = 0; - struct address_space *spaces; - struct swap_info_struct *si; - - for (i = 0; i < MAX_SWAPFILES; i++) { - swp_entry_t entry = swp_entry(i, 1); - - /* Avoid get_swap_device() to warn for bad swap entry */ - if (!swp_swap_info(entry)) - continue; - /* Prevent swapoff to free swapper_spaces */ - si = get_swap_device(entry); - if (!si) - continue; - nr = nr_swapper_spaces[i]; - spaces = swapper_spaces[i]; - for (j = 0; j < nr; j++) - ret += spaces[j].nrpages; - put_swap_device(si); - } - return ret; -} - static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); - printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", - swap_cache_info.add_total, swap_cache_info.del_total, - swap_cache_info.find_success, swap_cache_info.find_total); - printk("Free swap = %ldkB\n", - get_nr_swap_pages() << (PAGE_SHIFT - 10)); - printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); + printk("Total swap = %lukB\n", K(total_swap_pages)); } void *get_shadow_from_swap_cache(swp_entry_t entry) @@ -113,57 +74,53 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) pgoff_t idx = swp_offset(entry); struct page *page; - page = find_get_entry(address_space, idx); + page = xa_load(&address_space->i_pages, idx); if (xa_is_value(page)) return page; - if (page) - put_page(page); return NULL; } /* - * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = thp_nr_pages(page); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); + unsigned long i, nr = folio_nr_pages(folio); void *old; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageSwapCache(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + xas_set_update(&xas, workingset_update_node); - page_ref_add(page, nr); - SetPageSwapCache(page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - do { - unsigned long nr_shadows = 0; + folio_ref_add(folio, nr); + folio_set_swapcache(folio); + folio->swap = entry; + do { xas_lock_irq(&xas); xas_create_range(&xas); if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { - VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); - old = xas_load(&xas); - if (xa_is_value(old)) { - nr_shadows++; - if (shadowp) + VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); + if (shadowp) { + old = xas_load(&xas); + if (xa_is_value(old)) *shadowp = old; } - set_page_private(page + i, entry.val + i); - xas_store(&xas, page); + xas_store(&xas, folio); xas_next(&xas); } - address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - ADD_CACHE_INFO(add_total, nr); + __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -171,59 +128,63 @@ unlock: if (!xas_error(&xas)) return 0; - ClearPageSwapCache(page); - page_ref_sub(page, nr); + folio_clear_swapcache(folio); + folio_ref_sub(folio, nr); return xas_error(&xas); } /* - * This must be called only on pages that have + * This must be called only on folios that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page, +void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); - int i, nr = thp_nr_pages(page); + int i; + long nr = folio_nr_pages(folio); pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - VM_BUG_ON_PAGE(PageWriteback(page), page); + xas_set_update(&xas, workingset_update_node); + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != page, entry); - set_page_private(page + i, 0); + VM_BUG_ON_PAGE(entry != folio, entry); xas_next(&xas); } - ClearPageSwapCache(page); - if (shadow) - address_space->nrexceptional += nr; + folio->swap.val = 0; + folio_clear_swapcache(folio); address_space->nrpages -= nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); - ADD_CACHE_INFO(del_total, nr); + __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } /** - * add_to_swap - allocate swap space for a page - * @page: page we want to move to swap + * add_to_swap - allocate swap space for a folio + * @folio: folio we want to move to swap + * + * Allocate swap space for the folio and add the folio to the + * swap cache. * - * Allocate swap space for the page and add the page to the - * swap cache. Caller needs to hold the page lock. + * Context: Caller needs to hold the folio lock. + * Return: Whether the folio was added to the swap cache. */ -int add_to_swap(struct page *page) +bool add_to_swap(struct folio *folio) { swp_entry_t entry; int err; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageUptodate(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); - entry = get_swap_page(page); + entry = folio_alloc_swap(folio); if (!entry.val) - return 0; + return false; /* * XArray node allocations from PF_MEMALLOC contexts could @@ -236,7 +197,7 @@ int add_to_swap(struct page *page) /* * Add it to the swap cache. */ - err = add_to_swap_cache(page, entry, + err = add_to_swap_cache(folio, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* @@ -245,41 +206,42 @@ int add_to_swap(struct page *page) */ goto fail; /* - * Normally the page will be dirtied in unmap because its pte should be - * dirty. A special case is MADV_FREE page. The page'e pte could have - * dirty bit cleared but the page's SwapBacked bit is still set because - * clearing the dirty bit and SwapBacked bit has no lock protected. For - * such page, unmap will not set dirty bit for it, so page reclaim will - * not write the page out. This can cause data corruption when the page - * is swap in later. Always setting the dirty bit for the page solves - * the problem. + * Normally the folio will be dirtied in unmap because its + * pte should be dirty. A special case is MADV_FREE page. The + * page's pte could have dirty bit cleared but the folio's + * SwapBacked flag is still set because clearing the dirty bit + * and SwapBacked flag has no lock protected. For such folio, + * unmap will not set dirty bit for it, so folio reclaim will + * not write the folio out. This can cause data corruption when + * the folio is swapped in later. Always setting the dirty flag + * for the folio solves the problem. */ - set_page_dirty(page); + folio_mark_dirty(folio); - return 1; + return true; fail: - put_swap_page(page, entry); - return 0; + put_swap_folio(folio, entry); + return false; } /* - * This must be called only on pages that have + * This must be called only on folios that have * been verified to be in the swap cache and locked. - * It will never put the page into the free list, - * the caller has a reference on the page. + * It will never put the folio into the free list, + * the caller has a reference on the folio. */ -void delete_from_swap_cache(struct page *page) +void delete_from_swap_cache(struct folio *folio) { - swp_entry_t entry = { .val = page_private(page) }; + swp_entry_t entry = folio->swap; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page, entry, NULL); + __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); - put_swap_page(page, entry); - page_ref_sub(page, thp_nr_pages(page)); + put_swap_folio(folio, entry); + folio_ref_sub(folio, folio_nr_pages(folio)); } void clear_shadow_from_swap_cache(int type, unsigned long begin, @@ -289,19 +251,18 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, void *old; for (;;) { - unsigned long nr_shadows = 0; swp_entry_t entry = swp_entry(type, curr); struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, curr); + xas_set_update(&xas, workingset_update_node); + xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, end) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); - nr_shadows++; } - address_space->nrexceptional -= nr_shadows; xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ @@ -313,23 +274,26 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, } } -/* - * If we are the only user, then try to free up the swap cache. - * - * Its ok to check for PageSwapCache without the page lock +/* + * If we are the only user, then try to free up the swap cache. + * + * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside - * try_to_free_swap() _with_ the lock. + * folio_free_swap() _with_ the lock. * - Marcelo */ -static inline void free_swap_cache(struct page *page) +void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { - try_to_free_swap(page); - unlock_page(page); + struct folio *folio = page_folio(page); + + if (folio_test_swapcache(folio) && !folio_mapped(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); } } -/* +/* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ @@ -344,15 +308,12 @@ void free_page_and_swap_cache(struct page *page) * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ -void free_pages_and_swap_cache(struct page **pages, int nr) +void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { - struct page **pagep = pages; - int i; - lru_add_drain(); - for (i = 0; i < nr; i++) - free_swap_cache(pagep[i]); - release_pages(pagep, nr); + for (int i = 0; i < nr; i++) + free_swap_cache(encoded_page_ptr(pages[i])); + release_pages(pages, nr); } static inline bool swap_use_vma_readahead(void) @@ -361,37 +322,31 @@ static inline bool swap_use_vma_readahead(void) } /* - * Lookup a swap entry in the swap cache. A found page will be returned + * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the page + * lock getting page table operations atomic even if we drop the folio * lock before returning. + * + * Caller must lock the swap device or hold a reference to keep it valid. */ -struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, - unsigned long addr) +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) { - struct page *page; - struct swap_info_struct *si; + struct folio *folio; - si = get_swap_device(entry); - if (!si) - return NULL; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); - put_swap_device(si); - - INC_CACHE_INFO(find_total); - if (page) { + folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); + if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; - INC_CACHE_INFO(find_success); /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ - if (unlikely(PageTransCompound(page))) - return page; + if (unlikely(folio_test_large(folio))) + return folio; - readahead = TestClearPageReadahead(page); + readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; @@ -410,69 +365,75 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } + } else { + folio = NULL; } - return page; + return folio; } /** - * find_get_incore_page - Find and get a page from the page or swap caches. + * filemap_get_incore_folio - Find and get a folio from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * - * This differs from find_get_page() in that it will also look for the - * page in the swap cache. + * This differs from filemap_get_folio() in that it will also look for the + * folio in the swap cache. * - * Return: The found page or %NULL. + * Return: The found folio or %NULL. */ -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; - struct page *page = find_get_entry(mapping, index); + struct folio *folio = filemap_get_entry(mapping, index); - if (!page) - return page; - if (!xa_is_value(page)) - return find_subpage(page, index); + if (!folio) + return ERR_PTR(-ENOENT); + if (!xa_is_value(folio)) + return folio; if (!shmem_mapping(mapping)) - return NULL; + return ERR_PTR(-ENOENT); - swp = radix_to_swp_entry(page); + swp = radix_to_swp_entry(folio); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swp)) + return ERR_PTR(-ENOENT); /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) - return NULL; - page = find_get_page(swap_address_space(swp), swp_offset(swp)); + return ERR_PTR(-ENOENT); + index = swp_offset(swp); + folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); - return page; + return folio; } -struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated) +struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, + bool skip_if_exists) { struct swap_info_struct *si; - struct page *page; + struct folio *folio; void *shadow = NULL; *new_page_allocated = false; + si = get_swap_device(entry); + if (!si) + return NULL; for (;;) { int err; /* * First check the swap cache. Since this is normally - * called after lookup_swap_cache() failed, re-calling + * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ - si = get_swap_device(entry); - if (!si) - return NULL; - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); - put_swap_device(si); - if (page) - return page; + folio = filemap_get_folio(swap_address_space(entry), + swp_offset(entry)); + if (!IS_ERR(folio)) + goto got_folio; /* * Just skip read ahead for unused swap slot. @@ -482,17 +443,18 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * as SWAP_HAS_CACHE. That's done in later part of code or * else swap_off will be aborted if we return NULL. */ - if (!__swp_swapcount(entry) && swap_slot_cache_enabled) - return NULL; + if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) + goto fail_put_swap; /* - * Get a new page to read into from swap. Allocate it now, + * Get a new folio to read into from swap. Allocate it now, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - page = alloc_page_vma(gfp_mask, vma, addr); - if (!page) - return NULL; + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, + mpol, ilx, numa_node_id()); + if (!folio) + goto fail_put_swap; /* * Swap entry may have been freed since our caller observed it. @@ -501,50 +463,63 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (!err) break; - put_page(page); + folio_put(folio); if (err != -EEXIST) - return NULL; + goto fail_put_swap; + + /* + * Protect against a recursive call to __read_swap_cache_async() + * on the same entry waiting forever here because SWAP_HAS_CACHE + * is set but the folio is not the swap cache yet. This can + * happen today if mem_cgroup_swapin_charge_folio() below + * triggers reclaim through zswap, which may call + * __read_swap_cache_async() in the writeback path. + */ + if (skip_if_exists) + goto fail_put_swap; /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE - * in swap_map, but not yet added its page to swap cache. + * in swap_map, but not yet added its folio to swap cache. */ - cond_resched(); + schedule_timeout_uninterruptible(1); } /* - * The swap entry is ours to swap in. Prepare the new page. + * The swap entry is ours to swap in. Prepare the new folio. */ - __SetPageLocked(page); - __SetPageSwapBacked(page); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) { - put_swap_page(page, entry); + if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) goto fail_unlock; - } - if (mem_cgroup_charge(page, NULL, gfp_mask)) { - delete_from_swap_cache(page); + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - } + + mem_cgroup_swapin_uncharge_swap(entry); if (shadow) - workingset_refault(page, shadow); + workingset_refault(folio, shadow); - /* Caller will initiate read into locked page */ - SetPageWorkingset(page); - lru_cache_add(page); + /* Caller will initiate read into locked folio */ + folio_add_lru(folio); *new_page_allocated = true; - return page; +got_folio: + put_swap_device(si); + return folio; fail_unlock: - unlock_page(page); - put_page(page); + put_swap_folio(folio, entry); + folio_unlock(folio); + folio_put(folio); +fail_put_swap: + put_swap_device(si); return NULL; } @@ -553,18 +528,28 @@ fail_unlock: * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. + * + * get/put_swap_device() aren't needed to call this function, because + * __read_swap_cache_async() call them and swap_read_folio() holds the + * swap cache folio lock. */ -struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, bool do_poll) +struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr, + struct swap_iocb **plug) { - bool page_was_allocated; - struct page *retpage = __read_swap_cache_async(entry, gfp_mask, - vma, addr, &page_was_allocated); - - if (page_was_allocated) - swap_readpage(retpage, do_poll); - - return retpage; + bool page_allocated; + struct mempolicy *mpol; + pgoff_t ilx; + struct folio *folio; + + mpol = get_vma_policy(vma, addr, 0, &ilx); + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated, false); + mpol_cond_put(mpol); + + if (page_allocated) + swap_read_folio(folio, false, plug); + return folio; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, @@ -632,46 +617,37 @@ static unsigned long swapin_nr_pages(unsigned long offset) * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * - * Returns the struct page for entry and addr, after queueing swapin. + * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold read mmap_lock if vmf->vma is not NULL. + * Note: it is intentional that the same NUMA policy and interleave index + * are used for every page of the readahead: neighbouring pages on swap + * are fairly likely to have been swapped out from the same node. */ -struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_fault *vmf) +struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx) { - struct page *page; + struct folio *folio; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; - bool do_poll = true, page_allocated; - struct vm_area_struct *vma = vmf->vma; - unsigned long addr = vmf->address; + struct swap_iocb *splug = NULL; + bool page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; - /* Test swap type to make sure the dereference is safe */ - if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) { - struct inode *inode = si->swap_file->f_mapping->host; - if (inode_read_congested(inode)) - goto skip; - } - - do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -683,25 +659,32 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - page = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, &page_allocated); - if (!page) + folio = __read_swap_cache_async( + swp_entry(swp_type(entry), offset), + gfp_mask, mpol, ilx, &page_allocated, false); + if (!folio) continue; if (page_allocated) { - swap_readpage(page, false); + swap_read_folio(folio, false, &splug); if (offset != entry_offset) { - SetPageReadahead(page); + folio_set_readahead(folio); count_vm_event(SWAP_RA); } } - put_page(page); + folio_put(folio); } blk_finish_plug(&plug); - + swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: - return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); + /* The page was likely read above, so no need for plugging here */ + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated, false); + if (unlikely(page_allocated)) { + zswap_folio_swapin(folio); + swap_read_folio(folio, false, NULL); + } + return folio; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -729,37 +712,32 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) void exit_swap_address_space(unsigned int type) { - kvfree(swapper_spaces[type]); + int i; + struct address_space *spaces = swapper_spaces[type]; + + for (i = 0; i < nr_swapper_spaces[type]; i++) + VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); + kvfree(spaces); nr_swapper_spaces[type] = 0; swapper_spaces[type] = NULL; } -static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, - unsigned long faddr, - unsigned long lpfn, - unsigned long rpfn, - unsigned long *start, - unsigned long *end) -{ - *start = max3(lpfn, PFN_DOWN(vma->vm_start), - PFN_DOWN(faddr & PMD_MASK)); - *end = min3(rpfn, PFN_DOWN(vma->vm_end), - PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); -} +#define SWAP_RA_ORDER_CEILING 5 + +struct vma_swap_readahead { + unsigned short win; + unsigned short offset; + unsigned short nr_pte; +}; static void swap_ra_info(struct vm_fault *vmf, - struct vma_swap_readahead *ra_info) + struct vma_swap_readahead *ra_info) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; - swp_entry_t entry; - unsigned long faddr, pfn, fpfn; + unsigned long faddr, pfn, fpfn, lpfn, rpfn; unsigned long start, end; - pte_t *pte, *orig_pte; - unsigned int max_win, hits, prev_win, win, left; -#ifndef CONFIG_64BIT - pte_t *tpte; -#endif + unsigned int max_win, hits, prev_win, win; max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); @@ -769,13 +747,6 @@ static void swap_ra_info(struct vm_fault *vmf, } faddr = vmf->address; - orig_pte = pte = pte_offset_map(vmf->pmd, faddr); - entry = pte_to_swp_entry(*pte); - if ((unlikely(non_swap_entry(entry)))) { - pte_unmap(orig_pte); - return; - } - fpfn = PFN_DOWN(faddr); ra_val = GET_SWAP_RA_VAL(vma); pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); @@ -785,95 +756,111 @@ static void swap_ra_info(struct vm_fault *vmf, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); - - if (win == 1) { - pte_unmap(orig_pte); + if (win == 1) return; - } - /* Copy the PTEs because the page table may be unmapped */ - if (fpfn == pfn + 1) - swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); - else if (pfn == fpfn + 1) - swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, - &start, &end); - else { - left = (win - 1) / 2; - swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, - &start, &end); + if (fpfn == pfn + 1) { + lpfn = fpfn; + rpfn = fpfn + win; + } else if (pfn == fpfn + 1) { + lpfn = fpfn - win + 1; + rpfn = fpfn + 1; + } else { + unsigned int left = (win - 1) / 2; + + lpfn = fpfn - left; + rpfn = fpfn + win - left; } + start = max3(lpfn, PFN_DOWN(vma->vm_start), + PFN_DOWN(faddr & PMD_MASK)); + end = min3(rpfn, PFN_DOWN(vma->vm_end), + PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); + ra_info->nr_pte = end - start; ra_info->offset = fpfn - start; - pte -= ra_info->offset; -#ifdef CONFIG_64BIT - ra_info->ptes = pte; -#else - tpte = ra_info->ptes; - for (pfn = start; pfn != end; pfn++) - *tpte++ = *pte++; -#endif - pte_unmap(orig_pte); } /** * swap_vma_readahead - swap in pages in hope we need them soon - * @fentry: swap entry of this memory + * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * - * Returns the struct page for entry and addr, after queueing swapin. + * Returns the struct folio for entry and addr, after queueing swapin. * - * Primitive swap readahead code. We simply read in a few pages whoes + * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ -static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, - struct vm_fault *vmf) +static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; - struct vm_area_struct *vma = vmf->vma; - struct page *page; - pte_t *pte, pentry; + struct swap_iocb *splug = NULL; + struct folio *folio; + pte_t *pte = NULL, pentry; + unsigned long addr; swp_entry_t entry; + pgoff_t ilx; unsigned int i; bool page_allocated; - struct vma_swap_readahead ra_info = {0,}; + struct vma_swap_readahead ra_info = { + .win = 1, + }; swap_ra_info(vmf, &ra_info); if (ra_info.win == 1) goto skip; + addr = vmf->address - (ra_info.offset * PAGE_SIZE); + ilx = targ_ilx - ra_info.offset; + blk_start_plug(&plug); - for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; - i++, pte++) { - pentry = *pte; - if (pte_none(pentry)) - continue; - if (pte_present(pentry)) + for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { + if (!pte++) { + pte = pte_offset_map(vmf->pmd, addr); + if (!pte) + break; + } + pentry = ptep_get_lockless(pte); + if (!is_swap_pte(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) continue; - page = __read_swap_cache_async(entry, gfp_mask, vma, - vmf->address, &page_allocated); - if (!page) + pte_unmap(pte); + pte = NULL; + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated, false); + if (!folio) continue; if (page_allocated) { - swap_readpage(page, false); + swap_read_folio(folio, false, &splug); if (i != ra_info.offset) { - SetPageReadahead(page); + folio_set_readahead(folio); count_vm_event(SWAP_RA); } } - put_page(page); + folio_put(folio); } + if (pte) + pte_unmap(pte); blk_finish_plug(&plug); + swap_read_unplug(splug); lru_add_drain(); skip: - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - ra_info.win == 1); + /* The folio was likely read above, so no need for plugging here */ + folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated, false); + if (unlikely(page_allocated)) { + zswap_folio_swapin(folio); + swap_read_folio(folio, false, NULL); + } + return folio; } /** @@ -891,40 +878,48 @@ skip: struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { - return swap_use_vma_readahead() ? - swap_vma_readahead(entry, gfp_mask, vmf) : - swap_cluster_readahead(entry, gfp_mask, vmf); + struct mempolicy *mpol; + pgoff_t ilx; + struct folio *folio; + + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); + folio = swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : + swap_cluster_readahead(entry, gfp_mask, mpol, ilx); + mpol_cond_put(mpol); + + if (!folio) + return NULL; + return folio_file_page(folio, swp_offset(entry)); } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false"); + return sysfs_emit(buf, "%s\n", + enable_vma_readahead ? "true" : "false"); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - enable_vma_readahead = true; - else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - enable_vma_readahead = false; - else - return -EINVAL; + ssize_t ret; + + ret = kstrtobool(buf, &enable_vma_readahead); + if (ret) + return ret; return count; } -static struct kobj_attribute vma_ra_enabled_attr = - __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, - vma_ra_enabled_store); +static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, NULL, }; -static struct attribute_group swap_attr_group = { +static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; |