diff options
Diffstat (limited to 'mm/util.c')
-rw-r--r-- | mm/util.c | 440 |
1 files changed, 307 insertions, 133 deletions
diff --git a/mm/util.c b/mm/util.c index 4e21fe7eae27..5a6a9802583b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -27,6 +27,7 @@ #include <linux/uaccess.h> #include "internal.h" +#include "swap.h" /** * kfree_const - conditionally free memory @@ -48,6 +49,7 @@ EXPORT_SYMBOL(kfree_const); * * Return: newly allocated copy of @s or %NULL in case of error */ +noinline char *kstrdup(const char *s, gfp_t gfp) { size_t len; @@ -69,7 +71,8 @@ EXPORT_SYMBOL(kstrdup); * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * - * Note: Strings allocated by kstrdup_const should be freed by kfree_const. + * Note: Strings allocated by kstrdup_const should be freed by kfree_const and + * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. @@ -118,7 +121,8 @@ EXPORT_SYMBOL(kstrndup); * @len: memory region length * @gfp: GFP mask to use * - * Return: newly allocated copy of @src or %NULL in case of error + * Return: newly allocated copy of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { @@ -132,6 +136,27 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) EXPORT_SYMBOL(kmemdup); /** + * kvmemdup - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length + * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error, + * result may be not physically contiguous. Use kvfree() to free. + */ +void *kvmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *p; + + p = kvmalloc(len, gfp); + if (p) + memcpy(p, src, len); + return p; +} +EXPORT_SYMBOL(kvmemdup); + +/** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify * @len: The size of the data @@ -270,38 +295,6 @@ void *memdup_user_nul(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user_nul); -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - struct vm_area_struct *next; - - vma->vm_prev = prev; - if (prev) { - next = prev->vm_next; - prev->vm_next = vma; - } else { - next = mm->mmap; - mm->mmap = vma; - } - vma->vm_next = next; - if (next) - next->vm_prev = vma; -} - -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) -{ - struct vm_area_struct *prev, *next; - - next = vma->vm_next; - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - if (next) - next->vm_prev = prev; -} - /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { @@ -310,6 +303,18 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } +/* + * Change backing file, only valid to use during initial VMA setup. + */ +void vma_set_file(struct vm_area_struct *vma, struct file *file) +{ + /* Changing an anonymous vma with this is illegal */ + get_file(file); + swap(vma->vm_file, file); + fput(file); +} +EXPORT_SYMBOL(vma_set_file); + #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif @@ -330,8 +335,40 @@ unsigned long randomize_stack_top(unsigned long stack_top) #endif } +/** + * randomize_page - Generate a random, page aligned address + * @start: The smallest acceptable address the caller will take. + * @range: The size of the area, starting at @start, within which the + * random address must fall. + * + * If @start + @range would overflow, @range is capped. + * + * NOTE: Historical use of randomize_range, which this replaces, presumed that + * @start was already page aligned. We now align it regardless. + * + * Return: A page aligned address within [start, start + range). On error, + * @start is returned. + */ +unsigned long randomize_page(unsigned long start, unsigned long range) +{ + if (!PAGE_ALIGNED(start)) { + range -= PAGE_ALIGN(start) - start; + start = PAGE_ALIGN(start); + } + + if (start > ULONG_MAX - range) + range = ULONG_MAX - start; + + range >>= PAGE_SHIFT; + + if (range == 0) + return start; + + return start + (get_random_long() % range << PAGE_SHIFT); +} + #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT -unsigned long arch_randomize_brk(struct mm_struct *mm) +unsigned long __weak arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) @@ -359,7 +396,10 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlim_stack->rlim_cur == RLIM_INFINITY) + /* On parisc the stack always grows up - so a unlimited stack should + * not be an indicator to use the legacy memory layout. */ + if (rlim_stack->rlim_cur == RLIM_INFINITY && + !IS_ENABLED(CONFIG_STACK_GROWSUP)) return 1; return sysctl_legacy_va_layout; @@ -374,6 +414,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { +#ifdef CONFIG_STACK_GROWSUP + /* + * For an upwards growing stack the calculation is much simpler. + * Memory for the maximum stack size is reserved at the top of the + * task. mmap_base starts directly below the stack and grows + * downwards. + */ + return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); +#else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; @@ -391,6 +440,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); +#endif } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) @@ -503,7 +553,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, + ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -536,13 +586,10 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not - * fall back to vmalloc. - * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) @@ -551,13 +598,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) void *ret; /* - * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) - * so the given set of flags has to be compatible. - */ - if ((flags & GFP_KERNEL) != GFP_KERNEL) - return kmalloc_node(size, flags, node); - - /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. @@ -569,6 +609,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + kmalloc_flags &= ~__GFP_NOFAIL; } ret = kmalloc_node(size, kmalloc_flags, node); @@ -580,8 +623,25 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; - return __vmalloc_node(size, 1, flags, node, - __builtin_return_address(0)); + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); @@ -622,111 +682,134 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); -static inline void *__page_rmapping(struct page *page) +void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) { - unsigned long mapping; + void *newp; + + if (oldsize >= newsize) + return (void *)p; + newp = kvmalloc(newsize, flags); + if (!newp) + return NULL; + memcpy(newp, p, oldsize); + kvfree(p); + return newp; +} +EXPORT_SYMBOL(kvrealloc); - mapping = (unsigned long)page->mapping; - mapping &= ~PAGE_MAPPING_FLAGS; +/** + * __vmalloc_array - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +void *__vmalloc_array(size_t n, size_t size, gfp_t flags) +{ + size_t bytes; - return (void *)mapping; + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + return __vmalloc(bytes, flags); } +EXPORT_SYMBOL(__vmalloc_array); -/* Neutral page->mapping pointer to address_space or anon_vma or other */ -void *page_rmapping(struct page *page) +/** + * vmalloc_array - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +void *vmalloc_array(size_t n, size_t size) { - page = compound_head(page); - return __page_rmapping(page); + return __vmalloc_array(n, size, GFP_KERNEL); } +EXPORT_SYMBOL(vmalloc_array); -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped. - */ -bool page_mapped(struct page *page) -{ - int i; - - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) >= 0; - page = compound_head(page); - if (atomic_read(compound_mapcount_ptr(page)) >= 0) - return true; - if (PageHuge(page)) - return false; - for (i = 0; i < compound_nr(page); i++) { - if (atomic_read(&page[i]._mapcount) >= 0) - return true; - } - return false; +/** + * __vcalloc - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +void *__vcalloc(size_t n, size_t size, gfp_t flags) +{ + return __vmalloc_array(n, size, flags | __GFP_ZERO); } -EXPORT_SYMBOL(page_mapped); +EXPORT_SYMBOL(__vcalloc); -struct anon_vma *page_anon_vma(struct page *page) +/** + * vcalloc - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +void *vcalloc(size_t n, size_t size) +{ + return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL(vcalloc); + +struct anon_vma *folio_anon_vma(struct folio *folio) { - unsigned long mapping; + unsigned long mapping = (unsigned long)folio->mapping; - page = compound_head(page); - mapping = (unsigned long)page->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) return NULL; - return __page_rmapping(page); + return (void *)(mapping - PAGE_MAPPING_ANON); } -struct address_space *page_mapping(struct page *page) +/** + * folio_mapping - Find the mapping where this folio is stored. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Folios in the swap cache return the swap mapping + * this page is stored in (which is different from the mapping for the + * swap file or swap device where the data is stored). + * + * You can call this for folios which aren't in the swap cache or page + * cache and it will return NULL. + */ +struct address_space *folio_mapping(struct folio *folio) { struct address_space *mapping; - page = compound_head(page); - /* This happens if someone calls flush_dcache_page on slab page */ - if (unlikely(PageSlab(page))) + if (unlikely(folio_test_slab(folio))) return NULL; - if (unlikely(PageSwapCache(page))) { - swp_entry_t entry; - - entry.val = page_private(page); - return swap_address_space(entry); - } + if (unlikely(folio_test_swapcache(folio))) + return swap_address_space(folio->swap); - mapping = page->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_ANON) + mapping = folio->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); + return mapping; } -EXPORT_SYMBOL(page_mapping); +EXPORT_SYMBOL(folio_mapping); -/* - * For file cache pages, return the address_space, otherwise return NULL +/** + * folio_copy - Copy the contents of one folio to another. + * @dst: Folio to copy to. + * @src: Folio to copy from. + * + * The bytes in the folio represented by @src are copied to @dst. + * Assumes the caller has validated that @dst is at least as large as @src. + * Can be called in atomic context for order-0 folios, but if the folio is + * larger, it may sleep. */ -struct address_space *page_mapping_file(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return NULL; - return page_mapping(page); -} - -/* Slow path of page_mapcount() for compound pages */ -int __page_mapcount(struct page *page) +void folio_copy(struct folio *dst, struct folio *src) { - int ret; - - ret = atomic_read(&page->_mapcount) + 1; - /* - * For file THP page->_mapcount contains total number of mapping - * of the page: no need to look into compound_mapcount. - */ - if (!PageAnon(page) && !PageHuge(page)) - return ret; - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) - ret--; - return ret; + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } } -EXPORT_SYMBOL_GPL(__page_mapcount); +EXPORT_SYMBOL(folio_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; @@ -755,14 +838,14 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; - int new_policy; + int new_policy = -1; int ret; /* * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply - * with the strict "NEVER", and to avoid possible race condtion (even + * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch @@ -773,7 +856,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, t = *table; t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); - if (ret) + if (ret || new_policy == -1) return ret; mm_compute_batch(new_policy); @@ -846,7 +929,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst + * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. @@ -893,6 +976,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: + pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n", + __func__, current->pid, current->comm); vm_unacct_memory(pages); return -ENOMEM; @@ -962,10 +1047,99 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) char *addr1, *addr2; int ret; - addr1 = kmap_atomic(page1); - addr2 = kmap_atomic(page2); + addr1 = kmap_local_page(page1); + addr2 = kmap_local_page(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2); - kunmap_atomic(addr1); + kunmap_local(addr2); + kunmap_local(addr1); return ret; } + +#ifdef CONFIG_PRINTK +/** + * mem_dump_obj - Print available provenance information + * @object: object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For example, for a slab-cache object, the slab name is printed, and, + * if available, the return address and stack trace from the allocation + * and last free path of that object. + */ +void mem_dump_obj(void *object) +{ + const char *type; + + if (kmem_dump_obj(object)) + return; + + if (vmalloc_dump_obj(object)) + return; + + if (is_vmalloc_addr(object)) + type = "vmalloc memory"; + else if (virt_addr_valid(object)) + type = "non-slab/vmalloc memory"; + else if (object == NULL) + type = "NULL pointer"; + else if (object == ZERO_SIZE_PTR) + type = "zero-size pointer"; + else + type = "non-paged memory"; + + pr_cont(" %s\n", type); +} +EXPORT_SYMBOL_GPL(mem_dump_obj); +#endif + +/* + * A driver might set a page logically offline -- PageOffline() -- and + * turn the page inaccessible in the hypervisor; after that, access to page + * content can be fatal. + * + * Some special PFN walkers -- i.e., /proc/kcore -- read content of random + * pages after checking PageOffline(); however, these PFN walkers can race + * with drivers that set PageOffline(). + * + * page_offline_freeze()/page_offline_thaw() allows for a subsystem to + * synchronize with such drivers, achieving that a page cannot be set + * PageOffline() while frozen. + * + * page_offline_begin()/page_offline_end() is used by drivers that care about + * such races when setting a page PageOffline(). + */ +static DECLARE_RWSEM(page_offline_rwsem); + +void page_offline_freeze(void) +{ + down_read(&page_offline_rwsem); +} + +void page_offline_thaw(void) +{ + up_read(&page_offline_rwsem); +} + +void page_offline_begin(void) +{ + down_write(&page_offline_rwsem); +} +EXPORT_SYMBOL(page_offline_begin); + +void page_offline_end(void) +{ + up_write(&page_offline_rwsem); +} +EXPORT_SYMBOL(page_offline_end); + +#ifndef flush_dcache_folio +void flush_dcache_folio(struct folio *folio) +{ + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + flush_dcache_page(folio_page(folio, i)); +} +EXPORT_SYMBOL(flush_dcache_folio); +#endif |