/*
 *  linux/mm/vmalloc.c
 *
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
 *  Numa awareness, Christoph Lameter, SGI, June 2005
 */

#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/llist.h>
#include <linux/bitops.h>

#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>

#include "internal.h"

struct vfree_deferred {
	struct llist_head list;
	struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

static void __vunmap(const void *, int);

static void free_work(struct work_struct *w)
{
	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
	struct llist_node *llnode = llist_del_all(&p->list);
	while (llnode) {
		void *p = llnode;
		llnode = llist_next(llnode);
		__vunmap(p, 1);
	}
}

/*** Page table manipulation functions ***/

static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
{
	pte_t *pte;

	pte = pte_offset_kernel(pmd, addr);
	do {
		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
	} while (pte++, addr += PAGE_SIZE, addr != end);
}

static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_clear_huge(pmd))
			continue;
		if (pmd_none_or_clear_bad(pmd))
			continue;
		vunmap_pte_range(pmd, addr, next);
	} while (pmd++, addr = next, addr != end);
}

static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
{
	pud_t *pud;
	unsigned long next;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_clear_huge(pud))
			continue;
		if (pud_none_or_clear_bad(pud))
			continue;
		vunmap_pmd_range(pud, addr, next);
	} while (pud++, addr = next, addr != end);
}

static void vunmap_page_range(unsigned long addr, unsigned long end)
{
	pgd_t *pgd;
	unsigned long next;

	BUG_ON(addr >= end);
	pgd = pgd_offset_k(addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
		vunmap_pud_range(pgd, addr, next);
	} while (pgd++, addr = next, addr != end);
}

static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
	pte_t *pte;

	/*
	 * nr is a running index into the array which helps higher level
	 * callers keep track of where we're up to.
	 */

	pte = pte_alloc_kernel(pmd, addr);
	if (!pte)
		return -ENOMEM;
	do {
		struct page *page = pages[*nr];

		if (WARN_ON(!pte_none(*pte)))
			return -EBUSY;
		if (WARN_ON(!page))
			return -ENOMEM;
		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
		(*nr)++;
	} while (pte++, addr += PAGE_SIZE, addr != end);
	return 0;
}

static int vmap_pmd_range(pud_t *pud, unsigned long addr,
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_alloc(&init_mm, pud, addr);
	if (!pmd)
		return -ENOMEM;
	do {
		next = pmd_addr_end(addr, end);
		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
			return -ENOMEM;
	} while (pmd++, addr = next, addr != end);
	return 0;
}

static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
	pud_t *pud;
	unsigned long next;

	pud = pud_alloc(&init_mm, pgd, addr);
	if (!pud)
		return -ENOMEM;
	do {
		next = pud_addr_end(addr, end);
		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
			return -ENOMEM;
	} while (pud++, addr = next, addr != end);
	return 0;
}

/*
 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
 * will have pfns corresponding to the "pages" array.
 *
 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
 */
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
				   pgprot_t prot, struct page **pages)
{
	pgd_t *pgd;
	unsigned long next;
	unsigned long addr = start;
	int err = 0;
	int nr = 0;

	BUG_ON(addr >= end);
	pgd = pgd_offset_k(addr);
	do {
		next = pgd_addr_end(addr, end);
		err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
		if (err)
			return err;
	} while (pgd++, addr = next, addr != end);

	return nr;
}

static int vmap_page_range(unsigned long start, unsigned long end,
			   pgprot_t prot, struct page **pages)
{
	int ret;

	ret = vmap_page_range_noflush(start, end, prot, pages);
	flush_cache_vmap(start, end);
	return ret;
}

int is_vmalloc_or_module_addr(const void *x)
{
	/*
	 * ARM, x86-64 and sparc64 put modules in a special place,
	 * and fall back on vmalloc() if that fails. Others
	 * just put it in the vmalloc space.
	 */
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
	unsigned long addr = (unsigned long)x;
	if (addr >= MODULES_VADDR && addr < MODULES_END)
		return 1;
#endif
	return is_vmalloc_addr(x);
}

/*
 * Walk a vmap address to the struct page it maps.
 */
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
	unsigned long addr = (unsigned long) vmalloc_addr;
	struct page *page = NULL;
	pgd_t *pgd = pgd_offset_k(addr);

	/*
	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
	 * architectures that do not vmalloc module space
	 */
	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));

	if (!pgd_none(*pgd)) {
		pud_t *pud = pud_offset(pgd, addr);
		if (!pud_none(*pud)) {
			pmd_t *pmd = pmd_offset(pud, addr);
			if (!pmd_none(*pmd)) {
				pte_t *ptep, pte;

				ptep = pte_offset_map(pmd, addr);
				pte = *ptep;
				if (pte_present(pte))
					page = pte_page(pte);
				pte_unmap(ptep);
			}
		}
	}
	return page;
}
EXPORT_SYMBOL(vmalloc_to_page);

/*
 * Map a vmalloc()-space virtual address to the physical page frame number.
 */
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);


/*** Global kva allocator ***/

#define VM_LAZY_FREE	0x01
#define VM_LAZY_FREEING	0x02
#define VM_VM_AREA	0x04

static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static struct rb_root vmap_area_root = RB_ROOT;

/* The vmap cache globals are protected by vmap_area_lock */
static struct rb_node *free_vmap_cache;
static unsigned long cached_hole_size;
static unsigned long cached_vstart;
static unsigned long cached_align;

static unsigned long vmap_area_pcpu_hole;

static struct vmap_area *__find_vmap_area(unsigned long addr)
{
	struct rb_node *n = vmap_area_root.rb_node;

	while (n) {
		struct vmap_area *va;

		va = rb_entry(n, struct vmap_area, rb_node);
		if (addr < va->va_start)
			n = n->rb_left;
		else if (addr >= va->va_end)
			n = n->rb_right;
		else
			return va;
	}

	return NULL;
}

static void __insert_vmap_area(struct vmap_area *va)
{
	struct rb_node **p = &vmap_area_root.rb_node;
	struct rb_node *parent = NULL;
	struct rb_node *tmp;

	while (*p) {
		struct vmap_area *tmp_va;

		parent = *p;
		tmp_va = rb_entry(parent, struct vmap_area, rb_node);
		if (va->va_start < tmp_va->va_end)
			p = &(*p)->rb_left;
		else if (va->va_end > tmp_va->va_start)
			p = &(*p)->rb_right;
		else
			BUG();
	}

	rb_link_node(&va->rb_node, parent, p);
	rb_insert_color(&va->rb_node, &vmap_area_root);

	/* address-sort this list */
	tmp = rb_prev(&va->rb_node);
	if (tmp) {
		struct vmap_area *prev;
		prev = rb_entry(tmp, struct vmap_area, rb_node);
		list_add_rcu(&va->list, &prev->list);
	} else
		list_add_rcu(&va->list, &vmap_area_list);
}

static void purge_vmap_area_lazy(void);

/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
				unsigned long align,
				unsigned long vstart, unsigned long vend,
				int node, gfp_t gfp_mask)
{
	struct vmap_area *va;
	struct rb_node *n;
	unsigned long addr;
	int purged = 0;
	struct vmap_area *first;

	BUG_ON(!size);
	BUG_ON(offset_in_page(size));
	BUG_ON(!is_power_of_2(align));

	va = kmalloc_node(sizeof(struct vmap_area),
			gfp_mask & GFP_RECLAIM_MASK, node);
	if (unlikely(!va))
		return ERR_PTR(-ENOMEM);

	/*
	 * Only scan the relevant parts containing pointers to other objects
	 * to avoid false negatives.
	 */
	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);

retry:
	spin_lock(&vmap_area_lock);
	/*
	 * Invalidate cache if we have more permissive parameters.
	 * cached_hole_size notes the largest hole noticed _below_
	 * the vmap_area cached in free_vmap_cache: if size fits
	 * into that hole, we want to scan from vstart to reuse
	 * the hole instead of allocating above free_vmap_cache.
	 * Note that __free_vmap_area may update free_vmap_cache
	 * without updating cached_hole_size or cached_align.
	 */
	if (!free_vmap_cache ||
			size < cached_hole_size ||
			vstart < cached_vstart ||
			align < cached_align) {
nocache:
		cached_hole_size = 0;
		free_vmap_cache = NULL;
	}
	/* record if we encounter less permissive parameters */
	cached_vstart = vstart;
	cached_align = align;

	/* find starting point for our search */
	if (free_vmap_cache) {
		first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
		addr = ALIGN(first->va_end, align);
		if (addr < vstart)
			goto nocache;
		if (addr + size < addr)
			goto overflow;

	} else {
		addr = ALIGN(vstart, align);
		if (addr + size < addr)
			goto overflow;

		n = vmap_area_root.rb_node;
		first = NULL;

		while (n) {
			struct vmap_area *tmp;
			tmp = rb_entry(n, struct vmap_area, rb_node);
			if (tmp->va_end >= addr) {
				first = tmp;
				if (tmp->va_start <= addr)
					break;
				n = n->rb_left;
			} else
				n = n->rb_right;
		}

		if (!first)
			goto found;
	}

	/* from the starting point, walk areas until a suitable hole is found */
	while (addr + size > first->va_start && addr + size <= vend) {
		if (addr + cached_hole_size < first->va_start)
			cached_hole_size = first->va_start - addr;
		addr = ALIGN(first->va_end, align);
		if (addr + size < addr)
			goto overflow;

		if (list_is_last(&first->list, &vmap_area_list))
			goto found;

		first = list_entry(first->list.next,
				struct vmap_area, list);
	}

found:
	if (addr + size > vend)
		goto overflow;

	va->va_start = addr;
	va->va_end = addr + size;
	va->flags = 0;
	__insert_vmap_area(va);
	free_vmap_cache = &va->rb_node;
	spin_unlock(&vmap_area_lock);

	BUG_ON(va->va_start & (align-1));
	BUG_ON(va->va_start < vstart);
	BUG_ON(va->va_end > vend);

	return va;

overflow:
	spin_unlock(&vmap_area_lock);
	if (!purged) {
		purge_vmap_area_lazy();
		purged = 1;
		goto retry;
	}
	if (printk_ratelimit())
		pr_warn("vmap allocation for size %lu failed: "
			"use vmalloc=<size> to increase size.\n", size);
	kfree(va);
	return ERR_PTR(-EBUSY);
}

static void __free_vmap_area(struct vmap_area *va)
{
	BUG_ON(RB_EMPTY_NODE(&va->rb_node));

	if (free_vmap_cache) {
		if (va->va_end < cached_vstart) {
			free_vmap_cache = NULL;
		} else {
			struct vmap_area *cache;
			cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
			if (va->va_start <= cache->va_start) {
				free_vmap_cache = rb_prev(&va->rb_node);
				/*
				 * We don't try to update cached_hole_size or
				 * cached_align, but it won't go very wrong.
				 */
			}
		}
	}
	rb_erase(&va->rb_node, &vmap_area_root);
	RB_CLEAR_NODE(&va->rb_node);
	list_del_rcu(&va->list);

	/*
	 * Track the highest possible candidate for pcpu area
	 * allocation.  Areas outside of <style>
@media only all and (prefers-color-scheme: dark) {
.highlight .hll { background-color: #49483e }
.highlight .c { color: #75715e } /* Comment */
.highlight .err { color: #960050; background-color: #1e0010 } /* Error */
.highlight .k { color: #66d9ef } /* Keyword */
.highlight .l { color: #ae81ff } /* Literal */
.highlight .n { color: #f8f8f2 } /* Name */
.highlight .o { color: #f92672 } /* Operator */
.highlight .p { color: #f8f8f2 } /* Punctuation */
.highlight .ch { color: #75715e } /* Comment.Hashbang */
.highlight .cm { color: #75715e } /* Comment.Multiline */
.highlight .cp { color: #75715e } /* Comment.Preproc */
.highlight .cpf { color: #75715e } /* Comment.PreprocFile */
.highlight .c1 { color: #75715e } /* Comment.Single */
.highlight .cs { color: #75715e } /* Comment.Special */
.highlight .gd { color: #f92672 } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gi { color: #a6e22e } /* Generic.Inserted */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #75715e } /* Generic.Subheading */
.highlight .kc { color: #66d9ef } /* Keyword.Constant */
.highlight .kd { color: #66d9ef } /* Keyword.Declaration */
.highlight .kn { color: #f92672 } /* Keyword.Namespace */
.highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
.highlight .kr { color: #66d9ef } /* Keyword.Reserved */
.highlight .kt { color: #66d9ef } /* Keyword.Type */
.highlight .ld { color: #e6db74 } /* Literal.Date */
.highlight .m { color: #ae81ff } /* Literal.Number */
.highlight .s { color: #e6db74 } /* Literal.String */
.highlight .na { color: #a6e22e } /* Name.Attribute */
.highlight .nb { color: #f8f8f2 } /* Name.Builtin */
.highlight .nc { color: #a6e22e } /* Name.Class */
.highlight .no { color: #66d9ef } /* Name.Constant */
.highlight .nd { color: #a6e22e } /* Name.Decorator */
.highlight .ni { color: #f8f8f2 } /* Name.Entity */
.highlight .ne { color: #a6e22e } /* Name.Exception */
.highlight .nf { color: #a6e22e } /* Name.Function */
.highlight .nl { color: #f8f8f2 } /* Name.Label */
.highlight .nn { color: #f8f8f2 } /* Name.Namespace */
.highlight .nx { color: #a6e22e } /* Name.Other */
.highlight .py { color: #f8f8f2 } /* Name.Property */
.highlight .nt { color: #f92672 } /* Name.Tag */
.highlight .nv { color: #f8f8f2 } /* Name.Variable */
.highlight .ow { color: #f92672 } /* Operator.Word */
.highlight .w { color: #f8f8f2 } /* Text.Whitespace */
.highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
.highlight .mf { color: #ae81ff } /* Literal.Number.Float */
.highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
.highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
.highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
.highlight .sa { color: #e6db74 } /* Literal.String.Affix */
.highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
.highlight .sc { color: #e6db74 } /* Literal.String.Char */
.highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
.highlight .sd { color: #e6db74 } /* Literal.String.Doc */
.highlight .s2 { color: #e6db74 } /* Literal.String.Double */
.highlight .se { color: #ae81ff } /* Literal.String.Escape */
.highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
.highlight .si { color: #e6db74 } /* Literal.String.Interpol */
.highlight .sx { color: #e6db74 } /* Literal.String.Other */
.highlight .sr { color: #e6db74 } /* Literal.String.Regex */
.highlight .s1 { color: #e6db74 } /* Literal.String.Single */
.highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
.highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #a6e22e } /* Name.Function.Magic */
.highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
.highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
.highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
.highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
.highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */
}
@media (prefers-color-scheme: light) {
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
}
</style><div class="highlight"><pre><span></span><span class="cm">/*</span>
<span class="cm"> *   linux/mm/fremap.c</span>
<span class="cm"> * </span>
<span class="cm"> * Explicit pagetable population and nonlinear (random) mappings support.</span>
<span class="cm"> *</span>
<span class="cm"> * started by Ingo Molnar, Copyright (C) 2002, 2003</span>
<span class="cm"> */</span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/backing-dev.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/mm.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/swap.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/file.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/mman.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/pagemap.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/swapops.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/rmap.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/module.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;linux/syscalls.h&gt;</span><span class="cp"></span>

<span class="cp">#include</span> <span class="cpf">&lt;asm/mmu_context.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;asm/cacheflush.h&gt;</span><span class="cp"></span>
<span class="cp">#include</span> <span class="cpf">&lt;asm/tlbflush.h&gt;</span><span class="cp"></span>

<span class="k">static</span> <span class="kt">void</span> <span class="nf">zap_pte</span><span class="p">(</span><span class="k">struct</span> <span class="n">mm_struct</span> <span class="o">*</span><span class="n">mm</span><span class="p">,</span> <span class="k">struct</span> <span class="n">vm_area_struct</span> <span class="o">*</span><span class="n">vma</span><span class="p">,</span>
			<span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">addr</span><span class="p">,</span> <span class="n">pte_t</span> <span class="o">*</span><span class="n">ptep</span><span class="p">)</span>
<span class="p">{</span>
	<span class="n">pte_t</span> <span class="n">pte</span> <span class="o">=</span> <span class="o">*</span><span class="n">ptep</span><span class="p">;</span>

	<span class="k">if</span> <span class="p">(</span><span class="n">pte_present</span><span class="p">(</span><span class="n">pte</span><span class="p">))</span> <span class="p">{</span>
		<span class="k">struct</span> <span class="n">page</span> <span class="o">*</span><span class="n">page</span><span class="p">;</span>

		<span class="n">flush_cache_page</span><span class="p">(</span><span class="n">vma</span><span class="p">,</span> <span class="n">addr</span><span class="p">,</span> <span class="n">pte_pfn</span><span class="p">(</span><span class="n">pte</span><span class="p">));</span>
		<span class="n">pte</span> <span class="o">=</span> <span class="n">ptep_clear_flush</span><span class="p">(</span><span class="n">vma</span><span class="p">,</span> <span class="n">addr</span><span class="p">,</span> <span class="n">ptep</span><span class="p">);</span>
		<span class="n">page</span> <span class="o">=</span> <span class="n">vm_normal_page</span><span class="p">(</span><span class="n">vma</span><span class="p">,</span> <span class="n">addr</span><span class="p">,</span> <span class="n">pte</span><span class="p">);</span>
		<span class="k">if</span> <span class="p">(</span><span class="n">page</span><span class="p">)</span> <span class="p">{</span>
			<span class="k">if</span> <span class="p">(</span><span class="n">pte_dirty</span><span class="p">(</span><span class="n">pte</span><span class="p">))</span>
				<span class="n">set_page_dirty</span><span class="p">(</span><span class="n">page</span><span class="p">);</span>
			<span class="n">page_remove_rmap</span><span class="p">(</span><span class="n">page</span><span class="p">,</span> <span class="n">vma</span><span class="p">);</span>
			<span class="n">page_cache_release</span><span class="p">(</span><span class="n">page</span><span class="p">);</span>
			<span class="n">update_hiwater_rss</span><span class="p">(</span><span class="n">mm</span><span class="p">);</span>
			<span class="n">dec_mm_counter</span><span class="p">(</span><span class="n">mm</span><span class="p">,</span> <span class="n">file_rss</span><span class="p">);</span>
		<span class="p">}</span>
	<span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
		<span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">pte_file</span><span class="p">(</span><span class="n">pte</span><span class="p">))</span>
			<span class="n">free_swap_and_cache</span><span class="p">(</span><span class="n">pte_to_swp_entry</span><span class="p">(</span><span class="n">pte</span><span class="p">));</span>
		<span class="n">pte_clear_not_present_full</span><span class="p">(</span><span class="n">mm</span><span class="p">,</span> <span class="n">addr</span><span class="p">,</span> <span class="n">ptep</span><span class="p">,</span> <span class="mi">0</span><span class="p">);</span>
	<span class="p">}</span>
<span class="p">}</span>

<span class="cm">/*</span>
<span class="cm"> * Install a file pte to a given virtual memory address, release any</span>
<span class="cm"> * previously existing mapping.</span>
<span class="cm"> */</span>
<span class="k">static</span> <span class="kt">int</span> <span class="nf">install_file_pte</span><span class="p">(</span><span class="k">struct</span> <span class="n">mm_struct</span> <span class="o">*</span><span class="n">mm</span><span class="p">,</span> <span class="k">struct</span> <span class="n">vm_area_struct</span> <span class="o">*</span><span class="n">vma</span><span class="p">,</span>
		<span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">addr</span><span class="p">,</span> <span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">pgoff</span><span class="p">,</span> <span class="n">pgprot_t</span> <span class="n">prot</span><span class="p">)</span>
<span class="p">{</span>
	<span class="kt">int</span> <span class="n">err</span> <span class="o">=</span> <span class="o">-</span><span class="n">ENOMEM</span><span class="p">;</span>
	<span class="n">pte_t</span> <span class="o">*</span><span class="n">pte</span><span class="p">;</span>
	<span class="n">spinlock_t</span> <span class="o">*</span><span class="n">ptl</span><span class="p">;</span>

	<span class="n">pte</span> <span class="o">=</span> <span class="n">get_locked_pte</span><span class="p">(</span><span class="n">mm</span><span class="p">,</span> <span class="n">addr</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">ptl</span><span class="p">);</span>
	<span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">pte</span><span class="p">)</span>
		<span class="k">goto</span> <span class="n">out</span><span class="p">;</span>

	<span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">pte_none</span><span class="p">(</span><span class="o">*</span><span class="n">pte</span><span class="p">))</span>
		<span class="n">zap_pte</span><span class="p">(</span><span class="n">mm</span><span class="p">,</span> <span class="n">vma</span><span class="p">,</span> <span class="n">addr</span><span class="p">,</span> <span class="n">pte</span><span class="p">);</span>

	<span class="n">set_pte_at</span><span class="p">(</span><span class="n">mm</span><span class="p">,</span> <span class="n">addr</span><span class="p">,</span> <span class="n">pte</span><span class="p">,</span> <span class="n">pgoff_to_pte</span><span class="p">(</span><span class="n">pgoff</span><span class="p">));</span>
	<span class="cm">/*</span>
<span class="cm">	 * We don&#39;t need to run update_mmu_cache() here because the &quot;file pte&quot;</span>
<span class="cm">	 * being installed by install_file_pte() is not a real pte - it&#39;s a</span>
<span class="cm">	 * non-present entry (like a swap entry), noting what file offset should</span>
<span class="cm">	 * be mapped there when there&#39;s a fault (in a non-linear vma where</span>
<span class="cm">	 * that&#39;s not obvious).</span>
<span class="cm">	 */</span>
	<span class="n">pte_unmap_unlock</span><span class="p">(</span><span class="n">pte</span><span class="p">,</span> <span class="n">ptl</span><span class="p">);</span>
	<span class="n">err</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
<span class="nl">out</span><span class="p">:</span>
	<span class="k">return</span> <span class="n">err</span><span class="p">;</span>
<span class="p">}</span>

<span class="k">static</span> <span class="kt">int</span> <span class="nf">populate_range</span><span class="p">(</span><span class="k">struct</span> <span class="n">mm_struct</span> <span class="o">*</span><span class="n">mm</span><span class="p">,</span> <span class="k">struct</span> <span class="n">vm_area_struct</span> <span class="o">*</span><span class="n">vma</span><span class="p">,</span>
			<span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">addr</span><span class="p">,</span> <span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">size</span><span class="p">,</span> <span class="n">pgoff_t</span> <span class="n">pgoff</span><span class="p">)</span>
<span class="p">{</span>
	<span class="kt">int</span> <span class="n">err</span><span class="p">;</span>

	<span class="k">do</span> <span class="p">{</span>
		<span class="n">err</span> <span class="o">=</span> <span class="n">install_file_pte</span><span class="p">(</span><span class="n">mm</span><span class="p">,</span> <span class="n">vma</span><span class="p">,</span> <span class="n">addr</span><span class="p">,</span> <span class="n">pgoff</span><span class="p">,</span> <span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_page_prot</span><span class="p">);</span>
		<span class="k">if</span> <span class="p">(</span><span class="n">err</span><span class="p">)</span>
			<span class="k">return</span> <span class="n">err</span><span class="p">;</span>

		<span class="n">size</span> <span class="o">-=</span> <span class="n">PAGE_SIZE</span><span class="p">;</span>
		<span class="n">addr</span> <span class="o">+=</span> <span class="n">PAGE_SIZE</span><span class="p">;</span>
		<span class="n">pgoff</span><span class="o">++</span><span class="p">;</span>
	<span class="p">}</span> <span class="k">while</span> <span class="p">(</span><span class="n">size</span><span class="p">);</span>

        <span class="k">return</span> <span class="mi">0</span><span class="p">;</span>

<span class="p">}</span>

<span class="cm">/**</span>
<span class="cm"> * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma</span>
<span class="cm"> * @start: start of the remapped virtual memory range</span>
<span class="cm"> * @size: size of the remapped virtual memory range</span>
<span class="cm"> * @prot: new protection bits of the range (see NOTE)</span>
<span class="cm"> * @pgoff: to-be-mapped page of the backing store file</span>
<span class="cm"> * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.</span>
<span class="cm"> *</span>
<span class="cm"> * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma</span>
<span class="cm"> * (shared backing store file).</span>
<span class="cm"> *</span>
<span class="cm"> * This syscall works purely via pagetables, so it&#39;s the most efficient</span>
<span class="cm"> * way to map the same (large) file into a given virtual window. Unlike</span>
<span class="cm"> * mmap()/mremap() it does not create any new vmas. The new mappings are</span>
<span class="cm"> * also safe across swapout.</span>
<span class="cm"> *</span>
<span class="cm"> * NOTE: the @prot parameter right now is ignored (but must be zero),</span>
<span class="cm"> * and the vma&#39;s default protection is used. Arbitrary protections</span>
<span class="cm"> * might be implemented in the future.</span>
<span class="cm"> */</span>
<span class="n">asmlinkage</span> <span class="kt">long</span> <span class="nf">sys_remap_file_pages</span><span class="p">(</span><span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">start</span><span class="p">,</span> <span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">size</span><span class="p">,</span>
	<span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">prot</span><span class="p">,</span> <span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">pgoff</span><span class="p">,</span> <span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">flags</span><span class="p">)</span>
<span class="p">{</span>
	<span class="k">struct</span> <span class="n">mm_struct</span> <span class="o">*</span><span class="n">mm</span> <span class="o">=</span> <span class="n">current</span><span class="o">-&gt;</span><span class="n">mm</span><span class="p">;</span>
	<span class="k">struct</span> <span class="n">address_space</span> <span class="o">*</span><span class="n">mapping</span><span class="p">;</span>
	<span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">end</span> <span class="o">=</span> <span class="n">start</span> <span class="o">+</span> <span class="n">size</span><span class="p">;</span>
	<span class="k">struct</span> <span class="n">vm_area_struct</span> <span class="o">*</span><span class="n">vma</span><span class="p">;</span>
	<span class="kt">int</span> <span class="n">err</span> <span class="o">=</span> <span class="o">-</span><span class="n">EINVAL</span><span class="p">;</span>
	<span class="kt">int</span> <span class="n">has_write_lock</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>

	<span class="k">if</span> <span class="p">(</span><span class="n">prot</span><span class="p">)</span>
		<span class="k">return</span> <span class="n">err</span><span class="p">;</span>
	<span class="cm">/*</span>
<span class="cm">	 * Sanitize the syscall parameters:</span>
<span class="cm">	 */</span>
	<span class="n">start</span> <span class="o">=</span> <span class="n">start</span> <span class="o">&amp;</span> <span class="n">PAGE_MASK</span><span class="p">;</span>
	<span class="n">size</span> <span class="o">=</span> <span class="n">size</span> <span class="o">&amp;</span> <span class="n">PAGE_MASK</span><span class="p">;</span>

	<span class="cm">/* Does the address range wrap, or is the span zero-sized? */</span>
	<span class="k">if</span> <span class="p">(</span><span class="n">start</span> <span class="o">+</span> <span class="n">size</span> <span class="o">&lt;=</span> <span class="n">start</span><span class="p">)</span>
		<span class="k">return</span> <span class="n">err</span><span class="p">;</span>

	<span class="cm">/* Can we represent this offset inside this architecture&#39;s pte&#39;s? */</span>
<span class="cp">#if PTE_FILE_MAX_BITS &lt; BITS_PER_LONG</span>
	<span class="k">if</span> <span class="p">(</span><span class="n">pgoff</span> <span class="o">+</span> <span class="p">(</span><span class="n">size</span> <span class="o">&gt;&gt;</span> <span class="n">PAGE_SHIFT</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="p">(</span><span class="mi">1UL</span> <span class="o">&lt;&lt;</span> <span class="n">PTE_FILE_MAX_BITS</span><span class="p">))</span>
		<span class="k">return</span> <span class="n">err</span><span class="p">;</span>
<span class="cp">#endif</span>

	<span class="cm">/* We need down_write() to change vma-&gt;vm_flags. */</span>
	<span class="n">down_read</span><span class="p">(</span><span class="o">&amp;</span><span class="n">mm</span><span class="o">-&gt;</span><span class="n">mmap_sem</span><span class="p">);</span>
 <span class="nl">retry</span><span class="p">:</span>
	<span class="n">vma</span> <span class="o">=</span> <span class="n">find_vma</span><span class="p">(</span><span class="n">mm</span><span class="p">,</span> <span class="n">start</span><span class="p">);</span>

	<span class="cm">/*</span>
<span class="cm">	 * Make sure the vma is shared, that it supports prefaulting,</span>
<span class="cm">	 * and that the remapped range is valid and fully within</span>
<span class="cm">	 * the single existing vma.  vm_private_data is used as a</span>
<span class="cm">	 * swapout cursor in a VM_NONLINEAR vma.</span>
<span class="cm">	 */</span>
	<span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">vma</span> <span class="o">||</span> <span class="o">!</span><span class="p">(</span><span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_flags</span> <span class="o">&amp;</span> <span class="n">VM_SHARED</span><span class="p">))</span>
		<span class="k">goto</span> <span class="n">out</span><span class="p">;</span>

	<span class="k">if</span> <span class="p">(</span><span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_private_data</span> <span class="o">&amp;&amp;</span> <span class="o">!</span><span class="p">(</span><span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_flags</span> <span class="o">&amp;</span> <span class="n">VM_NONLINEAR</span><span class="p">))</span>
		<span class="k">goto</span> <span class="n">out</span><span class="p">;</span>

	<span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="p">(</span><span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_flags</span> <span class="o">&amp;</span> <span class="n">VM_CAN_NONLINEAR</span><span class="p">))</span>
		<span class="k">goto</span> <span class="n">out</span><span class="p">;</span>

	<span class="k">if</span> <span class="p">(</span><span class="n">end</span> <span class="o">&lt;=</span> <span class="n">start</span> <span class="o">||</span> <span class="n">start</span> <span class="o">&lt;</span> <span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_start</span> <span class="o">||</span> <span class="n">end</span> <span class="o">&gt;</span> <span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_end</span><span class="p">)</span>
		<span class="k">goto</span> <span class="n">out</span><span class="p">;</span>

	<span class="cm">/* Must set VM_NONLINEAR before any pages are populated. */</span>
	<span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="p">(</span><span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_flags</span> <span class="o">&amp;</span> <span class="n">VM_NONLINEAR</span><span class="p">))</span> <span class="p">{</span>
		<span class="cm">/* Don&#39;t need a nonlinear mapping, exit success */</span>
		<span class="k">if</span> <span class="p">(</span><span class="n">pgoff</span> <span class="o">==</span> <span class="n">linear_page_index</span><span class="p">(</span><span class="n">vma</span><span class="p">,</span> <span class="n">start</span><span class="p">))</span> <span class="p">{</span>
			<span class="n">err</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
			<span class="k">goto</span> <span class="n">out</span><span class="p">;</span>
		<span class="p">}</span>

		<span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">has_write_lock</span><span class="p">)</span> <span class="p">{</span>
			<span class="n">up_read</span><span class="p">(</span><span class="o">&amp;</span><span class="n">mm</span><span class="o">-&gt;</span><span class="n">mmap_sem</span><span class="p">);</span>
			<span class="n">down_write</span><span class="p">(</span><span class="o">&amp;</span><span class="n">mm</span><span class="o">-&gt;</span><span class="n">mmap_sem</span><span class="p">);</span>
			<span class="n">has_write_lock</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span>
			<span class="k">goto</span> <span class="n">retry</span><span class="p">;</span>
		<span class="p">}</span>
		<span class="n">mapping</span> <span class="o">=</span> <span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_file</span><span class="o">-&gt;</span><span class="n">f_mapping</span><span class="p">;</span>
		<span class="cm">/*</span>
<span class="cm">		 * page_mkclean doesn&#39;t work on nonlinear vmas, so if</span>
<span class="cm">		 * dirty pages need to be accounted, emulate with linear</span>
<span class="cm">		 * vmas.</span>
<span class="cm">		 */</span>
		<span class="k">if</span> <span class="p">(</span><span class="n">mapping_cap_account_dirty</span><span class="p">(</span><span class="n">mapping</span><span class="p">))</span> <span class="p">{</span>
			<span class="kt">unsigned</span> <span class="kt">long</span> <span class="n">addr</span><span class="p">;</span>
			<span class="k">struct</span> <span class="n">file</span> <span class="o">*</span><span class="n">file</span> <span class="o">=</span> <span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_file</span><span class="p">;</span>

			<span class="n">flags</span> <span class="o">&amp;=</span> <span class="n">MAP_NONBLOCK</span><span class="p">;</span>
			<span class="n">get_file</span><span class="p">(</span><span class="n">file</span><span class="p">);</span>
			<span class="n">addr</span> <span class="o">=</span> <span class="n">mmap_region</span><span class="p">(</span><span class="n">file</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">size</span><span class="p">,</span>
					<span class="n">flags</span><span class="p">,</span> <span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_flags</span><span class="p">,</span> <span class="n">pgoff</span><span class="p">,</span> <span class="mi">1</span><span class="p">);</span>
			<span class="n">fput</span><span class="p">(</span><span class="n">file</span><span class="p">);</span>
			<span class="k">if</span> <span class="p">(</span><span class="n">IS_ERR_VALUE</span><span class="p">(</span><span class="n">addr</span><span class="p">))</span> <span class="p">{</span>
				<span class="n">err</span> <span class="o">=</span> <span class="n">addr</span><span class="p">;</span>
			<span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
				<span class="n">BUG_ON</span><span class="p">(</span><span class="n">addr</span> <span class="o">!=</span> <span class="n">start</span><span class="p">);</span>
				<span class="n">err</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
			<span class="p">}</span>
			<span class="k">goto</span> <span class="n">out</span><span class="p">;</span>
		<span class="p">}</span>
		<span class="n">spin_lock</span><span class="p">(</span><span class="o">&amp;</span><span class="n">mapping</span><span class="o">-&gt;</span><span class="n">i_mmap_lock</span><span class="p">);</span>
		<span class="n">flush_dcache_mmap_lock</span><span class="p">(</span><span class="n">mapping</span><span class="p">);</span>
		<span class="n">vma</span><span class="o">-&gt;</span><span class="n">vm_flags</span> <span class="o">|=</span> <span class="n">VM_NONLINEAR</span><span class="p">;</span>
		<span class="n">vma_prio_tree_remove</span><span class="p">(</span><span class="n">vma</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">mapping</span><span class="o">-&gt;</span><span class="n">i_mmap</span><span class="p">);</span>
		<span class="n">vma_nonlinear_insert</span><span class="p">(</span><span class="n">vma</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">mapping</span><span class="o">-&gt;</span><span class="n">i_mmap_nonlinear</span><span class="p">);</span>
		<span class="n">flush_dcache_mmap_unlock</span><span class="p">(</span><span class="n">mapping</span><span class="p">);</span>
		<span class="n">spin_unlock</span><span class="p">(</span><span class="o">&amp;</span><span class="n">mapping</span><span class="o">-&gt;</span><span class="n">i_mmap_lock</span><span class="p">);</span>
	<span class="p">}</span>

	<span class="n">err</span> <span class="o">=</span> <span class="n">populate_range</span><span class="p">(</span><span class="n">mm</span><span class="p">,</span> <span class="n">vma</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">size</span><span class="p">,</span> <span class="n">pgoff</span><span class="p">);</span>
	<span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">err</span> <span class="o">&amp;&amp;</span> <span class="o">!</span><span class="p">(</span><span class="n">flags</span> <span class="o">&amp;</span> <span class="n">MAP_NONBLOCK</span><span class="p">))</span> <span class="p">{</span>
		<span class="k">if</span> <span class="p">(</span><span class="n">unlikely</span><span class="p">(</span><span class="n">has_write_lock</span><span class="p">))</span> <span class="p">{</span>
			<span class="n">downgrade_write</span><span class="p">(</span><span class="o">&amp;</span><span class="n">mm</span><span class="o">-&gt;</span><span class="n">mmap_sem</span><span class="p">);</span>
			<span class="n">has_write_lock</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
		<span class="p">}</span>
		<span class="n">make_pages_present</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">start</span><span class="o">+</span><span class="n">size</span><span class="p">);</span>
	<span class="p">}</span>

	<span class="cm">/*</span>
<span class="cm">	 * We can&#39;t clear VM_NONLINEAR because we&#39;d have to do</span>
<span class="cm">	 * it after -&gt;populate completes, and that would prevent</span>
<span class="cm">	 * downgrading the lock.  (Locks can&#39;t be upgraded).</span>
<span class="cm">	 */</span>

<span class="nl">out</span><span class="p">:</span>
	<span class="k">if</span> <span class="p">(</span><span class="n">likely</span><span class="p">(</span><span class="o">!</span><span class="n">has_write_lock</span><span class="p">))</span>
		<span class="n">up_read</span><span class="p">(</span><span class="o">&amp;</span><span class="n">mm</span><span class="o">-&gt;</span><span class="n">mmap_sem</span><span class="p">);</span>
	<span class="k">else</span>
		<span class="n">up_write</span><span class="p">(</span><span class="o">&amp;</span><span class="n">mm</span><span class="o">-&gt;</span><span class="n">mmap_sem</span><span class="p">);</span>

	<span class="k">return</span> <span class="n">err</span><span class="p">;</span>
<span class="p">}</span>
</pre></div>
</code></pre></td></tr></table>
</div> <!-- class=content -->
<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit 1.2.3-korg</a> (<a href='https://git-scm.com/'>git 2.39.0</a>) at 2024-04-30 12:26:40 +0000</div>
</div> <!-- id=cgit -->
</body>
</html>
 GFP_KERNEL
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
#else
#define GFP_VMALLOC32 GFP_KERNEL
#endif

/**
 *	vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
 *	@size:		allocation size
 *
 *	Allocate enough 32bit PA addressable pages to cover @size from the
 *	page level allocator and map them into contiguous kernel virtual space.
 */
void *vmalloc_32(unsigned long size)
{
	return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
			      NUMA_NO_NODE, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);

/**
 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 *	@size:		allocation size
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
 */
void *vmalloc_32_user(unsigned long size)
{
	struct vm_struct *area;
	void *ret;

	ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
			     NUMA_NO_NODE, __builtin_return_address(0));
	if (ret) {
		area = find_vm_area(ret);
		area->flags |= VM_USERMAP;
	}
	return ret;
}
EXPORT_SYMBOL(vmalloc_32_user);

/*
 * small helper routine , copy contents to buf from addr.
 * If the page is not present, fill zero.
 */

static int aligned_vread(char *buf, char *addr, unsigned long count)
{
	struct page *p;
	int copied = 0;

	while (count) {
		unsigned long offset, length;

		offset = offset_in_page(addr);
		length = PAGE_SIZE - offset;
		if (length > count)
			length = count;
		p = vmalloc_to_page(addr);
		/*
		 * To do safe access to this _mapped_ area, we need
		 * lock. But adding lock here means that we need to add
		 * overhead of vmalloc()/vfree() calles for this _debug_
		 * interface, rarely used. Instead of that, we'll use
		 * kmap() and get small overhead in this access function.
		 */
		if (p) {
			/*
			 * we can expect USER0 is not used (see vread/vwrite's
			 * function description)
			 */
			void *map = kmap_atomic(p);
			memcpy(buf, map + offset, length);
			kunmap_atomic(map);
		} else
			memset(buf, 0, length);

		addr += length;
		buf += length;
		copied += length;
		count -= length;
	}
	return copied;
}

static int aligned_vwrite(char *buf, char *addr, unsigned long count)
{
	struct page *p;
	int copied = 0;

	while (count) {
		unsigned long offset, length;

		offset = offset_in_page(addr);
		length = PAGE_SIZE - offset;
		if (length > count)
			length = count;
		p = vmalloc_to_page(addr);
		/*
		 * To do safe access to this _mapped_ area, we need
		 * lock. But adding lock here means that we need to add
		 * overhead of vmalloc()/vfree() calles for this _debug_
		 * interface, rarely used. Instead of that, we'll use
		 * kmap() and get small overhead in this access function.
		 */
		if (p) {
			/*
			 * we can expect USER0 is not used (see vread/vwrite's
			 * function description)
			 */
			void *map = kmap_atomic(p);
			memcpy(map + offset, buf, length);
			kunmap_atomic(map);
		}
		addr += length;
		buf += length;
		copied += length;
		count -= length;
	}
	return copied;
}

/**
 *	vread() -  read vmalloc area in a safe way.
 *	@buf:		buffer for reading data
 *	@addr:		vm address.
 *	@count:		number of bytes to be read.
 *
 *	Returns # of bytes which addr and buf should be increased.
 *	(same number to @count). Returns 0 if [addr...addr+count) doesn't
 *	includes any intersect with alive vmalloc area.
 *
 *	This function checks that addr is a valid vmalloc'ed area, and
 *	copy data from that area to a given buffer. If the given memory range
 *	of [addr...addr+count) includes some valid address, data is copied to
 *	proper area of @buf. If there are memory holes, they'll be zero-filled.
 *	IOREMAP area is treated as memory hole and no copy is done.
 *
 *	If [addr...addr+count) doesn't includes any intersects with alive
 *	vm_struct area, returns 0. @buf should be kernel's buffer.
 *
 *	Note: In usual ops, vread() is never necessary because the caller
 *	should know vmalloc() area is valid and can use memcpy().
 *	This is for routines which have to access vmalloc area without
 *	any informaion, as /dev/kmem.
 *
 */

long vread(char *buf, char *addr, unsigned long count)
{
	struct vmap_area *va;
	struct vm_struct *vm;
	char *vaddr, *buf_start = buf;
	unsigned long buflen = count;
	unsigned long n;

	/* Don't allow overflow */
	if ((unsigned long) addr + count < count)
		count = -(unsigned long) addr;

	spin_lock(&vmap_area_lock);
	list_for_each_entry(va, &vmap_area_list, list) {
		if (!count)
			break;

		if (!(va->flags & VM_VM_AREA))
			continue;

		vm = va->vm;
		vaddr = (char *) vm->addr;
		if (addr >= vaddr + get_vm_area_size(vm))
			continue;
		while (addr < vaddr) {
			if (count == 0)
				goto finished;
			*buf = '\0';
			buf++;
			addr++;
			count--;
		}
		n = vaddr + get_vm_area_size(vm) - addr;
		if (n > count)
			n = count;
		if (!(vm->flags & VM_IOREMAP))
			aligned_vread(buf, addr, n);
		else /* IOREMAP area is treated as memory hole */
			memset(buf, 0, n);
		buf += n;
		addr += n;
		count -= n;
	}
finished:
	spin_unlock(&vmap_area_lock);

	if (buf == buf_start)
		return 0;
	/* zero-fill memory holes */
	if (buf != buf_start + buflen)
		memset(buf, 0, buflen - (buf - buf_start));

	return buflen;
}

/**
 *	vwrite() -  write vmalloc area in a safe way.
 *	@buf:		buffer for source data
 *	@addr:		vm address.
 *	@count:		number of bytes to be read.
 *
 *	Returns # of bytes which addr and buf should be incresed.
 *	(same number to @count).
 *	If [addr...addr+count) doesn't includes any intersect with valid
 *	vmalloc area, returns 0.
 *
 *	This function checks that addr is a valid vmalloc'ed area, and
 *	copy data from a buffer to the given addr. If specified range of
 *	[addr...addr+count) includes some valid address, data is copied from
 *	proper area of @buf. If there are memory holes, no copy to hole.
 *	IOREMAP area is treated as memory hole and no copy is done.
 *
 *	If [addr...addr+count) doesn't includes any intersects with alive
 *	vm_struct area, returns 0. @buf should be kernel's buffer.
 *
 *	Note: In usual ops, vwrite() is never necessary because the caller
 *	should know vmalloc() area is valid and can use memcpy().
 *	This is for routines which have to access vmalloc area without
 *	any informaion, as /dev/kmem.
 */

long vwrite(char *buf, char *addr, unsigned long count)
{
	struct vmap_area *va;
	struct vm_struct *vm;
	char *vaddr;
	unsigned long n, buflen;
	int copied = 0;

	/* Don't allow overflow */
	if ((unsigned long) addr + count < count)
		count = -(unsigned long) addr;
	buflen = count;

	spin_lock(&vmap_area_lock);
	list_for_each_entry(va, &vmap_area_list, list) {
		if (!count)
			break;

		if (!(va->flags & VM_VM_AREA))
			continue;

		vm = va->vm;
		vaddr = (char *) vm->addr;
		if (addr >= vaddr + get_vm_area_size(vm))
			continue;
		while (addr < vaddr) {
			if (count == 0)
				goto finished;
			buf++;
			addr++;
			count--;
		}
		n = vaddr + get_vm_area_size(vm) - addr;
		if (n > count)
			n = count;
		if (!(vm->flags & VM_IOREMAP)) {
			aligned_vwrite(buf, addr, n);
			copied++;
		}
		buf += n;
		addr += n;
		count -= n;
	}
finished:
	spin_unlock(&vmap_area_lock);
	if (!copied)
		return 0;
	return buflen;
}

/**
 *	remap_vmalloc_range_partial  -  map vmalloc pages to userspace
 *	@vma:		vma to cover
 *	@uaddr:		target user address to start at
 *	@kaddr:		virtual address of vmalloc kernel memory
 *	@size:		size of map area
 *
 *	Returns:	0 for success, -Exxx on failure
 *
 *	This function checks that @kaddr is a valid vmalloc'ed area,
 *	and that it is big enough to cover the range starting at
 *	@uaddr in @vma. Will return failure if that criteria isn't
 *	met.
 *
 *	Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
				void *kaddr, unsigned long size)
{
	struct vm_struct *area;

	size = PAGE_ALIGN(size);

	if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
		return -EINVAL;

	area = find_vm_area(kaddr);
	if (!area)
		return -EINVAL;

	if (!(area->flags & VM_USERMAP))
		return -EINVAL;

	if (kaddr + size > area->addr + area->size)
		return -EINVAL;

	do {
		struct page *page = vmalloc_to_page(kaddr);
		int ret;

		ret = vm_insert_page(vma, uaddr, page);
		if (ret)
			return ret;

		uaddr += PAGE_SIZE;
		kaddr += PAGE_SIZE;
		size -= PAGE_SIZE;
	} while (size > 0);

	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;

	return 0;
}
EXPORT_SYMBOL(remap_vmalloc_range_partial);

/**
 *	remap_vmalloc_range  -  map vmalloc pages to userspace
 *	@vma:		vma to cover (map full range of vma)
 *	@addr:		vmalloc memory
 *	@pgoff:		number of pages into addr before first page to map
 *
 *	Returns:	0 for success, -Exxx on failure
 *
 *	This function checks that addr is a valid vmalloc'ed area, and
 *	that it is big enough to cover the vma. Will return failure if
 *	that criteria isn't met.
 *
 *	Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
						unsigned long pgoff)
{
	return remap_vmalloc_range_partial(vma, vma->vm_start,
					   addr + (pgoff << PAGE_SHIFT),
					   vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);

/*
 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
 * have one.
 */
void __weak vmalloc_sync_all(void)
{
}


static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
{
	pte_t ***p = data;

	if (p) {
		*(*p) = pte;
		(*p)++;
	}
	return 0;
}

/**
 *	alloc_vm_area - allocate a range of kernel address space
 *	@size:		size of the area
 *	@ptes:		returns the PTEs for the address space
 *
 *	Returns:	NULL on failure, vm_struct on success
 *
 *	This function reserves a range of kernel address space, and
 *	allocates pagetables to map that range.  No actual mappings
 *	are created.
 *
 *	If @ptes is non-NULL, pointers to the PTEs (in init_mm)
 *	allocated for the VM area are returned.
 */
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
	struct vm_struct *area;

	area = get_vm_area_caller(size, VM_IOREMAP,
				__builtin_return_address(0));
	if (area == NULL)
		return NULL;

	/*
	 * This ensures that page tables are constructed for this region
	 * of kernel virtual address space and mapped into init_mm.
	 */
	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
				size, f, ptes ? &ptes : NULL)) {
		free_vm_area(area);
		return NULL;
	}

	return area;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);

void free_vm_area(struct vm_struct *area)
{
	struct vm_struct *ret;
	ret = remove_vm_area(area->addr);
	BUG_ON(ret != area);
	kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);

#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
	return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
}

/**
 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
 * @end: target address
 * @pnext: out arg for the next vmap_area
 * @pprev: out arg for the previous vmap_area
 *
 * Returns: %true if either or both of next and prev are found,
 *	    %false if no vmap_area exists
 *
 * Find vmap_areas end addresses of which enclose @end.  ie. if not
 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
 */
static bool pvm_find_next_prev(unsigned long end,
			       struct vmap_area **pnext,
			       struct vmap_area **pprev)
{
	struct rb_node *n = vmap_area_root.rb_node;
	struct vmap_area *va = NULL;

	while (n) {
		va = rb_entry(n, struct vmap_area, rb_node);
		if (end < va->va_end)
			n = n->rb_left;
		else if (end > va->va_end)
			n = n->rb_right;
		else
			break;
	}

	if (!va)
		return false;

	if (va->va_end > end) {
		*pnext = va;
		*pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
	} else {
		*pprev = va;
		*pnext = node_to_va(rb_next(&(*pprev)->rb_node));
	}
	return true;
}

/**
 * pvm_determine_end - find the highest aligned address between two vmap_areas
 * @pnext: in/out arg for the next vmap_area
 * @pprev: in/out arg for the previous vmap_area
 * @align: alignment
 *
 * Returns: determined end address
 *
 * Find the highest aligned address between *@pnext and *@pprev below
 * VMALLOC_END.  *@pnext and *@pprev are adjusted so that the aligned
 * down address is between the end addresses of the two vmap_areas.
 *
 * Please note that the address returned by this function may fall
 * inside *@pnext vmap_area.  The caller is responsible for checking
 * that.
 */
static unsigned long pvm_determine_end(struct vmap_area **pnext,
				       struct vmap_area **pprev,
				       unsigned long align)
{
	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
	unsigned long addr;

	if (*pnext)
		addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
	else
		addr = vmalloc_end;

	while (*pprev && (*pprev)->va_end > addr) {
		*pnext = *pprev;
		*pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
	}

	return addr;
}

/**
 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
 * @offsets: array containing offset of each area
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *	    vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
 * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
 * be scattered pretty far, distance between two areas easily going up
 * to gigabytes.  To avoid interacting with regular vmallocs, these
 * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple.  It
 * does everything top-down and scans areas from the end looking for
 * matching slot.  While scanning, if any of the areas overlaps with
 * existing vmap_area, the base address is pulled down to fit the
 * area.  Scanning is repeated till all the areas fit and then all
 * necessary data structres are inserted and the result is returned.
 */
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
				     const size_t *sizes, int nr_vms,
				     size_t align)
{
	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
	struct vmap_area **vas, *prev, *next;
	struct vm_struct **vms;
	int area, area2, last_area, term_area;
	unsigned long base, start, end, last_end;
	bool purged = false;

	/* verify parameters and allocate data structures */
	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
	for (last_area = 0, area = 0; area < nr_vms; area++) {
		start = offsets[area];
		end = start + sizes[area];

		/* is everything aligned properly? */
		BUG_ON(!IS_ALIGNED(offsets[area], align));
		BUG_ON(!IS_ALIGNED(sizes[area], align));

		/* detect the area with the highest address */
		if (start > offsets[last_area])
			last_area = area;

		for (area2 = 0; area2 < nr_vms; area2++) {
			unsigned long start2 = offsets[area2];
			unsigned long end2 = start2 + sizes[area2];

			if (area2 == area)
				continue;

			BUG_ON(start2 >= start && start2 < end);
			BUG_ON(end2 <= end && end2 > start);
		}
	}
	last_end = offsets[last_area] + sizes[last_area];

	if (vmalloc_end - vmalloc_start < last_end) {
		WARN_ON(true);
		return NULL;
	}

	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
	if (!vas || !vms)
		goto err_free2;

	for (area = 0; area < nr_vms; area++) {
		vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
		if (!vas[area] || !vms[area])
			goto err_free;
	}
retry:
	spin_lock(&vmap_area_lock);

	/* start scanning - we scan from the top, begin with the last area */
	area = term_area = last_area;
	start = offsets[area];
	end = start + sizes[area];

	if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
		base = vmalloc_end - last_end;
		goto found;
	}
	base = pvm_determine_end(&next, &prev, align) - end;

	while (true) {
		BUG_ON(next && next->va_end <= base + end);
		BUG_ON(prev && prev->va_end > base + end);

		/*
		 * base might have underflowed, add last_end before
		 * comparing.
		 */
		if (base + last_end < vmalloc_start + last_end) {
			spin_unlock(&vmap_area_lock);
			if (!purged) {
				purge_vmap_area_lazy();
				purged = true;
				goto retry;
			}
			goto err_free;
		}

		/*
		 * If next overlaps, move base downwards so that it's
		 * right below next and then recheck.
		 */
		if (next && next->va_start < base + end) {
			base = pvm_determine_end(&next, &prev, align) - end;
			term_area = area;
			continue;
		}

		/*
		 * If prev overlaps, shift down next and prev and move
		 * base so that it's right below new next and then
		 * recheck.
		 */
		if (prev && prev->va_end > base + start)  {
			next = prev;
			prev = node_to_va(rb_prev(&next->rb_node));
			base = pvm_determine_end(&next, &prev, align) - end;
			term_area = area;
			continue;
		}

		/*
		 * This area fits, move on to the previous one.  If
		 * the previous one is the terminal one, we're done.
		 */
		area = (area + nr_vms - 1) % nr_vms;
		if (area == term_area)
			break;
		start = offsets[area];
		end = start + sizes[area];
		pvm_find_next_prev(base + end, &next, &prev);
	}
found:
	/* we've found a fitting base, insert all va's */
	for (area = 0; area < nr_vms; area++) {
		struct vmap_area *va = vas[area];

		va->va_start = base + offsets[area];
		va->va_end = va->va_start + sizes[area];
		__insert_vmap_area(va);
	}

	vmap_area_pcpu_hole = base + offsets[last_area];

	spin_unlock(&vmap_area_lock);

	/* insert all vm's */
	for (area = 0; area < nr_vms; area++)
		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
				 pcpu_get_vm_areas);

	kfree(vas);
	return vms;

err_free:
	for (area = 0; area < nr_vms; area++) {
		kfree(vas[area]);
		kfree(vms[area]);
	}
err_free2:
	kfree(vas);
	kfree(vms);
	return NULL;
}

/**
 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
 * @nr_vms: the number of allocated areas
 *
 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
 */
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
	int i;

	for (i = 0; i < nr_vms; i++)
		free_vm_area(vms[i]);
	kfree(vms);
}
#endif	/* CONFIG_SMP */

#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
	__acquires(&vmap_area_lock)
{
	loff_t n = *pos;
	struct vmap_area *va;

	spin_lock(&vmap_area_lock);
	va = list_entry((&vmap_area_list)->next, typeof(*va), list);
	while (n > 0 && &va->list != &vmap_area_list) {
		n--;
		va = list_entry(va->list.next, typeof(*va), list);
	}
	if (!n && &va->list != &vmap_area_list)
		return va;

	return NULL;

}

static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
	struct vmap_area *va = p, *next;

	++*pos;
	next = list_entry(va->list.next, typeof(*va), list);
	if (&next->list != &vmap_area_list)
		return next;

	return NULL;
}

static void s_stop(struct seq_file *m, void *p)
	__releases(&vmap_area_lock)
{
	spin_unlock(&vmap_area_lock);
}

static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
	if (IS_ENABLED(CONFIG_NUMA)) {
		unsigned int nr, *counters = m->private;

		if (!counters)
			return;

		if (v->flags & VM_UNINITIALIZED)
			return;
		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
		smp_rmb();

		memset(counters, 0, nr_node_ids * sizeof(unsigned int));

		for (nr = 0; nr < v->nr_pages; nr++)
			counters[page_to_nid(v->pages[nr])]++;

		for_each_node_state(nr, N_HIGH_MEMORY)
			if (counters[nr])
				seq_printf(m, " N%u=%u", nr, counters[nr]);
	}
}

static int s_show(struct seq_file *m, void *p)
{
	struct vmap_area *va = p;
	struct vm_struct *v;

	/*
	 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
	 * behalf of vmap area is being tear down or vm_map_ram allocation.
	 */
	if (!(va->flags & VM_VM_AREA))
		return 0;

	v = va->vm;

	seq_printf(m, "0x%pK-0x%pK %7ld",
		v->addr, v->addr + v->size, v->size);

	if (v->caller)
		seq_printf(m, " %pS", v->caller);

	if (v->nr_pages)
		seq_printf(m, " pages=%d", v->nr_pages);

	if (v->phys_addr)
		seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);

	if (v->flags & VM_IOREMAP)
		seq_puts(m, " ioremap");

	if (v->flags & VM_ALLOC)
		seq_puts(m, " vmalloc");

	if (v->flags & VM_MAP)
		seq_puts(m, " vmap");

	if (v->flags & VM_USERMAP)
		seq_puts(m, " user");

	if (v->flags & VM_VPAGES)
		seq_puts(m, " vpages");

	show_numa_info(m, v);
	seq_putc(m, '\n');
	return 0;
}

static const struct seq_operations vmalloc_op = {
	.start = s_start,
	.next = s_next,
	.stop = s_stop,
	.show = s_show,
};

static int vmalloc_open(struct inode *inode, struct file *file)
{
	if (IS_ENABLED(CONFIG_NUMA))
		return seq_open_private(file, &vmalloc_op,
					nr_node_ids * sizeof(unsigned int));
	else
		return seq_open(file, &vmalloc_op);
}

static const struct file_operations proc_vmalloc_operations = {
	.open		= vmalloc_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= seq_release_private,
};

static int __init proc_vmalloc_init(void)
{
	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
	return 0;
}
module_init(proc_vmalloc_init);

#endif