diff options
Diffstat (limited to 'mm/mprotect.c')
-rw-r--r-- | mm/mprotect.c | 556 |
1 files changed, 371 insertions, 185 deletions
diff --git a/mm/mprotect.c b/mm/mprotect.c index 56c02beb6041..81991102f785 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,40 +29,73 @@ #include <linux/uaccess.h> #include <linux/mm_inline.h> #include <linux/pgtable.h> +#include <linux/sched/sysctl.h> +#include <linux/userfaultfd_k.h> +#include <linux/memory-tiers.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> +#include <asm/tlb.h> #include "internal.h" -static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot, - unsigned long cp_flags) +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + struct page *page; + + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) + return false; + + /* Don't touch entries that are not even readable. */ + if (pte_protnone(pte)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_pte_wp(vma, pte)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* + * Writable MAP_PRIVATE mapping: We can only special-case on + * exclusive anonymous pages, because we know that our + * write-fault handler similarly would map them writable without + * any additional checks while holding the PT lock. + */ + page = vm_normal_page(vma, addr, pte); + return page && PageAnon(page) && PageAnonExclusive(page); + } + + /* + * Writable MAP_SHARED mapping: "clean" might indicate that the FS still + * needs a real write-fault for writenotify + * (see vma_wants_writenotify()). If "dirty", the assumption is that the + * FS was already notified and we can simply mark the PTE writable + * just like the write-fault handler would do. + */ + return pte_dirty(pte); +} + +static long change_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; - unsigned long pages = 0; + long pages = 0; int target_node = NUMA_NO_NODE; - bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; - /* - * Can be called with only the mmap_lock for reading by - * prot_numa so we must check the pmd isn't constantly - * changing from under us from pmd_none to pmd_trans_huge - * and/or the other way around. - */ - if (pmd_trans_unstable(pmd)) - return 0; - - /* - * The pmd points to a regular pte so the pmd can't change - * from under us even if the mmap_lock is only hold for - * reading. - */ + tlb_change_page_size(tlb, PAGE_SIZE); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) + return -EAGAIN; /* Get target node for single threaded private VMAs */ if (prot_numa && !(vma->vm_flags & VM_SHARED) && @@ -72,29 +105,31 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { - oldpte = *pte; + oldpte = ptep_get(pte); if (pte_present(oldpte)) { pte_t ptent; - bool preserve_write = prot_numa && pte_write(oldpte); /* * Avoid trapping faults against the zero or KSM * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { - struct page *page; + struct folio *folio; + int nid; + bool toptier; /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; - page = vm_normal_page(vma, addr, oldpte); - if (!page || PageKsm(page)) + folio = vm_normal_folio(vma, addr, oldpte); + if (!folio || folio_is_zone_device(folio) || + folio_test_ksm(folio)) continue; /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - page_mapcount(page) != 1) + folio_ref_count(folio) != 1) continue; /* @@ -102,67 +137,116 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * it cannot move them all from MIGRATE_ASYNC * context. */ - if (page_is_file_lru(page) && PageDirty(page)) + if (folio_is_file_lru(folio) && + folio_test_dirty(folio)) continue; /* * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. */ - if (target_node == page_to_nid(page)) + nid = folio_nid(folio); + if (target_node == nid) continue; + toptier = node_is_toptier(nid); + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + toptier) + continue; + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + folio_xchg_access_time(folio, + jiffies_to_msecs(jiffies)); } oldpte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(oldpte, newprot); - if (preserve_write) - ptent = pte_mk_savedwrite(ptent); - if (uffd_wp) { - ptent = pte_wrprotect(ptent); + if (uffd_wp) ptent = pte_mkuffd_wp(ptent); - } else if (uffd_wp_resolve) { - /* - * Leave the write bit to be handled - * by PF interrupt handler, then - * things like COW could be properly - * handled. - */ + else if (uffd_wp_resolve) ptent = pte_clear_uffd_wp(ptent); - } - /* Avoid taking write faults for known dirty pages */ - if (dirty_accountable && pte_dirty(ptent) && - (pte_soft_dirty(ptent) || - !(vma->vm_flags & VM_SOFTDIRTY))) { - ptent = pte_mkwrite(ptent); - } + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent) && + can_change_pte_writable(vma, addr, ptent)) + ptent = pte_mkwrite(ptent, vma); + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); + if (pte_needs_flush(oldpte, ptent)) + tlb_flush_pte_range(tlb, addr, PAGE_SIZE); pages++; } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); pte_t newpte; - if (is_write_migration_entry(entry)) { + if (is_writable_migration_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + /* * A protection check is difficult so * just be safe and disable write */ - make_migration_entry_read(&entry); + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry( + swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_write_device_private_entry(entry)) { + } else if (is_writable_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See - * copy_one_pte() for explanation. + * copy_nonpresent_pte() for explanation. */ - make_device_private_entry_read(&entry); + entry = make_readable_device_private_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_writable_device_exclusive_entry(entry)) { + entry = make_readable_device_exclusive_entry( + swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_pte_marker_entry(entry)) { + /* + * Ignore error swap entries unconditionally, + * because any access should sigbus anyway. + */ + if (is_poisoned_swp_entry(entry)) + continue; + /* + * If this is uffd-wp pte marker and we'd like + * to unprotect it, drop it; the next page + * fault will trigger without uffd trapping. + */ + if (uffd_wp_resolve) { + pte_clear(vma->vm_mm, addr, pte); + pages++; + } + continue; } else { newpte = oldpte; } @@ -176,6 +260,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } + } else { + /* It must be an none page, or what else?.. */ + WARN_ON_ONCE(!pte_none(oldpte)); + + /* + * Nobody plays with any none ptes besides + * userfaultfd when applying the protections. + */ + if (likely(!uffd_wp)) + continue; + + if (userfaultfd_wp_use_markers(vma)) { + /* + * For file-backed mem, we need to be able to + * wr-protect a none pte, because even if the + * pte is none, the page/swap cache could + * exist. Doing that by install a marker. + */ + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + pages++; + } } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -185,37 +291,74 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } /* - * Used when setting automatic NUMA hinting protection where it is - * critical that a numa hinting PMD is not confused with a bad PMD. + * Return true if we want to split THPs into PTE mappings in change + * protection procedure, false otherwise. */ -static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) +static inline bool +pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags) { - pmd_t pmdval = pmd_read_atomic(pmd); - - /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - barrier(); -#endif + /* + * pte markers only resides in pte level, if we need pte markers, + * we need to split. We cannot wr-protect shmem thp because file + * thp is handled differently when split by erasing the pmd so far. + */ + return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); +} - if (pmd_none(pmdval)) - return 1; - if (pmd_trans_huge(pmdval)) - return 0; - if (unlikely(pmd_bad(pmdval))) { - pmd_clear_bad(pmd); - return 1; - } +/* + * Return true if we want to populate pgtables in change protection + * procedure, false otherwise + */ +static inline bool +pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags) +{ + /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */ + if (!(cp_flags & MM_CP_UFFD_WP)) + return false; - return 0; + /* Populate if the userfaultfd mode requires pte markers */ + return userfaultfd_wp_use_markers(vma); } -static inline unsigned long change_pmd_range(struct vm_area_struct *vma, - pud_t *pud, unsigned long addr, unsigned long end, - pgprot_t newprot, unsigned long cp_flags) +/* + * Populate the pgtable underneath for whatever reason if requested. + * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable + * allocation failures during page faults by kicking OOM and returning + * error. + */ +#define change_pmd_prepare(vma, pmd, cp_flags) \ + ({ \ + long err = 0; \ + if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \ + if (pte_alloc(vma->vm_mm, pmd)) \ + err = -ENOMEM; \ + } \ + err; \ + }) + +/* + * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to + * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, + * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. + */ +#define change_prepare(vma, high, low, addr, cp_flags) \ + ({ \ + long err = 0; \ + if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \ + low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ + if (p == NULL) \ + err = -ENOMEM; \ + } \ + err; \ + }) + +static inline long change_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; - unsigned long pages = 0; + long pages = 0; unsigned long nr_huge_updates = 0; struct mmu_notifier_range range; @@ -223,39 +366,48 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd = pmd_offset(pud, addr); do { - unsigned long this_pages; - + long ret; + pmd_t _pmd; +again: next = pmd_addr_end(addr, end); - /* - * Automatic NUMA balancing walks the tables with mmap_lock - * held for read. It's possible a parallel update to occur - * between pmd_trans_huge() and a pmd_none_or_clear_bad() - * check leading to a false positive and clearing. - * Hence, it's necessary to atomically read the PMD value - * for all the checks. - */ - if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) && - pmd_none_or_clear_bad_unless_trans_huge(pmd)) + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } + + if (pmd_none(*pmd)) goto next; /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, - vma, vma->vm_mm, addr, end); + vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { - if (next - addr != HPAGE_PMD_SIZE) { + _pmd = pmdp_get_lockless(pmd); + if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { + if ((next - addr != HPAGE_PMD_SIZE) || + pgtable_split_needed(vma, cp_flags)) { __split_huge_pmd(vma, pmd, addr, false, NULL); + /* + * For file-backed, the pmd could have been + * cleared; make sure pmd populated if + * necessary, then fall-through to pte level. + */ + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } } else { - int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, cp_flags); - - if (nr_ptes) { - if (nr_ptes == HPAGE_PMD_NR) { + ret = change_huge_pmd(tlb, vma, pmd, + addr, newprot, cp_flags); + if (ret) { + if (ret == HPAGE_PMD_NR) { pages += HPAGE_PMD_NR; nr_huge_updates++; } @@ -266,9 +418,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } /* fall through, the trans huge pmd just split */ } - this_pages = change_pte_range(vma, pmd, addr, next, newprot, - cp_flags); - pages += this_pages; + + ret = change_pte_range(tlb, vma, pmd, addr, next, newprot, + cp_flags); + if (ret < 0) + goto again; + pages += ret; next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -281,88 +436,108 @@ next: return pages; } -static inline unsigned long change_pud_range(struct vm_area_struct *vma, - p4d_t *p4d, unsigned long addr, unsigned long end, - pgprot_t newprot, unsigned long cp_flags) +static inline long change_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; - unsigned long pages = 0; + long pages = 0, ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); + ret = change_prepare(vma, pud, pmd, addr, cp_flags); + if (ret) + return ret; if (pud_none_or_clear_bad(pud)) continue; - pages += change_pmd_range(vma, pud, addr, next, newprot, + pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, cp_flags); } while (pud++, addr = next, addr != end); return pages; } -static inline unsigned long change_p4d_range(struct vm_area_struct *vma, - pgd_t *pgd, unsigned long addr, unsigned long end, - pgprot_t newprot, unsigned long cp_flags) +static inline long change_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; - unsigned long pages = 0; + long pages = 0, ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); + ret = change_prepare(vma, p4d, pud, addr, cp_flags); + if (ret) + return ret; if (p4d_none_or_clear_bad(p4d)) continue; - pages += change_pud_range(vma, p4d, addr, next, newprot, + pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, cp_flags); } while (p4d++, addr = next, addr != end); return pages; } -static unsigned long change_protection_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, pgprot_t newprot, - unsigned long cp_flags) +static long change_protection_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - unsigned long start = addr; - unsigned long pages = 0; + long pages = 0, ret; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); - flush_cache_range(vma, addr, end); - inc_tlb_flush_pending(mm); + tlb_start_vma(tlb, vma); do { next = pgd_addr_end(addr, end); + ret = change_prepare(vma, pgd, p4d, addr, cp_flags); + if (ret) { + pages = ret; + break; + } if (pgd_none_or_clear_bad(pgd)) continue; - pages += change_p4d_range(vma, pgd, addr, next, newprot, + pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, cp_flags); } while (pgd++, addr = next, addr != end); - /* Only flush the TLB if we actually modified any entries: */ - if (pages) - flush_tlb_range(vma, start, end); - dec_tlb_flush_pending(mm); + tlb_end_vma(tlb, vma); return pages; } -unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags) +long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long cp_flags) { - unsigned long pages; + pgprot_t newprot = vma->vm_page_prot; + long pages; BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); +#ifdef CONFIG_NUMA_BALANCING + /* + * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking) + * are expected to reflect their requirements via VMA flags such that + * vma_set_page_prot() will adjust vma->vm_page_prot accordingly. + */ + if (cp_flags & MM_CP_PROT_NUMA) + newprot = PAGE_NONE; +#else + WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA); +#endif + if (is_vm_hugetlb_page(vma)) - pages = hugetlb_change_protection(vma, start, end, newprot); + pages = hugetlb_change_protection(vma, start, end, newprot, + cp_flags); else - pages = change_protection_range(vma, start, end, newprot, + pages = change_protection_range(tlb, vma, start, end, newprot, cp_flags); return pages; @@ -371,7 +546,8 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, static int prot_none_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } @@ -379,7 +555,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } @@ -393,19 +570,20 @@ static const struct mm_walk_ops prot_none_walk_ops = { .pte_entry = prot_none_pte_entry, .hugetlb_entry = prot_none_hugetlb_entry, .test_walk = prot_none_test, + .walk_lock = PGWALK_WRLOCK, }; int -mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned long newflags) +mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) { struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; + unsigned int mm_cp_flags = 0; unsigned long charged = 0; - pgoff_t pgoff; int error; - int dirty_accountable = 0; if (newflags == oldflags) { *pprev = vma; @@ -431,8 +609,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. hugetlb mapping were accounted for - * even if read-only so there is no need to account for them here + * make it unwritable again except in the anonymous case where no + * anon_vma has yet to be assigned. + * + * hugetlb mapping were accounted for even if read-only so there is + * no need to account for them here. */ if (newflags & VM_WRITE) { /* Check space limits when area turns into data. */ @@ -446,46 +627,33 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, return -ENOMEM; newflags |= VM_ACCOUNT; } + } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) && + !vma->anon_vma) { + newflags &= ~VM_ACCOUNT; } - /* - * First try to merge with previous and/or next vma. - */ - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); - if (*pprev) { - vma = *pprev; - VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); - goto success; + vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags); + if (IS_ERR(vma)) { + error = PTR_ERR(vma); + goto fail; } *pprev = vma; - if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); - if (error) - goto fail; - } - - if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); - if (error) - goto fail; - } - -success: /* * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ - vma->vm_flags = newflags; - dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); + vma_start_write(vma); + vm_flags_reset(vma, newflags); + if (vma_wants_manual_pte_write_upgrade(vma)) + mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); - change_protection(vma, start, end, vma->vm_page_prot, - dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + change_protection(tlb, vma, start, end, mm_cp_flags); + + if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT)) + vm_unacct_memory(nrpages); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major @@ -514,10 +682,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len, { unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; - int error = -EINVAL; + int error; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + struct mmu_gather tlb; + struct vma_iterator vmi; start = untagged_addr(start); @@ -549,11 +719,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - vma = find_vma(current->mm, start); + vma_iter_init(&vmi, current->mm, start); + vma = vma_find(&vmi, end); error = -ENOMEM; if (!vma) goto out; - prev = vma->vm_prev; + if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; @@ -571,15 +742,23 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } } + + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - for (nstart = start ; ; ) { + tlb_gather_mmu(&tlb, current->mm); + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { unsigned long mask_off_old_flags; unsigned long newflags; int new_vma_pkey; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + if (vma->vm_start != tmp) { + error = -ENOMEM; + break; + } /* Does the application expect PROT_READ to imply PROT_EXEC */ if (rier && (vma->vm_flags & VM_MAYEXEC)) @@ -590,8 +769,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, * If a permission is not passed to mprotect(), it must be * cleared from the VMA. */ - mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | - VM_FLAGS_CLEAR; + mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR; new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); @@ -600,39 +778,47 @@ static int do_mprotect_pkey(unsigned long start, size_t len, /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { error = -EACCES; - goto out; + break; + } + + if (map_deny_write_exec(vma, newflags)) { + error = -EACCES; + break; } /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; - goto out; + break; } error = security_file_mprotect(vma, reqprot, prot); if (error) - goto out; + break; tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); - if (error) - goto out; - nstart = tmp; - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - goto out; - - vma = prev->vm_next; - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - goto out; + if (vma->vm_ops && vma->vm_ops->mprotect) { + error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags); + if (error) + break; } + + error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags); + if (error) + break; + + tmp = vma_iter_end(&vmi); + nstart = tmp; prot = reqprot; } + tlb_finish_mmu(&tlb); + + if (!error && tmp < end) + error = -ENOMEM; + out: mmap_write_unlock(current->mm); return error; @@ -691,7 +877,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) mmap_write_unlock(current->mm); /* - * We could provie warnings or errors if any VMA still + * We could provide warnings or errors if any VMA still * has the pkey set here. */ return ret; |