aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c526
1 files changed, 325 insertions, 201 deletions
diff --git a/mm/memory.c b/mm/memory.c
index f69fbc251198..cdc4d4c1c858 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,6 +77,7 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
+#include <linux/net_mm.h>
#include <trace/events/kmem.h>
@@ -699,15 +700,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
struct page *page, unsigned long address,
pte_t *ptep)
{
+ pte_t orig_pte;
pte_t pte;
swp_entry_t entry;
+ orig_pte = ptep_get(ptep);
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
- if (pte_swp_soft_dirty(*ptep))
+ if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(*ptep);
- if (pte_swp_uffd_wp(*ptep))
+ entry = pte_to_swp_entry(orig_pte);
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_mkuffd_wp(pte);
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -744,7 +747,7 @@ static int
try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr)
{
- swp_entry_t entry = pte_to_swp_entry(*src_pte);
+ swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
struct page *page = pfn_swap_entry_to_page(entry);
if (trylock_page(page)) {
@@ -768,9 +771,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
unsigned long vm_flags = dst_vma->vm_flags;
- pte_t pte = *src_pte;
+ pte_t orig_pte = ptep_get(src_pte);
+ pte_t pte = orig_pte;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(pte);
+ swp_entry_t entry = pte_to_swp_entry(orig_pte);
if (likely(!non_swap_entry(entry))) {
if (swap_duplicate(entry) < 0)
@@ -785,8 +789,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spin_unlock(&mmlist_lock);
}
/* Mark the swap entry as shared. */
- if (pte_swp_exclusive(*src_pte)) {
- pte = pte_swp_clear_exclusive(*src_pte);
+ if (pte_swp_exclusive(orig_pte)) {
+ pte = pte_swp_clear_exclusive(orig_pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
rss[MM_SWAPENTS]++;
@@ -805,9 +809,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
entry = make_readable_migration_entry(
swp_offset(entry));
pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
+ if (pte_swp_soft_dirty(orig_pte))
pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*src_pte))
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
@@ -840,7 +844,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
entry = make_readable_device_private_entry(
swp_offset(entry));
pte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(*src_pte))
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
@@ -904,7 +908,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
/* All done, just insert the new page copy in the child */
pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
- if (userfaultfd_pte_wp(dst_vma, *src_pte))
+ if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
pte = pte_mkuffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -922,7 +926,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
{
struct mm_struct *src_mm = src_vma->vm_mm;
unsigned long vm_flags = src_vma->vm_flags;
- pte_t pte = *src_pte;
+ pte_t pte = ptep_get(src_pte);
struct page *page;
struct folio *folio;
@@ -1002,6 +1006,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
+ pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
int progress, ret = 0;
int rss[NR_MM_COUNTERS];
@@ -1012,13 +1017,25 @@ again:
progress = 0;
init_rss_vec(rss);
+ /*
+ * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
+ * error handling here, assume that exclusive mmap_lock on dst and src
+ * protects anon from unexpected THP transitions; with shmem and file
+ * protected by mmap_lock-less collapse skipping areas with anon_vma
+ * (whereas vma_needs_copy() skips areas without anon_vma). A rework
+ * can remove such assumptions later, but this is good enough for now.
+ */
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) {
ret = -ENOMEM;
goto out;
}
- src_pte = pte_offset_map(src_pmd, addr);
- src_ptl = pte_lockptr(src_mm, src_pmd);
+ src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+ if (!src_pte) {
+ pte_unmap_unlock(dst_pte, dst_ptl);
+ /* ret == 0 */
+ goto out;
+ }
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
@@ -1035,17 +1052,18 @@ again:
spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
break;
}
- if (pte_none(*src_pte)) {
+ ptent = ptep_get(src_pte);
+ if (pte_none(ptent)) {
progress++;
continue;
}
- if (unlikely(!pte_present(*src_pte))) {
+ if (unlikely(!pte_present(ptent))) {
ret = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
dst_vma, src_vma,
addr, rss);
if (ret == -EIO) {
- entry = pte_to_swp_entry(*src_pte);
+ entry = pte_to_swp_entry(ptep_get(src_pte));
break;
} else if (ret == -EBUSY) {
break;
@@ -1083,8 +1101,7 @@ again:
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- spin_unlock(src_ptl);
- pte_unmap(orig_src_pte);
+ pte_unmap_unlock(orig_src_pte, src_ptl);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
@@ -1388,14 +1405,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
swp_entry_t entry;
tlb_change_page_size(tlb, PAGE_SIZE);
-again:
init_rss_vec(rss);
- start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- pte = start_pte;
+ start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return addr;
+
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
struct page *page;
if (pte_none(ptent))
@@ -1507,17 +1525,10 @@ again:
* If we forced a TLB flush (either due to running out of
* batch buffers or because we needed to flush dirty TLB
* entries before releasing the ptl), free the batched
- * memory too. Restart if we didn't do everything.
+ * memory too. Come back again if we didn't do everything.
*/
- if (force_flush) {
- force_flush = 0;
+ if (force_flush)
tlb_flush_mmu(tlb);
- }
-
- if (addr != end) {
- cond_resched();
- goto again;
- }
return addr;
}
@@ -1536,8 +1547,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
- else if (zap_huge_pmd(tlb, vma, pmd, addr))
- goto next;
+ else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
+ addr = next;
+ continue;
+ }
/* fall through */
} else if (details && details->single_folio &&
folio_test_pmd_mappable(details->single_folio) &&
@@ -1550,20 +1563,14 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
*/
spin_unlock(ptl);
}
-
- /*
- * Here there can be other concurrent MADV_DONTNEED or
- * trans huge page faults running, and if the pmd is
- * none or trans huge it can change under us. This is
- * because MADV_DONTNEED holds the mmap_lock in read
- * mode.
- */
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- goto next;
- next = zap_pte_range(tlb, vma, pmd, addr, next, details);
-next:
- cond_resched();
- } while (pmd++, addr = next, addr != end);
+ if (pmd_none(*pmd)) {
+ addr = next;
+ continue;
+ }
+ addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
+ if (addr != next)
+ pmd--;
+ } while (pmd++, cond_resched(), addr != end);
return addr;
}
@@ -1821,7 +1828,7 @@ static int validate_page_before_insert(struct page *page)
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
- if (!pte_none(*pte))
+ if (!pte_none(ptep_get(pte)))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
@@ -1905,6 +1912,10 @@ more:
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+ if (!start_pte) {
+ ret = -EFAULT;
+ goto out;
+ }
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot);
@@ -2111,7 +2122,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return VM_FAULT_OOM;
- if (!pte_none(*pte)) {
+ entry = ptep_get(pte);
+ if (!pte_none(entry)) {
if (mkwrite) {
/*
* For read faults on private mappings the PFN passed
@@ -2123,11 +2135,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* allocation and mapping invalidation so just skip the
* update.
*/
- if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
- WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
+ if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
goto out_unlock;
}
- entry = pte_mkyoung(*pte);
+ entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, addr, pte, entry, 1))
update_mmu_cache(vma, addr, pte);
@@ -2339,7 +2351,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
return -ENOMEM;
arch_enter_lazy_mmu_mode();
do {
- BUG_ON(!pte_none(*pte));
+ BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
err = -EACCES;
break;
@@ -2572,15 +2584,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
mapped_pte = pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -EINVAL;
}
- BUG_ON(pmd_huge(*pmd));
-
arch_enter_lazy_mmu_mode();
if (fn) {
do {
- if (create || !pte_none(*pte)) {
+ if (create || !pte_none(ptep_get(pte))) {
err = fn(pte++, addr, data);
if (err)
break;
@@ -2781,10 +2793,9 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
- spin_lock(ptl);
- same = pte_same(*vmf->pte, vmf->orig_pte);
- spin_unlock(ptl);
+ spin_lock(vmf->ptl);
+ same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+ spin_unlock(vmf->ptl);
}
#endif
pte_unmap(vmf->pte);
@@ -2804,7 +2815,6 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
int ret;
void *kaddr;
void __user *uaddr;
- bool locked = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address;
@@ -2830,17 +2840,18 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
+ vmf->pte = NULL;
if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- locked = true;
- if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
/*
* Other thread has already handled the fault
* and update local tlb only
*/
- update_mmu_tlb(vma, addr, vmf->pte);
+ if (vmf->pte)
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = -EAGAIN;
goto pte_unlock;
}
@@ -2857,15 +2868,15 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
- if (locked)
+ if (vmf->pte)
goto warn;
/* Re-validate under PTL if the page is still mapped */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- locked = true;
- if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
- update_mmu_tlb(vma, addr, vmf->pte);
+ if (vmf->pte)
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = -EAGAIN;
goto pte_unlock;
}
@@ -2888,7 +2899,7 @@ warn:
ret = 0;
pte_unlock:
- if (locked)
+ if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
@@ -3110,7 +3121,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* Re-check the pte - we dropped the lock
*/
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
if (old_folio) {
if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(&old_folio->page));
@@ -3178,19 +3189,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
/* Free the old page.. */
new_folio = old_folio;
page_copied = 1;
- } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ } else if (vmf->pte) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- if (new_folio)
- folio_put(new_folio);
-
- pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
+
+ if (new_folio)
+ folio_put(new_folio);
if (old_folio) {
if (page_copied)
free_swap_cache(&old_folio->page);
@@ -3230,11 +3242,13 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
+ if (!vmf->pte)
+ return VM_FAULT_NOPAGE;
/*
* We might have raced with another page fault while we released the
* pte_offset_map_lock.
*/
- if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
@@ -3329,7 +3343,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
struct folio *folio = NULL;
if (likely(!unshare)) {
- if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
}
@@ -3388,8 +3402,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
goto copy;
if (!folio_test_lru(folio))
/*
- * Note: We cannot easily detect+handle references from
- * remote LRU pagevecs or references to LRU folios.
+ * We cannot easily detect+handle references from
+ * remote LRU caches or references to LRU folios.
*/
lru_add_drain();
if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
@@ -3591,10 +3605,11 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
folio_unlock(folio);
folio_put(folio);
@@ -3625,6 +3640,8 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ return 0;
/*
* Be careful so that we will only recover a special uffd-wp pte into a
* none pte. Otherwise it means the pte could have changed, so retry.
@@ -3633,7 +3650,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
* So is_pte_marker() check is not enough to safely drop the pte.
*/
- if (pte_same(vmf->orig_pte, *vmf->pte))
+ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -3728,10 +3745,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
- spin_unlock(vmf->ptl);
- goto out;
- }
+ if (unlikely(!vmf->pte ||
+ !pte_same(ptep_get(vmf->pte),
+ vmf->orig_pte)))
+ goto unlock;
/*
* Get a page reference while we know the page can't be
@@ -3807,7 +3824,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ if (likely(vmf->pte &&
+ pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
ret = VM_FAULT_OOM;
goto unlock;
}
@@ -3863,7 +3881,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* If we want to map a page that's in the swapcache writable, we
* have to detect via the refcount if we're really the exclusive
* owner. Try removing the extra reference from the local LRU
- * pagevecs if required.
+ * caches if required.
*/
if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
!folio_test_ksm(folio) && !folio_test_lru(folio))
@@ -3877,7 +3895,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
goto out_nomap;
if (unlikely(!folio_test_uptodate(folio))) {
@@ -3933,6 +3951,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/*
+ * Some architectures may have to restore extra metadata to the page
+ * when reading from swap. This metadata may be indexed by swap entry
+ * so this must be called before swap_free().
+ */
+ arch_swap_restore(entry, folio);
+
+ /*
* Remove the swap entry and conditionally try to free up the swapcache.
* We're already holding a reference on the page but haven't mapped it
* yet.
@@ -4003,13 +4028,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
if (si)
put_swap_device(si);
return ret;
out_nomap:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
folio_unlock(folio);
out_release:
@@ -4041,22 +4068,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/*
- * Use pte_alloc() instead of pte_alloc_map(). We can't run
- * pte_offset_map() on pmds where a huge pmd might be created
- * from a different thread.
- *
- * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
- * parallel threads are excluded by other means.
- *
- * Here we only have mmap_read_lock(mm).
+ * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
+ * be distinguished from a transient failure of pte_offset_map().
*/
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
- /* See comment in handle_pte_fault() */
- if (unlikely(pmd_trans_unstable(vmf->pmd)))
- return 0;
-
/* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
@@ -4064,6 +4081,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ goto unlock;
if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
@@ -4104,6 +4123,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
+ if (!vmf->pte)
+ goto release;
if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
@@ -4131,7 +4152,8 @@ setpte:
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
folio_put(folio);
@@ -4325,9 +4347,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
static bool vmf_pte_changed(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
- return !pte_same(*vmf->pte, vmf->orig_pte);
+ return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
- return !pte_none(*vmf->pte);
+ return !pte_none(ptep_get(vmf->pte));
}
/**
@@ -4380,15 +4402,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
}
- /*
- * See comment in handle_pte_fault() for how this scenario happens, we
- * need to return NOPAGE so that we drop this page.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return VM_FAULT_NOPAGE;
-
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ return VM_FAULT_NOPAGE;
/* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) {
@@ -4630,17 +4647,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/
if (!vma->vm_ops->fault) {
- /*
- * If we find a migration pmd entry or a none pmd entry, which
- * should never happen, return SIGBUS
- */
- if (unlikely(!pmd_present(*vmf->pmd)))
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
ret = VM_FAULT_SIGBUS;
else {
- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
- vmf->pmd,
- vmf->address,
- &vmf->ptl);
/*
* Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update
@@ -4648,7 +4659,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* we don't have concurrent modification by hardware
* followed by an update.
*/
- if (unlikely(pte_none(*vmf->pte)))
+ if (unlikely(pte_none(ptep_get(vmf->pte))))
ret = VM_FAULT_SIGBUS;
else
ret = VM_FAULT_NOPAGE;
@@ -4703,9 +4714,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*/
- vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -4774,9 +4784,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_MIGRATED;
} else {
flags |= TNF_MIGRATE_FAIL;
- vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
- spin_lock(vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ goto out;
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -4905,38 +4917,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
/*
- * If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
- * of pmd_trans_huge() to ensure the pmd didn't become
- * pmd_trans_huge under us and then back to pmd_none, as a
- * result of MADV_DONTNEED running immediately after a huge pmd
- * fault in a different thread of this mm, in turn leading to a
- * misleading pmd_trans_huge() retval. All we have to ensure is
- * that it is a regular pmd that we can walk with
- * pte_offset_map() and we can do that through an atomic read
- * in C, which is what pmd_trans_unstable() provides.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return 0;
- /*
* A regular pmd is established and it can't morph into a huge
- * pmd from under us anymore at this point because we hold the
- * mmap_lock read mode and khugepaged takes it in write mode.
- * So now it's safe to run pte_offset_map().
+ * pmd by anon khugepaged, since that takes mmap_lock in write
+ * mode; but shmem or file collapse to THP could still morph
+ * it into a huge pmd: just retry later if so.
*/
- vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
- vmf->orig_pte = *vmf->pte;
+ vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ return 0;
+ vmf->orig_pte = ptep_get_lockless(vmf->pte);
vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
- /*
- * some architectures can have larger ptes than wordsize,
- * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
- * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
- * accesses. The code below just needs a consistent view
- * for the ifs and we later double check anyway with the
- * ptl lock held. So here a barrier will do.
- */
- barrier();
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
@@ -4952,10 +4944,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
- vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
- if (unlikely(!pte_same(*vmf->pte, entry))) {
+ if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
@@ -5060,9 +5051,8 @@ retry_pud:
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- vmf.orig_pmd = *vmf.pmd;
+ vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
- barrier();
if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(vmf.orig_pmd));
@@ -5262,6 +5252,122 @@ out:
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ if (likely(mmap_read_trylock(mm)))
+ return true;
+
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = instruction_pointer(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = instruction_pointer(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalend to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+#endif
+
#ifdef CONFIG_PER_VMA_LOCK
/*
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
@@ -5280,31 +5386,32 @@ retry:
if (!vma)
goto inval;
- /* Only anonymous vmas are supported for now */
- if (!vma_is_anonymous(vma))
- goto inval;
-
- /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
- if (!vma->anon_vma)
+ /* Only anonymous and tcp vmas are supported for now */
+ if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
goto inval;
if (!vma_start_read(vma))
goto inval;
/*
+ * find_mergeable_anon_vma uses adjacent vmas which are not locked.
+ * This check must happen after vma_start_read(); otherwise, a
+ * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
+ * from its anon_vma.
+ */
+ if (unlikely(!vma->anon_vma && !vma_is_tcp(vma)))
+ goto inval_end_read;
+
+ /*
* Due to the possibility of userfault handler dropping mmap_lock, avoid
* it for now and fall back to page fault handling under mmap_lock.
*/
- if (userfaultfd_armed(vma)) {
- vma_end_read(vma);
- goto inval;
- }
+ if (userfaultfd_armed(vma))
+ goto inval_end_read;
/* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- vma_end_read(vma);
- goto inval;
- }
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
/* Check if the VMA got isolated after we found it */
if (vma->detached) {
@@ -5316,6 +5423,9 @@ retry:
rcu_read_unlock();
return vma;
+
+inval_end_read:
+ vma_end_read(vma);
inval:
rcu_read_unlock();
count_vm_vma_lock_event(VMA_LOCK_ABORT);
@@ -5439,11 +5549,10 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto out;
-
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
- if (!pte_present(*ptep))
+ if (!ptep)
+ goto out;
+ if (!pte_present(ptep_get(ptep)))
goto unlock;
*ptepp = ptep;
return 0;
@@ -5480,7 +5589,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
- *pfn = pte_pfn(*ptep);
+ *pfn = pte_pfn(ptep_get(ptep));
pte_unmap_unlock(ptep, ptl);
return 0;
}
@@ -5500,7 +5609,7 @@ int follow_phys(struct vm_area_struct *vma,
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
- pte = *ptep;
+ pte = ptep_get(ptep);
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -5544,7 +5653,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
retry:
if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
return -EINVAL;
- pte = *ptep;
+ pte = ptep_get(ptep);
pte_unmap_unlock(ptep, ptl);
prot = pgprot_val(pte_pgprot(pte));
@@ -5560,7 +5669,7 @@ retry:
if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
goto out_unmap;
- if (!pte_same(pte, *ptep)) {
+ if (!pte_same(pte, ptep_get(ptep))) {
pte_unmap_unlock(ptep, ptl);
iounmap(maddr);
@@ -5587,39 +5696,54 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
int len, unsigned int gup_flags)
{
- struct vm_area_struct *vma;
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
if (mmap_read_lock_killable(mm))
return 0;
+ /* Untag the address before looking up the VMA */
+ addr = untagged_addr_remote(mm, addr);
+
+ /* Avoid triggering the temporary warning in __get_user_pages */
+ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
+ return 0;
+
/* ignore errors, just check how much was successfully transferred */
while (len) {
- int bytes, ret, offset;
+ int bytes, offset;
void *maddr;
- struct page *page = NULL;
+ struct vm_area_struct *vma = NULL;
+ struct page *page = get_user_page_vma_remote(mm, addr,
+ gup_flags, &vma);
+
+ if (IS_ERR_OR_NULL(page)) {
+ /* We might need to expand the stack to access it */
+ vma = vma_lookup(mm, addr);
+ if (!vma) {
+ vma = expand_stack(mm, addr);
+
+ /* mmap_lock was dropped on failure */
+ if (!vma)
+ return buf - old_buf;
+
+ /* Try again if stack expansion worked */
+ continue;
+ }
+
- ret = get_user_pages_remote(mm, addr, 1,
- gup_flags, &page, &vma, NULL);
- if (ret <= 0) {
-#ifndef CONFIG_HAVE_IOREMAP_PROT
- break;
-#else
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
- vma = vma_lookup(mm, addr);
- if (!vma)
- break;
+ bytes = 0;
+#ifdef CONFIG_HAVE_IOREMAP_PROT
if (vma->vm_ops && vma->vm_ops->access)
- ret = vma->vm_ops->access(vma, addr, buf,
- len, write);
- if (ret <= 0)
- break;
- bytes = ret;
+ bytes = vma->vm_ops->access(vma, addr, buf,
+ len, write);
#endif
+ if (bytes <= 0)
+ break;
} else {
bytes = len;
offset = addr & (PAGE_SIZE-1);