aboutsummaryrefslogtreecommitdiffstats
path: root/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r--mm/khugepaged.c45
1 files changed, 40 insertions, 5 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3c2326568193..f1f98305433e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1060,6 +1060,7 @@ static void collapse_huge_page(struct mm_struct *mm,
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();
spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1312,6 +1313,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
spinlock_t *ptl;
int count = 0;
int i;
+ struct mmu_notifier_range range;
if (!vma || !vma->vm_file ||
vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
@@ -1338,6 +1340,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!pmd)
goto drop_hpage;
+ /*
+ * We need to lock the mapping so that from here on, only GUP-fast and
+ * hardware page walks can access the parts of the page tables that
+ * we're operating on.
+ */
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ /*
+ * This spinlock should be unnecessary: Nobody else should be accessing
+ * the page tables under spinlock protection here, only
+ * lockless_pages_from_mm() and the hardware page walker can access page
+ * tables while all the high-level locks are held in write mode.
+ */
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
/* step 1: check all mapped PTEs are to the right huge page */
@@ -1384,12 +1399,23 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
}
/* step 4: collapse pmd */
- ptl = pmd_lock(vma->vm_mm, pmd);
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
_pmd = pmdp_collapse_flush(vma, haddr, pmd);
- spin_unlock(ptl);
mm_dec_nr_ptes(mm);
+ tlb_remove_table_sync_one();
+ mmu_notifier_invalidate_range_end(&range);
pte_free(mm, pmd_pgtable(_pmd));
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
drop_hpage:
unlock_page(hpage);
put_page(hpage);
@@ -1397,6 +1423,7 @@ drop_hpage:
abort:
pte_unmap_unlock(start_pte, ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}
@@ -1446,7 +1473,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too.
+ * has higher cost too. It would also probably require locking
+ * the anon_vma.
*/
if (vma->anon_vma)
continue;
@@ -1468,12 +1496,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
*/
if (down_write_trylock(&mm->mmap_sem)) {
if (!khugepaged_test_exit(mm)) {
- spinlock_t *ptl = pmd_lock(mm, pmd);
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, addr,
+ addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
mm_dec_nr_ptes(mm);
+ tlb_remove_table_sync_one();
pte_free(mm, pmd_pgtable(_pmd));
+ mmu_notifier_invalidate_range_end(&range);
}
up_write(&mm->mmap_sem);
} else {