diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 149 |
1 files changed, 106 insertions, 43 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6f4ce9547658..2f769a661568 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -68,6 +68,21 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; +static inline bool PageHugeFreed(struct page *head) +{ + return page_private(head + 4) == -1UL; +} + +static inline void SetPageHugeFreed(struct page *head) +{ + set_page_private(head + 4, -1UL); +} + +static inline void ClearPageHugeFreed(struct page *head) +{ + set_page_private(head + 4, 0); +} + /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -573,13 +588,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; + bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (rsv_adjust) { + if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - hugetlb_acct_memory(h, 1); + if (!hugetlb_acct_memory(h, 1)) + reserved = true; + } else if (!rsv_adjust) { + reserved = true; } + + if (!reserved) + pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* @@ -858,6 +880,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; + SetPageHugeFreed(page); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -875,6 +898,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) return NULL; list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); + ClearPageHugeFreed(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return page; @@ -1154,14 +1178,16 @@ static inline void destroy_compound_gigantic_page(struct page *page, static void update_and_free_page(struct hstate *h, struct page *page) { int i; + struct page *subpage = page; if (hstate_is_gigantic(h) && !gigantic_page_supported()) return; h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; - for (i = 0; i < pages_per_huge_page(h); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + for (i = 0; i < pages_per_huge_page(h); + i++, subpage = mem_map_next(subpage, page, i)) { + subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | 1 << PG_writeback); @@ -1196,12 +1222,11 @@ struct hstate *size_to_hstate(unsigned long size) */ bool page_huge_active(struct page *page) { - VM_BUG_ON_PAGE(!PageHuge(page), page); - return PageHead(page) && PagePrivate(&page[1]); + return PageHeadHuge(page) && PagePrivate(&page[1]); } /* never called for tail page */ -static void set_page_huge_active(struct page *page) +void set_page_huge_active(struct page *page) { VM_BUG_ON_PAGE(!PageHeadHuge(page), page); SetPagePrivate(&page[1]); @@ -1305,6 +1330,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; + ClearPageHugeFreed(page); spin_unlock(&hugetlb_lock); } @@ -1500,6 +1526,7 @@ int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; +retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!PageHuge(page)) return 0; @@ -1516,6 +1543,26 @@ int dissolve_free_huge_page(struct page *page) int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; + + /* + * We should make sure that the page is already on the free list + * when it is dissolved. + */ + if (unlikely(!PageHugeFreed(head))) { + spin_unlock(&hugetlb_lock); + cond_resched(); + + /* + * Theoretically, we should return -EBUSY when we + * encounter this race. In fact, we have a chance + * to successfully dissolve the page if we do a + * retry. Because the race window is quite small. + * If we seize this opportunity, it is an optimization + * for increasing the success rate of dissolving page. + */ + goto retry; + } + /* * Move PageHWPoison flag from head page to the raw error page, * which makes any subpages rather than the error page reusable. @@ -2610,8 +2657,10 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, return -ENOMEM; retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); - if (retval) + if (retval) { kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + } return retval; } @@ -2918,6 +2967,22 @@ static unsigned int cpuset_mems_nr(unsigned int *array) } #ifdef CONFIG_SYSCTL +static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos, unsigned long *out) +{ + struct ctl_table dup_table; + + /* + * In order to avoid races with __do_proc_doulongvec_minmax(), we + * can duplicate the @table and alter the duplicate of it. + */ + dup_table = *table; + dup_table.data = out; + + return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); +} + static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -2929,9 +2994,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (!hugepages_supported()) return -EOPNOTSUPP; - table->data = &tmp; - table->maxlen = sizeof(unsigned long); - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); if (ret) goto out; @@ -2975,9 +3039,8 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, if (write && hstate_is_gigantic(h)) return -EINVAL; - table->data = &tmp; - table->maxlen = sizeof(unsigned long); - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); if (ret) goto out; @@ -3799,7 +3862,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3838,7 +3901,7 @@ retry: * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON | + ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } @@ -3908,7 +3971,7 @@ backout_unlocked: #ifdef CONFIG_SMP u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) + pgoff_t idx) { unsigned long key[2]; u32 hash; @@ -3916,7 +3979,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, key[0] = (unsigned long) mapping; key[1] = idx; - hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } @@ -3926,7 +3989,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) + pgoff_t idx) { return 0; } @@ -3970,7 +4033,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -4650,25 +4713,23 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { - unsigned long check_addr = *start; + unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), + v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); - if (!(vma->vm_flags & VM_MAYSHARE)) + /* + * vma need span at least one aligned PUD size and the start,end range + * must at least partialy within it. + */ + if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || + (*end <= v_start) || (*start >= v_end)) return; - for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { - unsigned long a_start = check_addr & PUD_MASK; - unsigned long a_end = a_start + PUD_SIZE; + /* Extend the range to be PUD aligned for a worst case scenario */ + if (*start > v_start) + *start = ALIGN_DOWN(*start, PUD_SIZE); - /* - * If sharing is possible, adjust start/end if necessary. - */ - if (range_in_vma(vma, a_start, a_end)) { - if (a_start < *start) - *start = a_start; - if (a_end > *end) - *end = a_end; - } - } + if (*end < v_end) + *end = ALIGN(*end, PUD_SIZE); } /* @@ -4820,8 +4881,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, { pgd_t *pgd; p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; + pud_t *pud, pud_entry; + pmd_t *pmd, pmd_entry; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) @@ -4831,17 +4892,19 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; pud = pud_offset(p4d, addr); - if (sz != PUD_SIZE && pud_none(*pud)) + pud_entry = READ_ONCE(*pud); + if (sz != PUD_SIZE && pud_none(pud_entry)) return NULL; /* hugepage or swap? */ - if (pud_huge(*pud) || !pud_present(*pud)) + if (pud_huge(pud_entry) || !pud_present(pud_entry)) return (pte_t *)pud; pmd = pmd_offset(pud, addr); - if (sz != PMD_SIZE && pmd_none(*pmd)) + pmd_entry = READ_ONCE(*pmd); + if (sz != PMD_SIZE && pmd_none(pmd_entry)) return NULL; /* hugepage or swap? */ - if (pmd_huge(*pmd) || !pmd_present(*pmd)) + if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry)) return (pte_t *)pmd; return NULL; @@ -4928,9 +4991,9 @@ bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; - VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); - if (!page_huge_active(page) || !get_page_unless_zero(page)) { + if (!PageHeadHuge(page) || !page_huge_active(page) || + !get_page_unless_zero(page)) { ret = false; goto unlock; } |