aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c239
1 files changed, 144 insertions, 95 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3bac76ae4b30..7b2611a055a7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -798,9 +799,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
set_page_count(p, 0);
} while (++p, --i);
- set_page_refcounted(page);
set_pageblock_migratetype(page, MIGRATE_CMA);
- __free_pages(page, pageblock_order);
+
+ if (pageblock_order >= MAX_ORDER) {
+ i = pageblock_nr_pages;
+ p = page;
+ do {
+ set_page_refcounted(p);
+ __free_pages(p, MAX_ORDER - 1);
+ p += MAX_ORDER_NR_PAGES;
+ } while (i -= MAX_ORDER_NR_PAGES);
+ } else {
+ set_page_refcounted(page);
+ __free_pages(page, pageblock_order);
+ }
+
adjust_managed_page_count(page, pageblock_nr_pages);
}
#endif
@@ -1238,15 +1251,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
}
local_irq_restore(flags);
}
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
- return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
- return false;
-}
#endif
/*
@@ -1583,12 +1587,7 @@ again:
get_pageblock_migratetype(page));
}
- /*
- * NOTE: GFP_THISNODE allocations do not partake in the kswapd
- * aging protocol, so they can't be fair.
- */
- if (!gfp_thisnode_allocation(gfp_flags))
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1870,7 +1869,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
{
int i;
- for_each_online_node(i)
+ for_each_node_state(i, N_MEMORY)
if (node_distance(nid, i) <= RECLAIM_DISTANCE)
node_set(i, NODE_DATA(nid)->reclaim_nodes);
else
@@ -1954,23 +1953,12 @@ zonelist_scan:
* zone size to ensure fair page aging. The zone a
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
- *
- * Try to stay in local zones in the fastpath. If
- * that fails, the slowpath is entered, which will do
- * another pass starting with the local zones, but
- * ultimately fall back to remote zones that do not
- * partake in the fairness round-robin cycle of this
- * zonelist.
- *
- * NOTE: GFP_THISNODE allocations do not partake in
- * the kswapd aging protocol, so they can't be fair.
*/
- if ((alloc_flags & ALLOC_WMARK_LOW) &&
- !gfp_thisnode_allocation(gfp_mask)) {
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
- continue;
+ if (alloc_flags & ALLOC_FAIR) {
if (!zone_local(preferred_zone, zone))
continue;
+ if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0)
+ continue;
}
/*
* When allocating a page cache page for writing, we
@@ -2208,6 +2196,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
}
/*
+ * PM-freezer should be notified that there might be an OOM killer on
+ * its way to kill and wake somebody up. This is too early and we might
+ * end up not killing anything but false positives are acceptable.
+ * See freeze_processes.
+ */
+ note_oom_kill();
+
+ /*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
@@ -2408,37 +2404,45 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page;
}
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone)
+static void reset_alloc_batches(struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- if (!(gfp_mask & __GFP_NO_KSWAPD))
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
/*
* Only reset the batches of zones that were actually
- * considered in the fast path, we don't want to
- * thrash fairness information for zones that are not
+ * considered in the fairness pass, we don't want to
+ * trash fairness information for zones that are not
* actually part of this zonelist's round-robin cycle.
*/
if (!zone_local(preferred_zone, zone))
continue;
mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
}
}
+static void wake_all_kswapds(unsigned int order,
+ struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- const gfp_t wait = gfp_mask & __GFP_WAIT;
+ const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2447,20 +2451,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
- if (!wait) {
+ if (atomic) {
/*
- * Not worth trying to allocate harder for
- * __GFP_NOMEMALLOC even if it can't schedule.
+ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+ * if it can't schedule.
*/
- if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
/*
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+ * comment for __cpuset_node_allowed_softwall().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2522,12 +2526,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are
* over allocated.
*/
- if (gfp_thisnode_allocation(gfp_mask))
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
- prepare_slowpath(gfp_mask, order, zonelist,
- high_zoneidx, preferred_zone);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2711,7 +2716,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2739,7 +2744,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
return NULL;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
@@ -2752,12 +2757,29 @@ retry_cpuset:
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
+retry:
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
if (unlikely(!page)) {
/*
+ * The first pass makes sure allocations are spread
+ * fairly within the local node. However, the local
+ * node might have free pages left after the fairness
+ * batches are exhausted, and remote zones haven't
+ * even been considered yet. Try once more without
+ * fairness, and include remote zones now, before
+ * entering the slowpath and waking kswapd: prefer
+ * spilling to a remote zone over swapping locally.
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ reset_alloc_batches(zonelist, high_zoneidx,
+ preferred_zone);
+ alloc_flags &= ~ALLOC_FAIR;
+ goto retry;
+ }
+ /*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
@@ -2777,7 +2799,7 @@ out:
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
memcg_kmem_commit_charge(page, memcg, order);
@@ -3045,9 +3067,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
goto out;
do {
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
ret = !node_isset(nid, cpuset_current_mems_allowed);
- } while (!put_mems_allowed(cpuset_mems_cookie));
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
out:
return ret;
}
@@ -4105,7 +4127,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -4221,8 +4243,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
pageset_update(&p->pcp, high, batch);
}
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
@@ -4919,7 +4941,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- init_zone_allows_reclaim(nid);
+ if (node_state(nid, N_MEMORY))
+ init_zone_allows_reclaim(nid);
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#endif
@@ -5647,9 +5670,8 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -5847,23 +5869,38 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- unsigned int cpu;
+ int old_percpu_pagelist_fraction;
int ret;
+ mutex_lock(&pcp_batch_high_lock);
+ old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret < 0))
- return ret;
+ if (!write || ret < 0)
+ goto out;
+
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+ percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* No change? */
+ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ goto out;
- mutex_lock(&pcp_batch_high_lock);
for_each_populated_zone(zone) {
- unsigned long high;
- high = zone->managed_pages / percpu_pagelist_fraction;
+ unsigned int cpu;
+
for_each_possible_cpu(cpu)
- pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
- high);
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
}
+out:
mutex_unlock(&pcp_batch_high_lock);
- return 0;
+ return ret;
}
int hashdist = HASHDIST_DEFAULT;
@@ -6006,53 +6043,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
* @end_bitidx: The last bit of interest
* returns pageblock_bits flags
*/
-unsigned long get_pageblock_flags_group(struct page *page,
- int start_bitidx, int end_bitidx)
+unsigned long get_pageblock_flags_mask(struct page *page,
+ unsigned long end_bitidx,
+ unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx;
- unsigned long flags = 0;
- unsigned long value = 1;
+ unsigned long pfn, bitidx, word_bitidx;
+ unsigned long word;
zone = page_zone(page);
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
- for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
- if (test_bit(bitidx + start_bitidx, bitmap))
- flags |= value;
-
- return flags;
+ word = bitmap[word_bitidx];
+ bitidx += end_bitidx;
+ return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}
/**
- * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @start_bitidx: The first bit of interest
* @end_bitidx: The last bit of interest
* @flags: The flags to set
*/
-void set_pageblock_flags_group(struct page *page, unsigned long flags,
- int start_bitidx, int end_bitidx)
+void set_pageblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long end_bitidx,
+ unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx;
- unsigned long value = 1;
+ unsigned long pfn, bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
zone = page_zone(page);
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
- for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
- if (flags & value)
- __set_bit(bitidx + start_bitidx, bitmap);
- else
- __clear_bit(bitidx + start_bitidx, bitmap);
+ bitidx += end_bitidx;
+ mask <<= (BITS_PER_LONG - bitidx - 1);
+ flags <<= (BITS_PER_LONG - bitidx - 1);
+
+ word = ACCESS_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
}
/*