aboutsummaryrefslogtreecommitdiffstats
path: root/mm/slub.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/slub.c')
-rw-r--r--mm/slub.c756
1 files changed, 502 insertions, 254 deletions
diff --git a/mm/slub.c b/mm/slub.c
index e32ded30506e..27660a93d456 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -44,13 +44,22 @@
/*
* Lock order:
* 1. slab_mutex (Global Mutex)
- * 2. node->list_lock
- * 3. slab_lock(page) (Only on some arches and for debugging)
+ * 2. node->list_lock (Spinlock)
+ * OR
+ * kmem_cache->cpu_slab->lock (Local lock)
+ * 3. slab_lock(page) (Only on some arches or for debugging)
+ * 4. object_map_lock (Only for debugging)
*
* slab_mutex
*
* The role of the slab_mutex is to protect the list of all the slabs
* and to synchronize major metadata changes to slab cache structures.
+ * Also synchronizes memory hotplug callbacks.
+ *
+ * slab_lock
+ *
+ * The slab_lock is a wrapper around the page lock, thus it is a bit
+ * spinlock.
*
* The slab_lock is only used for debugging and on arches that do not
* have the ability to do a cmpxchg_double. It only protects:
@@ -59,6 +68,8 @@
* C. page->objects -> Number of objects in page
* D. page->frozen -> frozen state
*
+ * Frozen slabs
+ *
* If a slab is frozen then it is exempt from list management. It is not
* on any list except per cpu partial list. The processor that froze the
* slab is the one who can perform list operations on the page. Other
@@ -66,6 +77,8 @@
* froze the slab is the only one that can retrieve the objects from the
* page's freelist.
*
+ * list_lock
+ *
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
* removed from the lists nor make the number of partial slabs be modified.
@@ -77,10 +90,36 @@
* slabs, operations can continue without any centralized lock. F.e.
* allocating a long series of objects that fill up slabs does not require
* the list lock.
- * Interrupts are disabled during allocation and deallocation in order to
- * make the slab allocator safe to use in the context of an irq. In addition
- * interrupts are disabled to ensure that the processor does not change
- * while handling per_cpu slabs, due to kernel preemption.
+ *
+ * cpu_slab->lock local lock
+ *
+ * This locks protect slowpath manipulation of all kmem_cache_cpu fields
+ * except the stat counters. This is a percpu structure manipulated only by
+ * the local cpu, so the lock protects against being preempted or interrupted
+ * by an irq. Fast path operations rely on lockless operations instead.
+ * On PREEMPT_RT, the local lock does not actually disable irqs (and thus
+ * prevent the lockless operations), so fastpath operations also need to take
+ * the lock and are no longer lockless.
+ *
+ * lockless fastpaths
+ *
+ * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
+ * are fully lockless when satisfied from the percpu slab (and when
+ * cmpxchg_double is possible to use, otherwise slab_lock is taken).
+ * They also don't disable preemption or migration or irqs. They rely on
+ * the transaction id (tid) field to detect being preempted or moved to
+ * another cpu.
+ *
+ * irq, preemption, migration considerations
+ *
+ * Interrupts are disabled as part of list_lock or local_lock operations, or
+ * around the slab_lock operation, in order to make the slab allocator safe
+ * to use in the context of an irq.
+ *
+ * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
+ * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
+ * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
+ * doesn't have to be revalidated in each section protected by the local lock.
*
* SLUB assigns one slab for allocation to each processor.
* Allocations only occur from these slabs called cpu slabs.
@@ -116,6 +155,26 @@
* the fast path and disables lockless freelists.
*/
+/*
+ * We could simply use migrate_disable()/enable() but as long as it's a
+ * function call even on !PREEMPT_RT, use inline preempt_disable() there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#else
+#define slub_get_cpu_ptr(var) \
+({ \
+ migrate_disable(); \
+ this_cpu_ptr(var); \
+})
+#define slub_put_cpu_ptr(var) \
+do { \
+ (void)(var); \
+ migrate_enable(); \
+} while (0)
+#endif
+
#ifdef CONFIG_SLUB_DEBUG
#ifdef CONFIG_SLUB_DEBUG_ON
DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
@@ -354,25 +413,42 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
/*
* Per slab locking using the pagelock
*/
-static __always_inline void slab_lock(struct page *page)
+static __always_inline void
+__slab_lock(struct page *page, unsigned long *flags, bool disable_irqs)
{
VM_BUG_ON_PAGE(PageTail(page), page);
+ if (disable_irqs)
+ local_irq_save(*flags);
bit_spin_lock(PG_locked, &page->flags);
}
-static __always_inline void slab_unlock(struct page *page)
+static __always_inline void
+__slab_unlock(struct page *page, unsigned long *flags, bool disable_irqs)
{
VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
+ if (disable_irqs)
+ local_irq_restore(*flags);
}
-/* Interrupts must be disabled (for the fallback code to work right) */
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+static __always_inline void
+slab_lock(struct page *page, unsigned long *flags)
+{
+ __slab_lock(page, flags, IS_ENABLED(CONFIG_PREEMPT_RT));
+}
+
+static __always_inline void slab_unlock(struct page *page, unsigned long *flags)
+{
+ __slab_unlock(page, flags, IS_ENABLED(CONFIG_PREEMPT_RT));
+}
+
+static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
- const char *n)
+ const char *n, bool disable_irqs)
{
- VM_BUG_ON(!irqs_disabled());
+ if (!disable_irqs)
+ lockdep_assert_irqs_disabled();
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (s->flags & __CMPXCHG_DOUBLE) {
@@ -383,15 +459,17 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
} else
#endif
{
- slab_lock(page);
+ unsigned long flags;
+
+ __slab_lock(page, &flags, disable_irqs);
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
page->counters = counters_new;
- slab_unlock(page);
+ __slab_unlock(page, &flags, disable_irqs);
return true;
}
- slab_unlock(page);
+ __slab_unlock(page, &flags, disable_irqs);
}
cpu_relax();
@@ -404,51 +482,46 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
return false;
}
+/*
+ * Interrupts must be disabled (for the fallback code to work right), typically
+ * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
+ * so we disable interrupts explicitly here.
+ */
+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new,
+ const char *n)
+{
+ return ___cmpxchg_double_slab(s, page, freelist_old, counters_old,
+ freelist_new, counters_new, n,
+ IS_ENABLED(CONFIG_PREEMPT_RT));
+}
+
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&page->freelist, &page->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
- unsigned long flags;
+ return ___cmpxchg_double_slab(s, page, freelist_old, counters_old,
+ freelist_new, counters_new, n, true);
+}
- local_irq_save(flags);
- slab_lock(page);
- if (page->freelist == freelist_old &&
- page->counters == counters_old) {
- page->freelist = freelist_new;
- page->counters = counters_new;
- slab_unlock(page);
- local_irq_restore(flags);
- return true;
- }
- slab_unlock(page);
- local_irq_restore(flags);
- }
+#ifdef CONFIG_SLUB_DEBUG
+static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
+static DEFINE_RAW_SPINLOCK(object_map_lock);
- cpu_relax();
- stat(s, CMPXCHG_DOUBLE_FAIL);
+static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
+ struct page *page)
+{
+ void *addr = page_address(page);
+ void *p;
-#ifdef SLUB_DEBUG_CMPXCHG
- pr_info("%s %s: cmpxchg double redo ", n, s->name);
-#endif
+ bitmap_zero(obj_map, page->objects);
- return false;
+ for (p = page->freelist; p; p = get_freepointer(s, p))
+ set_bit(__obj_to_index(s, addr, p), obj_map);
}
-#ifdef CONFIG_SLUB_DEBUG
-static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
-static DEFINE_SPINLOCK(object_map_lock);
-
/*
* Determine a map of object in use on a page.
*
@@ -458,17 +531,11 @@ static DEFINE_SPINLOCK(object_map_lock);
static unsigned long *get_map(struct kmem_cache *s, struct page *page)
__acquires(&object_map_lock)
{
- void *p;
- void *addr = page_address(page);
-
VM_BUG_ON(!irqs_disabled());
- spin_lock(&object_map_lock);
+ raw_spin_lock(&object_map_lock);
- bitmap_zero(object_map, page->objects);
-
- for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(__obj_to_index(s, addr, p), object_map);
+ __fill_map(object_map, s, page);
return object_map;
}
@@ -476,7 +543,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
static void put_map(unsigned long *map) __releases(&object_map_lock)
{
VM_BUG_ON(map != object_map);
- spin_unlock(&object_map_lock);
+ raw_spin_unlock(&object_map_lock);
}
static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -963,8 +1030,6 @@ static int check_slab(struct kmem_cache *s, struct page *page)
{
int maxobj;
- VM_BUG_ON(!irqs_disabled());
-
if (!PageSlab(page)) {
slab_err(s, page, "Not a valid slab page");
return 0;
@@ -1225,11 +1290,11 @@ static noinline int free_debug_processing(
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
void *object = head;
int cnt = 0;
- unsigned long flags;
+ unsigned long flags, flags2;
int ret = 0;
spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(page);
+ slab_lock(page, &flags2);
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
if (!check_slab(s, page))
@@ -1262,7 +1327,7 @@ out:
slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
bulk_cnt, cnt);
- slab_unlock(page);
+ slab_unlock(page, &flags2);
spin_unlock_irqrestore(&n->list_lock, flags);
if (!ret)
slab_fix(s, "Object at 0x%p not freed", object);
@@ -1540,20 +1605,8 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s,
{
kmemleak_free_recursive(x, s->flags);
- /*
- * Trouble is that we may no longer disable interrupts in the fast path
- * So in order to make the debug calls that expect irqs to be
- * disabled we need to disable interrupts temporarily.
- */
-#ifdef CONFIG_LOCKDEP
- {
- unsigned long flags;
+ debug_check_no_locks_freed(x, s->object_size);
- local_irq_save(flags);
- debug_check_no_locks_freed(x, s->object_size);
- local_irq_restore(flags);
- }
-#endif
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
@@ -1770,9 +1823,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
flags &= gfp_allowed_mask;
- if (gfpflags_allow_blocking(flags))
- local_irq_enable();
-
flags |= s->allocflags;
/*
@@ -1831,8 +1881,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->frozen = 1;
out:
- if (gfpflags_allow_blocking(flags))
- local_irq_disable();
if (!page)
return NULL;
@@ -1846,6 +1894,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);
+ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
+
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}
@@ -1976,11 +2026,12 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
- struct kmem_cache_cpu *c, gfp_t flags)
+ struct page **ret_page, gfp_t gfpflags)
{
struct page *page, *page2;
void *object = NULL;
unsigned int available = 0;
+ unsigned long flags;
int objects;
/*
@@ -1992,11 +2043,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
if (!n || !n->nr_partial)
return NULL;
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
- if (!pfmemalloc_match(page, flags))
+ if (!pfmemalloc_match(page, gfpflags))
continue;
t = acquire_slab(s, n, page, object == NULL, &objects);
@@ -2005,7 +2056,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
available += objects;
if (!object) {
- c->page = page;
+ *ret_page = page;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
@@ -2017,7 +2068,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
break;
}
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
return object;
}
@@ -2025,7 +2076,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
* Get a page from somewhere. Search in increasing NUMA distances.
*/
static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
- struct kmem_cache_cpu *c)
+ struct page **ret_page)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
@@ -2067,7 +2118,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
if (n && cpuset_zone_allowed(zone, flags) &&
n->nr_partial > s->min_partial) {
- object = get_partial_node(s, n, c, flags);
+ object = get_partial_node(s, n, ret_page, flags);
if (object) {
/*
* Don't check read_mems_allowed_retry()
@@ -2089,7 +2140,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
* Get a partial page, lock it and return it.
*/
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
- struct kmem_cache_cpu *c)
+ struct page **ret_page)
{
void *object;
int searchnode = node;
@@ -2097,11 +2148,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- object = get_partial_node(s, get_node(s, searchnode), c, flags);
+ object = get_partial_node(s, get_node(s, searchnode), ret_page, flags);
if (object || node != NUMA_NO_NODE)
return object;
- return get_any_partial(s, flags, c);
+ return get_any_partial(s, flags, ret_page);
}
#ifdef CONFIG_PREEMPTION
@@ -2168,16 +2219,23 @@ static inline void note_cmpxchg_failure(const char *n,
static void init_kmem_cache_cpus(struct kmem_cache *s)
{
int cpu;
+ struct kmem_cache_cpu *c;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(s->cpu_slab, cpu);
+ local_lock_init(&c->lock);
+ c->tid = init_tid(cpu);
+ }
}
/*
- * Remove the cpu slab
+ * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist,
+ * unfreezes the slabs and puts it on the proper list.
+ * Assumes the slab has been already safely taken away from kmem_cache_cpu
+ * by the caller.
*/
static void deactivate_slab(struct kmem_cache *s, struct page *page,
- void *freelist, struct kmem_cache_cpu *c)
+ void *freelist)
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -2185,6 +2243,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
enum slab_modes l = M_NONE, m = M_NONE;
void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
+ unsigned long flags = 0;
struct page new;
struct page old;
@@ -2260,7 +2319,7 @@ redo:
* that acquire_slab() will see a slab page that
* is frozen
*/
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
}
} else {
m = M_FULL;
@@ -2271,7 +2330,7 @@ redo:
* slabs from diagnostic functions will not see
* any frozen slabs.
*/
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
}
}
@@ -2288,14 +2347,14 @@ redo:
}
l = m;
- if (!__cmpxchg_double_slab(s, page,
+ if (!cmpxchg_double_slab(s, page,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"))
goto redo;
if (lock)
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
if (m == M_PARTIAL)
stat(s, tail);
@@ -2306,38 +2365,29 @@ redo:
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-
- c->page = NULL;
- c->freelist = NULL;
}
-/*
- * Unfreeze all the cpu partial slabs.
- *
- * This function must be called with interrupts disabled
- * for the cpu using c (or some other guarantee must be there
- * to guarantee no concurrent accesses).
- */
-static void unfreeze_partials(struct kmem_cache *s,
- struct kmem_cache_cpu *c)
-{
#ifdef CONFIG_SLUB_CPU_PARTIAL
+static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page)
+{
struct kmem_cache_node *n = NULL, *n2 = NULL;
struct page *page, *discard_page = NULL;
+ unsigned long flags = 0;
- while ((page = slub_percpu_partial(c))) {
+ while (partial_page) {
struct page new;
struct page old;
- slub_set_percpu_partial(c, page);
+ page = partial_page;
+ partial_page = page->next;
n2 = get_node(s, page_to_nid(page));
if (n != n2) {
if (n)
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
n = n2;
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
}
do {
@@ -2366,7 +2416,7 @@ static void unfreeze_partials(struct kmem_cache *s,
}
if (n)
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
while (discard_page) {
page = discard_page;
@@ -2376,10 +2426,50 @@ static void unfreeze_partials(struct kmem_cache *s,
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
/*
+ * Unfreeze all the cpu partial slabs.
+ *
+ * This function must be called with preemption or migration
+ * disabled.
+ */
+static void unfreeze_partials(struct kmem_cache *s)
+{
+ struct page *partial_page;
+
+ do {
+ partial_page = this_cpu_read(s->cpu_slab->partial);
+
+ } while (partial_page &&
+ this_cpu_cmpxchg(s->cpu_slab->partial, partial_page, NULL)
+ != partial_page);
+
+ if (partial_page)
+ __unfreeze_partials(s, partial_page);
+}
+
+static void unfreeze_partials_cpu(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
+{
+ struct page *partial_page;
+
+ partial_page = slub_percpu_partial(c);
+ c->partial = NULL;
+
+ if (partial_page)
+ __unfreeze_partials(s, partial_page);
+}
+
+#else /* CONFIG_SLUB_CPU_PARTIAL */
+
+static void unfreeze_partials(struct kmem_cache *s) { }
+static void unfreeze_partials_cpu(struct kmem_cache *s,
+ struct kmem_cache_cpu *c) { }
+
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+
+/*
* Put a page that was just frozen (in __slab_free|get_partial_node) into a
* partial page slot if available.
*
@@ -2393,7 +2483,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
int pages;
int pobjects;
- preempt_disable();
+ slub_get_cpu_ptr(s->cpu_slab);
do {
pages = 0;
pobjects = 0;
@@ -2403,14 +2493,11 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
pobjects = oldpage->pobjects;
pages = oldpage->pages;
if (drain && pobjects > slub_cpu_partial(s)) {
- unsigned long flags;
/*
* partial array is full. Move the existing
* set to the per node partial list.
*/
- local_irq_save(flags);
- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
- local_irq_restore(flags);
+ unfreeze_partials(s);
oldpage = NULL;
pobjects = 0;
pages = 0;
@@ -2427,58 +2514,113 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
- if (unlikely(!slub_cpu_partial(s))) {
- unsigned long flags;
-
- local_irq_save(flags);
- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
- local_irq_restore(flags);
- }
- preempt_enable();
+ slub_put_cpu_ptr(s->cpu_slab);
#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
-static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c,
+ bool lock)
{
- stat(s, CPUSLAB_FLUSH);
- deactivate_slab(s, c->page, c->freelist, c);
+ unsigned long flags;
+ void *freelist;
+ struct page *page;
+
+ if (lock)
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+
+ freelist = c->freelist;
+ page = c->page;
+ c->page = NULL;
+ c->freelist = NULL;
c->tid = next_tid(c->tid);
+
+ if (lock)
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ if (page)
+ deactivate_slab(s, page, freelist);
+
+ stat(s, CPUSLAB_FLUSH);
}
-/*
- * Flush cpu slab.
- *
- * Called from IPI handler with interrupts disabled.
- */
static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
if (c->page)
- flush_slab(s, c);
+ flush_slab(s, c, false);
- unfreeze_partials(s, c);
+ unfreeze_partials_cpu(s, c);
}
-static void flush_cpu_slab(void *d)
+struct slub_flush_work {
+ struct work_struct work;
+ struct kmem_cache *s;
+ bool skip;
+};
+
+/*
+ * Flush cpu slab.
+ *
+ * Called from CPU work handler with migration disabled.
+ */
+static void flush_cpu_slab(struct work_struct *w)
{
- struct kmem_cache *s = d;
+ struct kmem_cache *s;
+ struct kmem_cache_cpu *c;
+ struct slub_flush_work *sfw;
- __flush_cpu_slab(s, smp_processor_id());
+ sfw = container_of(w, struct slub_flush_work, work);
+
+ s = sfw->s;
+ c = this_cpu_ptr(s->cpu_slab);
+
+ if (c->page)
+ flush_slab(s, c, true);
+
+ unfreeze_partials(s);
}
-static bool has_cpu_slab(int cpu, void *info)
+static bool has_cpu_slab(int cpu, struct kmem_cache *s)
{
- struct kmem_cache *s = info;
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
return c->page || slub_percpu_partial(c);
}
+static DEFINE_MUTEX(flush_lock);
+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
+
static void flush_all(struct kmem_cache *s)
{
- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
+ struct slub_flush_work *sfw;
+ unsigned int cpu;
+
+ cpus_read_lock();
+ mutex_lock(&flush_lock);
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ if (!has_cpu_slab(cpu, s)) {
+ sfw->skip = true;
+ continue;
+ }
+ INIT_WORK(&sfw->work, flush_cpu_slab);
+ sfw->skip = false;
+ sfw->s = s;
+ schedule_work_on(cpu, &sfw->work);
+ }
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ if (sfw->skip)
+ continue;
+ flush_work(&sfw->work);
+ }
+
+ mutex_unlock(&flush_lock);
+ cpus_read_unlock();
}
/*
@@ -2488,14 +2630,10 @@ static void flush_all(struct kmem_cache *s)
static int slub_cpu_dead(unsigned int cpu)
{
struct kmem_cache *s;
- unsigned long flags;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- local_irq_save(flags);
+ list_for_each_entry(s, &slab_caches, list)
__flush_cpu_slab(s, cpu);
- local_irq_restore(flags);
- }
mutex_unlock(&slab_mutex);
return 0;
}
@@ -2578,41 +2716,6 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
#endif
}
-static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
- int node, struct kmem_cache_cpu **pc)
-{
- void *freelist;
- struct kmem_cache_cpu *c = *pc;
- struct page *page;
-
- WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
-
- freelist = get_partial(s, flags, node, c);
-
- if (freelist)
- return freelist;
-
- page = new_slab(s, flags, node);
- if (page) {
- c = raw_cpu_ptr(s->cpu_slab);
- if (c->page)
- flush_slab(s, c);
-
- /*
- * No other reference to the page yet so we can
- * muck around with it freely without cmpxchg
- */
- freelist = page->freelist;
- page->freelist = NULL;
-
- stat(s, ALLOC_SLAB);
- c->page = page;
- *pc = c;
- }
-
- return freelist;
-}
-
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
{
if (unlikely(PageSlabPfmemalloc(page)))
@@ -2671,7 +2774,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
*
- * Version of __slab_alloc to use when we know that interrupts are
+ * Version of __slab_alloc to use when we know that preemption is
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
@@ -2679,10 +2782,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
{
void *freelist;
struct page *page;
+ unsigned long flags;
stat(s, ALLOC_SLOWPATH);
- page = c->page;
+reread_page:
+
+ page = READ_ONCE(c->page);
if (!page) {
/*
* if the node is not online or has no normal memory, just
@@ -2705,8 +2811,7 @@ redo:
goto redo;
} else {
stat(s, ALLOC_NODE_MISMATCH);
- deactivate_slab(s, page, c->freelist, c);
- goto new_slab;
+ goto deactivate_slab;
}
}
@@ -2715,12 +2820,15 @@ redo:
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
- if (unlikely(!pfmemalloc_match(page, gfpflags))) {
- deactivate_slab(s, page, c->freelist, c);
- goto new_slab;
+ if (unlikely(!pfmemalloc_match(page, gfpflags)))
+ goto deactivate_slab;
+
+ /* must check again c->page in case we got preempted and it changed */
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(page != c->page)) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_page;
}
-
- /* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;
if (freelist)
goto load_freelist;
@@ -2729,6 +2837,7 @@ redo:
if (!freelist) {
c->page = NULL;
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -2736,6 +2845,13 @@ redo:
stat(s, ALLOC_REFILL);
load_freelist:
+
+#ifdef CONFIG_PREEMPT_RT
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock.lock));
+#else
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+#endif
+
/*
* freelist is pointing to the list of objects to be used.
* page is pointing to the page from which the objects are obtained.
@@ -2744,59 +2860,141 @@ load_freelist:
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
return freelist;
+deactivate_slab:
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (page != c->page) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_page;
+ }
+ freelist = c->freelist;
+ c->page = NULL;
+ c->freelist = NULL;
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ deactivate_slab(s, page, freelist);
+
new_slab:
if (slub_percpu_partial(c)) {
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(c->page)) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_page;
+ }
+ if (unlikely(!slub_percpu_partial(c))) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ /* we were preempted and partial list got empty */
+ goto new_objects;
+ }
+
page = c->page = slub_percpu_partial(c);
slub_set_percpu_partial(c, page);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, CPU_PARTIAL_ALLOC);
goto redo;
}
- freelist = new_slab_objects(s, gfpflags, node, &c);
+new_objects:
+
+ freelist = get_partial(s, gfpflags, node, &page);
+ if (freelist)
+ goto check_new_page;
- if (unlikely(!freelist)) {
+ slub_put_cpu_ptr(s->cpu_slab);
+ page = new_slab(s, gfpflags, node);
+ c = slub_get_cpu_ptr(s->cpu_slab);
+
+ if (unlikely(!page)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
}
- page = c->page;
- if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
- goto load_freelist;
+ /*
+ * No other reference to the page yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ freelist = page->freelist;
+ page->freelist = NULL;
- /* Only entered in the debug case */
- if (kmem_cache_debug(s) &&
- !alloc_debug_processing(s, page, freelist, addr))
- goto new_slab; /* Slab failed checks. Next slab needed */
+ stat(s, ALLOC_SLAB);
+
+check_new_page:
+
+ if (kmem_cache_debug(s)) {
+ if (!alloc_debug_processing(s, page, freelist, addr)) {
+ /* Slab failed checks. Next slab needed */
+ goto new_slab;
+ } else {
+ /*
+ * For debug case, we don't load freelist so that all
+ * allocations go through alloc_debug_processing()
+ */
+ goto return_single;
+ }
+ }
+
+ if (unlikely(!pfmemalloc_match(page, gfpflags)))
+ /*
+ * For !pfmemalloc_match() case we don't load freelist so that
+ * we don't make further mismatched allocations easier.
+ */
+ goto return_single;
+
+retry_load_page:
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(c->page)) {
+ void *flush_freelist = c->freelist;
+ struct page *flush_page = c->page;
+
+ c->page = NULL;
+ c->freelist = NULL;
+ c->tid = next_tid(c->tid);
+
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ deactivate_slab(s, flush_page, flush_freelist);
- deactivate_slab(s, page, get_freepointer(s, freelist), c);
+ stat(s, CPUSLAB_FLUSH);
+
+ goto retry_load_page;
+ }
+ c->page = page;
+
+ goto load_freelist;
+
+return_single:
+
+ deactivate_slab(s, page, get_freepointer(s, freelist));
return freelist;
}
/*
- * Another one that disabled interrupt and compensates for possible
- * cpu changes by refetching the per cpu area pointer.
+ * A wrapper for ___slab_alloc() for contexts where preemption is not yet
+ * disabled. Compensates for possible cpu changes by refetching the per cpu area
+ * pointer.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *p;
- unsigned long flags;
- local_irq_save(flags);
-#ifdef CONFIG_PREEMPTION
+#ifdef CONFIG_PREEMPT_COUNT
/*
* We may have been preempted and rescheduled on a different
- * cpu before disabling interrupts. Need to reload cpu area
+ * cpu before disabling preemption. Need to reload cpu area
* pointer.
*/
- c = this_cpu_ptr(s->cpu_slab);
+ c = slub_get_cpu_ptr(s->cpu_slab);
#endif
p = ___slab_alloc(s, gfpflags, node, addr, c);
- local_irq_restore(flags);
+#ifdef CONFIG_PREEMPT_COUNT
+ slub_put_cpu_ptr(s->cpu_slab);
+#endif
return p;
}
@@ -2847,15 +3045,14 @@ redo:
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
- * We should guarantee that tid and kmem_cache are retrieved on
- * the same cpu. It could be different if CONFIG_PREEMPTION so we need
- * to check if it is matched or not.
+ * We must guarantee that tid and kmem_cache_cpu are retrieved on the
+ * same cpu. We read first the kmem_cache_cpu pointer and use it to read
+ * the tid. If we are preempted and switched to another cpu between the
+ * two reads, it's OK as the two are still associated with the same cpu
+ * and cmpxchg later will validate the cpu.
*/
- do {
- tid = this_cpu_read(s->cpu_slab->tid);
- c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPTION) &&
- unlikely(tid != READ_ONCE(c->tid)));
+ c = raw_cpu_ptr(s->cpu_slab);
+ tid = READ_ONCE(c->tid);
/*
* Irqless object alloc/free algorithm used here depends on sequence
@@ -2876,7 +3073,15 @@ redo:
object = c->freelist;
page = c->page;
- if (unlikely(!object || !page || !node_match(page, node))) {
+ /*
+ * We cannot use the lockless fastpath on PREEMPT_RT because if a
+ * slowpath has taken the local_lock_irqsave(), it is not protected
+ * against a fast path operation in an irq handler. So we need to take
+ * the slow path which uses local_lock. It is still relatively fast if
+ * there is a suitable cpu freelist.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+ unlikely(!object || !page || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3129,16 +3334,14 @@ redo:
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succeed.
*/
- do {
- tid = this_cpu_read(s->cpu_slab->tid);
- c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPTION) &&
- unlikely(tid != READ_ONCE(c->tid)));
+ c = raw_cpu_ptr(s->cpu_slab);
+ tid = READ_ONCE(c->tid);
/* Same with comment on barrier() in slab_alloc_node() */
barrier();
if (likely(page == c->page)) {
+#ifndef CONFIG_PREEMPT_RT
void **freelist = READ_ONCE(c->freelist);
set_freepointer(s, tail_obj, freelist);
@@ -3151,6 +3354,32 @@ redo:
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
+#else /* CONFIG_PREEMPT_RT */
+ /*
+ * We cannot use the lockless fastpath on PREEMPT_RT because if
+ * a slowpath has taken the local_lock_irqsave(), it is not
+ * protected against a fast path operation in an irq handler. So
+ * we need to take the local_lock. We shouldn't simply defer to
+ * __slab_free() as that wouldn't use the cpu freelist at all.
+ */
+ unsigned long flags;
+ void **freelist;
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ c = this_cpu_ptr(s->cpu_slab);
+ if (unlikely(page != c->page)) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto redo;
+ }
+ tid = c->tid;
+ freelist = c->freelist;
+
+ set_freepointer(s, tail_obj, freelist);
+ c->freelist = head;
+ c->tid = next_tid(tid);
+
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+#endif
stat(s, FREE_FASTPATH);
} else
__slab_free(s, page, head, tail_obj, cnt, addr);
@@ -3320,8 +3549,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
* IRQs, which protects against PREEMPT and interrupts
* handlers invoking normal fastpath.
*/
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ c = slub_get_cpu_ptr(s->cpu_slab);
+ local_lock_irq(&s->cpu_slab->lock);
for (i = 0; i < size; i++) {
void *object = kfence_alloc(s, s->object_size, flags);
@@ -3342,6 +3571,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
*/
c->tid = next_tid(c->tid);
+ local_unlock_irq(&s->cpu_slab->lock);
+
/*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
@@ -3354,6 +3585,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
c = this_cpu_ptr(s->cpu_slab);
maybe_wipe_obj_freeptr(s, p[i]);
+ local_lock_irq(&s->cpu_slab->lock);
+
continue; /* goto for-loop */
}
c->freelist = get_freepointer(s, object);
@@ -3361,7 +3594,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
maybe_wipe_obj_freeptr(s, p[i]);
}
c->tid = next_tid(c->tid);
- local_irq_enable();
+ local_unlock_irq(&s->cpu_slab->lock);
+ slub_put_cpu_ptr(s->cpu_slab);
/*
* memcg and kmem_cache debug support and memory initialization.
@@ -3371,7 +3605,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
slab_want_init_on_alloc(flags, s));
return i;
error:
- local_irq_enable();
+ local_unlock_irq(&s->cpu_slab->lock);
slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
@@ -3887,9 +4121,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
void *addr = page_address(page);
unsigned long *map;
void *p;
+ unsigned long flags;
slab_err(s, page, text, s->name);
- slab_lock(page);
+ slab_lock(page, &flags);
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
@@ -3900,7 +4135,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
}
}
put_map(map);
- slab_unlock(page);
+ slab_unlock(page, &flags);
#endif
}
@@ -4611,33 +4846,33 @@ static int count_total(struct page *page)
#endif
#ifdef CONFIG_SLUB_DEBUG
-static void validate_slab(struct kmem_cache *s, struct page *page)
+static void validate_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *obj_map)
{
void *p;
void *addr = page_address(page);
- unsigned long *map;
+ unsigned long flags;
- slab_lock(page);
+ slab_lock(page, &flags);
if (!check_slab(s, page) || !on_freelist(s, page, NULL))
goto unlock;
/* Now we know that a valid freelist exists */
- map = get_map(s, page);
+ __fill_map(obj_map, s, page);
for_each_object(p, s, addr, page->objects) {
- u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
+ u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
if (!check_object(s, page, p, val))
break;
}
- put_map(map);
unlock:
- slab_unlock(page);
+ slab_unlock(page, &flags);
}
static int validate_slab_node(struct kmem_cache *s,
- struct kmem_cache_node *n)
+ struct kmem_cache_node *n, unsigned long *obj_map)
{
unsigned long count = 0;
struct page *page;
@@ -4646,7 +4881,7 @@ static int validate_slab_node(struct kmem_cache *s,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list) {
- validate_slab(s, page);
+ validate_slab(s, page, obj_map);
count++;
}
if (count != n->nr_partial)
@@ -4657,7 +4892,7 @@ static int validate_slab_node(struct kmem_cache *s,
goto out;
list_for_each_entry(page, &n->full, slab_list) {
- validate_slab(s, page);
+ validate_slab(s, page, obj_map);
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
@@ -4674,10 +4909,17 @@ static long validate_slab_cache(struct kmem_cache *s)
int node;
unsigned long count = 0;
struct kmem_cache_node *n;
+ unsigned long *obj_map;
+
+ obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
+ if (!obj_map)
+ return -ENOMEM;
flush_all(s);
for_each_kmem_cache_node(s, node, n)
- count += validate_slab_node(s, n);
+ count += validate_slab_node(s, n, obj_map);
+
+ bitmap_free(obj_map);
return count;
}
@@ -4808,17 +5050,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
- struct page *page, enum track_item alloc)
+ struct page *page, enum track_item alloc,
+ unsigned long *obj_map)
{
void *addr = page_address(page);
void *p;
- unsigned long *map;
- map = get_map(s, page);
+ __fill_map(obj_map, s, page);
+
for_each_object(p, s, addr, page->objects)
- if (!test_bit(__obj_to_index(s, addr, p), map))
+ if (!test_bit(__obj_to_index(s, addr, p), obj_map))
add_location(t, s, get_track(s, p, alloc));
- put_map(map);
}
static int list_locations(struct kmem_cache *s, char *buf,
@@ -4829,13 +5071,17 @@ static int list_locations(struct kmem_cache *s, char *buf,
struct loc_track t = { 0, 0, NULL };
int node;
struct kmem_cache_node *n;
+ unsigned long *obj_map;
+
+ obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
+ if (!obj_map)
+ return sysfs_emit(buf, "Out of memory\n");
if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
+ bitmap_free(obj_map);
return sysfs_emit(buf, "Out of memory\n");
}
- /* Push back cpu slabs */
- flush_all(s);
for_each_kmem_cache_node(s, node, n) {
unsigned long flags;
@@ -4846,12 +5092,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
- process_slab(&t, s, page, alloc);
+ process_slab(&t, s, page, alloc, obj_map);
list_for_each_entry(page, &n->full, slab_list)
- process_slab(&t, s, page, alloc);
+ process_slab(&t, s, page, alloc, obj_map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
+ bitmap_free(obj_map);
+
for (i = 0; i < t.count; i++) {
struct location *l = &t.loc[i];