slub.c 153 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Christoph Lameter's avatar
Christoph Lameter committed
2
3
4
5
/*
 * SLUB: A slab allocator that limits cache line use instead of queuing
 * objects in per cpu and per node lists.
 *
6
 * The allocator synchronizes using per slab locks or atomic operations
7
 * and only uses a centralized lock to manage a pool of partial slabs.
Christoph Lameter's avatar
Christoph Lameter committed
8
 *
Christoph Lameter's avatar
Christoph Lameter committed
9
 * (C) 2007 SGI, Christoph Lameter
10
 * (C) 2011 Linux Foundation, Christoph Lameter
Christoph Lameter's avatar
Christoph Lameter committed
11
12
13
 */

#include <linux/mm.h>
Nick Piggin's avatar
Nick Piggin committed
14
#include <linux/swap.h> /* struct reclaim_state */
Christoph Lameter's avatar
Christoph Lameter committed
15
16
17
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
Andrew Morton's avatar
Andrew Morton committed
18
#include <linux/swab.h>
Christoph Lameter's avatar
Christoph Lameter committed
19
20
#include <linux/bitops.h>
#include <linux/slab.h>
21
#include "slab.h"
22
#include <linux/proc_fs.h>
Christoph Lameter's avatar
Christoph Lameter committed
23
#include <linux/seq_file.h>
24
#include <linux/kasan.h>
Christoph Lameter's avatar
Christoph Lameter committed
25
26
27
28
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
29
#include <linux/debugobjects.h>
Christoph Lameter's avatar
Christoph Lameter committed
30
#include <linux/kallsyms.h>
31
#include <linux/kfence.h>
32
#include <linux/memory.h>
Roman Zippel's avatar
Roman Zippel committed
33
#include <linux/math64.h>
Akinobu Mita's avatar
Akinobu Mita committed
34
#include <linux/fault-inject.h>
35
#include <linux/stacktrace.h>
36
#include <linux/prefetch.h>
37
#include <linux/memcontrol.h>
38
#include <linux/random.h>
39
#include <kunit/test.h>
Christoph Lameter's avatar
Christoph Lameter committed
40

41
#include <linux/debugfs.h>
42
43
#include <trace/events/kmem.h>

44
45
#include "internal.h"

Christoph Lameter's avatar
Christoph Lameter committed
46
47
/*
 * Lock order:
48
 *   1. slab_mutex (Global Mutex)
49
50
51
52
 *   2. node->list_lock (Spinlock)
 *   3. kmem_cache->cpu_slab->lock (Local lock)
 *   4. slab_lock(page) (Only on some arches or for debugging)
 *   5. object_map_lock (Only for debugging)
Christoph Lameter's avatar
Christoph Lameter committed
53
 *
54
 *   slab_mutex
55
 *
56
 *   The role of the slab_mutex is to protect the list of all the slabs
57
 *   and to synchronize major metadata changes to slab cache structures.
58
59
60
61
62
63
 *   Also synchronizes memory hotplug callbacks.
 *
 *   slab_lock
 *
 *   The slab_lock is a wrapper around the page lock, thus it is a bit
 *   spinlock.
64
65
 *
 *   The slab_lock is only used for debugging and on arches that do not
66
 *   have the ability to do a cmpxchg_double. It only protects:
67
 *	A. page->freelist	-> List of object free in a page
68
69
70
 *	B. page->inuse		-> Number of objects in use
 *	C. page->objects	-> Number of objects in page
 *	D. page->frozen		-> frozen state
71
 *
72
73
 *   Frozen slabs
 *
74
 *   If a slab is frozen then it is exempt from list management. It is not
75
76
77
78
79
 *   on any list except per cpu partial list. The processor that froze the
 *   slab is the one who can perform list operations on the page. Other
 *   processors may put objects onto the freelist but the processor that
 *   froze the slab is the only one that can retrieve the objects from the
 *   page's freelist.
Christoph Lameter's avatar
Christoph Lameter committed
80
 *
81
82
 *   list_lock
 *
Christoph Lameter's avatar
Christoph Lameter committed
83
84
85
86
87
88
89
90
91
92
93
 *   The list_lock protects the partial and full list on each node and
 *   the partial slab counter. If taken then no new slabs may be added or
 *   removed from the lists nor make the number of partial slabs be modified.
 *   (Note that the total number of slabs is an atomic value that may be
 *   modified without taking the list lock).
 *
 *   The list_lock is a centralized lock and thus we avoid taking it as
 *   much as possible. As long as SLUB does not have to handle partial
 *   slabs, operations can continue without any centralized lock. F.e.
 *   allocating a long series of objects that fill up slabs does not require
 *   the list lock.
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
 *
 *   cpu_slab->lock local lock
 *
 *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
 *   except the stat counters. This is a percpu structure manipulated only by
 *   the local cpu, so the lock protects against being preempted or interrupted
 *   by an irq. Fast path operations rely on lockless operations instead.
 *   On PREEMPT_RT, the local lock does not actually disable irqs (and thus
 *   prevent the lockless operations), so fastpath operations also need to take
 *   the lock and are no longer lockless.
 *
 *   lockless fastpaths
 *
 *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
 *   are fully lockless when satisfied from the percpu slab (and when
 *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
 *   They also don't disable preemption or migration or irqs. They rely on
 *   the transaction id (tid) field to detect being preempted or moved to
 *   another cpu.
 *
 *   irq, preemption, migration considerations
 *
 *   Interrupts are disabled as part of list_lock or local_lock operations, or
 *   around the slab_lock operation, in order to make the slab allocator safe
 *   to use in the context of an irq.
 *
 *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
 *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
 *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
 *   doesn't have to be revalidated in each section protected by the local lock.
Christoph Lameter's avatar
Christoph Lameter committed
124
125
126
127
 *
 * SLUB assigns one slab for allocation to each processor.
 * Allocations only occur from these slabs called cpu slabs.
 *
Christoph Lameter's avatar
Christoph Lameter committed
128
129
 * Slabs with free elements are kept on a partial list and during regular
 * operations no list for full slabs is used. If an object in a full slab is
Christoph Lameter's avatar
Christoph Lameter committed
130
 * freed then the slab will show up again on the partial lists.
Christoph Lameter's avatar
Christoph Lameter committed
131
132
 * We track full slabs for debugging purposes though because otherwise we
 * cannot scan all objects.
Christoph Lameter's avatar
Christoph Lameter committed
133
134
135
136
137
 *
 * Slabs are freed when they become empty. Teardown and setup is
 * minimal so we rely on the page allocators per cpu caches for
 * fast frees and allocs.
 *
Yu Zhao's avatar
Yu Zhao committed
138
 * page->frozen		The slab is frozen and exempt from list processing.
139
140
141
142
143
144
145
146
147
148
149
 * 			This means that the slab is dedicated to a purpose
 * 			such as satisfying allocations for a specific
 * 			processor. Objects may be freed in the slab while
 * 			it is frozen but slab_free will then skip the usual
 * 			list operations. It is up to the processor holding
 * 			the slab to integrate the slab into the slab lists
 * 			when the slab is no longer needed.
 *
 * 			One use of this flag is to mark slabs that are
 * 			used for allocations. Then such a slab becomes a cpu
 * 			slab. The cpu slab may be equipped with an additional
150
 * 			freelist that allows lockless access to
151
152
 * 			free objects in addition to the regular freelist
 * 			that requires the slab lock.
Christoph Lameter's avatar
Christoph Lameter committed
153
 *
Yu Zhao's avatar
Yu Zhao committed
154
 * SLAB_DEBUG_FLAGS	Slab requires special handling due to debug
Christoph Lameter's avatar
Christoph Lameter committed
155
 * 			options set. This moves	slab handling out of
156
 * 			the fast path and disables lockless freelists.
Christoph Lameter's avatar
Christoph Lameter committed
157
158
 */

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/*
 * We could simply use migrate_disable()/enable() but as long as it's a
 * function call even on !PREEMPT_RT, use inline preempt_disable() there.
 */
#ifndef CONFIG_PREEMPT_RT
#define slub_get_cpu_ptr(var)	get_cpu_ptr(var)
#define slub_put_cpu_ptr(var)	put_cpu_ptr(var)
#else
#define slub_get_cpu_ptr(var)		\
({					\
	migrate_disable();		\
	this_cpu_ptr(var);		\
})
#define slub_put_cpu_ptr(var)		\
do {					\
	(void)(var);			\
	migrate_enable();		\
} while (0)
#endif

179
180
181
182
183
184
#ifdef CONFIG_SLUB_DEBUG
#ifdef CONFIG_SLUB_DEBUG_ON
DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
185
#endif		/* CONFIG_SLUB_DEBUG */
186

187
188
189
static inline bool kmem_cache_debug(struct kmem_cache *s)
{
	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
190
}
191

192
void *fixup_red_left(struct kmem_cache *s, void *p)
Joonsoo Kim's avatar
Joonsoo Kim committed
193
{
194
	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
Joonsoo Kim's avatar
Joonsoo Kim committed
195
196
197
198
199
		p += s->red_left_pad;

	return p;
}

200
201
202
203
204
205
206
207
208
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
	return !kmem_cache_debug(s);
#else
	return false;
#endif
}

Christoph Lameter's avatar
Christoph Lameter committed
209
210
211
212
213
214
215
216
/*
 * Issues still to be resolved:
 *
 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 *
 * - Variable sizing of the per node arrays
 */

217
218
219
/* Enable to log cmpxchg failures */
#undef SLUB_DEBUG_CMPXCHG

220
/*
221
 * Minimum number of partial slabs. These will be left on the partial
222
223
 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 */
224
#define MIN_PARTIAL 5
Christoph Lameter's avatar
Christoph Lameter committed
225

226
227
228
/*
 * Maximum number of desirable partial slabs.
 * The existence of more partial slabs makes kmem_cache_shrink
229
 * sort the partial list by the number of objects in use.
230
231
232
 */
#define MAX_PARTIAL 10

233
#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
Christoph Lameter's avatar
Christoph Lameter committed
234
				SLAB_POISON | SLAB_STORE_USER)
Christoph Lameter's avatar
Christoph Lameter committed
235

236
237
238
239
240
241
242
243
/*
 * These debug flags cannot use CMPXCHG because there might be consistency
 * issues when checking or reading debug information
 */
#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
				SLAB_TRACE)


244
/*
245
246
247
 * Debugging flags that require metadata to be stored in the slab.  These get
 * disabled when slub_debug=O is used and a cache's min order increases with
 * metadata.
248
 */
249
#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
250

251
252
#define OO_SHIFT	16
#define OO_MASK		((1 << OO_SHIFT) - 1)
253
#define MAX_OBJS_PER_PAGE	32767 /* since page.objects is u15 */
254

Christoph Lameter's avatar
Christoph Lameter committed
255
/* Internal SLUB flags */
256
/* Poison object */
257
#define __OBJECT_POISON		((slab_flags_t __force)0x80000000U)
258
/* Use cmpxchg_double */
259
#define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000U)
Christoph Lameter's avatar
Christoph Lameter committed
260

261
262
263
/*
 * Tracking user of a slab.
 */
264
#define TRACK_ADDRS_COUNT 16
265
struct track {
266
	unsigned long addr;	/* Called from address */
267
268
#ifdef CONFIG_STACKTRACE
	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
269
#endif
270
271
272
273
274
275
276
	int cpu;		/* Was running on cpu */
	int pid;		/* Pid context */
	unsigned long when;	/* When did the operation occur */
};

enum track_item { TRACK_ALLOC, TRACK_FREE };

277
#ifdef CONFIG_SYSFS
Christoph Lameter's avatar
Christoph Lameter committed
278
279
280
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
#else
281
282
283
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
							{ return 0; }
Christoph Lameter's avatar
Christoph Lameter committed
284
285
#endif

286
287
288
289
290
291
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
static void debugfs_slab_add(struct kmem_cache *);
#else
static inline void debugfs_slab_add(struct kmem_cache *s) { }
#endif

292
static inline void stat(const struct kmem_cache *s, enum stat_item si)
293
294
{
#ifdef CONFIG_SLUB_STATS
295
296
297
298
299
	/*
	 * The rmw is racy on a preemptible kernel but this is acceptable, so
	 * avoid this_cpu_add()'s irq-disable overhead.
	 */
	raw_cpu_inc(s->cpu_slab->stat[si]);
300
301
302
#endif
}

303
304
305
306
307
308
309
310
/*
 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
 * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
 * differ during memory hotplug/hotremove operations.
 * Protected by slab_mutex.
 */
static nodemask_t slab_nodes;

Christoph Lameter's avatar
Christoph Lameter committed
311
312
313
314
/********************************************************************
 * 			Core slab cache functions
 *******************************************************************/

315
316
317
318
319
320
321
322
323
/*
 * Returns freelist pointer (ptr). With hardening, this is obfuscated
 * with an XOR of the address where the pointer is held and a per-cache
 * random number.
 */
static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
				 unsigned long ptr_addr)
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
324
	/*
325
	 * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
326
327
328
329
330
331
332
333
334
	 * Normally, this doesn't cause any issues, as both set_freepointer()
	 * and get_freepointer() are called with a pointer with the same tag.
	 * However, there are some issues with CONFIG_SLUB_DEBUG code. For
	 * example, when __free_slub() iterates over objects in a cache, it
	 * passes untagged pointers to check_object(). check_object() in turns
	 * calls get_freepointer() with an untagged pointer, which causes the
	 * freepointer to be restored incorrectly.
	 */
	return (void *)((unsigned long)ptr ^ s->random ^
335
			swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
336
337
338
339
340
341
342
343
344
345
346
347
348
#else
	return ptr;
#endif
}

/* Returns the freelist pointer recorded at location ptr_addr. */
static inline void *freelist_dereference(const struct kmem_cache *s,
					 void *ptr_addr)
{
	return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
			    (unsigned long)ptr_addr);
}

349
350
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
351
	object = kasan_reset_tag(object);
352
	return freelist_dereference(s, object + s->offset);
353
354
}

355
356
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
357
	prefetch(object + s->offset);
358
359
}

360
361
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
362
	unsigned long freepointer_addr;
363
364
	void *p;

365
	if (!debug_pagealloc_enabled_static())
366
367
		return get_freepointer(s, object);

368
	object = kasan_reset_tag(object);
369
	freepointer_addr = (unsigned long)object + s->offset;
370
	copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
371
	return freelist_ptr(s, p, freepointer_addr);
372
373
}

374
375
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
376
377
	unsigned long freeptr_addr = (unsigned long)object + s->offset;

378
379
380
381
#ifdef CONFIG_SLAB_FREELIST_HARDENED
	BUG_ON(object == fp); /* naive detection of double free or corruption */
#endif

382
	freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
383
	*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
384
385
386
}

/* Loop over all objects in a slab */
387
#define for_each_object(__p, __s, __addr, __objects) \
Joonsoo Kim's avatar
Joonsoo Kim committed
388
389
390
	for (__p = fixup_red_left(__s, __addr); \
		__p < (__addr) + (__objects) * (__s)->size; \
		__p += (__s)->size)
391

392
static inline unsigned int order_objects(unsigned int order, unsigned int size)
393
{
394
	return ((unsigned int)PAGE_SIZE << order) / size;
395
396
}

397
static inline struct kmem_cache_order_objects oo_make(unsigned int order,
398
		unsigned int size)
399
400
{
	struct kmem_cache_order_objects x = {
401
		(order << OO_SHIFT) + order_objects(order, size)
402
403
404
405
406
	};

	return x;
}

407
static inline unsigned int oo_order(struct kmem_cache_order_objects x)
408
{
409
	return x.x >> OO_SHIFT;
410
411
}

412
static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
413
{
414
	return x.x & OO_MASK;
415
416
}

417
418
419
/*
 * Per slab locking using the pagelock
 */
420
static __always_inline void __slab_lock(struct page *page)
421
{
422
	VM_BUG_ON_PAGE(PageTail(page), page);
423
424
425
	bit_spin_lock(PG_locked, &page->flags);
}

426
static __always_inline void __slab_unlock(struct page *page)
427
{
428
	VM_BUG_ON_PAGE(PageTail(page), page);
429
	__bit_spin_unlock(PG_locked, &page->flags);
430
431
}

432
static __always_inline void slab_lock(struct page *page, unsigned long *flags)
433
{
434
435
436
	if (IS_ENABLED(CONFIG_PREEMPT_RT))
		local_irq_save(*flags);
	__slab_lock(page);
437
438
439
440
}

static __always_inline void slab_unlock(struct page *page, unsigned long *flags)
{
441
442
443
	__slab_unlock(page);
	if (IS_ENABLED(CONFIG_PREEMPT_RT))
		local_irq_restore(*flags);
444
445
}

446
447
448
449
450
451
/*
 * Interrupts must be disabled (for the fallback code to work right), typically
 * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
 * so we disable interrupts as part of slab_[un]lock().
 */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
452
453
		void *freelist_old, unsigned long counters_old,
		void *freelist_new, unsigned long counters_new,
454
		const char *n)
455
{
456
	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
457
		lockdep_assert_irqs_disabled();
458
459
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
460
	if (s->flags & __CMPXCHG_DOUBLE) {
461
		if (cmpxchg_double(&page->freelist, &page->counters,
462
463
				   freelist_old, counters_old,
				   freelist_new, counters_new))
464
			return true;
465
466
467
	} else
#endif
	{
468
469
		/* init to 0 to prevent spurious warnings */
		unsigned long flags = 0;
470

471
		slab_lock(page, &flags);
472
473
		if (page->freelist == freelist_old &&
					page->counters == counters_old) {
474
			page->freelist = freelist_new;
475
			page->counters = counters_new;
476
			slab_unlock(page, &flags);
477
			return true;
478
		}
479
		slab_unlock(page, &flags);
480
481
482
483
484
485
	}

	cpu_relax();
	stat(s, CMPXCHG_DOUBLE_FAIL);

#ifdef SLUB_DEBUG_CMPXCHG
486
	pr_info("%s %s: cmpxchg double redo ", n, s->name);
487
488
#endif

489
	return false;
490
491
}

492
493
494
495
496
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
		void *freelist_old, unsigned long counters_old,
		void *freelist_new, unsigned long counters_new,
		const char *n)
{
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
	if (s->flags & __CMPXCHG_DOUBLE) {
		if (cmpxchg_double(&page->freelist, &page->counters,
				   freelist_old, counters_old,
				   freelist_new, counters_new))
			return true;
	} else
#endif
	{
		unsigned long flags;

		local_irq_save(flags);
		__slab_lock(page);
		if (page->freelist == freelist_old &&
					page->counters == counters_old) {
			page->freelist = freelist_new;
			page->counters = counters_new;
			__slab_unlock(page);
			local_irq_restore(flags);
			return true;
		}
		__slab_unlock(page);
		local_irq_restore(flags);
	}

	cpu_relax();
	stat(s, CMPXCHG_DOUBLE_FAIL);

#ifdef SLUB_DEBUG_CMPXCHG
	pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif

	return false;
531
532
}

533
#ifdef CONFIG_SLUB_DEBUG
534
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
535
static DEFINE_RAW_SPINLOCK(object_map_lock);
536

537
538
539
540
541
542
543
544
545
546
547
548
static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
		       struct page *page)
{
	void *addr = page_address(page);
	void *p;

	bitmap_zero(obj_map, page->objects);

	for (p = page->freelist; p; p = get_freepointer(s, p))
		set_bit(__obj_to_index(s, addr, p), obj_map);
}

549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
#if IS_ENABLED(CONFIG_KUNIT)
static bool slab_add_kunit_errors(void)
{
	struct kunit_resource *resource;

	if (likely(!current->kunit_test))
		return false;

	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
	if (!resource)
		return false;

	(*(int *)resource->data)++;
	kunit_put_resource(resource);
	return true;
}
#else
static inline bool slab_add_kunit_errors(void) { return false; }
#endif

569
570
571
/*
 * Determine a map of object in use on a page.
 *
572
 * Node listlock must be held to guarantee that the page does
573
574
 * not vanish from under us.
 */
575
static unsigned long *get_map(struct kmem_cache *s, struct page *page)
576
	__acquires(&object_map_lock)
577
{
578
579
	VM_BUG_ON(!irqs_disabled());

580
	raw_spin_lock(&object_map_lock);
581

582
	__fill_map(object_map, s, page);
583
584
585
586

	return object_map;
}

587
static void put_map(unsigned long *map) __releases(&object_map_lock)
588
589
{
	VM_BUG_ON(map != object_map);
590
	raw_spin_unlock(&object_map_lock);
591
592
}

593
static inline unsigned int size_from_object(struct kmem_cache *s)
Joonsoo Kim's avatar
Joonsoo Kim committed
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
{
	if (s->flags & SLAB_RED_ZONE)
		return s->size - s->red_left_pad;

	return s->size;
}

static inline void *restore_red_left(struct kmem_cache *s, void *p)
{
	if (s->flags & SLAB_RED_ZONE)
		p -= s->red_left_pad;

	return p;
}

609
610
611
/*
 * Debug settings:
 */
612
#if defined(CONFIG_SLUB_DEBUG_ON)
613
static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
614
#else
615
static slab_flags_t slub_debug;
616
#endif
617

618
static char *slub_debug_string;
619
static int disable_higher_order_debug;
620

621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
/*
 * slub is about to manipulate internal object metadata.  This memory lies
 * outside the range of the allocated object, so accessing it would normally
 * be reported by kasan as a bounds error.  metadata_access_enable() is used
 * to tell kasan that these accesses are OK.
 */
static inline void metadata_access_enable(void)
{
	kasan_disable_current();
}

static inline void metadata_access_disable(void)
{
	kasan_enable_current();
}

Christoph Lameter's avatar
Christoph Lameter committed
637
638
639
/*
 * Object debugging
 */
Joonsoo Kim's avatar
Joonsoo Kim committed
640
641
642
643
644
645
646
647
648
649
650

/* Verify that a pointer has an address that is valid within a slab page */
static inline int check_valid_pointer(struct kmem_cache *s,
				struct page *page, void *object)
{
	void *base;

	if (!object)
		return 1;

	base = page_address(page);
651
	object = kasan_reset_tag(object);
Joonsoo Kim's avatar
Joonsoo Kim committed
652
653
654
655
656
657
658
659
660
	object = restore_red_left(s, object);
	if (object < base || object >= base + page->objects * s->size ||
		(object - base) % s->size) {
		return 0;
	}

	return 1;
}

661
662
static void print_section(char *level, char *text, u8 *addr,
			  unsigned int length)
Christoph Lameter's avatar
Christoph Lameter committed
663
{
664
	metadata_access_enable();
665
666
	print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
			16, 1, kasan_reset_tag((void *)addr), length, 1);
667
	metadata_access_disable();
Christoph Lameter's avatar
Christoph Lameter committed
668
669
}

670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
/*
 * See comment in calculate_sizes().
 */
static inline bool freeptr_outside_object(struct kmem_cache *s)
{
	return s->offset >= s->inuse;
}

/*
 * Return offset of the end of info block which is inuse + free pointer if
 * not overlapping with object.
 */
static inline unsigned int get_info_end(struct kmem_cache *s)
{
	if (freeptr_outside_object(s))
		return s->inuse + sizeof(void *);
	else
		return s->inuse;
}

Christoph Lameter's avatar
Christoph Lameter committed
690
691
692
693
694
static struct track *get_track(struct kmem_cache *s, void *object,
	enum track_item alloc)
{
	struct track *p;

695
	p = object + get_info_end(s);
Christoph Lameter's avatar
Christoph Lameter committed
696

697
	return kasan_reset_tag(p + alloc);
Christoph Lameter's avatar
Christoph Lameter committed
698
699
700
}

static void set_track(struct kmem_cache *s, void *object,
701
			enum track_item alloc, unsigned long addr)
Christoph Lameter's avatar
Christoph Lameter committed
702
{
Akinobu Mita's avatar
Akinobu Mita committed
703
	struct track *p = get_track(s, object, alloc);
Christoph Lameter's avatar
Christoph Lameter committed
704
705

	if (addr) {
706
707
708
709
710
711
712
713
714
715
#ifdef CONFIG_STACKTRACE
		unsigned int nr_entries;

		metadata_access_enable();
		nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
					      TRACK_ADDRS_COUNT, 3);
		metadata_access_disable();

		if (nr_entries < TRACK_ADDRS_COUNT)
			p->addrs[nr_entries] = 0;
716
#endif
Christoph Lameter's avatar
Christoph Lameter committed
717
718
		p->addr = addr;
		p->cpu = smp_processor_id();
719
		p->pid = current->pid;
Christoph Lameter's avatar
Christoph Lameter committed
720
		p->when = jiffies;
721
	} else {
Christoph Lameter's avatar
Christoph Lameter committed
722
		memset(p, 0, sizeof(struct track));
723
	}
Christoph Lameter's avatar
Christoph Lameter committed
724
725
726
727
}

static void init_tracking(struct kmem_cache *s, void *object)
{
728
729
730
	if (!(s->flags & SLAB_STORE_USER))
		return;

731
732
	set_track(s, object, TRACK_FREE, 0UL);
	set_track(s, object, TRACK_ALLOC, 0UL);
Christoph Lameter's avatar
Christoph Lameter committed
733
734
}

735
static void print_track(const char *s, struct track *t, unsigned long pr_time)
Christoph Lameter's avatar
Christoph Lameter committed
736
737
738
739
{
	if (!t->addr)
		return;

740
	pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
741
	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
742
#ifdef CONFIG_STACKTRACE
743
	{
744
745
746
747
748
749
		int i;
		for (i = 0; i < TRACK_ADDRS_COUNT; i++)
			if (t->addrs[i])
				pr_err("\t%pS\n", (void *)t->addrs[i]);
			else
				break;
750
751
	}
#endif
752
753
}

754
void print_tracking(struct kmem_cache *s, void *object)
755
{
756
	unsigned long pr_time = jiffies;
757
758
759
	if (!(s->flags & SLAB_STORE_USER))
		return;

760
761
	print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
	print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
762
763
764
765
}

static void print_page_info(struct page *page)
{
766
	pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
767
768
	       page, page->objects, page->inuse, page->freelist,
	       page->flags, &page->flags);
769
770
771
772
773

}

static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
774
	struct va_format vaf;
775
776
777
	va_list args;

	va_start(args, fmt);
778
779
	vaf.fmt = fmt;
	vaf.va = &args;
780
	pr_err("=============================================================================\n");
781
	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
782
	pr_err("-----------------------------------------------------------------------------\n\n");
783
	va_end(args);
Christoph Lameter's avatar
Christoph Lameter committed
784
785
}

786
__printf(2, 3)
787
788
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
{
789
	struct va_format vaf;
790
791
	va_list args;

792
793
794
	if (slab_add_kunit_errors())
		return;

795
	va_start(args, fmt);
796
797
798
	vaf.fmt = fmt;
	vaf.va = &args;
	pr_err("FIX %s: %pV\n", s->name, &vaf);
799
800
801
	va_end(args);
}

802
static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
803
			       void **freelist, void *nextfree)
804
805
{
	if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
806
807
808
	    !check_valid_pointer(s, page, nextfree) && freelist) {
		object_err(s, page, *freelist, "Freechain corrupt");
		*freelist = NULL;
809
810
811
812
813
814
815
		slab_fix(s, "Isolate corrupted freechain");
		return true;
	}

	return false;
}

816
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
Christoph Lameter's avatar
Christoph Lameter committed
817
818
{
	unsigned int off;	/* Offset of last byte */
819
	u8 *addr = page_address(page);
820
821
822
823
824

	print_tracking(s, p);

	print_page_info(page);

825
	pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
826
	       p, p - addr, get_freepointer(s, p));
827

Joonsoo Kim's avatar
Joonsoo Kim committed
828
	if (s->flags & SLAB_RED_ZONE)
829
		print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
830
			      s->red_left_pad);
Joonsoo Kim's avatar
Joonsoo Kim committed
831
	else if (p > addr + 16)
832
		print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
Christoph Lameter's avatar
Christoph Lameter committed
833

834
	print_section(KERN_ERR,         "Object   ", p,
835
		      min_t(unsigned int, s->object_size, PAGE_SIZE));
Christoph Lameter's avatar
Christoph Lameter committed
836
	if (s->flags & SLAB_RED_ZONE)
837
		print_section(KERN_ERR, "Redzone  ", p + s->object_size,
838
			s->inuse - s->object_size);
Christoph Lameter's avatar
Christoph Lameter committed
839

840
	off = get_info_end(s);
Christoph Lameter's avatar
Christoph Lameter committed
841

842
	if (s->flags & SLAB_STORE_USER)
Christoph Lameter's avatar
Christoph Lameter committed
843
844
		off += 2 * sizeof(struct track);

845
846
	off += kasan_metadata_size(s);

Joonsoo Kim's avatar
Joonsoo Kim committed
847
	if (off != size_from_object(s))
Christoph Lameter's avatar
Christoph Lameter committed
848
		/* Beginning of the filler is the free pointer */
849
		print_section(KERN_ERR, "Padding  ", p + off,
850
			      size_from_object(s) - off);
851
852

	dump_stack();
Christoph Lameter's avatar
Christoph Lameter committed
853
854
}

855
void object_err(struct kmem_cache *s, struct page *page,
Christoph Lameter's avatar
Christoph Lameter committed
856
857
			u8 *object, char *reason)
{
858
859
860
	if (slab_add_kunit_errors())
		return;

861
	slab_bug(s, "%s", reason);
862
	print_trailer(s, page, object);
863
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
Christoph Lameter's avatar
Christoph Lameter committed
864
865
}

866
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
867
			const char *fmt, ...)
Christoph Lameter's avatar
Christoph Lameter committed
868
869
870
871
{
	va_list args;
	char buf[100];

872
873
874
	if (slab_add_kunit_errors())
		return;

875
876
	va_start(args, fmt);
	vsnprintf(buf, sizeof(buf), fmt, args);
Christoph Lameter's avatar
Christoph Lameter committed
877
	va_end(args);
878
	slab_bug(s, "%s", buf);
879
	print_page_info(page);
Christoph Lameter's avatar
Christoph Lameter committed
880
	dump_stack();
881
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
Christoph Lameter's avatar
Christoph Lameter committed
882
883
}

884
static void init_object(struct kmem_cache *s, void *object, u8 val)
Christoph Lameter's avatar
Christoph Lameter committed
885
{
886
	u8 *p = kasan_reset_tag(object);
Christoph Lameter's avatar
Christoph Lameter committed
887

Joonsoo Kim's avatar
Joonsoo Kim committed
888
889
890
	if (s->flags & SLAB_RED_ZONE)
		memset(p - s->red_left_pad, val, s->red_left_pad);

Christoph Lameter's avatar
Christoph Lameter committed
891
	if (s->flags & __OBJECT_POISON) {
892
893
		memset(p, POISON_FREE, s->object_size - 1);
		p[s->object_size - 1] = POISON_END;
Christoph Lameter's avatar
Christoph Lameter committed
894
895
896
	}

	if (s->flags & SLAB_RED_ZONE)
897
		memset(p + s->object_size, val, s->inuse - s->object_size);
Christoph Lameter's avatar
Christoph Lameter committed
898
899
}

900
901
902
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
						void *from, void *to)
{
903
	slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
904
905
906
907
908
	memset(from, data, to - from);
}

static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
			u8 *object, char *what,
909
			u8 *start, unsigned int value, unsigned int bytes)
910
911
912
{
	u8 *fault;
	u8 *end;
913
	u8 *addr = page_address(page);
914

915
	metadata_access_enable();
916
	fault = memchr_inv(kasan_reset_tag(start), value, bytes);
917
	metadata_access_disable();
918
919
920
921
922
923
924
	if (!fault)
		return 1;

	end = start + bytes;
	while (end > fault && end[-1] == value)
		end--;

925
926
927
	if (slab_add_kunit_errors())
		goto skip_bug_print;

928
	slab_bug(s, "%s overwritten", what);
929
	pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
930
931
					fault, end - 1, fault - addr,
					fault[0], value);
932
	print_trailer(s, page, object);
933
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
934

935
skip_bug_print:
936
937
	restore_bytes(s, what, value, fault, end);
	return 0;
Christoph Lameter's avatar
Christoph Lameter committed
938
939
940
941
942
943
944
945
}

/*
 * Object layout:
 *
 * object address
 * 	Bytes of the object to be managed.
 * 	If the freepointer may overlay the object then the free
946
 *	pointer is at the middle of the object.
Christoph Lameter's avatar
Christoph Lameter committed
947
 *
Christoph Lameter's avatar
Christoph Lameter committed
948
949
950
 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
 * 	0xa5 (POISON_END)
 *
951
 * object + s->object_size
Christoph Lameter's avatar
Christoph Lameter committed
952
 * 	Padding to reach word boundary. This is also used for Redzoning.
Christoph Lameter's avatar
Christoph Lameter committed
953
 * 	Padding is extended by another word if Redzoning is enabled and
954
 * 	object_size == inuse.
Christoph Lameter's avatar
Christoph Lameter committed
955
 *
Christoph Lameter's avatar
Christoph Lameter committed
956
957
958
959
 * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 * 	0xcc (RED_ACTIVE) for objects in use.
 *
 * object + s->inuse
Christoph Lameter's avatar
Christoph Lameter committed
960
961
 * 	Meta data starts here.
 *
Christoph Lameter's avatar
Christoph Lameter committed
962
963
 * 	A. Free pointer (if we cannot overwrite object on free)
 * 	B. Tracking data for SLAB_STORE_USER
964
 *	C. Padding to reach required alignment boundary or at minimum
Christoph Lameter's avatar
Christoph Lameter committed
965
 * 		one word if debugging is on to be able to detect writes
Christoph Lameter's avatar
Christoph Lameter committed
966
967
968
 * 		before the word boundary.
 *
 *	Padding is done using 0x5a (POISON_INUSE)
Christoph Lameter's avatar
Christoph Lameter committed
969
970
 *
 * object + s->size
Christoph Lameter's avatar
Christoph Lameter committed
971
 * 	Nothing is used beyond s->size.
Christoph Lameter's avatar
Christoph Lameter committed
972
 *
973
 * If slabcaches are merged then the object_size and inuse boundaries are mostly
Christoph Lameter's avatar
Christoph Lameter committed
974
 * ignored. And therefore no slab options that rely on these boundaries
Christoph Lameter's avatar
Christoph Lameter committed
975
976
977
978
979
 * may be used with merged slabcaches.
 */

static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
980
	unsigned long off = get_info_end(s);	/* The end of info */
Christoph Lameter's avatar
Christoph Lameter committed
981
982
983
984
985

	if (s->flags & SLAB_STORE_USER)
		/* We also have user information there */
		off += 2 * sizeof(struct track);

986
987
	off += kasan_metadata_size(s);

Joonsoo Kim's avatar
Joonsoo Kim committed
988
	if (size_from_object(s) == off)
Christoph Lameter's avatar
Christoph Lameter committed
989
990
		return 1;

991
	return check_bytes_and_report(s, page, p, "Object padding",
Joonsoo Kim's avatar
Joonsoo Kim committed
992
			p + off, POISON_INUSE, size_from_object(s) - off);
Christoph Lameter's avatar
Christoph Lameter committed
993
994
}

995
/* Check the pad bytes at the end of a slab page */
Christoph Lameter's avatar
Christoph Lameter committed
996
997
static int slab_pad_check(struct kmem_cache *s, struct page *page)
{
998
999
1000
	u8 *start;
	u8 *fault;
	u8 *end;
1001
	u8 *pad;
1002
1003
	int length;
	int remainder;
Christoph Lameter's avatar
Christoph Lameter committed
1004
1005
1006
1007

	if (!(s->flags & SLAB_POISON))
		return 1;

1008
	start = page_address(page);
1009
	length = page_size(page);
1010
1011
	end = start + length;
	remainder = length % s->size;
Christoph Lameter's avatar
Christoph Lameter committed
1012
1013
1014
	if (!remainder)
		return 1;

1015
	pad = end - remainder;
1016
	metadata_access_enable();
1017
	fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
1018
	metadata_access_disable();
1019
1020
1021
1022
1023
	if (!fault)
		return 1;
	while (end > fault && end[-1] == POISON_INUSE)
		end--;

1024
1025
	slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
			fault, end - 1, fault - start);
1026
	print_section(KERN_ERR, "Padding ", pad, remainder);
1027

1028
	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
1029
	return 0;
Christoph Lameter's avatar
Christoph Lameter committed
1030
1031
1032
}

static int check_object(struct kmem_cache *s, struct page *page,
1033
					void *object, u8 val)
Christoph Lameter's avatar
Christoph Lameter committed
1034