summaryrefslogtreecommitdiffstats
path: root/mm/swap_slots.c
blob: 63a7b4563a57ee3367b8963b1219323a4d3b28c9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
// SPDX-License-Identifier: GPL-2.0
/*
 * Manage cache of swap slots to be used for and returned from
 * swap.
 *
 * Copyright(c) 2016 Intel Corporation.
 *
 * Author: Tim Chen <tim.c.chen@linux.intel.com>
 *
 * We allocate the swap slots from the global pool and put
 * it into local per cpu caches.  This has the advantage
 * of no needing to acquire the swap_info lock every time
 * we need a new slot.
 *
 * There is also opportunity to simply return the slot
 * to local caches without needing to acquire swap_info
 * lock.  We do not reuse the returned slots directly but
 * move them back to the global pool in a batch.  This
 * allows the slots to coaellesce and reduce fragmentation.
 *
 * The swap entry allocated is marked with SWAP_HAS_CACHE
 * flag in map_count that prevents it from being allocated
 * again from the global pool.
 *
 * The swap slots cache is protected by a mutex instead of
 * a spin lock as when we search for slots with scan_swap_map,
 * we can possibly sleep.
 */

#include <linux/swap_slots.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/vmalloc.h>
#include <linux/mutex.h>
#include <linux/mm.h>

static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
static bool	swap_slot_cache_active;
bool	swap_slot_cache_enabled;
static bool	swap_slot_cache_initialized;
static DEFINE_MUTEX(swap_slots_cache_mutex);
/* Serialize swap slots cache enable/disable operations */
static DEFINE_MUTEX(swap_slots_cache_enable_mutex);

static void __drain_swap_slots_cache(unsigned int type);
static void deactivate_swap_slots_cache(void);
static void reactivate_swap_slots_cache(void);

#define use_swap_slot_cache (swap_slot_cache_active && \
		swap_slot_cache_enabled && swap_slot_cache_initialized)
#define SLOTS_CACHE 0x1
#define SLOTS_CACHE_RET 0x2

static void deactivate_swap_slots_cache(void)
{
	mutex_lock(&swap_slots_cache_mutex);
	swap_slot_cache_active = false;
	__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
	mutex_unlock(&swap_slots_cache_mutex);
}

static void reactivate_swap_slots_cache(void)
{
	mutex_lock(&swap_slots_cache_mutex);
	swap_slot_cache_active = true;
	mutex_unlock(&swap_slots_cache_mutex);
}

/* Must not be called with cpu hot plug lock */
void disable_swap_slots_cache_lock(void)
{
	mutex_lock(&swap_slots_cache_enable_mutex);
	swap_slot_cache_enabled = false;
	if (swap_slot_cache_initialized) {
		/* serialize with cpu hotplug operations */
		get_online_cpus();
		__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
		put_online_cpus();
	}
}

static void __reenable_swap_slots_cache(void)
{
	swap_slot_cache_enabled = has_usable_swap();
}

void reenable_swap_slots_cache_unlock(void)
{
	__reenable_swap_slots_cache();
	mutex_unlock(&swap_slots_cache_enable_mutex);
}

static bool check_cache_active(void)
{
	long pages;

	if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
		return false;

	pages = get_nr_swap_pages();
	if (!swap_slot_cache_active) {
		if (pages > num_online_cpus() *
		    THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
			reactivate_swap_slots_cache();
		goto out;
	}

	/* if global pool of slot caches too low, deactivate cache */
	if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
		deactivate_swap_slots_cache();
out:
	return swap_slot_cache_active;
}

static int alloc_swap_slot_cache(unsigned int cpu)
{
	struct swap_slots_cache *cache;
	swp_entry_t *slots, *slots_ret;

	/*
	 * Do allocation outside swap_slots_cache_mutex
	 * as kvzalloc could trigger reclaim and get_swap_page,
	 * which can lock swap_slots_cache_mutex.
	 */
	slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
			 GFP_KERNEL);
	if (!slots)
		return -ENOMEM;

	slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
			     GFP_KERNEL);
	if (!slots_ret) {
		kvfree(slots);
		return -ENOMEM;
	}

	mutex_lock(&swap_slots_cache_mutex);
	cache = &per_cpu(swp_slots, cpu);
	if (cache->slots || cache->slots_ret)
		/* cache already allocated */
		goto out;
	if (!cache->lock_initialized) {
		mutex_init(&cache->alloc_lock);
		spin_lock_init(&cache->free_lock);
		cache->lock_initialized = true;
	}
	cache->nr = 0;
	cache->cur = 0;
	cache->n_ret = 0;
	/*
	 * We initialized alloc_lock and free_lock earlier.  We use
	 * !cache->slots or !cache->slots_ret to know if it is safe to acquire
	 * the corresponding lock and use the cache.  Memory barrier below
	 * ensures the assumption.
	 */
	mb();
	cache->slots = slots;
	slots = NULL;
	cache->slots_ret = slots_ret;
	slots_ret = NULL;
out:
	mutex_unlock(&swap_slots_cache_mutex);
	if (slots)
		kvfree(slots);
	if (slots_ret)
		kvfree(slots_ret);
	return 0;
}

static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
				  bool free_slots)
{
	struct swap_slots_cache *cache;
	swp_entry_t *slots = NULL;

	cache = &per_cpu(swp_slots, cpu);
	if ((type & SLOTS_CACHE) && cache->slots) {
		mutex_lock(&cache->alloc_lock);
		swapcache_free_entries(cache->slots + cache->cur, cache->nr);
		cache->cur = 0;
		cache->nr = 0;
		if (free_slots && cache->slots) {
			kvfree(cache->slots);
			cache->slots = NULL;
		}
		mutex_unlock(&cache->alloc_lock);
	}
	if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
		spin_lock_irq(&cache->free_lock);
		swapcache_free_entries(cache->slots_ret, cache->n_ret);
		cache->n_ret = 0;
		if (free_slots && cache->slots_ret) {
			slots = cache->slots_ret;
			cache->slots_ret = NULL;
		}
		spin_unlock_irq(&cache->free_lock);
		if (slots)
			kvfree(slots);
	}
}

static void __drain_swap_slots_cache(unsigned int type)
{
	unsigned int cpu;

	/*
	 * This function is called during
	 *	1) swapoff, when we have to make sure no
	 *	   left over slots are in cache when we remove
	 *	   a swap device;
	 *      2) disabling of swap slot cache, when we run low
	 *	   on swap slots when allocating memory and need
	 *	   to return swap slots to global pool.
	 *
	 * We cannot acquire cpu hot plug lock here as
	 * this function can be invoked in the cpu
	 * hot plug path:
	 * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
	 *   -> memory allocation -> direct reclaim -> get_swap_page
	 *   -> drain_swap_slots_cache
	 *
	 * Hence the loop over current online cpu below could miss cpu that
	 * is being brought online but not yet marked as online.
	 * That is okay as we do not schedule and run anything on a
	 * cpu before it has been marked online. Hence, we will not
	 * fill any swap slots in slots cache of such cpu.
	 * There are no slots on such cpu that need to be drained.
	 */
	for_each_online_cpu(cpu)
		drain_slots_cache_cpu(cpu, type, false);
}

static int free_slot_cache(unsigned int cpu)
{
	mutex_lock(&swap_slots_cache_mutex);
	drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
	mutex_unlock(&swap_slots_cache_mutex);
	return 0;
}

int enable_swap_slots_cache(void)
{
	int ret = 0;

	mutex_lock(&swap_slots_cache_enable_mutex);
	if (swap_slot_cache_initialized) {
		__reenable_swap_slots_cache();
		goto out_unlock;
	}

	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
				alloc_swap_slot_cache, free_slot_cache);
	if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
			       "without swap slots cache.\n", __func__))
		goto out_unlock;

	swap_slot_cache_initialized = true;
	__reenable_swap_slots_cache();
out_unlock:
	mutex_unlock(&swap_slots_cache_enable_mutex);
	return 0;
}

/* called with swap slot cache's alloc lock held */
static int refill_swap_slots_cache(struct swap_slots_cache *cache)
{
	if (!use_swap_slot_cache || cache->nr)
		return 0;

	cache->cur = 0;
	if (swap_slot_cache_active)
		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
					   cache->slots, 1);

	return cache->nr;
}

int free_swap_slot(swp_entry_t entry)
{
	struct swap_slots_cache *cache;

	cache = raw_cpu_ptr(&swp_slots);
	if (likely(use_swap_slot_cache && cache->slots_ret)) {
		spin_lock_irq(&cache->free_lock);
		/* Swap slots cache may be deactivated before acquiring lock */
		if (!use_swap_slot_cache || !cache->slots_ret) {
			spin_unlock_irq(&cache->free_lock);
			goto direct_free;
		}
		if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
			/*
			 * Return slots to global pool.
			 * The current swap_map value is SWAP_HAS_CACHE.
			 * Set it to 0 to indicate it is available for
			 * allocation in global pool
			 */
			swapcache_free_entries(cache->slots_ret, cache->n_ret);
			cache->n_ret = 0;
		}
		cache->slots_ret[cache->n_ret++] = entry;
		spin_unlock_irq(&cache->free_lock);
	} else {
direct_free:
		swapcache_free_entries(&entry, 1);
	}

	return 0;
}

swp_entry_t get_swap_page(struct page *page)
{
	swp_entry_t entry, *pentry;
	struct swap_slots_cache *cache;

	entry.val = 0;

	if (PageTransHuge(page)) {
		if (IS_ENABLED(CONFIG_THP_SWAP))
			get_swap_pages(1, &entry, HPAGE_PMD_NR);
		goto out;
	}

	/*
	 * Preemption is allowed here, because we may sleep
	 * in refill_swap_slots_cache().  But it is safe, because
	 * accesses to the per-CPU data structure are protected by the
	 * mutex cache->alloc_lock.
	 *
	 * The alloc path here does not touch cache->slots_ret
	 * so cache->free_lock is not taken.
	 */
	cache = raw_cpu_ptr(&swp_slots);

	if (likely(check_cache_active() && cache->slots)) {
		mutex_lock(&cache->alloc_lock);
		if (cache->slots) {
repeat:
			if (cache->nr) {
				pentry = &cache->slots[cache->cur++];
				entry = *pentry;
				pentry->val = 0;
				cache->nr--;
			} else {
				if (refill_swap_slots_cache(cache))
					goto repeat;
			}
		}
		mutex_unlock(&cache->alloc_lock);
		if (entry.val)
			goto out;
	}

	get_swap_pages(1, &entry, 1);
out:
	if (mem_cgroup_try_charge_swap(page, entry)) {
		put_swap_page(page, entry);
		entry.val = 0;
	}
	return entry;
}
="n">ret ? : err; out_unlock_reading: mutex_unlock(&reading_mutex); out_put: put_rng(rng); goto out; } static const struct file_operations rng_chrdev_ops = { .owner = THIS_MODULE, .open = rng_dev_open, .read = rng_dev_read, .llseek = noop_llseek, }; static const struct attribute_group *rng_dev_groups[]; static struct miscdevice rng_miscdev = { .minor = HWRNG_MINOR, .name = RNG_MODULE_NAME, .nodename = "hwrng", .fops = &rng_chrdev_ops, .groups = rng_dev_groups, }; static int enable_best_rng(void) { int ret = -ENODEV; BUG_ON(!mutex_is_locked(&rng_mutex)); /* rng_list is sorted by quality, use the best (=first) one */ if (!list_empty(&rng_list)) { struct hwrng *new_rng; new_rng = list_entry(rng_list.next, struct hwrng, list); ret = ((new_rng == current_rng) ? 0 : set_current_rng(new_rng)); if (!ret) cur_rng_set_by_user = 0; } else { drop_current_rng(); cur_rng_set_by_user = 0; ret = 0; } return ret; } static ssize_t hwrng_attr_current_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { int err = -ENODEV; struct hwrng *rng; err = mutex_lock_interruptible(&rng_mutex); if (err) return -ERESTARTSYS; if (sysfs_streq(buf, "")) { err = enable_best_rng(); } else { list_for_each_entry(rng, &rng_list, list) { if (sysfs_streq(rng->name, buf)) { cur_rng_set_by_user = 1; err = set_current_rng(rng); break; } } } mutex_unlock(&rng_mutex); return err ? : len; } static ssize_t hwrng_attr_current_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct hwrng *rng; rng = get_current_rng(); if (IS_ERR(rng)) return PTR_ERR(rng); ret = snprintf(buf, PAGE_SIZE, "%s\n", rng ? rng->name : "none"); put_rng(rng); return ret; } static ssize_t hwrng_attr_available_show(struct device *dev, struct device_attribute *attr, char *buf) { int err; struct hwrng *rng; err = mutex_lock_interruptible(&rng_mutex); if (err) return -ERESTARTSYS; buf[0] = '\0'; list_for_each_entry(rng, &rng_list, list) { strlcat(buf, rng->name, PAGE_SIZE); strlcat(buf, " ", PAGE_SIZE); } strlcat(buf, "\n", PAGE_SIZE); mutex_unlock(&rng_mutex); return strlen(buf); } static ssize_t hwrng_attr_selected_show(struct device *dev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", cur_rng_set_by_user); } static DEVICE_ATTR(rng_current, S_IRUGO | S_IWUSR, hwrng_attr_current_show, hwrng_attr_current_store); static DEVICE_ATTR(rng_available, S_IRUGO, hwrng_attr_available_show, NULL); static DEVICE_ATTR(rng_selected, S_IRUGO, hwrng_attr_selected_show, NULL); static struct attribute *rng_dev_attrs[] = { &dev_attr_rng_current.attr, &dev_attr_rng_available.attr, &dev_attr_rng_selected.attr, NULL }; ATTRIBUTE_GROUPS(rng_dev); static void __exit unregister_miscdev(void) { misc_deregister(&rng_miscdev); } static int __init register_miscdev(void) { return misc_register(&rng_miscdev); } static int hwrng_fillfn(void *unused) { long rc; while (!kthread_should_stop()) { struct hwrng *rng; rng = get_current_rng(); if (IS_ERR(rng) || !rng) break; mutex_lock(&reading_mutex); rc = rng_get_data(rng, rng_fillbuf, rng_buffer_size(), 1); mutex_unlock(&reading_mutex); put_rng(rng); if (rc <= 0) { pr_warn("hwrng: no data available\n"); msleep_interruptible(10000); continue; } /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, rc * current_quality * 8 >> 10); } hwrng_fill = NULL; return 0; } static void start_khwrngd(void) { hwrng_fill = kthread_run(hwrng_fillfn, NULL, "hwrng"); if (IS_ERR(hwrng_fill)) { pr_err("hwrng_fill thread creation failed\n"); hwrng_fill = NULL; } } int hwrng_register(struct hwrng *rng) { int err = -EINVAL; struct hwrng *old_rng, *tmp; struct list_head *rng_list_ptr; if (!rng->name || (!rng->data_read && !rng->read)) goto out; mutex_lock(&rng_mutex); /* Must not register two RNGs with the same name. */ err = -EEXIST; list_for_each_entry(tmp, &rng_list, list) { if (strcmp(tmp->name, rng->name) == 0) goto out_unlock; } init_completion(&rng->cleanup_done); complete(&rng->cleanup_done); /* rng_list is sorted by decreasing quality */ list_for_each(rng_list_ptr, &rng_list) { tmp = list_entry(rng_list_ptr, struct hwrng, list); if (tmp->quality < rng->quality) break; } list_add_tail(&rng->list, rng_list_ptr); old_rng = current_rng; err = 0; if (!old_rng || (!cur_rng_set_by_user && rng->quality > old_rng->quality)) { /* * Set new rng as current as the new rng source * provides better entropy quality and was not * chosen by userspace. */ err = set_current_rng(rng); if (err) goto out_unlock; } if (old_rng && !rng->init) { /* * Use a new device's input to add some randomness to * the system. If this rng device isn't going to be * used right away, its init function hasn't been * called yet; so only use the randomness from devices * that don't need an init callback. */ add_early_randomness(rng); } out_unlock: mutex_unlock(&rng_mutex); out: return err; } EXPORT_SYMBOL_GPL(hwrng_register); void hwrng_unregister(struct hwrng *rng) { int err; mutex_lock(&rng_mutex); list_del(&rng->list); if (current_rng == rng) { err = enable_best_rng(); if (err) { drop_current_rng(); cur_rng_set_by_user = 0; } } if (list_empty(&rng_list)) { mutex_unlock(&rng_mutex); if (hwrng_fill) kthread_stop(hwrng_fill); } else mutex_unlock(&rng_mutex); wait_for_completion(&rng->cleanup_done); } EXPORT_SYMBOL_GPL(hwrng_unregister); static void devm_hwrng_release(struct device *dev, void *res) { hwrng_unregister(*(struct hwrng **)res); } static int devm_hwrng_match(struct device *dev, void *res, void *data) { struct hwrng **r = res; if (WARN_ON(!r || !*r)) return 0; return *r == data; } int devm_hwrng_register(struct device *dev, struct hwrng *rng) { struct hwrng **ptr; int error; ptr = devres_alloc(devm_hwrng_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; error = hwrng_register(rng); if (error) { devres_free(ptr); return error; } *ptr = rng; devres_add(dev, ptr); return 0; } EXPORT_SYMBOL_GPL(devm_hwrng_register); void devm_hwrng_unregister(struct device *dev, struct hwrng *rng) { devres_release(dev, devm_hwrng_release, devm_hwrng_match, rng); } EXPORT_SYMBOL_GPL(devm_hwrng_unregister); static int __init hwrng_modinit(void) { int ret = -ENOMEM; /* kmalloc makes this safe for virt_to_page() in virtio_rng.c */ rng_buffer = kmalloc(rng_buffer_size(), GFP_KERNEL); if (!rng_buffer) return -ENOMEM; rng_fillbuf = kmalloc(rng_buffer_size(), GFP_KERNEL); if (!rng_fillbuf) { kfree(rng_buffer); return -ENOMEM; } ret = register_miscdev(); if (ret) { kfree(rng_fillbuf); kfree(rng_buffer); } return ret; } static void __exit hwrng_modexit(void) { mutex_lock(&rng_mutex); BUG_ON(current_rng); kfree(rng_buffer); kfree(rng_fillbuf); mutex_unlock(&rng_mutex); unregister_miscdev(); } module_init(hwrng_modinit); module_exit(hwrng_modexit); MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver"); MODULE_LICENSE("GPL");