aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c275
1 files changed, 128 insertions, 147 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f442058c064..51507e514636 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -485,7 +485,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
if (lru_gen_enabled()) {
if (soft_limit_excess(memcg))
- lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
+ lru_gen_soft_reclaim(memcg, nid);
return;
}
@@ -639,7 +639,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
}
}
-static void do_flush_stats(bool atomic)
+static void do_flush_stats(void)
{
/*
* We always flush the entire tree, so concurrent flushers can just
@@ -652,30 +652,16 @@ static void do_flush_stats(bool atomic)
WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
- if (atomic)
- cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
- else
- cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+ cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
atomic_set(&stats_flush_threshold, 0);
atomic_set(&stats_flush_ongoing, 0);
}
-static bool should_flush_stats(void)
-{
- return atomic_read(&stats_flush_threshold) > num_online_cpus();
-}
-
void mem_cgroup_flush_stats(void)
{
- if (should_flush_stats())
- do_flush_stats(false);
-}
-
-void mem_cgroup_flush_stats_atomic(void)
-{
- if (should_flush_stats())
- do_flush_stats(true);
+ if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+ do_flush_stats();
}
void mem_cgroup_flush_stats_ratelimited(void)
@@ -690,7 +676,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
* Always flush here so that flushing in latency-sensitive paths is
* as cheap as possible.
*/
- do_flush_stats(false);
+ do_flush_stats();
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
@@ -1273,13 +1259,13 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
*
* This function iterates over tasks attached to @memcg or to any of its
* descendants and calls @fn for each task. If @fn returns a non-zero
- * value, the function breaks the iteration loop and returns the value.
- * Otherwise, it will iterate over all tasks and return 0.
+ * value, the function breaks the iteration loop. Otherwise, it will iterate
+ * over all tasks and return 0.
*
* This function must not be called for the root memory cgroup.
*/
-int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
- int (*fn)(struct task_struct *, void *), void *arg)
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+ int (*fn)(struct task_struct *, void *), void *arg)
{
struct mem_cgroup *iter;
int ret = 0;
@@ -1299,7 +1285,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
break;
}
}
- return ret;
}
#ifdef CONFIG_DEBUG_VM
@@ -1580,13 +1565,10 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
+static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct seq_buf s;
int i;
- seq_buf_init(&s, buf, bufsize);
-
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
@@ -1603,21 +1585,21 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
u64 size;
size = memcg_page_state_output(memcg, memory_stats[i].idx);
- seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
+ seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
size += memcg_page_state_output(memcg,
NR_SLAB_RECLAIMABLE_B);
- seq_buf_printf(&s, "slab %llu\n", size);
+ seq_buf_printf(s, "slab %llu\n", size);
}
}
/* Accumulated memory events */
- seq_buf_printf(&s, "pgscan %lu\n",
+ seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT) +
memcg_events(memcg, PGSCAN_KHUGEPAGED));
- seq_buf_printf(&s, "pgsteal %lu\n",
+ seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT) +
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
@@ -1627,13 +1609,24 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
memcg_vm_event_stat[i] == PGPGOUT)
continue;
- seq_buf_printf(&s, "%s %lu\n",
+ seq_buf_printf(s, "%s %lu\n",
vm_event_name(memcg_vm_event_stat[i]),
memcg_events(memcg, memcg_vm_event_stat[i]));
}
/* The above should easily fit into one page */
- WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
+}
+
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+
+static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memcg_stat_format(memcg, s);
+ else
+ memcg1_stat_format(memcg, s);
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -1671,6 +1664,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[PAGE_SIZE];
+ struct seq_buf s;
lockdep_assert_held(&oom_lock);
@@ -1693,8 +1687,9 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
- memory_stat_format(memcg, buf, sizeof(buf));
- pr_info("%s", buf);
+ seq_buf_init(&s, buf, sizeof(buf));
+ memory_stat_format(memcg, &s);
+ seq_buf_do_printk(&s, KERN_INFO);
}
/*
@@ -2028,26 +2023,12 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked)
mem_cgroup_oom_notify(memcg);
- if (locked && !READ_ONCE(memcg->oom_kill_disable)) {
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
- current->memcg_oom_order);
- } else {
- schedule();
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- }
+ schedule();
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
- if (locked) {
+ if (locked)
mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitly.
- */
- memcg_oom_recover(memcg);
- }
cleanup:
current->memcg_in_oom = NULL;
css_put(&memcg->css);
@@ -2166,17 +2147,12 @@ again:
* When charge migration first begins, we can have multiple
* critical sections holding the fast-path RCU lock and one
* holding the slowpath move_lock. Track the task who has the
- * move_lock for unlock_page_memcg().
+ * move_lock for folio_memcg_unlock().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
}
-void lock_page_memcg(struct page *page)
-{
- folio_memcg_lock(page_folio(page));
-}
-
static void __folio_memcg_unlock(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
@@ -2204,11 +2180,6 @@ void folio_memcg_unlock(struct folio *folio)
__folio_memcg_unlock(folio_memcg(folio));
}
-void unlock_page_memcg(struct page *page)
-{
- folio_memcg_unlock(page_folio(page));
-}
-
struct memcg_stock_pcp {
local_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
@@ -2275,7 +2246,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
+ if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) {
stock->nr_pages -= nr_pages;
ret = true;
}
@@ -2290,7 +2261,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
- struct mem_cgroup *old = stock->cached;
+ struct mem_cgroup *old = READ_ONCE(stock->cached);
if (!old)
return;
@@ -2303,7 +2274,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
}
css_put(&old->css);
- stock->cached = NULL;
+ WRITE_ONCE(stock->cached, NULL);
}
static void drain_local_stock(struct work_struct *dummy)
@@ -2338,10 +2309,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
struct memcg_stock_pcp *stock;
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached != memcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
drain_stock(stock);
css_get(&memcg->css);
- stock->cached = memcg;
+ WRITE_ONCE(stock->cached, memcg);
}
stock->nr_pages += nr_pages;
@@ -2383,7 +2354,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
bool flush = false;
rcu_read_lock();
- memcg = stock->cached;
+ memcg = READ_ONCE(stock->cached);
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
@@ -2884,7 +2855,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
*
* - the page lock
* - LRU isolation
- * - lock_page_memcg()
+ * - folio_memcg_lock()
* - exclusive reference
* - mem_cgroup_trylock_pages()
*/
@@ -3208,12 +3179,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
* accumulating over a page of vmstat data or when pgdat or idx
* changes.
*/
- if (stock->cached_objcg != objcg) {
+ if (READ_ONCE(stock->cached_objcg) != objcg) {
old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- stock->cached_objcg = objcg;
+ WRITE_ONCE(stock->cached_objcg, objcg);
stock->cached_pgdat = pgdat;
} else if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
@@ -3267,7 +3238,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
@@ -3279,7 +3250,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
- struct obj_cgroup *old = stock->cached_objcg;
+ struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
return NULL;
@@ -3332,7 +3303,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
stock->cached_pgdat = NULL;
}
- stock->cached_objcg = NULL;
+ WRITE_ONCE(stock->cached_objcg, NULL);
/*
* The `old' objects needs to be released by the caller via
* obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
@@ -3343,10 +3314,11 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
+ struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
- if (stock->cached_objcg) {
- memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (objcg) {
+ memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3365,10 +3337,10 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached_objcg != objcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
- stock->cached_objcg = objcg;
+ WRITE_ONCE(stock->cached_objcg, objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
allow_uncharge = true; /* Allow uncharge when objcg changes */
@@ -3699,27 +3671,13 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
if (mem_cgroup_is_root(memcg)) {
/*
- * We can reach here from irq context through:
- * uncharge_batch()
- * |--memcg_check_events()
- * |--mem_cgroup_threshold()
- * |--__mem_cgroup_threshold()
- * |--mem_cgroup_usage
- *
- * rstat flushing is an expensive operation that should not be
- * done from irq context; use stale stats in this case.
- * Arguably, usage threshold events are not reliable on the root
- * memcg anyway since its usage is ill-defined.
- *
- * Additionally, other call paths through memcg_check_events()
- * disable irqs, so make sure we are flushing stats atomically.
+ * Approximate root's usage from global state. This isn't
+ * perfect, but the root usage was always an approximation.
*/
- if (in_task())
- mem_cgroup_flush_stats_atomic();
- val = memcg_page_state(memcg, NR_FILE_PAGES) +
- memcg_page_state(memcg, NR_ANON_MAPPED);
+ val = global_node_page_state(NR_FILE_PAGES) +
+ global_node_page_state(NR_ANON_MAPPED);
if (swap)
- val += memcg_page_state(memcg, MEMCG_SWAP);
+ val += total_swap_pages - get_nr_swap_pages();
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -4135,9 +4093,8 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
-static int memcg_stat_show(struct seq_file *m, void *v)
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
@@ -4152,18 +4109,18 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+ seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
- memcg_events_local(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %lu\n", lru_list_name(i),
- memcg_page_state_local(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -4171,11 +4128,11 @@ static int memcg_stat_show(struct seq_file *m, void *v)
memory = min(memory, READ_ONCE(mi->memory.max));
memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
- seq_printf(m, "hierarchical_memory_limit %llu\n",
- (u64)memory * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
+ (u64)memory * PAGE_SIZE);
if (do_memsw_account())
- seq_printf(m, "hierarchical_memsw_limit %llu\n",
- (u64)memsw * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
+ (u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4183,19 +4140,19 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "total_%s %llu\n",
- vm_event_name(memcg1_events[i]),
- (u64)memcg_events(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "total_%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -4210,12 +4167,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "anon_cost %lu\n", anon_cost);
- seq_printf(m, "file_cost %lu\n", file_cost);
+ seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
+ seq_buf_printf(s, "file_cost %lu\n", file_cost);
}
#endif
-
- return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
@@ -4648,11 +4603,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- /*
- * wb_writeback() takes a spinlock and calls
- * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep.
- */
- mem_cgroup_flush_stats_atomic();
+ mem_cgroup_flush_stats();
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5059,6 +5010,8 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
}
#endif
+static int memory_stat_show(struct seq_file *m, void *v);
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
@@ -5091,7 +5044,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
},
{
.name = "stat",
- .seq_show = memcg_stat_show,
+ .seq_show = memory_stat_show,
},
{
.name = "force_empty",
@@ -5392,7 +5345,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
memcg->deferred_split_queue.split_queue_len = 0;
#endif
- idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
lru_gen_init_memcg(memcg);
return memcg;
fail:
@@ -5464,14 +5416,27 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ FLUSH_TIME);
+ lru_gen_online_memcg(memcg);
+
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
css_get(css);
- if (unlikely(mem_cgroup_is_root(memcg)))
- queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
- 2UL*HZ);
- lru_gen_online_memcg(memcg);
+ /*
+ * Ensure mem_cgroup_from_id() works once we're fully online.
+ *
+ * We could do this earlier and require callers to filter with
+ * css_tryget_online(). But right now there are no users that
+ * need earlier access, and the workingset code relies on the
+ * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
+ * publish it here at the end of onlining. This matches the
+ * regular ID destruction during offlining.
+ */
+ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+
return 0;
offline_kmem:
memcg_offline_kmem(memcg);
@@ -5871,7 +5836,7 @@ static int mem_cgroup_move_account(struct page *page,
* with (un)charging, migration, LRU putback, or anything else
* that would rely on a stable page's memory cgroup.
*
- * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * Note that folio_memcg_lock is a memcg lock, not a page lock,
* to save space. As soon as we switch page's memory cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
@@ -6063,11 +6028,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
for (; addr != end; pte++, addr += PAGE_SIZE)
- if (get_mctgt_type(vma, addr, *pte, NULL))
+ if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -6077,6 +6042,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
static const struct mm_walk_ops precharge_walk_ops = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
@@ -6283,12 +6249,12 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
for (; addr != end; addr += PAGE_SIZE) {
- pte_t ptent = *(pte++);
+ pte_t ptent = ptep_get(pte++);
bool device = false;
swp_entry_t ent;
@@ -6356,13 +6322,14 @@ put: /* get_mctgt_type() gets & locks the page */
static const struct mm_walk_ops charge_walk_ops = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static void mem_cgroup_move_charge(void)
{
lru_add_drain_all();
/*
- * Signal lock_page_memcg() to take the memcg's move_lock
+ * Signal folio_memcg_lock() to take the memcg's move_lock
* while we're moving its pages to another memcg. Then wait
* for already started RCU-only updates to finish.
*/
@@ -6640,10 +6607,12 @@ static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ struct seq_buf s;
if (!buf)
return -ENOMEM;
- memory_stat_format(memcg, buf, PAGE_SIZE);
+ seq_buf_init(&s, buf, PAGE_SIZE);
+ memory_stat_format(memcg, &s);
seq_puts(m, buf);
kfree(buf);
return 0;
@@ -6902,7 +6871,7 @@ static unsigned long effective_protection(unsigned long usage,
protected = min(usage, setting);
/*
* If all cgroups at this level combined claim and use more
- * protection then what the parent affords them, distribute
+ * protection than what the parent affords them, distribute
* shares in proportion to utilization.
*
* We are using actual utilization rather than the statically
@@ -7427,8 +7396,7 @@ static int __init mem_cgroup_init(void)
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE);
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
rtpn->rb_root = RB_ROOT;
rtpn->rb_rightmost = NULL;
@@ -7662,6 +7630,14 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static u64 swap_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)memcg->swap.watermark * PAGE_SIZE;
+}
+
static int swap_high_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -7741,6 +7717,11 @@ static struct cftype swap_files[] = {
.write = swap_max_write,
},
{
+ .name = "swap.peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_peak_read,
+ },
+ {
.name = "swap.events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, swap_events_file),