From 7dc531576e4189fe0c481b17eab1a81adbbe80c8 Mon Sep 17 00:00:00 2001 From: Binder Makin Date: Wed, 30 Jul 2014 16:58:51 +0800 Subject: [PATCH 1/2] cgroups: Resource controller for open files. lkml https://lkml.org/lkml/2014/7/2/640 upstream Add a resource controller for limiting the number of open file handles. This allows us to catch misbehaving processes and return EMFILE instead of ENOMEM for kernel memory limits. Signed-off-by: Binder Makin Signed-off-by: He Zhe Signed-off-by: Bruce Ashfield --- fs/Makefile | 1 + fs/file.c | 44 ++++++++ fs/filescontrol.c | 247 ++++++++++++++++++++++++++++++++++++++++++ include/linux/cgroup_subsys.h | 5 + include/linux/fdtable.h | 3 + include/linux/filescontrol.h | 32 ++++++ init/Kconfig | 7 ++ 7 files changed, 339 insertions(+) create mode 100644 fs/filescontrol.c create mode 100644 include/linux/filescontrol.h diff --git a/fs/Makefile b/fs/Makefile index ebfe2ee74c41..18eaee081f4b 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_CGROUP_FILES) += filescontrol.o obj-y += quota/ diff --git a/fs/file.c b/fs/file.c index eb56a13dab3e..4b84070bf8f4 100644 --- a/fs/file.c +++ b/fs/file.c @@ -22,6 +22,7 @@ #include #include #include +#include int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; @@ -264,6 +265,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; +#ifdef CONFIG_CGROUP_FILES + files_cgroup_assign(newf); +#endif spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); @@ -340,9 +344,28 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) rcu_assign_pointer(newf->fdt, new_fdt); +#ifdef CONFIG_CGROUP_FILES + if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) + return newf; + +/* could not get enough FD resources. Need to clean up. */ + new_fds = new_fdt->fd; + for (i = open_files; i != 0; i--) { + struct file *f = *new_fds++; + if (f) + fput(f); + } + if (new_fdt != &newf->fdtab) + __free_fdtable(new_fdt); + *errorp = -EMFILE; +#else return newf; +#endif out_release: +#ifdef CONFIG_CGROUP_FILES + files_cgroup_remove(newf); +#endif kmem_cache_free(files_cachep, newf); out: return NULL; @@ -368,6 +391,9 @@ static struct fdtable *close_files(struct files_struct * files) if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { +#ifdef CONFIG_CGROUP_FILES + files_cgroup_unalloc_fd(files, 1); +#endif filp_close(file, files); cond_resched(); } @@ -486,6 +512,13 @@ repeat: if (error) goto repeat; +#ifdef CONFIG_CGROUP_FILES + if (files_cgroup_alloc_fd(files, 1)) { + error = -EMFILE; + goto out; + } +#endif + if (start <= files->next_fd) files->next_fd = fd + 1; @@ -522,6 +555,10 @@ EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); +#ifdef CONFIG_CGROUP_FILES + if (test_bit(fd, fdt->open_fds)) + files_cgroup_unalloc_fd(files, 1); +#endif __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; @@ -780,6 +817,13 @@ static int do_dup2(struct files_struct *files, tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; + +#ifdef CONFIG_CGROUP_FILES + if (!tofree) + if (!files_cgroup_alloc_fd(files, 1)) + goto Ebusy; +#endif + get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); diff --git a/fs/filescontrol.c b/fs/filescontrol.c new file mode 100644 index 000000000000..c532d02eb023 --- /dev/null +++ b/fs/filescontrol.c @@ -0,0 +1,247 @@ +/* filescontrol.c - Cgroup controller for open file handles. + * + * Copyright 2014 Google Inc. + * Author: Brian Makin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct cgroup_subsys files_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(files_cgrp_subsys); + +struct files_cgroup { + struct cgroup_subsys_state css; + struct res_counter open_handles; +}; + +static inline struct files_cgroup *css_fcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct files_cgroup, css) : NULL; +} + +static inline struct res_counter * +css_res_open_handles(struct cgroup_subsys_state *css) +{ + return &css_fcg(css)->open_handles; +} + +static inline struct files_cgroup * +files_cgroup_from_files(struct files_struct *files) +{ + return files->files_cgroup; +} + +static struct cgroup_subsys_state * +files_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct files_cgroup *fcg; + + fcg = kzalloc(sizeof(*fcg), GFP_KERNEL); + if (!fcg) + goto out; + + if (!parent_css) { + res_counter_init(&fcg->open_handles, NULL); + res_counter_set_limit(&fcg->open_handles, get_max_files()); + } else { + struct files_cgroup *parent_fcg = css_fcg(parent_css); + res_counter_init(&fcg->open_handles, &parent_fcg->open_handles); + res_counter_set_limit(&fcg->open_handles, + res_counter_read_u64(&parent_fcg->open_handles, + RES_LIMIT)); + } + return &fcg->css; + +out: + return ERR_PTR(-ENOMEM); +} + +static void files_cgroup_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_fcg(css)); +} + +u64 files_cgroup_count_fds(struct files_struct *files) +{ + int i; + struct fdtable *fdt; + int retval = 0; + + fdt = files_fdtable(files); + for (i = 0; i < DIV_ROUND_UP(fdt->max_fds, BITS_PER_LONG); i++) + retval += hweight64((__u64)fdt->open_fds[i]); + return retval; +} + +static u64 files_in_taskset(struct cgroup_taskset *tset) +{ + struct task_struct *task; + u64 files = 0; + cgroup_taskset_for_each(task, tset) { + if (!thread_group_leader(task)) + continue; + + task_lock(task); + files += files_cgroup_count_fds(task->files); + task_unlock(task); + } + return files; +} + +/* + * If attaching this cgroup would overcommit the resource then deny + * the attach. + */ +static int files_cgroup_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + u64 files = files_in_taskset(tset); + if (res_counter_margin(css_res_open_handles(css)) < files) + return -ENOMEM; + return 0; +} + +/* + * If resource counts have gone up between can_attach and attach then + * this may overcommit resources. In that case just deny further allocation + * until the resource usage drops. + */ +static void files_cgroup_attach(struct cgroup_subsys_state *to_css, + struct cgroup_taskset *tset) +{ + u64 num_files; + struct task_struct *task = cgroup_taskset_first(tset); + struct cgroup_subsys_state *from_css; + struct res_counter *from_res; + struct res_counter *to_res = css_res_open_handles(to_css); + struct res_counter *fail_res; + struct files_struct *files; + + task_lock(task); + files = task->files; + if (!files) { + task_unlock(task); + return; + } + + from_css = &files_cgroup_from_files(files)->css; + from_res = css_res_open_handles(from_css); + + spin_lock(&files->file_lock); + num_files = files_cgroup_count_fds(files); + res_counter_uncharge(from_res, num_files); + css_put(from_css); + + if (res_counter_charge(to_res, num_files, &fail_res)) + pr_err("Open files limit overcommited\n"); + css_get(to_css); + + task->files->files_cgroup = css_fcg(to_css); + spin_unlock(&files->file_lock); + task_unlock(task); +} + +int files_cgroup_alloc_fd(struct files_struct *files, u64 n) +{ + struct res_counter *fail_res; + struct files_cgroup *files_cgroup = files_cgroup_from_files(files); + + if (res_counter_charge(&files_cgroup->open_handles, n, &fail_res)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(files_cgroup_alloc_fd); + +void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) +{ + struct files_cgroup *files_cgroup = files_cgroup_from_files(files); + + res_counter_uncharge(&files_cgroup->open_handles, n); +} +EXPORT_SYMBOL(files_cgroup_unalloc_fd); + +static u64 files_limit_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct files_cgroup *fcg = css_fcg(css); + return res_counter_read_u64(&fcg->open_handles, RES_LIMIT); +} + +static int files_limit_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 value) +{ + struct files_cgroup *fcg = css_fcg(css); + return res_counter_set_limit(&fcg->open_handles, value); +} + +static u64 files_usage_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct files_cgroup *fcg = css_fcg(css); + return res_counter_read_u64(&fcg->open_handles, RES_USAGE); +} + +static struct cftype files[] = { + { + .name = "limit", + .read_u64 = files_limit_read, + .write_u64 = files_limit_write, + }, + { + .name = "usage", + .read_u64 = files_usage_read, + }, + { } +}; + +struct cgroup_subsys files_cgrp_subsys = { + .name = "files", + .css_alloc = files_cgroup_css_alloc, + .css_free = files_cgroup_css_free, + .can_attach = files_cgroup_can_attach, + .attach = files_cgroup_attach, + .base_cftypes = files, +}; + +void files_cgroup_assign(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct cgroup_subsys_state *css; + + task_lock(tsk); + css = task_css(tsk, files_cgrp_id); + css_get(css); + files->files_cgroup = container_of(css, struct files_cgroup, css); + task_unlock(tsk); +} + +void files_cgroup_remove(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_cgroup *fcg; + + task_lock(tsk); + spin_lock(&files->file_lock); + fcg = files_cgroup_from_files(files); + css_put(&fcg->css); + spin_unlock(&files->file_lock); + task_unlock(tsk); +} diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 7b99d717411d..e6c9a40f29e3 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -50,6 +50,11 @@ SUBSYS(net_prio) #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB) SUBSYS(hugetlb) #endif + +#if IS_ENABLED(CONFIG_CGROUP_FILES) +SUBSYS(files) +#endif + /* * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 70e8e21c0a30..fee8e45e7425 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -57,6 +57,9 @@ struct files_struct { unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; +#ifdef CONFIG_CGROUP_FILES + struct files_cgroup *files_cgroup; +#endif }; struct file_operations; diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h new file mode 100644 index 000000000000..e39ed2aea623 --- /dev/null +++ b/include/linux/filescontrol.h @@ -0,0 +1,32 @@ +/* filescontrol.h - Files Controller + * + * Copyright 2014 Google Inc. + * Author: Brian Makin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_FILESCONTROL_H +#define _LINUX_FILESCONTROL_H + +#include + +#ifdef CONFIG_CGROUP_FILES + +extern int files_cgroup_alloc_fd(struct files_struct *files, u64 n); +extern void files_cgroup_unalloc_fd(struct files_struct *files, u64 n); +extern u64 files_cgroup_count_fds(struct files_struct *files); + +void files_cgroup_assign(struct files_struct *files); +void files_cgroup_remove(struct files_struct *files); + +#endif /* CONFIG_CGROUP_FILES */ +#endif /* _LINUX_FILESCONTROL_H */ diff --git a/init/Kconfig b/init/Kconfig index a4b420996c6b..a8c839203375 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1075,6 +1075,13 @@ config DEBUG_BLK_CGROUP Enable some debugging help. Currently it exports additional stat files in a cgroup which can be useful for debugging. +config CGROUP_FILES + bool "Files Resource Controller for Control Groups" + default n + help + Provides a cgroup resource controller that limits number of open + file handles within a cgroup. + endif # CGROUPS config CHECKPOINT_RESTORE -- 1.8.1.2