diff options
Diffstat (limited to 'ipc')
-rw-r--r-- | ipc/ipc_sysctl.c | 205 | ||||
-rw-r--r-- | ipc/mq_sysctl.c | 123 | ||||
-rw-r--r-- | ipc/mqueue.c | 118 | ||||
-rw-r--r-- | ipc/msg.c | 60 | ||||
-rw-r--r-- | ipc/msgutil.c | 2 | ||||
-rw-r--r-- | ipc/namespace.c | 59 | ||||
-rw-r--r-- | ipc/sem.c | 163 | ||||
-rw-r--r-- | ipc/shm.c | 310 | ||||
-rw-r--r-- | ipc/util.c | 109 | ||||
-rw-r--r-- | ipc/util.h | 10 |
10 files changed, 728 insertions, 431 deletions
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 3f312bf2b116..8c62e443f78b 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -10,46 +10,20 @@ #include <linux/nsproxy.h> #include <linux/sysctl.h> #include <linux/uaccess.h> +#include <linux/capability.h> #include <linux/ipc_namespace.h> #include <linux/msg.h> +#include <linux/slab.h> #include "util.h" -static void *get_ipc(struct ctl_table *table) -{ - char *which = table->data; - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; - return which; -} - -#ifdef CONFIG_PROC_SYSCTL -static int proc_ipc_dointvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); -} - -static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); -} - static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct ipc_namespace *ns = current->nsproxy->ipc_ns; - int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); + struct ipc_namespace *ns = + container_of(table->data, struct ipc_namespace, shm_rmid_forced); + int err; + + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err < 0) return err; @@ -58,17 +32,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, return err; } -static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - return proc_doulongvec_minmax(&ipc_table, write, buffer, - lenp, ppos); -} - static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -87,14 +50,15 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { + struct ipc_namespace *ns = + container_of(table->data, struct ipc_namespace, sem_ctls); int ret, semmni; - struct ipc_namespace *ns = current->nsproxy->ipc_ns; semmni = ns->sem_ctls[3]; - ret = proc_ipc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret) - ret = sem_check_semmni(current->nsproxy->ipc_ns); + ret = sem_check_semmni(ns); /* * Reset the semmni value if an error happens. @@ -104,40 +68,31 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, return ret; } -#else -#define proc_ipc_doulongvec_minmax NULL -#define proc_ipc_dointvec NULL -#define proc_ipc_dointvec_minmax NULL -#define proc_ipc_dointvec_minmax_orphans NULL -#define proc_ipc_auto_msgmni NULL -#define proc_ipc_sem_dointvec NULL -#endif - int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; -static struct ctl_table ipc_kern_table[] = { +static struct ctl_table ipc_sysctls[] = { { .procname = "shmmax", .data = &init_ipc_ns.shm_ctlmax, .maxlen = sizeof(init_ipc_ns.shm_ctlmax), .mode = 0644, - .proc_handler = proc_ipc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmall", .data = &init_ipc_ns.shm_ctlall, .maxlen = sizeof(init_ipc_ns.shm_ctlall), .mode = 0644, - .proc_handler = proc_ipc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmmni", .data = &init_ipc_ns.shm_ctlmni, .maxlen = sizeof(init_ipc_ns.shm_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, @@ -155,7 +110,7 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof(init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -164,7 +119,7 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, @@ -182,7 +137,7 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmnb, .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -198,8 +153,8 @@ static struct ctl_table ipc_kern_table[] = { .procname = "sem_next_id", .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -207,8 +162,8 @@ static struct ctl_table ipc_kern_table[] = { .procname = "msg_next_id", .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -216,8 +171,8 @@ static struct ctl_table ipc_kern_table[] = { .procname = "shm_next_id", .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -225,18 +180,114 @@ static struct ctl_table ipc_kern_table[] = { {} }; -static struct ctl_table ipc_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = ipc_kern_table, - }, - {} +static struct ctl_table_set *set_lookup(struct ctl_table_root *root) +{ + return ¤t->nsproxy->ipc_ns->ipc_set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t->nsproxy->ipc_ns->ipc_set == set; +} + +static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table) +{ + int mode = table->mode; + +#ifdef CONFIG_CHECKPOINT_RESTORE + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + + if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) || + (table->data == &ns->ids[IPC_MSG_IDS].next_id) || + (table->data == &ns->ids[IPC_SHM_IDS].next_id)) && + checkpoint_restore_ns_capable(ns->user_ns)) + mode = 0666; +#endif + return mode; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, + .permissions = ipc_permissions, }; +bool setup_ipc_sysctls(struct ipc_namespace *ns) +{ + struct ctl_table *tbl; + + setup_sysctl_set(&ns->ipc_set, &set_root, set_is_seen); + + tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL); + if (tbl) { + int i; + + for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) { + if (tbl[i].data == &init_ipc_ns.shm_ctlmax) + tbl[i].data = &ns->shm_ctlmax; + + else if (tbl[i].data == &init_ipc_ns.shm_ctlall) + tbl[i].data = &ns->shm_ctlall; + + else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) + tbl[i].data = &ns->shm_ctlmni; + + else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) + tbl[i].data = &ns->shm_rmid_forced; + + else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) + tbl[i].data = &ns->msg_ctlmax; + + else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) + tbl[i].data = &ns->msg_ctlmni; + + else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) + tbl[i].data = &ns->msg_ctlmnb; + + else if (tbl[i].data == &init_ipc_ns.sem_ctls) + tbl[i].data = &ns->sem_ctls; +#ifdef CONFIG_CHECKPOINT_RESTORE + else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) + tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id; + + else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) + tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id; + + else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) + tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id; +#endif + else + tbl[i].data = NULL; + } + + ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, + "kernel", tbl, + ARRAY_SIZE(ipc_sysctls)); + } + if (!ns->ipc_sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->ipc_set); + return false; + } + + return true; +} + +void retire_ipc_sysctls(struct ipc_namespace *ns) +{ + struct ctl_table *tbl; + + tbl = ns->ipc_sysctls->ctl_table_arg; + unregister_sysctl_table(ns->ipc_sysctls); + retire_sysctl_set(&ns->ipc_set); + kfree(tbl); +} + static int __init ipc_sysctl_init(void) { - register_sysctl_table(ipc_root_table); + if (!setup_ipc_sysctls(&init_ipc_ns)) { + pr_warn("ipc sysctl registration failed\n"); + return -ENOMEM; + } return 0; } diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 72a92a08c848..ebb5ed81c151 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -9,39 +9,9 @@ #include <linux/ipc_namespace.h> #include <linux/sysctl.h> -#ifdef CONFIG_PROC_SYSCTL -static void *get_mq(struct ctl_table *table) -{ - char *which = table->data; - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; - return which; -} - -static int proc_mq_dointvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table mq_table; - memcpy(&mq_table, table, sizeof(mq_table)); - mq_table.data = get_mq(table); - - return proc_dointvec(&mq_table, write, buffer, lenp, ppos); -} - -static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table mq_table; - memcpy(&mq_table, table, sizeof(mq_table)); - mq_table.data = get_mq(table); - - return proc_dointvec_minmax(&mq_table, write, buffer, - lenp, ppos); -} -#else -#define proc_mq_dointvec NULL -#define proc_mq_dointvec_minmax NULL -#endif +#include <linux/stat.h> +#include <linux/capability.h> +#include <linux/slab.h> static int msg_max_limit_min = MIN_MSGMAX; static int msg_max_limit_max = HARD_MSGMAX; @@ -55,14 +25,14 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_queues_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "msg_max", .data = &init_ipc_ns.mq_msg_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, @@ -71,7 +41,7 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msgsize_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, @@ -80,7 +50,7 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msg_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, @@ -89,32 +59,75 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msgsize_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, {} }; -static struct ctl_table mq_sysctl_dir[] = { - { - .procname = "mqueue", - .mode = 0555, - .child = mq_sysctls, - }, - {} -}; +static struct ctl_table_set *set_lookup(struct ctl_table_root *root) +{ + return ¤t->nsproxy->ipc_ns->mq_set; +} -static struct ctl_table mq_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = mq_sysctl_dir, - }, - {} +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t->nsproxy->ipc_ns->mq_set == set; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, }; -struct ctl_table_header *mq_register_sysctl_table(void) +bool setup_mq_sysctls(struct ipc_namespace *ns) { - return register_sysctl_table(mq_sysctl_root); + struct ctl_table *tbl; + + setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen); + + tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL); + if (tbl) { + int i; + + for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) { + if (tbl[i].data == &init_ipc_ns.mq_queues_max) + tbl[i].data = &ns->mq_queues_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msg_max) + tbl[i].data = &ns->mq_msg_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max) + tbl[i].data = &ns->mq_msgsize_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msg_default) + tbl[i].data = &ns->mq_msg_default; + + else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default) + tbl[i].data = &ns->mq_msgsize_default; + else + tbl[i].data = NULL; + } + + ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, + "fs/mqueue", tbl, + ARRAY_SIZE(mq_sysctls)); + } + if (!ns->mq_sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->mq_set); + return false; + } + + return true; +} + +void retire_mq_sysctls(struct ipc_namespace *ns) +{ + struct ctl_table *tbl; + + tbl = ns->mq_sysctls->ctl_table_arg; + unregister_sysctl_table(ns->mq_sysctls); + retire_sysctl_set(&ns->mq_set); + kfree(tbl); } diff --git a/ipc/mqueue.c b/ipc/mqueue.c index beff0cfcd1e8..5eea4dc0509e 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -45,6 +45,7 @@ struct mqueue_fs_context { struct ipc_namespace *ipc_ns; + bool newns; /* Set if newly created ipc namespace */ }; #define MQUEUE_MAGIC 0x19800202 @@ -144,7 +145,7 @@ struct mqueue_inode_info { struct pid *notify_owner; u32 notify_self_exec_id; struct user_namespace *notify_user_ns; - struct user_struct *user; /* user who created, for accounting */ + struct ucounts *ucounts; /* user who created, for accounting */ struct sock *notify_sock; struct sk_buff *notify_cookie; @@ -163,8 +164,6 @@ static void remove_notification(struct mqueue_inode_info *info); static struct kmem_cache *mqueue_inode_cachep; -static struct ctl_table_header *mq_sysctl_table; - static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode) { return container_of(inode, struct mqueue_inode_info, vfs_inode); @@ -292,7 +291,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, struct ipc_namespace *ipc_ns, umode_t mode, struct mq_attr *attr) { - struct user_struct *u = current_user(); struct inode *inode; int ret = -ENOMEM; @@ -304,7 +302,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_mtime = inode->i_ctime = inode->i_atime = current_time(inode); + simple_inode_init_ts(inode); if (S_ISREG(mode)) { struct mqueue_inode_info *info; @@ -321,7 +319,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->notify_owner = NULL; info->notify_user_ns = NULL; info->qsize = 0; - info->user = NULL; /* set when all is ok */ + info->ucounts = NULL; /* set when all is ok */ info->msg_tree = RB_ROOT; info->msg_tree_rightmost = NULL; info->node_cache = NULL; @@ -371,19 +369,23 @@ static struct inode *mqueue_get_inode(struct super_block *sb, if (mq_bytes + mq_treesize < mq_bytes) goto out_inode; mq_bytes += mq_treesize; - spin_lock(&mq_lock); - if (u->mq_bytes + mq_bytes < u->mq_bytes || - u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) { + info->ucounts = get_ucounts(current_ucounts()); + if (info->ucounts) { + long msgqueue; + + spin_lock(&mq_lock); + msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + spin_unlock(&mq_lock); + put_ucounts(info->ucounts); + info->ucounts = NULL; + /* mqueue_evict_inode() releases info->messages */ + ret = -EMFILE; + goto out_inode; + } spin_unlock(&mq_lock); - /* mqueue_evict_inode() releases info->messages */ - ret = -EMFILE; - goto out_inode; } - u->mq_bytes += mq_bytes; - spin_unlock(&mq_lock); - - /* all is ok */ - info->user = get_uid(u); } else if (S_ISDIR(mode)) { inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ @@ -424,6 +426,14 @@ static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; + /* + * With a newly created ipc namespace, we don't need to do a search + * for an ipc namespace match, but we still need to set s_fs_info. + */ + if (ctx->newns) { + fc->s_fs_info = ctx->ipc_ns; + return get_tree_nodev(fc, mqueue_fill_super); + } return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } @@ -451,6 +461,10 @@ static int mqueue_init_fs_context(struct fs_context *fc) return 0; } +/* + * mq_init_ns() is currently the only caller of mq_create_mount(). + * So the ns parameter is always a newly created ipc namespace. + */ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) { struct mqueue_fs_context *ctx; @@ -462,6 +476,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) return ERR_CAST(fc); ctx = fc->fs_private; + ctx->newns = true; put_ipc_ns(ctx->ipc_ns); ctx->ipc_ns = get_ipc_ns(ns); put_user_ns(fc->user_ns); @@ -474,7 +489,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) static void init_once(void *foo) { - struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo; + struct mqueue_inode_info *p = foo; inode_init_once(&p->vfs_inode); } @@ -483,7 +498,7 @@ static struct inode *mqueue_alloc_inode(struct super_block *sb) { struct mqueue_inode_info *ei; - ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; @@ -497,7 +512,6 @@ static void mqueue_free_inode(struct inode *inode) static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; - struct user_struct *user; struct ipc_namespace *ipc_ns; struct msg_msg *msg, *nmsg; LIST_HEAD(tmp_msg); @@ -520,8 +534,7 @@ static void mqueue_evict_inode(struct inode *inode) free_msg(msg); } - user = info->user; - if (user) { + if (info->ucounts) { unsigned long mq_bytes, mq_treesize; /* Total amount of bytes accounted for the mqueue */ @@ -533,7 +546,7 @@ static void mqueue_evict_inode(struct inode *inode) info->attr.mq_msgsize); spin_lock(&mq_lock); - user->mq_bytes -= mq_bytes; + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); /* * get_ns_from_inode() ensures that the * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns @@ -543,7 +556,8 @@ static void mqueue_evict_inode(struct inode *inode) if (ipc_ns) ipc_ns->mq_queues_count--; spin_unlock(&mq_lock); - free_uid(user); + put_ucounts(info->ucounts); + info->ucounts = NULL; } if (ipc_ns) put_ipc_ns(ipc_ns); @@ -582,7 +596,7 @@ static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg) put_ipc_ns(ipc_ns); dir->i_size += DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir); + simple_inode_init_ts(dir); d_instantiate(dentry, inode); dget(dentry); @@ -594,8 +608,8 @@ out_unlock: return error; } -static int mqueue_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { return mqueue_create_attr(dentry, mode, NULL); } @@ -604,7 +618,7 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir); + simple_inode_init_ts(dir); dir->i_size -= DIRENT_SIZE; drop_nlink(inode); dput(dentry); @@ -621,7 +635,8 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry) static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, size_t count, loff_t *off) { - struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp)); + struct inode *inode = file_inode(filp); + struct mqueue_inode_info *info = MQUEUE_I(inode); char buffer[FILENT_SIZE]; ssize_t ret; @@ -642,7 +657,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, if (ret <= 0) return ret; - file_inode(filp)->i_atime = file_inode(filp)->i_ctime = current_time(file_inode(filp)); + inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); return ret; } @@ -873,7 +888,7 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro, if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) return -EINVAL; acc = oflag2acc[oflag & O_ACCMODE]; - return inode_permission(d_inode(dentry), acc); + return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc); } static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, @@ -965,14 +980,14 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = -ENOENT; } else { ihold(inode); - err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL); + err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent), + dentry, NULL); } dput(dentry); out_unlock: inode_unlock(d_inode(mnt->mnt_root)); - if (inode) - iput(inode); + iput(inode); mnt_drop_write(mnt); out_name: putname(name); @@ -1003,12 +1018,14 @@ static inline void __pipelined_op(struct wake_q_head *wake_q, struct mqueue_inode_info *info, struct ext_wait_queue *this) { + struct task_struct *task; + list_del(&this->list); - get_task_struct(this->task); + task = get_task_struct(this->task); /* see MQ_BARRIER for purpose/pairing */ smp_store_release(&this->state, STATE_READY); - wake_q_add_safe(wake_q, this->task); + wake_q_add_safe(wake_q, task); } /* pipelined_send() - send a message directly to the task waiting in @@ -1146,8 +1163,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, goto out_unlock; __do_notify(info); } - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + simple_inode_init_ts(inode); } out_unlock: spin_unlock(&info->lock); @@ -1241,8 +1257,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, msg_ptr = msg_get(info); - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + simple_inode_init_ts(inode); /* There is now free space in queue. */ pipelined_receive(&wake_q, info); @@ -1380,7 +1395,8 @@ retry: if (notification == NULL) { if (info->notify_owner == task_tgid(current)) { remove_notification(info); - inode->i_atime = inode->i_ctime = current_time(inode); + inode_set_atime_to_ts(inode, + inode_set_ctime_current(inode)); } } else if (info->notify_owner != NULL) { ret = -EBUSY; @@ -1406,7 +1422,7 @@ retry: info->notify_owner = get_pid(task_tgid(current)); info->notify_user_ns = get_user_ns(current_user_ns()); - inode->i_atime = inode->i_ctime = current_time(inode); + inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } spin_unlock(&info->lock); out_fput: @@ -1469,7 +1485,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old) f.file->f_flags &= ~O_NONBLOCK; spin_unlock(&f.file->f_lock); - inode->i_atime = inode->i_ctime = current_time(inode); + inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } spin_unlock(&info->lock); @@ -1693,11 +1709,6 @@ void mq_clear_sbinfo(struct ipc_namespace *ns) ns->mq_mnt->mnt_sb->s_fs_info = NULL; } -void mq_put_mnt(struct ipc_namespace *ns) -{ - kern_unmount(ns->mq_mnt); -} - static int __init init_mqueue_fs(void) { int error; @@ -1708,8 +1719,11 @@ static int __init init_mqueue_fs(void) if (mqueue_inode_cachep == NULL) return -ENOMEM; - /* ignore failures - they are not fatal */ - mq_sysctl_table = mq_register_sysctl_table(); + if (!setup_mq_sysctls(&init_ipc_ns)) { + pr_warn("sysctl registration failed\n"); + error = -ENOMEM; + goto out_kmem; + } error = register_filesystem(&mqueue_fs_type); if (error) @@ -1726,8 +1740,8 @@ static int __init init_mqueue_fs(void) out_filesystem: unregister_filesystem(&mqueue_fs_type); out_sysctl: - if (mq_sysctl_table) - unregister_sysctl_table(mq_sysctl_table); + retire_mq_sysctls(&init_ipc_ns); +out_kmem: kmem_cache_destroy(mqueue_inode_cachep); return error; } diff --git a/ipc/msg.c b/ipc/msg.c index acd1bc7af55a..fd08b3cb36d7 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,6 +39,7 @@ #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> +#include <linux/percpu_counter.h> #include <asm/current.h> #include <linux/uaccess.h> @@ -130,7 +131,7 @@ static void msg_rcu_free(struct rcu_head *head) struct msg_queue *msq = container_of(p, struct msg_queue, q_perm); security_msg_queue_free(&msq->q_perm); - kvfree(msq); + kfree(msq); } /** @@ -147,7 +148,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) key_t key = params->key; int msgflg = params->flg; - msq = kvmalloc(sizeof(*msq), GFP_KERNEL); + msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT); if (unlikely(!msq)) return -ENOMEM; @@ -157,7 +158,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) msq->q_perm.security = NULL; retval = security_msg_queue_alloc(&msq->q_perm); if (retval) { - kvfree(msq); + kfree(msq); return retval; } @@ -251,11 +252,13 @@ static void expunge_all(struct msg_queue *msq, int res, struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { - get_task_struct(msr->r_tsk); + struct task_struct *r_tsk; + + r_tsk = get_task_struct(msr->r_tsk); /* see MSG_BARRIER for purpose/pairing */ smp_store_release(&msr->r_msg, ERR_PTR(res)); - wake_q_add_safe(wake_q, msr->r_tsk); + wake_q_add_safe(wake_q, r_tsk); } } @@ -283,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } - atomic_sub(msq->q_cbytes, &ns->msg_bytes); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); @@ -493,17 +496,22 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); - if (cmd == MSG_INFO) { + if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; - msginfo->msgmap = atomic_read(&ns->msg_hdrs); - msginfo->msgtql = atomic_read(&ns->msg_bytes); + max_idx = ipc_get_maxidx(&msg_ids(ns)); + up_read(&msg_ids(ns).rwsem); + if (cmd == MSG_INFO) { + msginfo->msgmap = min_t(int, + percpu_counter_sum(&ns->percpu_msg_hdrs), + INT_MAX); + msginfo->msgtql = min_t(int, + percpu_counter_sum(&ns->percpu_msg_bytes), + INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } - max_idx = ipc_get_maxidx(&msg_ids(ns)); - up_read(&msg_ids(ns).rwsem); return (max_idx < 0) ? 0 : max_idx; } @@ -933,8 +941,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext, list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; - atomic_add(msgsz, &ns->msg_bytes); - atomic_inc(&ns->msg_hdrs); + percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); + percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; @@ -1157,8 +1165,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; - atomic_sub(msg->m_ts, &ns->msg_bytes); - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; @@ -1295,15 +1303,27 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, } #endif -void msg_init_ns(struct ipc_namespace *ns) +int msg_init_ns(struct ipc_namespace *ns) { + int ret; + ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; - atomic_set(&ns->msg_bytes, 0); - atomic_set(&ns->msg_hdrs, 0); + ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); + if (ret) + goto fail_msg_bytes; + ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); + if (ret) + goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + return 0; + +fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); +fail_msg_bytes: + return ret; } #ifdef CONFIG_IPC_NS @@ -1312,6 +1332,8 @@ void msg_exit_ns(struct ipc_namespace *ns) free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); } #endif diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 3149b4a379de..d0a0e877cadd 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -26,7 +26,7 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .count = REFCOUNT_INIT(1), + .ns.count = REFCOUNT_INIT(1), .user_ns = &init_user_ns, .ns.inum = PROC_IPC_INIT_INO, #ifdef CONFIG_IPC_NS diff --git a/ipc/namespace.c b/ipc/namespace.c index 24e7b45320f7..6ecc30effd3e 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -19,6 +19,12 @@ #include "util.h" +/* + * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. + */ +static void free_ipc(struct work_struct *unused); +static DECLARE_WORK(free_ipc_work, free_ipc); + static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); @@ -37,12 +43,21 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, int err; err = -ENOSPC; + again: ucounts = inc_ipc_namespaces(user_ns); - if (!ucounts) + if (!ucounts) { + /* + * IPC namespaces are freed asynchronously, by free_ipc_work. + * If frees were pending, flush_work will wait, and + * return true. Fail the allocation if no frees are pending. + */ + if (flush_work(&free_ipc_work)) + goto again; goto fail; + } err = -ENOMEM; - ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; @@ -51,7 +66,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, goto fail_free; ns->ns.ops = &ipcns_operations; - refcount_set(&ns->count, 1); + refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; @@ -59,12 +74,25 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (err) goto fail_put; + err = -ENOMEM; + if (!setup_mq_sysctls(ns)) + goto fail_put; + + if (!setup_ipc_sysctls(ns)) + goto fail_mq; + + err = msg_init_ns(ns); + if (err) + goto fail_put; + sem_init_ns(ns); - msg_init_ns(ns); shm_init_ns(ns); return ns; +fail_mq: + retire_mq_sysctls(ns); + fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); @@ -117,14 +145,18 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { - /* mq_put_mnt() waits for a grace period as kern_unmount() - * uses synchronize_rcu(). + /* + * Caller needs to wait for an RCU grace period to have passed + * after making the mount point inaccessible to new accesses. */ - mq_put_mnt(ns); + mntput(ns->mq_mnt); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); + retire_mq_sysctls(ns); + retire_ipc_sysctls(ns); + dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); @@ -138,15 +170,16 @@ static void free_ipc(struct work_struct *unused) struct ipc_namespace *n, *t; llist_for_each_entry_safe(n, t, node, mnt_llist) + mnt_make_shortterm(n->mq_mnt); + + /* Wait for any last users to have gone away. */ + synchronize_rcu(); + + llist_for_each_entry_safe(n, t, node, mnt_llist) free_ipc_ns(n); } /* - * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. - */ -static DECLARE_WORK(free_ipc_work, free_ipc); - -/* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put * @@ -164,7 +197,7 @@ static DECLARE_WORK(free_ipc_work, free_ipc); */ void put_ipc_ns(struct ipc_namespace *ns) { - if (refcount_dec_and_lock(&ns->count, &mq_lock)) { + if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); diff --git a/ipc/sem.c b/ipc/sem.c index f6c30a85dadf..a39cdc7bf88f 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -36,7 +36,7 @@ * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO. * - undo adjustments at process exit are limited to 0..SEMVMX. * - namespace are supported. - * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing + * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing * to /proc/sys/kernel/sem. * - statistics about the usage are reported in /proc/sysvipc/sem. * @@ -152,7 +152,7 @@ struct sem_undo { struct list_head list_id; /* per semaphore array list: * all undos for one array */ int semid; /* semaphore set identifier */ - short *semadj; /* array of adjustments */ + short semadj[]; /* array of adjustments */ /* one per semaphore */ }; @@ -217,6 +217,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * this smp_load_acquire(), this is guaranteed because the smp_load_acquire() * is inside a spin_lock() and after a write from 0 to non-zero a * spin_lock()+spin_unlock() is done. + * To prevent the compiler/cpu temporarily writing 0 to use_global_lock, + * READ_ONCE()/WRITE_ONCE() is used. * * 2) queue.status: (SEM_BARRIER_2) * Initialization is done while holding sem_lock(), so no further barrier is @@ -224,7 +226,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * Setting it to a result code is a RELEASE, this is ensured by both a * smp_store_release() (for case a) and while holding sem_lock() * (for case b). - * The AQUIRE when reading the result code without holding sem_lock() is + * The ACQUIRE when reading the result code without holding sem_lock() is * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep(). * (case a above). * Reading the result code while holding sem_lock() needs no further barriers, @@ -342,10 +344,10 @@ static void complexmode_enter(struct sem_array *sma) * Nothing to do, just reset the * counter until we return to simple mode. */ - sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; + WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); return; } - sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; + WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); for (i = 0; i < sma->sem_nsems; i++) { sem = &sma->sems[i]; @@ -371,7 +373,8 @@ static void complexmode_tryleave(struct sem_array *sma) /* See SEM_BARRIER_1 for purpose/pairing */ smp_store_release(&sma->use_global_lock, 0); } else { - sma->use_global_lock--; + WRITE_ONCE(sma->use_global_lock, + sma->use_global_lock-1); } } @@ -412,7 +415,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, * Initial check for use_global_lock. Just an optimization, * no locking, no memory barrier. */ - if (!sma->use_global_lock) { + if (!READ_ONCE(sma->use_global_lock)) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. @@ -511,7 +514,7 @@ static struct sem_array *sem_alloc(size_t nsems) if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0])) return NULL; - sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL); + sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT); if (unlikely(!sma)) return NULL; @@ -763,7 +766,6 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) for (sop = sops; sop < sops + nsops; sop++) { curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; - result = curr->semval; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; @@ -784,12 +786,14 @@ would_block: static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, struct wake_q_head *wake_q) { - get_task_struct(q->sleeper); + struct task_struct *sleeper; + + sleeper = get_task_struct(q->sleeper); - /* see SEM_BARRIER_2 for purpuse/pairing */ + /* see SEM_BARRIER_2 for purpose/pairing */ smp_store_release(&q->status, error); - wake_q_add_safe(wake_q, q->sleeper); + wake_q_add_safe(wake_q, sleeper); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) @@ -821,7 +825,7 @@ static inline int check_restart(struct sem_array *sma, struct sem_queue *q) /* It is impossible that someone waits for the new value: * - complex operations always restart. - * - wait-for-zero are handled seperately. + * - wait-for-zero are handled separately. * - q is a previously sleeping simple operation that * altered the array. It must be a decrement, because * simple increments never sleep. @@ -1046,7 +1050,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop * - No complex ops, thus all sleeping ops are * decrease. * - if we decreased the value, then any sleeping - * semaphore ops wont be able to run: If the + * semaphore ops won't be able to run: If the * previous value was too small, then the new * value will be too small, too. */ @@ -1152,7 +1156,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) un->semid = -1; list_del_rcu(&un->list_proc); spin_unlock(&un->ulp->lock); - kfree_rcu(un, rcu); + kvfree_rcu(un, rcu); } /* Wake up all pending processes and let them fail with EIDRM. */ @@ -1425,7 +1429,6 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, if (err) goto out_rcu_wakeup; - err = -EACCES; switch (cmd) { case GETALL: { @@ -1850,7 +1853,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp) undo_list = current->sysvsem.undo_list; if (!undo_list) { - undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); @@ -1935,7 +1938,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) rcu_read_unlock(); /* step 2: allocate new undo structure */ - new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); + new = kvzalloc(struct_size(new, semadj, nsems), GFP_KERNEL_ACCOUNT); if (!new) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); @@ -1947,7 +1950,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); - kfree(new); + kvfree(new); un = ERR_PTR(-EIDRM); goto out; } @@ -1958,11 +1961,11 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) */ un = lookup_undo(ulp, semid); if (un) { - kfree(new); + spin_unlock(&ulp->lock); + kvfree(new); goto success; } /* step 5: initialize & link new undo structure */ - new->semadj = (short *) &new[1]; new->ulp = ulp; new->semid = semid; assert_spin_locked(&ulp->lock); @@ -1970,54 +1973,42 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; - -success: spin_unlock(&ulp->lock); +success: sem_unlock(sma, -1); out: return un; } -static long do_semtimedop(int semid, struct sembuf __user *tsops, - unsigned nsops, const struct timespec64 *timeout) +long __do_semtimedop(int semid, struct sembuf *sops, + unsigned nsops, const struct timespec64 *timeout, + struct ipc_namespace *ns) { int error = -EINVAL; struct sem_array *sma; - struct sembuf fast_sops[SEMOPM_FAST]; - struct sembuf *sops = fast_sops, *sop; + struct sembuf *sop; struct sem_undo *un; int max, locknum; bool undos = false, alter = false, dupsop = false; struct sem_queue queue; - unsigned long dup = 0, jiffies_left = 0; - struct ipc_namespace *ns; - - ns = current->nsproxy->ipc_ns; + unsigned long dup = 0; + ktime_t expires, *exp = NULL; + bool timed_out = false; if (nsops < 1 || semid < 0) return -EINVAL; if (nsops > ns->sc_semopm) return -E2BIG; - if (nsops > SEMOPM_FAST) { - sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); - if (sops == NULL) - return -ENOMEM; - } - - if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { - error = -EFAULT; - goto out_free; - } if (timeout) { - if (timeout->tv_sec < 0 || timeout->tv_nsec < 0 || - timeout->tv_nsec >= 1000000000L) { - error = -EINVAL; - goto out_free; - } - jiffies_left = timespec64_to_jiffies(timeout); + if (!timespec64_valid(timeout)) + return -EINVAL; + expires = ktime_add_safe(ktime_get(), + timespec64_to_ktime(*timeout)); + exp = &expires; } + max = 0; for (sop = sops; sop < sops + nsops; sop++) { unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); @@ -2046,7 +2037,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, un = find_alloc_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); - goto out_free; + goto out; } } else { un = NULL; @@ -2057,25 +2048,25 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, if (IS_ERR(sma)) { rcu_read_unlock(); error = PTR_ERR(sma); - goto out_free; + goto out; } error = -EFBIG; if (max >= sma->sem_nsems) { rcu_read_unlock(); - goto out_free; + goto out; } error = -EACCES; if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { rcu_read_unlock(); - goto out_free; + goto out; } error = security_sem_semop(&sma->sem_perm, sops, nsops, alter); if (error) { rcu_read_unlock(); - goto out_free; + goto out; } error = -EIDRM; @@ -2089,7 +2080,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * entangled here and why it's RMID race safe on comments at sem_lock() */ if (!ipc_valid_object(&sma->sem_perm)) - goto out_unlock_free; + goto out_unlock; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID @@ -2098,7 +2089,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * "un" itself is guaranteed by rcu. */ if (un && un->semid == -1) - goto out_unlock_free; + goto out_unlock; queue.sops = sops; queue.nsops = nsops; @@ -2108,7 +2099,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, queue.dupsop = dupsop; error = perform_atomic_semop(sma, &queue); - if (error == 0) { /* non-blocking succesfull path */ + if (error == 0) { /* non-blocking successful path */ DEFINE_WAKE_Q(wake_q); /* @@ -2124,10 +2115,10 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, rcu_read_unlock(); wake_up_q(&wake_q); - goto out_free; + goto out; } if (error < 0) /* non-blocking error path */ - goto out_unlock_free; + goto out_unlock; /* * We need to sleep on this operation, so we put the current @@ -2172,10 +2163,8 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, sem_unlock(sma, locknum); rcu_read_unlock(); - if (timeout) - jiffies_left = schedule_timeout(jiffies_left); - else - schedule(); + timed_out = !schedule_hrtimeout_range(exp, + current->timer_slack_ns, HRTIMER_MODE_ABS); /* * fastpath: the semop has completed, either successfully or @@ -2188,18 +2177,19 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * scenarios where we were awakened externally, during the * window between wake_q_add() and wake_up_q(). */ + rcu_read_lock(); error = READ_ONCE(queue.status); if (error != -EINTR) { /* see SEM_BARRIER_2 for purpose/pairing */ smp_acquire__after_ctrl_dep(); - goto out_free; + rcu_read_unlock(); + goto out; } - rcu_read_lock(); locknum = sem_lock(sma, sops, nsops); if (!ipc_valid_object(&sma->sem_perm)) - goto out_unlock_free; + goto out_unlock; /* * No necessity for any barrier: We are protect by sem_lock() @@ -2211,24 +2201,56 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * Leave without unlink_queue(), but with sem_unlock(). */ if (error != -EINTR) - goto out_unlock_free; + goto out_unlock; /* * If an interrupt occurred we have to clean up the queue. */ - if (timeout && jiffies_left == 0) + if (timed_out) error = -EAGAIN; } while (error == -EINTR && !signal_pending(current)); /* spurious */ unlink_queue(sma, &queue); -out_unlock_free: +out_unlock: sem_unlock(sma, locknum); rcu_read_unlock(); +out: + return error; +} + +static long do_semtimedop(int semid, struct sembuf __user *tsops, + unsigned nsops, const struct timespec64 *timeout) +{ + struct sembuf fast_sops[SEMOPM_FAST]; + struct sembuf *sops = fast_sops; + struct ipc_namespace *ns; + int ret; + + ns = current->nsproxy->ipc_ns; + if (nsops > ns->sc_semopm) + return -E2BIG; + if (nsops < 1) + return -EINVAL; + + if (nsops > SEMOPM_FAST) { + sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); + if (sops == NULL) + return -ENOMEM; + } + + if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { + ret = -EFAULT; + goto out_free; + } + + ret = __do_semtimedop(semid, sops, nsops, timeout, ns); + out_free: if (sops != fast_sops) kvfree(sops); - return error; + + return ret; } long ksys_semtimedop(int semid, struct sembuf __user *tsops, @@ -2418,7 +2440,7 @@ void exit_sem(struct task_struct *tsk) rcu_read_unlock(); wake_up_q(&wake_q); - kfree_rcu(un, rcu); + kvfree_rcu(un, rcu); } kfree(ulp); } @@ -2433,7 +2455,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) /* * The proc interface isn't aware of sem_lock(), it calls - * ipc_lock_object() directly (in sysvipc_find_ipc). + * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock). + * (in sysvipc_find_ipc) * In order to stay compatible with sem_lock(), we must * enter / leave complex_mode. */ diff --git a/ipc/shm.c b/ipc/shm.c index e25c7c6106bc..a89f001a8bf0 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -29,6 +29,7 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/shm.h> +#include <uapi/linux/shm.h> #include <linux/init.h> #include <linux/file.h> #include <linux/mman.h> @@ -60,11 +61,20 @@ struct shmid_kernel /* private to the kernel */ time64_t shm_ctim; struct pid *shm_cprid; struct pid *shm_lprid; - struct user_struct *mlock_user; + struct ucounts *mlock_ucounts; - /* The task created the shm object. NULL if the task is dead. */ + /* + * The task created the shm object, for + * task_lock(shp->shm_creator) + */ struct task_struct *shm_creator; - struct list_head shm_clist; /* list by creator */ + + /* + * List by creator. task_lock(->shm_creator) required for read/write. + * If list_empty(), then the creator is dead already. + */ + struct list_head shm_clist; + struct ipc_namespace *ns; } __randomize_layout; /* shm_mode upper byte flags */ @@ -115,6 +125,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); + WARN_ON(ns != shp->ns); if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; @@ -222,20 +233,51 @@ static void shm_rcu_free(struct rcu_head *head) struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel, shm_perm); security_shm_free(&shp->shm_perm); - kvfree(shp); + kfree(shp); +} + +/* + * It has to be called with shp locked. + * It must be called before ipc_rmid() + */ +static inline void shm_clist_rm(struct shmid_kernel *shp) +{ + struct task_struct *creator; + + /* ensure that shm_creator does not disappear */ + rcu_read_lock(); + + /* + * A concurrent exit_shm may do a list_del_init() as well. + * Just do nothing if exit_shm already did the work + */ + if (!list_empty(&shp->shm_clist)) { + /* + * shp->shm_creator is guaranteed to be valid *only* + * if shp->shm_clist is not empty. + */ + creator = shp->shm_creator; + + task_lock(creator); + /* + * list_del_init() is a nop if the entry was already removed + * from the list. + */ + list_del_init(&shp->shm_clist); + task_unlock(creator); + } + rcu_read_unlock(); } -static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) +static inline void shm_rmid(struct shmid_kernel *s) { - list_del(&s->shm_clist); - ipc_rmid(&shm_ids(ns), &s->shm_perm); + shm_clist_rm(s); + ipc_rmid(&shm_ids(s->ns), &s->shm_perm); } -static int __shm_open(struct vm_area_struct *vma) +static int __shm_open(struct shm_file_data *sfd) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); struct shmid_kernel *shp; shp = shm_lock(sfd->ns, sfd->id); @@ -259,7 +301,15 @@ static int __shm_open(struct vm_area_struct *vma) /* This is called by fork, once for every shm attach. */ static void shm_open(struct vm_area_struct *vma) { - int err = __shm_open(vma); + struct file *file = vma->vm_file; + struct shm_file_data *sfd = shm_file_data(file); + int err; + + /* Always call underlying open if present */ + if (sfd->vm_ops->open) + sfd->vm_ops->open(vma); + + err = __shm_open(sfd); /* * We raced in the idr lookup or with shm_destroy(). * Either way, the ID is busted. @@ -283,13 +333,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_file = shp->shm_file; shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; - shm_rmid(ns, shp); + shm_rmid(shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) - shmem_lock(shm_file, 0, shp->mlock_user); - else if (shp->mlock_user) - user_shm_unlock(i_size_read(file_inode(shm_file)), - shp->mlock_user); + shmem_lock(shm_file, 0, shp->mlock_ucounts); fput(shm_file); ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); @@ -306,10 +353,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) * * 2) sysctl kernel.shm_rmid_forced is set to 1. */ -static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) +static bool shm_may_destroy(struct shmid_kernel *shp) { return (shp->shm_nattch == 0) && - (ns->shm_rmid_forced || + (shp->ns->shm_rmid_forced || (shp->shm_perm.mode & SHM_DEST)); } @@ -319,10 +366,8 @@ static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) * The descriptor has already been removed from the current->mm->mmap list * and will later be kfree()d. */ -static void shm_close(struct vm_area_struct *vma) +static void __shm_close(struct shm_file_data *sfd) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; @@ -340,7 +385,7 @@ static void shm_close(struct vm_area_struct *vma) ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_dtim = ktime_get_real_seconds(); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); @@ -348,6 +393,18 @@ done: up_write(&shm_ids(ns).rwsem); } +static void shm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct shm_file_data *sfd = shm_file_data(file); + + /* Always call underlying close if present */ + if (sfd->vm_ops->close) + sfd->vm_ops->close(vma); + + __shm_close(sfd); +} + /* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { @@ -361,10 +418,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * * As shp->* are changed under rwsem, it's safe to skip shp locking. */ - if (shp->shm_creator != NULL) + if (!list_empty(&shp->shm_clist)) return 0; - if (shm_may_destroy(ns, shp)) { + if (shm_may_destroy(shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } @@ -382,48 +439,97 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) /* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { - struct ipc_namespace *ns = task->nsproxy->ipc_ns; - struct shmid_kernel *shp, *n; + for (;;) { + struct shmid_kernel *shp; + struct ipc_namespace *ns; - if (list_empty(&task->sysvshm.shm_clist)) - return; + task_lock(task); + + if (list_empty(&task->sysvshm.shm_clist)) { + task_unlock(task); + break; + } + + shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, + shm_clist); - /* - * If kernel.shm_rmid_forced is not set then only keep track of - * which shmids are orphaned, so that a later set of the sysctl - * can clean them up. - */ - if (!ns->shm_rmid_forced) { - down_read(&shm_ids(ns).rwsem); - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) - shp->shm_creator = NULL; /* - * Only under read lock but we are only called on current - * so no entry on the list will be shared. + * 1) Get pointer to the ipc namespace. It is worth to say + * that this pointer is guaranteed to be valid because + * shp lifetime is always shorter than namespace lifetime + * in which shp lives. + * We taken task_lock it means that shp won't be freed. */ - list_del(&task->sysvshm.shm_clist); - up_read(&shm_ids(ns).rwsem); - return; - } + ns = shp->ns; - /* - * Destroy all already created segments, that were not yet mapped, - * and mark any mapped as orphan to cover the sysctl toggling. - * Destroy is skipped if shm_may_destroy() returns false. - */ - down_write(&shm_ids(ns).rwsem); - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { - shp->shm_creator = NULL; + /* + * 2) If kernel.shm_rmid_forced is not set then only keep track of + * which shmids are orphaned, so that a later set of the sysctl + * can clean them up. + */ + if (!ns->shm_rmid_forced) + goto unlink_continue; - if (shm_may_destroy(ns, shp)) { - shm_lock_by_ptr(shp); - shm_destroy(ns, shp); + /* + * 3) get a reference to the namespace. + * The refcount could be already 0. If it is 0, then + * the shm objects will be free by free_ipc_work(). + */ + ns = get_ipc_ns_not_zero(ns); + if (!ns) { +unlink_continue: + list_del_init(&shp->shm_clist); + task_unlock(task); + continue; } - } - /* Remove the list head from any segments still attached. */ - list_del(&task->sysvshm.shm_clist); - up_write(&shm_ids(ns).rwsem); + /* + * 4) get a reference to shp. + * This cannot fail: shm_clist_rm() is called before + * ipc_rmid(), thus the refcount cannot be 0. + */ + WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); + + /* + * 5) unlink the shm segment from the list of segments + * created by current. + * This must be done last. After unlinking, + * only the refcounts obtained above prevent IPC_RMID + * from destroying the segment or the namespace. + */ + list_del_init(&shp->shm_clist); + + task_unlock(task); + + /* + * 6) we have all references + * Thus lock & if needed destroy shp. + */ + down_write(&shm_ids(ns).rwsem); + shm_lock_by_ptr(shp); + /* + * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's + * safe to call ipc_rcu_putref here + */ + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); + + if (ipc_valid_object(&shp->shm_perm)) { + if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); + } else { + /* + * Someone else deleted the shp from namespace + * idr/kht while we have waited. + * Just unlock and continue. + */ + shm_unlock(shp); + } + + up_write(&shm_ids(ns).rwsem); + put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ + } } static vm_fault_t shm_fault(struct vm_fault *vmf) @@ -434,13 +540,13 @@ static vm_fault_t shm_fault(struct vm_fault *vmf) return sfd->vm_ops->fault(vmf); } -static int shm_split(struct vm_area_struct *vma, unsigned long addr) +static int shm_may_split(struct vm_area_struct *vma, unsigned long addr) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); - if (sfd->vm_ops->split) - return sfd->vm_ops->split(vma, addr); + if (sfd->vm_ops->may_split) + return sfd->vm_ops->may_split(vma, addr); return 0; } @@ -457,30 +563,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) } #ifdef CONFIG_NUMA -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); + struct shm_file_data *sfd = shm_file_data(vma->vm_file); int err = 0; if (sfd->vm_ops->set_policy) - err = sfd->vm_ops->set_policy(vma, new); + err = sfd->vm_ops->set_policy(vma, mpol); return err; } static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); - struct mempolicy *pol = NULL; + struct shm_file_data *sfd = shm_file_data(vma->vm_file); + struct mempolicy *mpol = vma->vm_policy; if (sfd->vm_ops->get_policy) - pol = sfd->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy) - pol = vma->vm_policy; - - return pol; + mpol = sfd->vm_ops->get_policy(vma, addr, ilx); + return mpol; } #endif @@ -494,13 +595,13 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma) * IPC ID that was removed, and possibly even reused by another shm * segment already. Propagate this case as an error to caller. */ - ret = __shm_open(vma); + ret = __shm_open(sfd); if (ret) return ret; ret = call_mmap(sfd->file, vma); if (ret) { - shm_close(vma); + __shm_close(sfd); return ret; } sfd->vm_ops = vma->vm_ops; @@ -582,7 +683,7 @@ static const struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ .close = shm_close, /* callback for when the vm-area is released */ .fault = shm_fault, - .split = shm_split, + .may_split = shm_may_split, .pagesize = shm_pagesize, #if defined(CONFIG_NUMA) .set_policy = shm_set_policy, @@ -619,18 +720,18 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; - shp = kvmalloc(sizeof(*shp), GFP_KERNEL); + shp = kmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT); if (unlikely(!shp)) return -ENOMEM; shp->shm_perm.key = key; shp->shm_perm.mode = (shmflg & S_IRWXUGO); - shp->mlock_user = NULL; + shp->mlock_ucounts = NULL; shp->shm_perm.security = NULL; error = security_shm_alloc(&shp->shm_perm); if (error) { - kvfree(shp); + kfree(shp); return error; } @@ -650,8 +751,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; file = hugetlb_file_setup(name, hugesize, acctflag, - &shp->mlock_user, HUGETLB_SHMFS_INODE, - (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); + HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { /* * Do not allow no accounting for OVERCOMMIT_NEVER, even @@ -680,7 +780,11 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (error < 0) goto no_id; + shp->ns = ns; + + task_lock(current); list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); + task_unlock(current); /* * shmid gets reported as "inode#" in /proc/pid/maps. @@ -698,8 +802,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) no_id: ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); - if (is_file_hugepages(file) && shp->mlock_user) - user_shm_unlock(size, shp->mlock_user); fput(file); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); return error; @@ -1105,12 +1207,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) goto out_unlock0; if (cmd == SHM_LOCK) { - struct user_struct *user = current_user(); + struct ucounts *ucounts = current_ucounts(); - err = shmem_lock(shm_file, 1, user); + err = shmem_lock(shm_file, 1, ucounts); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { shp->shm_perm.mode |= SHM_LOCKED; - shp->mlock_user = user; + shp->mlock_ucounts = ucounts; } goto out_unlock0; } @@ -1118,9 +1220,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) goto out_unlock0; - shmem_lock(shm_file, 0, shp->mlock_user); + shmem_lock(shm_file, 0, shp->mlock_ucounts); shp->shm_perm.mode &= ~SHM_LOCKED; - shp->mlock_user = NULL; + shp->mlock_ucounts = NULL; get_file(shm_file); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); @@ -1556,7 +1658,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, goto invalid; } - addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL); + addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL); *raddr = addr; err = 0; if (IS_ERR_VALUE(addr)) @@ -1573,7 +1675,8 @@ out_nattch: down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); @@ -1630,7 +1733,7 @@ long ksys_shmdt(char __user *shmaddr) #ifdef CONFIG_MMU loff_t size = 0; struct file *file; - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, addr); #endif if (addr & ~PAGE_MASK) @@ -1660,12 +1763,9 @@ long ksys_shmdt(char __user *shmaddr) * match the usual checks anyway. So assume all vma's are * above the starting address given. */ - vma = find_vma(mm, addr); #ifdef CONFIG_MMU - while (vma) { - next = vma->vm_next; - + for_each_vma(vmi, vma) { /* * Check if the starting address would match, i.e. it's * a fragment created by mprotect() and/or munmap(), or it @@ -1682,7 +1782,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1690,10 +1791,9 @@ long ksys_shmdt(char __user *shmaddr) * searching for matching vma's. */ retval = 0; - vma = next; + vma = vma_next(&vmi); break; } - vma = next; } /* @@ -1703,17 +1803,19 @@ long ksys_shmdt(char __user *shmaddr) */ size = PAGE_ALIGN(size); while (vma && (loff_t)(vma->vm_end - addr) <= size) { - next = vma->vm_next; - /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - vma = next; + (vma->vm_file == file)) { + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); + } + + vma = vma_next(&vmi); } #else /* CONFIG_MMU */ + vma = vma_lookup(mm, addr); /* under NOMMU conditions, the exact address to be destroyed must be * given */ diff --git a/ipc/util.c b/ipc/util.c index cfa0045e748d..05cb9de66735 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -64,6 +64,7 @@ #include <linux/memory.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> +#include <linux/log2.h> #include <asm/unistd.h> @@ -446,8 +447,43 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) - rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, - ipc_kht_params); + WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, + ipc_kht_params)); +} + +/** + * ipc_search_maxidx - search for the highest assigned index + * @ids: ipc identifier set + * @limit: known upper limit for highest assigned index + * + * The function determines the highest assigned index in @ids. It is intended + * to be called when ids->max_idx needs to be updated. + * Updating ids->max_idx is necessary when the current highest index ipc + * object is deleted. + * If no ipc object is allocated, then -1 is returned. + * + * ipc_ids.rwsem needs to be held by the caller. + */ +static int ipc_search_maxidx(struct ipc_ids *ids, int limit) +{ + int tmpidx; + int i; + int retval; + + i = ilog2(limit+1); + + retval = 0; + for (; i >= 0; i--) { + tmpidx = retval | (1<<i); + /* + * "0" is a possible index value, thus search using + * e.g. 15,7,3,1,0 instead of 16,8,4,2,1. + */ + tmpidx = tmpidx-1; + if (idr_get_next(&ids->ipcs_idr, &tmpidx)) + retval |= (1<<i); + } + return retval - 1; } /** @@ -462,17 +498,15 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); - idr_remove(&ids->ipcs_idr, idx); + WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; if (unlikely(idx == ids->max_idx)) { - do { - idx--; - if (idx == -1) - break; - } while (!idr_find(&ids->ipcs_idr, idx)); + idx = ids->max_idx-1; + if (idx >= 0) + idx = ipc_search_maxidx(ids, idx); ids->max_idx = idx; } } @@ -748,36 +782,37 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) return iter->pid_ns; } -/* - * This routine locks the ipc structure found at least at position pos. +/** + * sysvipc_find_ipc - Find and lock the ipc structure based on seq pos + * @ids: ipc identifier set + * @pos: expected position + * + * The function finds an ipc structure, based on the sequence file + * position @pos. If there is no ipc structure at position @pos, then + * the successor is selected. + * If a structure is found, then it is locked (both rcu_read_lock() and + * ipc_lock_object()) and @pos is set to the position needed to locate + * the found ipc structure. + * If nothing is found (i.e. EOF), @pos is not modified. + * + * The function returns the found ipc structure, or NULL at EOF. */ -static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, - loff_t *new_pos) +static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t *pos) { + int tmpidx; struct kern_ipc_perm *ipc; - int total, id; - total = 0; - for (id = 0; id < pos && total < ids->in_use; id++) { - ipc = idr_find(&ids->ipcs_idr, id); - if (ipc != NULL) - total++; - } + /* convert from position to idr index -> "-1" */ + tmpidx = *pos - 1; - ipc = NULL; - if (total >= ids->in_use) - goto out; + ipc = idr_get_next(&ids->ipcs_idr, &tmpidx); + if (ipc != NULL) { + rcu_read_lock(); + ipc_lock_object(ipc); - for (; pos < ipc_mni; pos++) { - ipc = idr_find(&ids->ipcs_idr, pos); - if (ipc != NULL) { - rcu_read_lock(); - ipc_lock_object(ipc); - break; - } + /* convert from idr index to position -> "+1" */ + *pos = tmpidx + 1; } -out: - *new_pos = pos + 1; return ipc; } @@ -791,11 +826,13 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); - return sysvipc_find_ipc(&iter->ns->ids[iface->ids], *pos, pos); + /* Next -> search for *pos+1 */ + (*pos)++; + return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos); } /* - * File positions: pos 0 -> header, pos n -> ipc id = n - 1. + * File positions: pos 0 -> header, pos n -> ipc idx = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) @@ -820,8 +857,8 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - /* Find the (pos-1)th ipc */ - return sysvipc_find_ipc(ids, *pos - 1, pos); + /* Otherwise return the correct ipc structure */ + return sysvipc_find_ipc(ids, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) @@ -868,7 +905,7 @@ static int sysvipc_proc_open(struct inode *inode, struct file *file) if (!iter) return -ENOMEM; - iter->iface = PDE_DATA(inode); + iter->iface = pde_data(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); diff --git a/ipc/util.h b/ipc/util.h index 5766c61aed0e..a55d6cebe6d3 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -14,6 +14,7 @@ #include <linux/unistd.h> #include <linux/err.h> #include <linux/ipc_namespace.h> +#include <linux/pid.h> /* * The IPC ID contains 2 separate numbers - index and sequence number. @@ -56,15 +57,13 @@ struct pid_namespace; #ifdef CONFIG_POSIX_MQUEUE extern void mq_clear_sbinfo(struct ipc_namespace *ns); -extern void mq_put_mnt(struct ipc_namespace *ns); #else static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { } -static inline void mq_put_mnt(struct ipc_namespace *ns) { } #endif #ifdef CONFIG_SYSVIPC void sem_init_ns(struct ipc_namespace *ns); -void msg_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); void shm_init_ns(struct ipc_namespace *ns); void sem_exit_ns(struct ipc_namespace *ns); @@ -72,7 +71,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline void msg_init_ns(struct ipc_namespace *ns) { } +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; } static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { } @@ -145,6 +144,9 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); * ipc_get_maxidx - get the highest assigned index * @ids: ipc identifier set * + * The function returns the highest assigned index for @ids. The function + * doesn't scan the idr tree, it uses a cached value. + * * Called with ipc_ids.rwsem held for reading. */ static inline int ipc_get_maxidx(struct ipc_ids *ids) |