aboutsummaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c475
1 files changed, 370 insertions, 105 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2bb2ae9551f..8e9c58fa7636 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,6 +78,7 @@
#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
+#include <linux/io_uring.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -264,7 +265,16 @@ struct io_ring_ctx {
/* IO offload */
struct io_wq *io_wq;
struct task_struct *sqo_thread; /* if using sq thread polling */
- struct mm_struct *sqo_mm;
+
+ /*
+ * For SQPOLL usage - we hold a reference to the parent task, so we
+ * have access to the ->files
+ */
+ struct task_struct *sqo_task;
+
+ /* Only used for accounting purposes */
+ struct mm_struct *mm_account;
+
wait_queue_head_t sqo_wait;
/*
@@ -274,8 +284,6 @@ struct io_ring_ctx {
*/
struct fixed_file_data *file_data;
unsigned nr_user_files;
- int ring_fd;
- struct file *ring_file;
/* if used, fixed mapped user buffers */
unsigned nr_user_bufs;
@@ -541,7 +549,6 @@ enum {
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_QUEUE_TIMEOUT_BIT,
REQ_F_WORK_INITIALIZED_BIT,
- REQ_F_TASK_PINNED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -599,8 +606,6 @@ enum {
REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
- /* req->task is refcounted */
- REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
};
struct async_poll {
@@ -915,21 +920,6 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
-static void io_get_req_task(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_TASK_PINNED)
- return;
- get_task_struct(req->task);
- req->flags |= REQ_F_TASK_PINNED;
-}
-
-/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
-static void __io_put_req_task(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_TASK_PINNED)
- put_task_struct(req->task);
-}
-
static void io_file_put_work(struct work_struct *work);
/*
@@ -1141,14 +1131,34 @@ static void io_kill_timeout(struct io_kiocb *req)
}
}
-static void io_kill_timeouts(struct io_ring_ctx *ctx)
+static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!tsk || req->task == tsk)
+ return true;
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && req->task == ctx->sqo_thread)
+ return true;
+ return false;
+}
+
+/*
+ * Returns true if we found and killed one or more timeouts
+ */
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
{
struct io_kiocb *req, *tmp;
+ int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
- io_kill_timeout(req);
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) {
+ if (io_task_match(req, tsk)) {
+ io_kill_timeout(req);
+ canceled++;
+ }
+ }
spin_unlock_irq(&ctx->completion_lock);
+ return canceled != 0;
}
static void __io_queue_deferred(struct io_ring_ctx *ctx)
@@ -1229,12 +1239,24 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
+static inline bool io_match_files(struct io_kiocb *req,
+ struct files_struct *files)
+{
+ if (!files)
+ return true;
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ return req->work.files == files;
+ return false;
+}
+
/* Returns true if there are no backlogged entries after the flush */
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+ struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_rings *rings = ctx->rings;
+ struct io_kiocb *req, *tmp;
struct io_uring_cqe *cqe;
- struct io_kiocb *req;
unsigned long flags;
LIST_HEAD(list);
@@ -1253,7 +1275,12 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
ctx->cq_overflow_flushed = 1;
cqe = NULL;
- while (!list_empty(&ctx->cq_overflow_list)) {
+ list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, list) {
+ if (tsk && req->task != tsk)
+ continue;
+ if (!io_match_files(req, files))
+ continue;
+
cqe = io_get_cqring(ctx);
if (!cqe && !force)
break;
@@ -1307,7 +1334,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- } else if (ctx->cq_overflow_flushed) {
+ } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
+ /*
+ * If we're in ring overflow flush mode, or in task cancel mode,
+ * then we cannot store the request for later flushing, we need
+ * to drop it on the floor.
+ */
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
} else {
@@ -1412,15 +1444,35 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
+static void io_req_drop_files(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ list_del(&req->inflight_entry);
+ if (waitqueue_active(&ctx->inflight_wait))
+ wake_up(&ctx->inflight_wait);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ req->flags &= ~REQ_F_INFLIGHT;
+ put_files_struct(req->work.files);
+ put_nsproxy(req->work.nsproxy);
+ req->work.files = NULL;
+}
+
static void __io_req_aux_free(struct io_kiocb *req)
{
+ struct io_uring_task *tctx = req->task->io_uring;
if (req->flags & REQ_F_NEED_CLEANUP)
io_cleanup_req(req);
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- __io_put_req_task(req);
+ atomic_long_inc(&tctx->req_complete);
+ if (tctx->in_idle)
+ wake_up(&tctx->wait);
+ put_task_struct(req->task);
io_req_work_drop_env(req);
}
@@ -1428,16 +1480,8 @@ static void __io_free_req(struct io_kiocb *req)
{
__io_req_aux_free(req);
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- list_del(&req->inflight_entry);
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
- }
+ if (req->flags & REQ_F_INFLIGHT)
+ io_req_drop_files(req);
percpu_ref_put(&req->ctx->refs);
if (likely(!io_is_fallback_req(req)))
@@ -1717,7 +1761,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
if (noflush && !list_empty(&ctx->cq_overflow_list))
return -1U;
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx, false, NULL, NULL);
}
/* See comment at the top of this file */
@@ -1738,7 +1782,7 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
return false;
- if (req->file || req->io)
+ if (req->file || req->io || req->task)
rb->need_iter++;
rb->reqs[rb->to_free++] = req;
@@ -1762,6 +1806,12 @@ static int io_put_kbuf(struct io_kiocb *req)
static inline bool io_run_task_work(void)
{
+ /*
+ * Not safe to run on exiting task, and the task_work handling will
+ * not add work to such a task.
+ */
+ if (unlikely(current->flags & PF_EXITING))
+ return false;
if (current->task_works) {
__set_current_state(TASK_RUNNING);
task_work_run();
@@ -3492,8 +3542,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
- if ((req->file && req->file->f_op == &io_uring_fops) ||
- req->close.fd == req->ctx->ring_fd)
+ if ((req->file && req->file->f_op == &io_uring_fops))
return -EBADF;
req->close.put_file = NULL;
@@ -4397,9 +4446,10 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
{
if (io_op_defs[req->opcode].needs_mm && !current->mm) {
if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
- !mmget_not_zero(ctx->sqo_mm)))
+ !ctx->sqo_task->mm ||
+ !mmget_not_zero(ctx->sqo_task->mm)))
return -EFAULT;
- kthread_use_mm(ctx->sqo_mm);
+ kthread_use_mm(ctx->sqo_task->mm);
}
return 0;
@@ -4550,7 +4600,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&apoll->work, &req->work, sizeof(req->work));
- io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4635,7 +4684,10 @@ static bool io_poll_remove_one(struct io_kiocb *req)
return do_complete;
}
-static void io_poll_remove_all(struct io_ring_ctx *ctx)
+/*
+ * Returns true if we found and killed one or more poll requests
+ */
+static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -4646,13 +4698,17 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
struct hlist_head *list;
list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(req, tmp, list, hash_node)
- posted += io_poll_remove_one(req);
+ hlist_for_each_entry_safe(req, tmp, list, hash_node) {
+ if (io_task_match(req, tsk))
+ posted += io_poll_remove_one(req);
+ }
}
spin_unlock_irq(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
+
+ return posted != 0;
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -4738,8 +4794,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
-
- io_get_req_task(req);
return 0;
}
@@ -5626,32 +5680,20 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
static int io_grab_files(struct io_kiocb *req)
{
- int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx;
if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
return 0;
- if (!ctx->ring_file)
- return -EBADF;
- rcu_read_lock();
+ req->work.files = get_files_struct(current);
+ get_nsproxy(current->nsproxy);
+ req->work.nsproxy = current->nsproxy;
+ req->flags |= REQ_F_INFLIGHT;
+
spin_lock_irq(&ctx->inflight_lock);
- /*
- * We use the f_ops->flush() handler to ensure that we can flush
- * out work accessing these files if the fd is closed. Check if
- * the fd has changed since we started down this path, and disallow
- * this operation if it has.
- */
- if (fcheck(ctx->ring_fd) == ctx->ring_file) {
- list_add(&req->inflight_entry, &ctx->inflight_list);
- req->flags |= REQ_F_INFLIGHT;
- req->work.files = current->files;
- ret = 0;
- }
+ list_add(&req->inflight_entry, &ctx->inflight_list);
spin_unlock_irq(&ctx->inflight_lock);
- rcu_read_unlock();
-
- return ret;
+ return 0;
}
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
@@ -6021,6 +6063,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
req->task = current;
+ get_task_struct(req->task);
+ atomic_long_inc(&req->task->io_uring->req_issue);
req->result = 0;
if (unlikely(req->opcode >= IORING_OP_LAST))
@@ -6056,8 +6100,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
return io_req_set_file(state, req, READ_ONCE(sqe->fd));
}
-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- struct file *ring_file, int ring_fd)
+static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
@@ -6066,7 +6109,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
if (!list_empty(&ctx->cq_overflow_list) &&
- !io_cqring_overflow_flush(ctx, false))
+ !io_cqring_overflow_flush(ctx, false, NULL, NULL))
return -EBUSY;
}
@@ -6081,9 +6124,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
statep = &state;
}
- ctx->ring_fd = ring_fd;
- ctx->ring_file = ring_file;
-
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
@@ -6244,7 +6284,7 @@ static int io_sq_thread(void *data)
mutex_lock(&ctx->uring_lock);
if (likely(!percpu_ref_is_dying(&ctx->refs)))
- ret = io_submit_sqes(ctx, to_submit, NULL, -1);
+ ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
timeout = jiffies + ctx->sq_thread_idle;
}
@@ -7073,14 +7113,38 @@ out_fput:
return ret;
}
+static int io_uring_alloc_task_context(struct task_struct *task)
+{
+ struct io_uring_task *tctx;
+
+ tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+ if (unlikely(!tctx))
+ return -ENOMEM;
+
+ xa_init(&tctx->xa);
+ init_waitqueue_head(&tctx->wait);
+ tctx->last = NULL;
+ tctx->in_idle = 0;
+ atomic_long_set(&tctx->req_issue, 0);
+ atomic_long_set(&tctx->req_complete, 0);
+ task->io_uring = tctx;
+ return 0;
+}
+
+void __io_uring_free(struct task_struct *tsk)
+{
+ struct io_uring_task *tctx = tsk->io_uring;
+
+ WARN_ON_ONCE(!xa_empty(&tctx->xa));
+ kfree(tctx);
+ tsk->io_uring = NULL;
+}
+
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
int ret;
- mmgrab(current->mm);
- ctx->sqo_mm = current->mm;
-
if (ctx->flags & IORING_SETUP_SQPOLL) {
ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
@@ -7111,6 +7175,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
ctx->sqo_thread = NULL;
goto err;
}
+ ret = io_uring_alloc_task_context(ctx->sqo_thread);
+ if (ret)
+ goto err;
wake_up_process(ctx->sqo_thread);
} else if (p->flags & IORING_SETUP_SQ_AFF) {
/* Can't have SQ_AFF without SQPOLL */
@@ -7125,8 +7192,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
return 0;
err:
io_finish_async(ctx);
- mmdrop(ctx->sqo_mm);
- ctx->sqo_mm = NULL;
return ret;
}
@@ -7456,8 +7521,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
- if (ctx->sqo_mm)
- mmdrop(ctx->sqo_mm);
+ if (ctx->sqo_task) {
+ put_task_struct(ctx->sqo_task);
+ ctx->sqo_task = NULL;
+ mmdrop(ctx->mm_account);
+ ctx->mm_account = NULL;
+ }
io_iopoll_reap_events(ctx);
io_sqe_buffer_unregister(ctx);
@@ -7528,7 +7597,7 @@ static void io_ring_exit_work(struct work_struct *work)
ctx = container_of(work, struct io_ring_ctx, exit_work);
if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
/*
* If we're doing polled IO and end up having requests being
@@ -7539,7 +7608,7 @@ static void io_ring_exit_work(struct work_struct *work)
while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
io_iopoll_reap_events(ctx);
if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
}
io_ring_ctx_free(ctx);
}
@@ -7550,8 +7619,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
percpu_ref_kill(&ctx->refs);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx);
- io_poll_remove_all(ctx);
+ io_kill_timeouts(ctx, NULL);
+ io_poll_remove_all(ctx, NULL);
if (ctx->io_wq)
io_wq_cancel_all(ctx->io_wq);
@@ -7559,7 +7628,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_iopoll_reap_events(ctx);
/* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
/*
@@ -7588,7 +7657,7 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data)
{
struct files_struct *files = data;
- return work->files == files;
+ return !files || work->files == files;
}
/*
@@ -7609,12 +7678,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
return false;
}
-static inline bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- return (req->flags & REQ_F_WORK_INITIALIZED) && req->work.files == files;
-}
-
static bool io_match_link_files(struct io_kiocb *req,
struct files_struct *files)
{
@@ -7729,11 +7792,14 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
}
}
-static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+/*
+ * Returns true if we found and killed one or more files pinning requests
+ */
+static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
if (list_empty_careful(&ctx->inflight_list))
- return;
+ return false;
io_cancel_defer_files(ctx, files);
/* cancel all at once, should be faster than doing it one by one*/
@@ -7745,7 +7811,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (req->work.files != files)
+ if (files && req->work.files != files)
continue;
/* req is being completed, ignore */
if (!refcount_inc_not_zero(&req->refs))
@@ -7791,9 +7857,13 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
io_put_req(cancel_req);
}
+ /* cancellations _may_ trigger task work */
+ io_run_task_work();
schedule();
finish_wait(&ctx->inflight_wait, &wait);
}
+
+ return true;
}
static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
@@ -7801,21 +7871,198 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct task_struct *task = data;
- return req->task == task;
+ return io_task_match(req, task);
+}
+
+static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ struct files_struct *files)
+{
+ bool ret;
+
+ ret = io_uring_cancel_files(ctx, files);
+ if (!files) {
+ enum io_wq_cancel cret;
+
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+ if (cret != IO_WQ_CANCEL_NOTFOUND)
+ ret = true;
+
+ /* SQPOLL thread does its own polling */
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ if (!list_empty_careful(&ctx->poll_list)) {
+ io_iopoll_reap_events(ctx);
+ ret = true;
+ }
+ }
+
+ ret |= io_poll_remove_all(ctx, task);
+ ret |= io_kill_timeouts(ctx, task);
+ }
+
+ return ret;
+}
+
+/*
+ * We need to iteratively cancel requests, in case a request has dependent
+ * hard links. These persist even for failure of cancelations, hence keep
+ * looping until none are found.
+ */
+static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+{
+ struct task_struct *task = current;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ task = ctx->sqo_thread;
+
+ io_cqring_overflow_flush(ctx, true, task, files);
+
+ while (__io_uring_cancel_task_requests(ctx, task, files)) {
+ io_run_task_work();
+ cond_resched();
+ }
+}
+
+/*
+ * Note that this task has used io_uring. We use it for cancelation purposes.
+ */
+static int io_uring_add_task_file(struct file *file)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (unlikely(!tctx)) {
+ int ret;
+
+ ret = io_uring_alloc_task_context(current);
+ if (unlikely(ret))
+ return ret;
+ tctx = current->io_uring;
+ }
+ if (tctx->last != file) {
+ void *old = xa_load(&tctx->xa, (unsigned long)file);
+
+ if (!old) {
+ get_file(file);
+ xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL);
+ }
+ tctx->last = file;
+ }
+
+ return 0;
+}
+
+/*
+ * Remove this io_uring_file -> task mapping.
+ */
+static void io_uring_del_task_file(struct file *file)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (tctx->last == file)
+ tctx->last = NULL;
+ file = xa_erase(&tctx->xa, (unsigned long)file);
+ if (file)
+ fput(file);
+}
+
+static void __io_uring_attempt_task_drop(struct file *file)
+{
+ struct file *old = xa_load(&current->io_uring->xa, (unsigned long)file);
+
+ if (old == file)
+ io_uring_del_task_file(file);
+}
+
+/*
+ * Drop task note for this file if we're the only ones that hold it after
+ * pending fput()
+ */
+static void io_uring_attempt_task_drop(struct file *file, bool exiting)
+{
+ if (!current->io_uring)
+ return;
+ /*
+ * fput() is pending, will be 2 if the only other ref is our potential
+ * task file note. If the task is exiting, drop regardless of count.
+ */
+ if (!exiting && atomic_long_read(&file->f_count) != 2)
+ return;
+
+ __io_uring_attempt_task_drop(file);
+}
+
+void __io_uring_files_cancel(struct files_struct *files)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ struct file *file;
+ unsigned long index;
+
+ /* make sure overflow events are dropped */
+ tctx->in_idle = true;
+
+ xa_for_each(&tctx->xa, index, file) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ io_uring_cancel_task_requests(ctx, files);
+ if (files)
+ io_uring_del_task_file(file);
+ }
+}
+
+static inline bool io_uring_task_idle(struct io_uring_task *tctx)
+{
+ return atomic_long_read(&tctx->req_issue) ==
+ atomic_long_read(&tctx->req_complete);
+}
+
+/*
+ * Find any io_uring fd that this task has registered or done IO on, and cancel
+ * requests.
+ */
+void __io_uring_task_cancel(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ DEFINE_WAIT(wait);
+ long completions;
+
+ /* make sure overflow events are dropped */
+ tctx->in_idle = true;
+
+ while (!io_uring_task_idle(tctx)) {
+ /* read completions before cancelations */
+ completions = atomic_long_read(&tctx->req_complete);
+ __io_uring_files_cancel(NULL);
+
+ prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ /*
+ * If we've seen completions, retry. This avoids a race where
+ * a completion comes in before we did prepare_to_wait().
+ */
+ if (completions != atomic_long_read(&tctx->req_complete))
+ continue;
+ if (io_uring_task_idle(tctx))
+ break;
+ schedule();
+ }
+
+ finish_wait(&tctx->wait, &wait);
+ tctx->in_idle = false;
}
static int io_uring_flush(struct file *file, void *data)
{
struct io_ring_ctx *ctx = file->private_data;
- io_uring_cancel_files(ctx, data);
-
/*
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
+ data = NULL;
+ io_uring_cancel_task_requests(ctx, data);
+ io_uring_attempt_task_drop(file, !data);
return 0;
}
@@ -7924,13 +8171,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
if (!list_empty_careful(&ctx->cq_overflow_list))
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx, false, NULL, NULL);
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sqo_wait);
submitted = to_submit;
} else if (to_submit) {
+ ret = io_uring_add_task_file(f.file);
+ if (unlikely(ret))
+ goto out;
mutex_lock(&ctx->uring_lock);
- submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
+ submitted = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
if (submitted != to_submit)
@@ -8142,6 +8392,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC);
if (IS_ERR(file)) {
+err_fd:
put_unused_fd(ret);
ret = PTR_ERR(file);
goto err;
@@ -8150,6 +8401,10 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
#if defined(CONFIG_UNIX)
ctx->ring_sock->file = file;
#endif
+ if (unlikely(io_uring_add_task_file(file))) {
+ file = ERR_PTR(-ENOMEM);
+ goto err_fd;
+ }
fd_install(ret, file);
return ret;
err:
@@ -8228,6 +8483,16 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->user = user;
ctx->creds = get_current_cred();
+ ctx->sqo_task = get_task_struct(current);
+ /*
+ * This is just grabbed for accounting purposes. When a process exits,
+ * the mm is exited and dropped before the files, hence we need to hang
+ * on to this mm purely for the purposes of being able to unaccount
+ * memory (locked/pinned vm). It's not used for anything else.
+ */
+ mmgrab(current->mm);
+ ctx->mm_account = current->mm;
+
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;