summaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c301
1 files changed, 162 insertions, 139 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5190bfb6a665..c687f57fb651 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -357,7 +357,6 @@ struct io_timeout_data {
struct hrtimer timer;
struct timespec64 ts;
enum hrtimer_mode mode;
- u32 seq_offset;
};
struct io_accept {
@@ -385,7 +384,7 @@ struct io_timeout {
struct file *file;
u64 addr;
int flags;
- unsigned count;
+ u32 count;
};
struct io_rw {
@@ -508,6 +507,7 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_LINK_HEAD_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
@@ -543,6 +543,8 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
+ /* head of a link */
+ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
@@ -955,8 +957,8 @@ static inline bool __req_need_defer(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
- + atomic_read(&ctx->cached_cq_overflow);
+ return req->sequence != ctx->cached_cq_tail
+ + atomic_read(&ctx->cached_cq_overflow);
}
static inline bool req_need_defer(struct io_kiocb *req)
@@ -1437,7 +1439,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
- req->flags &= ~REQ_F_LINK;
+ req->flags &= ~REQ_F_LINK_HEAD;
io_put_req(req);
return true;
}
@@ -1473,7 +1475,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
list_del_init(&req->link_list);
if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK;
+ nxt->flags |= REQ_F_LINK_HEAD;
*nxtptr = nxt;
break;
}
@@ -1484,7 +1486,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
}
/*
- * Called if REQ_F_LINK is set, and we fail the head request
+ * Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
static void io_fail_links(struct io_kiocb *req)
{
@@ -1517,7 +1519,7 @@ static void io_fail_links(struct io_kiocb *req)
static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
- if (likely(!(req->flags & REQ_F_LINK)))
+ if (likely(!(req->flags & REQ_F_LINK_HEAD)))
return;
/*
@@ -1669,7 +1671,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
{
- if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
+ if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
return false;
if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
@@ -2562,7 +2564,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
req->result = 0;
io_size = ret;
- if (req->flags & REQ_F_LINK)
+ if (req->flags & REQ_F_LINK_HEAD)
req->result = io_size;
/*
@@ -2653,7 +2655,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
req->result = 0;
io_size = ret;
- if (req->flags & REQ_F_LINK)
+ if (req->flags & REQ_F_LINK_HEAD)
req->result = io_size;
/*
@@ -2760,7 +2762,7 @@ static bool io_splice_punt(struct file *file)
return false;
if (!io_file_supports_async(file))
return true;
- return !(file->f_mode & O_NONBLOCK);
+ return !(file->f_flags & O_NONBLOCK);
}
static int io_splice(struct io_kiocb *req, bool force_nonblock)
@@ -4153,25 +4155,62 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
return 1;
}
+static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
+ __acquires(&req->ctx->completion_lock)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
+
+ spin_lock_irq(&ctx->completion_lock);
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ return true;
+ }
+
+ return false;
+}
+
static void io_async_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
+ bool canceled;
trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
- WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
+ if (io_poll_rewait(req, &apoll->poll)) {
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
- if (hash_hashed(&req->hash_node)) {
- spin_lock_irq(&ctx->completion_lock);
+ if (hash_hashed(&req->hash_node))
hash_del(&req->hash_node);
- spin_unlock_irq(&ctx->completion_lock);
+
+ canceled = READ_ONCE(apoll->poll.canceled);
+ if (canceled) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(ctx);
}
+ spin_unlock_irq(&ctx->completion_lock);
+
/* restore ->work in case we need to retry again */
memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (canceled) {
+ kfree(apoll);
+ io_cqring_ev_posted(ctx);
+ req_set_fail_links(req);
+ io_double_put_req(req);
+ return;
+ }
+
__set_current_state(TASK_RUNNING);
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
@@ -4315,11 +4354,13 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
static bool io_poll_remove_one(struct io_kiocb *req)
{
+ struct async_poll *apoll = NULL;
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
+ apoll = req->apoll;
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &req->apoll->poll);
if (do_complete)
@@ -4328,6 +4369,14 @@ static bool io_poll_remove_one(struct io_kiocb *req)
hash_del(&req->hash_node);
+ if (do_complete && apoll) {
+ /*
+ * restore ->work because we need to call io_req_work_drop_env.
+ */
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll);
+ }
+
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
@@ -4342,7 +4391,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
{
struct hlist_node *tmp;
struct io_kiocb *req;
- int i;
+ int posted = 0, i;
spin_lock_irq(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
@@ -4350,11 +4399,12 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node)
- io_poll_remove_one(req);
+ posted += io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+ if (posted)
+ io_cqring_ev_posted(ctx);
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -4423,18 +4473,11 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
struct io_ring_ctx *ctx = req->ctx;
struct io_poll_iocb *poll = &req->poll;
- if (!req->result && !READ_ONCE(poll->canceled)) {
- struct poll_table_struct pt = { ._key = poll->events };
-
- req->result = vfs_poll(req->file, &pt) & poll->events;
- }
-
- spin_lock_irq(&ctx->completion_lock);
- if (!req->result && !READ_ONCE(poll->canceled)) {
- add_wait_queue(poll->head, &poll->wait);
+ if (io_poll_rewait(req, poll)) {
spin_unlock_irq(&ctx->completion_lock);
return;
}
+
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
@@ -4665,11 +4708,12 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_timeout(struct io_kiocb *req)
{
- unsigned count;
struct io_ring_ctx *ctx = req->ctx;
struct io_timeout_data *data;
struct list_head *entry;
unsigned span = 0;
+ u32 count = req->timeout.count;
+ u32 seq = req->sequence;
data = &req->io->timeout;
@@ -4678,7 +4722,6 @@ static int io_timeout(struct io_kiocb *req)
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- count = req->timeout.count;
if (!count) {
req->flags |= REQ_F_TIMEOUT_NOSEQ;
spin_lock_irq(&ctx->completion_lock);
@@ -4686,8 +4729,7 @@ static int io_timeout(struct io_kiocb *req)
goto add;
}
- req->sequence = ctx->cached_sq_head + count - 1;
- data->seq_offset = count;
+ req->sequence = seq + count;
/*
* Insertion sort, ensuring the first entry in the list is always
@@ -4696,26 +4738,26 @@ static int io_timeout(struct io_kiocb *req)
spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- unsigned nxt_sq_head;
+ unsigned nxt_seq;
long long tmp, tmp_nxt;
- u32 nxt_offset = nxt->io->timeout.seq_offset;
+ u32 nxt_offset = nxt->timeout.count;
if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
continue;
/*
- * Since cached_sq_head + count - 1 can overflow, use type long
+ * Since seq + count can overflow, use type long
* long to store it.
*/
- tmp = (long long)ctx->cached_sq_head + count - 1;
- nxt_sq_head = nxt->sequence - nxt_offset + 1;
- tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
+ tmp = (long long)seq + count;
+ nxt_seq = nxt->sequence - nxt_offset;
+ tmp_nxt = (long long)nxt_seq + nxt_offset;
/*
* cached_sq_head may overflow, and it will never overflow twice
* once there is some timeout req still be valid.
*/
- if (ctx->cached_sq_head < nxt_sq_head)
+ if (seq < nxt_seq)
tmp += UINT_MAX;
if (tmp > tmp_nxt)
@@ -5476,7 +5518,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *nxt;
- if (!(req->flags & REQ_F_LINK))
+ if (!(req->flags & REQ_F_LINK_HEAD))
return NULL;
/* for polled retry, if flag is set, we already went through here */
if (req->flags & REQ_F_POLLED)
@@ -5604,54 +5646,11 @@ static inline void io_queue_link_head(struct io_kiocb *req)
io_queue_sqe(req, NULL);
}
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
-
-static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned int sqe_flags;
- int ret, id, fd;
-
- sqe_flags = READ_ONCE(sqe->flags);
-
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
- ret = -EINVAL;
- goto err_req;
- }
-
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select) {
- ret = -EOPNOTSUPP;
- goto err_req;
- }
-
- id = READ_ONCE(sqe->personality);
- if (id) {
- req->work.creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!req->work.creds)) {
- ret = -EINVAL;
- goto err_req;
- }
- get_cred(req->work.creds);
- }
-
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
- IOSQE_ASYNC | IOSQE_FIXED_FILE |
- IOSQE_BUFFER_SELECT);
-
- fd = READ_ONCE(sqe->fd);
- ret = io_req_set_file(state, req, fd, sqe_flags);
- if (unlikely(ret)) {
-err_req:
- io_cqring_add_event(req, ret);
- io_double_put_req(req);
- return false;
- }
+ int ret;
/*
* If we already have a head request, queue this one for async
@@ -5670,42 +5669,39 @@ err_req:
* next after the link request. The last one is done via
* drain_next flag to persist the effect across calls.
*/
- if (sqe_flags & IOSQE_IO_DRAIN) {
+ if (req->flags & REQ_F_IO_DRAIN) {
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- if (io_alloc_async_ctx(req)) {
- ret = -EAGAIN;
- goto err_req;
- }
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
ret = io_req_defer_prep(req, sqe);
if (ret) {
/* fail even hard links since we don't submit */
head->flags |= REQ_F_FAIL_LINK;
- goto err_req;
+ return ret;
}
trace_io_uring_link(ctx, req, head);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
- if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
+ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_link_head(head);
*link = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
- req->ctx->drain_next = 0;
+ ctx->drain_next = 0;
}
- if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
- req->flags |= REQ_F_LINK;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ req->flags |= REQ_F_LINK_HEAD;
INIT_LIST_HEAD(&req->link_list);
- if (io_alloc_async_ctx(req)) {
- ret = -EAGAIN;
- goto err_req;
- }
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
+
ret = io_req_defer_prep(req, sqe);
if (ret)
req->flags |= REQ_F_FAIL_LINK;
@@ -5715,7 +5711,7 @@ err_req:
}
}
- return true;
+ return 0;
}
/*
@@ -5789,15 +5785,23 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx)
ctx->cached_sq_head++;
}
-static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_BUFFER_SELECT)
+
+static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe,
+ struct io_submit_state *state, bool async)
{
+ unsigned int sqe_flags;
+ int id, fd;
+
/*
* All io need record the previous position, if LINK vs DARIN,
* it can be used to mark the position of the first IO in the
* link list.
*/
- req->sequence = ctx->cached_sq_head;
+ req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
req->opcode = READ_ONCE(sqe->opcode);
req->user_data = READ_ONCE(sqe->user_data);
req->io = NULL;
@@ -5808,17 +5812,50 @@ static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
refcount_set(&req->refs, 2);
req->task = NULL;
req->result = 0;
+ req->needs_fixed_file = async;
INIT_IO_WORK(&req->work, io_wq_submit_work);
+
+ if (unlikely(req->opcode >= IORING_OP_LAST))
+ return -EINVAL;
+
+ if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+ if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ use_mm(ctx->sqo_mm);
+ }
+
+ sqe_flags = READ_ONCE(sqe->flags);
+ /* enforce forwards compatibility on users */
+ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+ return -EINVAL;
+
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[req->opcode].buffer_select)
+ return -EOPNOTSUPP;
+
+ id = READ_ONCE(sqe->personality);
+ if (id) {
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds))
+ return -EINVAL;
+ get_cred(req->work.creds);
+ }
+
+ /* same numerical values with corresponding REQ_F_*, safe to copy */
+ req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
+ IOSQE_ASYNC | IOSQE_FIXED_FILE |
+ IOSQE_BUFFER_SELECT | IOSQE_IO_LINK);
+
+ fd = READ_ONCE(sqe->fd);
+ return io_req_set_file(state, req, fd, sqe_flags);
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- struct file *ring_file, int ring_fd,
- struct mm_struct **mm, bool async)
+ struct file *ring_file, int ring_fd, bool async)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
int i, submitted = 0;
- bool mm_fault = false;
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
@@ -5858,34 +5895,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
break;
}
- io_init_req(ctx, req, sqe);
+ err = io_init_req(ctx, req, sqe, statep, async);
io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
- if (unlikely(req->opcode >= IORING_OP_LAST)) {
- err = -EINVAL;
+ if (unlikely(err)) {
fail_req:
io_cqring_add_event(req, err);
io_double_put_req(req);
break;
}
- if (io_op_defs[req->opcode].needs_mm && !*mm) {
- mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
- if (unlikely(mm_fault)) {
- err = -EFAULT;
- goto fail_req;
- }
- use_mm(ctx->sqo_mm);
- *mm = ctx->sqo_mm;
- }
-
- req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, async);
- if (!io_submit_sqe(req, sqe, statep, &link))
- break;
+ err = io_submit_sqe(req, sqe, statep, &link);
+ if (err)
+ goto fail_req;
}
if (unlikely(submitted != nr)) {
@@ -5904,10 +5930,19 @@ fail_req:
return submitted;
}
+static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
- struct mm_struct *cur_mm = NULL;
const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
@@ -5948,11 +5983,7 @@ static int io_sq_thread(void *data)
* adding ourselves to the waitqueue, as the unuse/drop
* may sleep.
*/
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- cur_mm = NULL;
- }
+ io_sq_thread_drop_mm(ctx);
/*
* We're polling. If we're within the defined idle
@@ -6016,7 +6047,7 @@ static int io_sq_thread(void *data)
}
mutex_lock(&ctx->uring_lock);
- ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+ ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
mutex_unlock(&ctx->uring_lock);
timeout = jiffies + ctx->sq_thread_idle;
}
@@ -6025,10 +6056,7 @@ static int io_sq_thread(void *data)
task_work_run();
set_fs(old_fs);
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- }
+ io_sq_thread_drop_mm(ctx);
revert_creds(old_cred);
kthread_parkme();
@@ -7509,13 +7537,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
wake_up(&ctx->sqo_wait);
submitted = to_submit;
} else if (to_submit) {
- struct mm_struct *cur_mm;
-
mutex_lock(&ctx->uring_lock);
- /* already have mm, so io_submit_sqes() won't try to grab it */
- cur_mm = ctx->sqo_mm;
- submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
- &cur_mm, false);
+ submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false);
mutex_unlock(&ctx->uring_lock);
if (submitted != to_submit)