aboutsummaryrefslogtreecommitdiffstats
path: root/io_uring/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r--io_uring/io_uring.c121
1 files changed, 54 insertions, 67 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cd9a137ad6ce..adf944bb5a2f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1176,12 +1176,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
static unsigned int handle_tw_list(struct llist_node *node,
struct io_ring_ctx **ctx,
- struct io_tw_state *ts,
- struct llist_node *last)
+ struct io_tw_state *ts)
{
unsigned int count = 0;
- while (node && node != last) {
+ do {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
@@ -1205,7 +1204,7 @@ static unsigned int handle_tw_list(struct llist_node *node,
*ctx = NULL;
cond_resched();
}
- }
+ } while (node);
return count;
}
@@ -1224,22 +1223,6 @@ static inline struct llist_node *io_llist_xchg(struct llist_head *head,
return xchg(&head->first, new);
}
-/**
- * io_llist_cmpxchg - possibly swap all entries in a lock-less list
- * @head: the head of lock-less list to delete all entries
- * @old: expected old value of the first entry of the list
- * @new: new entry as the head of the list
- *
- * perform a cmpxchg on the first entry of the list.
- */
-
-static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
- struct llist_node *old,
- struct llist_node *new)
-{
- return cmpxchg(&head->first, old, new);
-}
-
static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
{
struct llist_node *node = llist_del_all(&tctx->task_list);
@@ -1274,9 +1257,7 @@ void tctx_task_work(struct callback_head *cb)
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
- struct llist_node fake = {};
struct llist_node *node;
- unsigned int loops = 0;
unsigned int count = 0;
if (unlikely(current->flags & PF_EXITING)) {
@@ -1284,21 +1265,9 @@ void tctx_task_work(struct callback_head *cb)
return;
}
- do {
- loops++;
- node = io_llist_xchg(&tctx->task_list, &fake);
- count += handle_tw_list(node, &ctx, &ts, &fake);
-
- /* skip expensive cmpxchg if there are items in the list */
- if (READ_ONCE(tctx->task_list.first) != &fake)
- continue;
- if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
- io_submit_flush_completions(ctx);
- if (READ_ONCE(tctx->task_list.first) != &fake)
- continue;
- }
- node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
- } while (node != &fake);
+ node = llist_del_all(&tctx->task_list);
+ if (node)
+ count = handle_tw_list(node, &ctx, &ts);
ctx_flush_and_put(ctx, &ts);
@@ -1306,7 +1275,7 @@ void tctx_task_work(struct callback_head *cb)
if (unlikely(atomic_read(&tctx->in_cancel)))
io_uring_drop_tctx_refs(current);
- trace_io_uring_task_work_run(tctx, count, loops);
+ trace_io_uring_task_work_run(tctx, count, 1);
}
static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
@@ -1420,7 +1389,20 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
}
}
-static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
+static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
+ int min_events)
+{
+ if (llist_empty(&ctx->work_llist))
+ return false;
+ if (events < min_events)
+ return true;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+ return false;
+}
+
+static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
+ int min_events)
{
struct llist_node *node;
unsigned int loops = 0;
@@ -1449,18 +1431,20 @@ again:
}
loops++;
- if (!llist_empty(&ctx->work_llist))
+ if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
if (ts->locked) {
io_submit_flush_completions(ctx);
- if (!llist_empty(&ctx->work_llist))
+ if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
}
+
trace_io_uring_local_work_run(ctx, ret, loops);
return ret;
}
-static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
+static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
+ int min_events)
{
struct io_tw_state ts = { .locked = true, };
int ret;
@@ -1468,20 +1452,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
if (llist_empty(&ctx->work_llist))
return 0;
- ret = __io_run_local_work(ctx, &ts);
+ ret = __io_run_local_work(ctx, &ts, min_events);
/* shouldn't happen! */
if (WARN_ON_ONCE(!ts.locked))
mutex_lock(&ctx->uring_lock);
return ret;
}
-static int io_run_local_work(struct io_ring_ctx *ctx)
+static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
{
struct io_tw_state ts = {};
int ret;
ts.locked = mutex_trylock(&ctx->uring_lock);
- ret = __io_run_local_work(ctx, &ts);
+ ret = __io_run_local_work(ctx, &ts, min_events);
if (ts.locked)
mutex_unlock(&ctx->uring_lock);
@@ -1677,7 +1661,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;
- (void) io_run_local_work_locked(ctx);
+ (void) io_run_local_work_locked(ctx, min);
if (task_work_pending(current) ||
wq_list_empty(&ctx->iopoll_list)) {
@@ -2520,7 +2504,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
{
if (!llist_empty(&ctx->work_llist)) {
__set_current_state(TASK_RUNNING);
- if (io_run_local_work(ctx) > 0)
+ if (io_run_local_work(ctx, INT_MAX) > 0)
return 0;
}
if (io_run_task_work() > 0)
@@ -2543,7 +2527,7 @@ static bool current_pending_io(void)
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
- int io_wait, ret;
+ int ret;
if (unlikely(READ_ONCE(ctx->check_cq)))
return 1;
@@ -2561,7 +2545,6 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
* can take into account that the task is waiting for IO - turns out
* to be important for low QD IO.
*/
- io_wait = current->in_iowait;
if (current_pending_io())
current->in_iowait = 1;
ret = 0;
@@ -2569,7 +2552,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
schedule();
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
ret = -ETIME;
- current->in_iowait = io_wait;
+ current->in_iowait = 0;
return ret;
}
@@ -2588,7 +2571,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!io_allowed_run_tw(ctx))
return -EEXIST;
if (!llist_empty(&ctx->work_llist))
- io_run_local_work(ctx);
+ io_run_local_work(ctx, min_events);
io_run_task_work();
io_cqring_overflow_flush(ctx);
/* if user messes with these they will just get an early return */
@@ -2626,11 +2609,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
trace_io_uring_cqring_wait(ctx, min_events);
do {
+ int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
unsigned long check_cq;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
- int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
-
atomic_set(&ctx->cq_wait_nr, nr_wait);
set_current_state(TASK_INTERRUPTIBLE);
} else {
@@ -2649,7 +2631,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
*/
io_run_task_work();
if (!llist_empty(&ctx->work_llist))
- io_run_local_work(ctx);
+ io_run_local_work(ctx, nr_wait);
/*
* Non-local task_work will be run on exit to userspace, but
@@ -2720,7 +2702,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
struct page **page_array;
unsigned int nr_pages;
void *page_addr;
- int ret, i;
+ int ret, i, pinned;
*npages = 0;
@@ -2734,12 +2716,12 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
if (!page_array)
return ERR_PTR(-ENOMEM);
- ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- page_array);
- if (ret != nr_pages) {
-err:
- io_pages_free(&page_array, ret > 0 ? ret : 0);
- return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
+
+ pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ page_array);
+ if (pinned != nr_pages) {
+ ret = (pinned < 0) ? pinned : -EFAULT;
+ goto free_pages;
}
page_addr = page_address(page_array[0]);
@@ -2753,7 +2735,7 @@ err:
* didn't support this feature.
*/
if (PageHighMem(page_array[i]))
- goto err;
+ goto free_pages;
/*
* No support for discontig pages for now, should either be a
@@ -2762,13 +2744,17 @@ err:
* just fail them with EINVAL.
*/
if (page_address(page_array[i]) != page_addr)
- goto err;
+ goto free_pages;
page_addr += PAGE_SIZE;
}
*pages = page_array;
*npages = nr_pages;
return page_to_virt(page_array[0]);
+
+free_pages:
+ io_pages_free(&page_array, pinned > 0 ? pinned : 0);
+ return ERR_PTR(ret);
}
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
@@ -2790,14 +2776,15 @@ static void io_rings_free(struct io_ring_ctx *ctx)
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
- ctx->rings = NULL;
- ctx->sq_sqes = NULL;
} else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0;
io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
ctx->n_sqe_pages = 0;
}
+
+ ctx->rings = NULL;
+ ctx->sq_sqes = NULL;
}
void *io_mem_alloc(size_t size)
@@ -3304,7 +3291,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
- ret |= io_run_local_work(ctx) > 0;
+ ret |= io_run_local_work(ctx, INT_MAX) > 0;
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
@@ -3666,7 +3653,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
* it should handle ownership problems if any.
*/
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
- (void)io_run_local_work_locked(ctx);
+ (void)io_run_local_work_locked(ctx, min_complete);
}
mutex_unlock(&ctx->uring_lock);
}