diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 94 |
1 files changed, 67 insertions, 27 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 339453ac834c..8c0e94183186 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1803,6 +1803,25 @@ static inline struct timespec64 ep_set_mstimeout(long ms) return timespec64_add_safe(now, ts); } +/* + * autoremove_wake_function, but remove even on failure to wake up, because we + * know that default_wake_function/ttwu will only fail if the thread is already + * woken, and in that case the ep_poll loop will remove the entry anyways, not + * try to reuse it. + */ +static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int sync, void *key) +{ + int ret = default_wake_function(wq_entry, mode, sync, key); + + /* + * Pairs with list_empty_careful in ep_poll, and ensures future loop + * iterations see the cause of this wakeup. + */ + list_del_init_careful(&wq_entry->entry); + return ret; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller supplied * event buffer. @@ -1880,56 +1899,77 @@ fetch_events: * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. + * + * In fact, we now use an even more aggressive function that + * unconditionally removes, because we don't reuse the wait + * entry between loop iterations. This lets us also avoid the + * performance issue if a process is killed, causing all of its + * threads to wake up without being removed normally. */ init_wait(&wait); + wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); - __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); - /* - * We don't want to sleep if the ep_poll_callback() sends us - * a wakeup in between. That's why we set the task state - * to TASK_INTERRUPTIBLE before doing the checks. + * Barrierless variant, waitqueue_active() is called under + * the same lock on wakeup ep_poll_callback() side, so it + * is safe to avoid an explicit barrier. */ - set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_INTERRUPTIBLE); + /* - * Always short-circuit for fatal signals to allow - * threads to make a timely exit without the chance of - * finding more events available and fetching - * repeatedly. + * Do the final check under the lock. ep_scan_ready_list() + * plays with two lists (->rdllist and ->ovflist) and there + * is always a race when both lists are empty for short + * period of time although events are pending, so lock is + * important. */ - if (fatal_signal_pending(current)) { - res = -EINTR; - break; - } - eavail = ep_events_available(ep); - if (eavail) - break; - if (signal_pending(current)) { - res = -EINTR; - break; + if (!eavail) { + if (signal_pending(current)) + res = -EINTR; + else + __add_wait_queue_exclusive(&ep->wq, &wait); } + write_unlock_irq(&ep->lock); - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { - timed_out = 1; - break; - } + if (!eavail && !res) + timed_out = !schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS); - /* We were woken up, thus go and try to harvest some events */ + /* + * We were woken up, thus go and try to harvest some events. + * If timed out and still on the wait queue, recheck eavail + * carefully under lock, below. + */ eavail = 1; - } while (0); __set_current_state(TASK_RUNNING); if (!list_empty_careful(&wait.entry)) { write_lock_irq(&ep->lock); + /* + * If the thread timed out and is not on the wait queue, it + * means that the thread was woken up after its timeout expired + * before it could reacquire the lock. Thus, when wait.entry is + * empty, it needs to harvest events. + */ + if (timed_out) + eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); write_unlock_irq(&ep->lock); } send_events: + if (fatal_signal_pending(current)) { + /* + * Always short-circuit for fatal signals to allow + * threads to make a timely exit without the chance of + * finding more events available and fetching + * repeatedly. + */ + res = -EINTR; + } /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of |