Merge tag 'io_uring-6.1-2022-11-25' of git://git.kernel.dk/linux
Pull io_uring fixes from Jens Axboe:
- A few poll related fixes. One fixing a race condition between poll
cancelation and trigger, and one making the overflow handling a bit
more robust (Lin, Pavel)
- Fix an fput() for error handling in the direct file table (Lin)
- Fix for a regression introduced in this cycle, where we don't always
get TIF_NOTIFY_SIGNAL cleared appropriately (me)
* tag 'io_uring-6.1-2022-11-25' of git://git.kernel.dk/linux:
io_uring: clear TIF_NOTIFY_SIGNAL if set and task_work not available
io_uring/poll: fix poll_refs race with cancelation
io_uring/filetable: fix file reference underflow
io_uring: make poll refs more robust
io_uring: cmpxchg for poll arm refs release
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 7b47325..68dfc69 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -101,8 +101,6 @@
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- if (ret)
- fput(file);
return ret;
}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index cef5ff9..50bc3af 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -238,9 +238,14 @@
static inline int io_run_task_work(void)
{
+ /*
+ * Always check-and-clear the task_work notification signal. With how
+ * signaling works for task_work, we can find it set with nothing to
+ * run. We need to clear it for that case, like get_signal() does.
+ */
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL))
+ clear_notify_signal();
if (task_work_pending(current)) {
- if (test_thread_flag(TIF_NOTIFY_SIGNAL))
- clear_notify_signal();
__set_current_state(TASK_RUNNING);
task_work_run();
return 1;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 055632e..d9bf176 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -40,7 +40,14 @@
};
#define IO_POLL_CANCEL_FLAG BIT(31)
-#define IO_POLL_REF_MASK GENMASK(30, 0)
+#define IO_POLL_RETRY_FLAG BIT(30)
+#define IO_POLL_REF_MASK GENMASK(29, 0)
+
+/*
+ * We usually have 1-2 refs taken, 128 is more than enough and we want to
+ * maximise the margin between this amount and the moment when it overflows.
+ */
+#define IO_POLL_REF_BIAS 128
#define IO_WQE_F_DOUBLE 1
@@ -58,6 +65,21 @@
return priv & IO_WQE_F_DOUBLE;
}
+static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
+{
+ int v;
+
+ /*
+ * poll_refs are already elevated and we don't have much hope for
+ * grabbing the ownership. Instead of incrementing set a retry flag
+ * to notify the loop that there might have been some change.
+ */
+ v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
+ if (v & IO_POLL_REF_MASK)
+ return false;
+ return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
+}
+
/*
* If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
* bump it and acquire ownership. It's disallowed to modify requests while not
@@ -66,6 +88,8 @@
*/
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
+ if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
+ return io_poll_get_ownership_slowpath(req);
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
@@ -235,6 +259,16 @@
*/
if ((v & IO_POLL_REF_MASK) != 1)
req->cqe.res = 0;
+ if (v & IO_POLL_RETRY_FLAG) {
+ req->cqe.res = 0;
+ /*
+ * We won't find new events that came in between
+ * vfs_poll and the ref put unless we clear the flag
+ * in advance.
+ */
+ atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
+ v &= ~IO_POLL_RETRY_FLAG;
+ }
/* the mask was stashed in __io_poll_execute */
if (!req->cqe.res) {
@@ -274,7 +308,8 @@
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
*/
- } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
+ } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
+ IO_POLL_REF_MASK);
return IOU_POLL_NO_ACTION;
}
@@ -518,7 +553,6 @@
unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- int v;
INIT_HLIST_NODE(&req->hash_node);
req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
@@ -586,11 +620,10 @@
if (ipt->owning) {
/*
- * Release ownership. If someone tried to queue a tw while it was
- * locked, kick it off for them.
+ * Try to release ownership. If we see a change of state, e.g.
+ * poll was waken up, queue up a tw, it'll deal with it.
*/
- v = atomic_dec_return(&req->poll_refs);
- if (unlikely(v & IO_POLL_REF_MASK))
+ if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
__io_poll_execute(req, 0);
}
return 0;