| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * Shared application/kernel submission and completion ring pairs, for |
| * supporting fast/efficient IO. |
| * |
| * A note on the read/write ordering memory barriers that are matched between |
| * the application and kernel side. |
| * |
| * After the application reads the CQ ring tail, it must use an |
| * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses |
| * before writing the tail (using smp_load_acquire to read the tail will |
| * do). It also needs a smp_mb() before updating CQ head (ordering the |
| * entry load(s) with the head store), pairing with an implicit barrier |
| * through a control-dependency in io_get_cqe (smp_store_release to |
| * store head will do). Failure to do so could lead to reading invalid |
| * CQ entries. |
| * |
| * Likewise, the application must use an appropriate smp_wmb() before |
| * writing the SQ tail (ordering SQ entry stores with the tail store), |
| * which pairs with smp_load_acquire in io_get_sqring (smp_store_release |
| * to store the tail will do). And it needs a barrier ordering the SQ |
| * head load before writing new SQ entries (smp_load_acquire to read |
| * head will do). |
| * |
| * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application |
| * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* |
| * updating the SQ tail; a full memory barrier smp_mb() is needed |
| * between. |
| * |
| * Also see the examples in the liburing library: |
| * |
| * git://git.kernel.dk/liburing |
| * |
| * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens |
| * from data shared between the kernel and application. This is done both |
| * for ordering purposes, but also to ensure that once a value is loaded from |
| * data that the application could potentially modify, it remains stable. |
| * |
| * Copyright (C) 2018-2019 Jens Axboe |
| * Copyright (c) 2018-2019 Christoph Hellwig |
| */ |
| #include <linux/kernel.h> |
| #include <linux/init.h> |
| #include <linux/errno.h> |
| #include <linux/syscalls.h> |
| #include <linux/compat.h> |
| #include <net/compat.h> |
| #include <linux/refcount.h> |
| #include <linux/uio.h> |
| #include <linux/bits.h> |
| |
| #include <linux/sched/signal.h> |
| #include <linux/fs.h> |
| #include <linux/file.h> |
| #include <linux/fdtable.h> |
| #include <linux/mm.h> |
| #include <linux/mman.h> |
| #include <linux/percpu.h> |
| #include <linux/slab.h> |
| #include <linux/blk-mq.h> |
| #include <linux/bvec.h> |
| #include <linux/net.h> |
| #include <net/sock.h> |
| #include <net/af_unix.h> |
| #include <net/scm.h> |
| #include <linux/anon_inodes.h> |
| #include <linux/sched/mm.h> |
| #include <linux/uaccess.h> |
| #include <linux/nospec.h> |
| #include <linux/sizes.h> |
| #include <linux/hugetlb.h> |
| #include <linux/highmem.h> |
| #include <linux/namei.h> |
| #include <linux/fsnotify.h> |
| #include <linux/fadvise.h> |
| #include <linux/eventpoll.h> |
| #include <linux/splice.h> |
| #include <linux/task_work.h> |
| #include <linux/pagemap.h> |
| #include <linux/io_uring.h> |
| #include <linux/audit.h> |
| #include <linux/security.h> |
| #include <linux/xattr.h> |
| |
| #define CREATE_TRACE_POINTS |
| #include <trace/events/io_uring.h> |
| |
| #include <uapi/linux/io_uring.h> |
| |
| #include "internal.h" |
| #include "io-wq.h" |
| |
| #define IORING_MAX_ENTRIES 32768 |
| #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) |
| #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 |
| |
| /* only define max */ |
| #define IORING_MAX_FIXED_FILES (1U << 20) |
| #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ |
| IORING_REGISTER_LAST + IORING_OP_LAST) |
| |
| #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) |
| #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) |
| #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) |
| |
| #define IORING_MAX_REG_BUFFERS (1U << 14) |
| |
| #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ |
| IOSQE_IO_HARDLINK | IOSQE_ASYNC) |
| |
| #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ |
| IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) |
| |
| #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ |
| REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ |
| REQ_F_ASYNC_DATA) |
| |
| #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ |
| IO_REQ_CLEAN_FLAGS) |
| |
| #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) |
| |
| #define IO_TCTX_REFS_CACHE_NR (1U << 10) |
| |
| struct io_uring { |
| u32 head ____cacheline_aligned_in_smp; |
| u32 tail ____cacheline_aligned_in_smp; |
| }; |
| |
| /* |
| * This data is shared with the application through the mmap at offsets |
| * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. |
| * |
| * The offsets to the member fields are published through struct |
| * io_sqring_offsets when calling io_uring_setup. |
| */ |
| struct io_rings { |
| /* |
| * Head and tail offsets into the ring; the offsets need to be |
| * masked to get valid indices. |
| * |
| * The kernel controls head of the sq ring and the tail of the cq ring, |
| * and the application controls tail of the sq ring and the head of the |
| * cq ring. |
| */ |
| struct io_uring sq, cq; |
| /* |
| * Bitmasks to apply to head and tail offsets (constant, equals |
| * ring_entries - 1) |
| */ |
| u32 sq_ring_mask, cq_ring_mask; |
| /* Ring sizes (constant, power of 2) */ |
| u32 sq_ring_entries, cq_ring_entries; |
| /* |
| * Number of invalid entries dropped by the kernel due to |
| * invalid index stored in array |
| * |
| * Written by the kernel, shouldn't be modified by the |
| * application (i.e. get number of "new events" by comparing to |
| * cached value). |
| * |
| * After a new SQ head value was read by the application this |
| * counter includes all submissions that were dropped reaching |
| * the new SQ head (and possibly more). |
| */ |
| u32 sq_dropped; |
| /* |
| * Runtime SQ flags |
| * |
| * Written by the kernel, shouldn't be modified by the |
| * application. |
| * |
| * The application needs a full memory barrier before checking |
| * for IORING_SQ_NEED_WAKEUP after updating the sq tail. |
| */ |
| atomic_t sq_flags; |
| /* |
| * Runtime CQ flags |
| * |
| * Written by the application, shouldn't be modified by the |
| * kernel. |
| */ |
| u32 cq_flags; |
| /* |
| * Number of completion events lost because the queue was full; |
| * this should be avoided by the application by making sure |
| * there are not more requests pending than there is space in |
| * the completion queue. |
| * |
| * Written by the kernel, shouldn't be modified by the |
| * application (i.e. get number of "new events" by comparing to |
| * cached value). |
| * |
| * As completion events come in out of order this counter is not |
| * ordered with any other data. |
| */ |
| u32 cq_overflow; |
| /* |
| * Ring buffer of completion events. |
| * |
| * The kernel writes completion events fresh every time they are |
| * produced, so the application is allowed to modify pending |
| * entries. |
| */ |
| struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; |
| }; |
| |
| struct io_mapped_ubuf { |
| u64 ubuf; |
| u64 ubuf_end; |
| unsigned int nr_bvecs; |
| unsigned long acct_pages; |
| struct bio_vec bvec[]; |
| }; |
| |
| struct io_ring_ctx; |
| |
| struct io_overflow_cqe { |
| struct list_head list; |
| struct io_uring_cqe cqe; |
| }; |
| |
| /* |
| * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 |
| * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we |
| * can't safely always dereference the file when the task has exited and ring |
| * cleanup is done. If a file is tracked and part of SCM, then unix gc on |
| * process exit may reap it before __io_sqe_files_unregister() is run. |
| */ |
| #define FFS_NOWAIT 0x1UL |
| #define FFS_ISREG 0x2UL |
| #if defined(CONFIG_64BIT) |
| #define FFS_SCM 0x4UL |
| #else |
| #define IO_URING_SCM_ALL |
| #define FFS_SCM 0x0UL |
| #endif |
| #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) |
| |
| struct io_fixed_file { |
| /* file * with additional FFS_* flags */ |
| unsigned long file_ptr; |
| }; |
| |
| struct io_rsrc_put { |
| struct list_head list; |
| u64 tag; |
| union { |
| void *rsrc; |
| struct file *file; |
| struct io_mapped_ubuf *buf; |
| }; |
| }; |
| |
| struct io_file_table { |
| struct io_fixed_file *files; |
| unsigned long *bitmap; |
| unsigned int alloc_hint; |
| }; |
| |
| struct io_rsrc_node { |
| struct percpu_ref refs; |
| struct list_head node; |
| struct list_head rsrc_list; |
| struct io_rsrc_data *rsrc_data; |
| struct llist_node llist; |
| bool done; |
| }; |
| |
| typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); |
| |
| struct io_rsrc_data { |
| struct io_ring_ctx *ctx; |
| |
| u64 **tags; |
| unsigned int nr; |
| rsrc_put_fn *do_put; |
| atomic_t refs; |
| struct completion done; |
| bool quiesce; |
| }; |
| |
| #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) |
| struct io_buffer_list { |
| /* |
| * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, |
| * then these are classic provided buffers and ->buf_list is used. |
| */ |
| union { |
| struct list_head buf_list; |
| struct { |
| struct page **buf_pages; |
| struct io_uring_buf_ring *buf_ring; |
| }; |
| }; |
| __u16 bgid; |
| |
| /* below is for ring provided buffers */ |
| __u16 buf_nr_pages; |
| __u16 nr_entries; |
| __u16 head; |
| __u16 mask; |
| }; |
| |
| struct io_buffer { |
| struct list_head list; |
| __u64 addr; |
| __u32 len; |
| __u16 bid; |
| __u16 bgid; |
| }; |
| |
| struct io_restriction { |
| DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); |
| DECLARE_BITMAP(sqe_op, IORING_OP_LAST); |
| u8 sqe_flags_allowed; |
| u8 sqe_flags_required; |
| bool registered; |
| }; |
| |
| enum { |
| IO_SQ_THREAD_SHOULD_STOP = 0, |
| IO_SQ_THREAD_SHOULD_PARK, |
| }; |
| |
| struct io_sq_data { |
| refcount_t refs; |
| atomic_t park_pending; |
| struct mutex lock; |
| |
| /* ctx's that are using this sqd */ |
| struct list_head ctx_list; |
| |
| struct task_struct *thread; |
| struct wait_queue_head wait; |
| |
| unsigned sq_thread_idle; |
| int sq_cpu; |
| pid_t task_pid; |
| pid_t task_tgid; |
| |
| unsigned long state; |
| struct completion exited; |
| }; |
| |
| #define IO_COMPL_BATCH 32 |
| #define IO_REQ_CACHE_SIZE 32 |
| #define IO_REQ_ALLOC_BATCH 8 |
| |
| struct io_submit_link { |
| struct io_kiocb *head; |
| struct io_kiocb *last; |
| }; |
| |
| struct io_submit_state { |
| /* inline/task_work completion list, under ->uring_lock */ |
| struct io_wq_work_node free_list; |
| /* batch completion logic */ |
| struct io_wq_work_list compl_reqs; |
| struct io_submit_link link; |
| |
| bool plug_started; |
| bool need_plug; |
| bool flush_cqes; |
| unsigned short submit_nr; |
| struct blk_plug plug; |
| }; |
| |
| struct io_ev_fd { |
| struct eventfd_ctx *cq_ev_fd; |
| unsigned int eventfd_async: 1; |
| struct rcu_head rcu; |
| }; |
| |
| #define BGID_ARRAY 64 |
| |
| struct io_ring_ctx { |
| /* const or read-mostly hot data */ |
| struct { |
| struct percpu_ref refs; |
| |
| struct io_rings *rings; |
| unsigned int flags; |
| enum task_work_notify_mode notify_method; |
| unsigned int compat: 1; |
| unsigned int drain_next: 1; |
| unsigned int restricted: 1; |
| unsigned int off_timeout_used: 1; |
| unsigned int drain_active: 1; |
| unsigned int drain_disabled: 1; |
| unsigned int has_evfd: 1; |
| unsigned int syscall_iopoll: 1; |
| } ____cacheline_aligned_in_smp; |
| |
| /* submission data */ |
| struct { |
| struct mutex uring_lock; |
| |
| /* |
| * Ring buffer of indices into array of io_uring_sqe, which is |
| * mmapped by the application using the IORING_OFF_SQES offset. |
| * |
| * This indirection could e.g. be used to assign fixed |
| * io_uring_sqe entries to operations and only submit them to |
| * the queue when needed. |
| * |
| * The kernel modifies neither the indices array nor the entries |
| * array. |
| */ |
| u32 *sq_array; |
| struct io_uring_sqe *sq_sqes; |
| unsigned cached_sq_head; |
| unsigned sq_entries; |
| struct list_head defer_list; |
| |
| /* |
| * Fixed resources fast path, should be accessed only under |
| * uring_lock, and updated through io_uring_register(2) |
| */ |
| struct io_rsrc_node *rsrc_node; |
| int rsrc_cached_refs; |
| atomic_t cancel_seq; |
| struct io_file_table file_table; |
| unsigned nr_user_files; |
| unsigned nr_user_bufs; |
| struct io_mapped_ubuf **user_bufs; |
| |
| struct io_submit_state submit_state; |
| |
| struct io_buffer_list *io_bl; |
| struct xarray io_bl_xa; |
| struct list_head io_buffers_cache; |
| |
| struct list_head timeout_list; |
| struct list_head ltimeout_list; |
| struct list_head cq_overflow_list; |
| struct list_head apoll_cache; |
| struct xarray personalities; |
| u32 pers_next; |
| unsigned sq_thread_idle; |
| } ____cacheline_aligned_in_smp; |
| |
| /* IRQ completion list, under ->completion_lock */ |
| struct io_wq_work_list locked_free_list; |
| unsigned int locked_free_nr; |
| |
| const struct cred *sq_creds; /* cred used for __io_sq_thread() */ |
| struct io_sq_data *sq_data; /* if using sq thread polling */ |
| |
| struct wait_queue_head sqo_sq_wait; |
| struct list_head sqd_list; |
| |
| unsigned long check_cq; |
| |
| struct { |
| /* |
| * We cache a range of free CQEs we can use, once exhausted it |
| * should go through a slower range setup, see __io_get_cqe() |
| */ |
| struct io_uring_cqe *cqe_cached; |
| struct io_uring_cqe *cqe_sentinel; |
| |
| unsigned cached_cq_tail; |
| unsigned cq_entries; |
| struct io_ev_fd __rcu *io_ev_fd; |
| struct wait_queue_head cq_wait; |
| unsigned cq_extra; |
| atomic_t cq_timeouts; |
| unsigned cq_last_tm_flush; |
| } ____cacheline_aligned_in_smp; |
| |
| struct { |
| spinlock_t completion_lock; |
| |
| spinlock_t timeout_lock; |
| |
| /* |
| * ->iopoll_list is protected by the ctx->uring_lock for |
| * io_uring instances that don't use IORING_SETUP_SQPOLL. |
| * For SQPOLL, only the single threaded io_sq_thread() will |
| * manipulate the list, hence no extra locking is needed there. |
| */ |
| struct io_wq_work_list iopoll_list; |
| struct hlist_head *cancel_hash; |
| unsigned cancel_hash_bits; |
| bool poll_multi_queue; |
| |
| struct list_head io_buffers_comp; |
| } ____cacheline_aligned_in_smp; |
| |
| struct io_restriction restrictions; |
| |
| /* slow path rsrc auxilary data, used by update/register */ |
| struct { |
| struct io_rsrc_node *rsrc_backup_node; |
| struct io_mapped_ubuf *dummy_ubuf; |
| struct io_rsrc_data *file_data; |
| struct io_rsrc_data *buf_data; |
| |
| struct delayed_work rsrc_put_work; |
| struct llist_head rsrc_put_llist; |
| struct list_head rsrc_ref_list; |
| spinlock_t rsrc_ref_lock; |
| |
| struct list_head io_buffers_pages; |
| }; |
| |
| /* Keep this last, we don't need it for the fast path */ |
| struct { |
| #if defined(CONFIG_UNIX) |
| struct socket *ring_sock; |
| #endif |
| /* hashed buffered write serialization */ |
| struct io_wq_hash *hash_map; |
| |
| /* Only used for accounting purposes */ |
| struct user_struct *user; |
| struct mm_struct *mm_account; |
| |
| /* ctx exit and cancelation */ |
| struct llist_head fallback_llist; |
| struct delayed_work fallback_work; |
| struct work_struct exit_work; |
| struct list_head tctx_list; |
| struct completion ref_comp; |
| u32 iowq_limits[2]; |
| bool iowq_limits_set; |
| }; |
| }; |
| |
| /* |
| * Arbitrary limit, can be raised if need be |
| */ |
| #define IO_RINGFD_REG_MAX 16 |
| |
| struct io_uring_task { |
| /* submission side */ |
| int cached_refs; |
| struct xarray xa; |
| struct wait_queue_head wait; |
| const struct io_ring_ctx *last; |
| struct io_wq *io_wq; |
| struct percpu_counter inflight; |
| atomic_t inflight_tracked; |
| atomic_t in_idle; |
| |
| spinlock_t task_lock; |
| struct io_wq_work_list task_list; |
| struct io_wq_work_list prio_task_list; |
| struct callback_head task_work; |
| struct file **registered_rings; |
| bool task_running; |
| }; |
| |
| /* |
| * First field must be the file pointer in all the |
| * iocb unions! See also 'struct kiocb' in <linux/fs.h> |
| */ |
| struct io_poll_iocb { |
| struct file *file; |
| struct wait_queue_head *head; |
| __poll_t events; |
| struct wait_queue_entry wait; |
| }; |
| |
| struct io_poll_update { |
| struct file *file; |
| u64 old_user_data; |
| u64 new_user_data; |
| __poll_t events; |
| bool update_events; |
| bool update_user_data; |
| }; |
| |
| struct io_close { |
| struct file *file; |
| int fd; |
| u32 file_slot; |
| }; |
| |
| struct io_timeout_data { |
| struct io_kiocb *req; |
| struct hrtimer timer; |
| struct timespec64 ts; |
| enum hrtimer_mode mode; |
| u32 flags; |
| }; |
| |
| struct io_accept { |
| struct file *file; |
| struct sockaddr __user *addr; |
| int __user *addr_len; |
| int flags; |
| u32 file_slot; |
| unsigned long nofile; |
| }; |
| |
| struct io_socket { |
| struct file *file; |
| int domain; |
| int type; |
| int protocol; |
| int flags; |
| u32 file_slot; |
| unsigned long nofile; |
| }; |
| |
| struct io_sync { |
| struct file *file; |
| loff_t len; |
| loff_t off; |
| int flags; |
| int mode; |
| }; |
| |
| struct io_cancel { |
| struct file *file; |
| u64 addr; |
| u32 flags; |
| s32 fd; |
| }; |
| |
| struct io_timeout { |
| struct file *file; |
| u32 off; |
| u32 target_seq; |
| struct list_head list; |
| /* head of the link, used by linked timeouts only */ |
| struct io_kiocb *head; |
| /* for linked completions */ |
| struct io_kiocb *prev; |
| }; |
| |
| struct io_timeout_rem { |
| struct file *file; |
| u64 addr; |
| |
| /* timeout update */ |
| struct timespec64 ts; |
| u32 flags; |
| bool ltimeout; |
| }; |
| |
| struct io_rw { |
| /* NOTE: kiocb has the file as the first member, so don't do it here */ |
| struct kiocb kiocb; |
| u64 addr; |
| u32 len; |
| rwf_t flags; |
| }; |
| |
| struct io_connect { |
| struct file *file; |
| struct sockaddr __user *addr; |
| int addr_len; |
| }; |
| |
| struct io_sr_msg { |
| struct file *file; |
| union { |
| struct compat_msghdr __user *umsg_compat; |
| struct user_msghdr __user *umsg; |
| void __user *buf; |
| }; |
| int msg_flags; |
| size_t len; |
| size_t done_io; |
| unsigned int flags; |
| }; |
| |
| struct io_open { |
| struct file *file; |
| int dfd; |
| u32 file_slot; |
| struct filename *filename; |
| struct open_how how; |
| unsigned long nofile; |
| }; |
| |
| struct io_rsrc_update { |
| struct file *file; |
| u64 arg; |
| u32 nr_args; |
| u32 offset; |
| }; |
| |
| struct io_fadvise { |
| struct file *file; |
| u64 offset; |
| u32 len; |
| u32 advice; |
| }; |
| |
| struct io_madvise { |
| struct file *file; |
| u64 addr; |
| u32 len; |
| u32 advice; |
| }; |
| |
| struct io_epoll { |
| struct file *file; |
| int epfd; |
| int op; |
| int fd; |
| struct epoll_event event; |
| }; |
| |
| struct io_splice { |
| struct file *file_out; |
| loff_t off_out; |
| loff_t off_in; |
| u64 len; |
| int splice_fd_in; |
| unsigned int flags; |
| }; |
| |
| struct io_provide_buf { |
| struct file *file; |
| __u64 addr; |
| __u32 len; |
| __u32 bgid; |
| __u16 nbufs; |
| __u16 bid; |
| }; |
| |
| struct io_statx { |
| struct file *file; |
| int dfd; |
| unsigned int mask; |
| unsigned int flags; |
| struct filename *filename; |
| struct statx __user *buffer; |
| }; |
| |
| struct io_shutdown { |
| struct file *file; |
| int how; |
| }; |
| |
| struct io_rename { |
| struct file *file; |
| int old_dfd; |
| int new_dfd; |
| struct filename *oldpath; |
| struct filename *newpath; |
| int flags; |
| }; |
| |
| struct io_unlink { |
| struct file *file; |
| int dfd; |
| int flags; |
| struct filename *filename; |
| }; |
| |
| struct io_mkdir { |
| struct file *file; |
| int dfd; |
| umode_t mode; |
| struct filename *filename; |
| }; |
| |
| struct io_symlink { |
| struct file *file; |
| int new_dfd; |
| struct filename *oldpath; |
| struct filename *newpath; |
| }; |
| |
| struct io_hardlink { |
| struct file *file; |
| int old_dfd; |
| int new_dfd; |
| struct filename *oldpath; |
| struct filename *newpath; |
| int flags; |
| }; |
| |
| struct io_msg { |
| struct file *file; |
| u64 user_data; |
| u32 len; |
| }; |
| |
| struct io_async_connect { |
| struct sockaddr_storage address; |
| }; |
| |
| struct io_async_msghdr { |
| struct iovec fast_iov[UIO_FASTIOV]; |
| /* points to an allocated iov, if NULL we use fast_iov instead */ |
| struct iovec *free_iov; |
| struct sockaddr __user *uaddr; |
| struct msghdr msg; |
| struct sockaddr_storage addr; |
| }; |
| |
| struct io_rw_state { |
| struct iov_iter iter; |
| struct iov_iter_state iter_state; |
| struct iovec fast_iov[UIO_FASTIOV]; |
| }; |
| |
| struct io_async_rw { |
| struct io_rw_state s; |
| const struct iovec *free_iovec; |
| size_t bytes_done; |
| struct wait_page_queue wpq; |
| }; |
| |
| struct io_xattr { |
| struct file *file; |
| struct xattr_ctx ctx; |
| struct filename *filename; |
| }; |
| |
| enum { |
| REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, |
| REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, |
| REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, |
| REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, |
| REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, |
| REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, |
| REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, |
| |
| /* first byte is taken by user flags, shift it to not overlap */ |
| REQ_F_FAIL_BIT = 8, |
| REQ_F_INFLIGHT_BIT, |
| REQ_F_CUR_POS_BIT, |
| REQ_F_NOWAIT_BIT, |
| REQ_F_LINK_TIMEOUT_BIT, |
| REQ_F_NEED_CLEANUP_BIT, |
| REQ_F_POLLED_BIT, |
| REQ_F_BUFFER_SELECTED_BIT, |
| REQ_F_BUFFER_RING_BIT, |
| REQ_F_COMPLETE_INLINE_BIT, |
| REQ_F_REISSUE_BIT, |
| REQ_F_CREDS_BIT, |
| REQ_F_REFCOUNT_BIT, |
| REQ_F_ARM_LTIMEOUT_BIT, |
| REQ_F_ASYNC_DATA_BIT, |
| REQ_F_SKIP_LINK_CQES_BIT, |
| REQ_F_SINGLE_POLL_BIT, |
| REQ_F_DOUBLE_POLL_BIT, |
| REQ_F_PARTIAL_IO_BIT, |
| REQ_F_CQE32_INIT_BIT, |
| REQ_F_APOLL_MULTISHOT_BIT, |
| /* keep async read/write and isreg together and in order */ |
| REQ_F_SUPPORT_NOWAIT_BIT, |
| REQ_F_ISREG_BIT, |
| |
| /* not a real bit, just to check we're not overflowing the space */ |
| __REQ_F_LAST_BIT, |
| }; |
| |
| enum { |
| /* ctx owns file */ |
| REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), |
| /* drain existing IO first */ |
| REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), |
| /* linked sqes */ |
| REQ_F_LINK = BIT(REQ_F_LINK_BIT), |
| /* doesn't sever on completion < 0 */ |
| REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), |
| /* IOSQE_ASYNC */ |
| REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), |
| /* IOSQE_BUFFER_SELECT */ |
| REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), |
| /* IOSQE_CQE_SKIP_SUCCESS */ |
| REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), |
| |
| /* fail rest of links */ |
| REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), |
| /* on inflight list, should be cancelled and waited on exit reliably */ |
| REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), |
| /* read/write uses file position */ |
| REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), |
| /* must not punt to workers */ |
| REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), |
| /* has or had linked timeout */ |
| REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), |
| /* needs cleanup */ |
| REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), |
| /* already went through poll handler */ |
| REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), |
| /* buffer already selected */ |
| REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), |
| /* buffer selected from ring, needs commit */ |
| REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), |
| /* completion is deferred through io_comp_state */ |
| REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), |
| /* caller should reissue async */ |
| REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), |
| /* supports async reads/writes */ |
| REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), |
| /* regular file */ |
| REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), |
| /* has creds assigned */ |
| REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), |
| /* skip refcounting if not set */ |
| REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), |
| /* there is a linked timeout that has to be armed */ |
| REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), |
| /* ->async_data allocated */ |
| REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), |
| /* don't post CQEs while failing linked requests */ |
| REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), |
| /* single poll may be active */ |
| REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), |
| /* double poll may active */ |
| REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), |
| /* request has already done partial IO */ |
| REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), |
| /* fast poll multishot mode */ |
| REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), |
| /* ->extra1 and ->extra2 are initialised */ |
| REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), |
| }; |
| |
| struct async_poll { |
| struct io_poll_iocb poll; |
| struct io_poll_iocb *double_poll; |
| }; |
| |
| typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); |
| |
| struct io_task_work { |
| union { |
| struct io_wq_work_node node; |
| struct llist_node fallback_node; |
| }; |
| io_req_tw_func_t func; |
| }; |
| |
| enum { |
| IORING_RSRC_FILE = 0, |
| IORING_RSRC_BUFFER = 1, |
| }; |
| |
| struct io_cqe { |
| __u64 user_data; |
| __s32 res; |
| /* fd initially, then cflags for completion */ |
| union { |
| __u32 flags; |
| int fd; |
| }; |
| }; |
| |
| enum { |
| IO_CHECK_CQ_OVERFLOW_BIT, |
| IO_CHECK_CQ_DROPPED_BIT, |
| }; |
| |
| /* |
| * NOTE! Each of the iocb union members has the file pointer |
| * as the first entry in their struct definition. So you can |
| * access the file pointer through any of the sub-structs, |
| * or directly as just 'file' in this struct. |
| */ |
| struct io_kiocb { |
| union { |
| struct file *file; |
| struct io_rw rw; |
| struct io_poll_iocb poll; |
| struct io_poll_update poll_update; |
| struct io_accept accept; |
| struct io_sync sync; |
| struct io_cancel cancel; |
| struct io_timeout timeout; |
| struct io_timeout_rem timeout_rem; |
| struct io_connect connect; |
| struct io_sr_msg sr_msg; |
| struct io_open open; |
| struct io_close close; |
| struct io_rsrc_update rsrc_update; |
| struct io_fadvise fadvise; |
| struct io_madvise madvise; |
| struct io_epoll epoll; |
| struct io_splice splice; |
| struct io_provide_buf pbuf; |
| struct io_statx statx; |
| struct io_shutdown shutdown; |
| struct io_rename rename; |
| struct io_unlink unlink; |
| struct io_mkdir mkdir; |
| struct io_symlink symlink; |
| struct io_hardlink hardlink; |
| struct io_msg msg; |
| struct io_xattr xattr; |
| struct io_socket sock; |
| struct io_uring_cmd uring_cmd; |
| }; |
| |
| u8 opcode; |
| /* polled IO has completed */ |
| u8 iopoll_completed; |
| /* |
| * Can be either a fixed buffer index, or used with provided buffers. |
| * For the latter, before issue it points to the buffer group ID, |
| * and after selection it points to the buffer ID itself. |
| */ |
| u16 buf_index; |
| unsigned int flags; |
| |
| struct io_cqe cqe; |
| |
| struct io_ring_ctx *ctx; |
| struct task_struct *task; |
| |
| struct io_rsrc_node *rsrc_node; |
| |
| union { |
| /* store used ubuf, so we can prevent reloading */ |
| struct io_mapped_ubuf *imu; |
| |
| /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ |
| struct io_buffer *kbuf; |
| |
| /* |
| * stores buffer ID for ring provided buffers, valid IFF |
| * REQ_F_BUFFER_RING is set. |
| */ |
| struct io_buffer_list *buf_list; |
| }; |
| |
| union { |
| /* used by request caches, completion batching and iopoll */ |
| struct io_wq_work_node comp_list; |
| /* cache ->apoll->events */ |
| __poll_t apoll_events; |
| }; |
| atomic_t refs; |
| atomic_t poll_refs; |
| struct io_task_work io_task_work; |
| /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ |
| union { |
| struct hlist_node hash_node; |
| struct { |
| u64 extra1; |
| u64 extra2; |
| }; |
| }; |
| /* internal polling, see IORING_FEAT_FAST_POLL */ |
| struct async_poll *apoll; |
| /* opcode allocated if it needs to store data for async defer */ |
| void *async_data; |
| /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ |
| struct io_kiocb *link; |
| /* custom credentials, valid IFF REQ_F_CREDS is set */ |
| const struct cred *creds; |
| struct io_wq_work work; |
| }; |
| |
| struct io_tctx_node { |
| struct list_head ctx_node; |
| struct task_struct *task; |
| struct io_ring_ctx *ctx; |
| }; |
| |
| struct io_defer_entry { |
| struct list_head list; |
| struct io_kiocb *req; |
| u32 seq; |
| }; |
| |
| struct io_cancel_data { |
| struct io_ring_ctx *ctx; |
| union { |
| u64 data; |
| struct file *file; |
| }; |
| u32 flags; |
| int seq; |
| }; |
| |
| /* |
| * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into |
| * the following sqe if SQE128 is used. |
| */ |
| #define uring_cmd_pdu_size(is_sqe128) \ |
| ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ |
| offsetof(struct io_uring_sqe, cmd)) |
| |
| struct io_op_def { |
| /* needs req->file assigned */ |
| unsigned needs_file : 1; |
| /* should block plug */ |
| unsigned plug : 1; |
| /* hash wq insertion if file is a regular file */ |
| unsigned hash_reg_file : 1; |
| /* unbound wq insertion if file is a non-regular file */ |
| unsigned unbound_nonreg_file : 1; |
| /* set if opcode supports polled "wait" */ |
| unsigned pollin : 1; |
| unsigned pollout : 1; |
| unsigned poll_exclusive : 1; |
| /* op supports buffer selection */ |
| unsigned buffer_select : 1; |
| /* do prep async if is going to be punted */ |
| unsigned needs_async_setup : 1; |
| /* opcode is not supported by this kernel */ |
| unsigned not_supported : 1; |
| /* skip auditing */ |
| unsigned audit_skip : 1; |
| /* supports ioprio */ |
| unsigned ioprio : 1; |
| /* supports iopoll */ |
| unsigned iopoll : 1; |
| /* size of async data needed, if any */ |
| unsigned short async_size; |
| }; |
| |
| static const struct io_op_def io_op_defs[] = { |
| [IORING_OP_NOP] = { |
| .audit_skip = 1, |
| .iopoll = 1, |
| }, |
| [IORING_OP_READV] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .buffer_select = 1, |
| .needs_async_setup = 1, |
| .plug = 1, |
| .audit_skip = 1, |
| .ioprio = 1, |
| .iopoll = 1, |
| .async_size = sizeof(struct io_async_rw), |
| }, |
| [IORING_OP_WRITEV] = { |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| .needs_async_setup = 1, |
| .plug = 1, |
| .audit_skip = 1, |
| .ioprio = 1, |
| .iopoll = 1, |
| .async_size = sizeof(struct io_async_rw), |
| }, |
| [IORING_OP_FSYNC] = { |
| .needs_file = 1, |
| .audit_skip = 1, |
| }, |
| [IORING_OP_READ_FIXED] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .plug = 1, |
| .audit_skip = 1, |
| .ioprio = 1, |
| .iopoll = 1, |
| .async_size = sizeof(struct io_async_rw), |
| }, |
| [IORING_OP_WRITE_FIXED] = { |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| .plug = 1, |
| .audit_skip = 1, |
| .ioprio = 1, |
| .iopoll = 1, |
| .async_size = sizeof(struct io_async_rw), |
| }, |
| [IORING_OP_POLL_ADD] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .audit_skip = 1, |
| }, |
| [IORING_OP_POLL_REMOVE] = { |
| .audit_skip = 1, |
| }, |
| [IORING_OP_SYNC_FILE_RANGE] = { |
| .needs_file = 1, |
| .audit_skip = 1, |
| }, |
| [IORING_OP_SENDMSG] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| .needs_async_setup = 1, |
| .ioprio = 1, |
| .async_size = sizeof(struct io_async_msghdr), |
| }, |
| [IORING_OP_RECVMSG] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .buffer_select = 1, |
| .needs_async_setup = 1, |
| .ioprio = 1, |
| .async_size = sizeof(struct io_async_msghdr), |
| }, |
| [IORING_OP_TIMEOUT] = { |
| .audit_skip = 1, |
| .async_size = sizeof(struct io_timeout_data), |
| }, |
| [IORING_OP_TIMEOUT_REMOVE] = { |
| /* used by timeout updates' prep() */ |
| .audit_skip = 1, |
| }, |
| [IORING_OP_ACCEPT] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .poll_exclusive = 1, |
| .ioprio = 1, /* used for flags */ |
| }, |
| [IORING_OP_ASYNC_CANCEL] = { |
| .audit_skip = 1, |
| }, |
| [IORING_OP_LINK_TIMEOUT] = { |
| .audit_skip = 1, |
| .async_size = sizeof(struct io_timeout_data), |
| }, |
| [IORING_OP_CONNECT] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| .needs_async_setup = 1, |
| .async_size = sizeof(struct io_async_connect), |
| }, |
| [IORING_OP_FALLOCATE] = { |
| .needs_file = 1, |
| }, |
| [IORING_OP_OPENAT] = {}, |
| [IORING_OP_CLOSE] = {}, |
| [IORING_OP_FILES_UPDATE] = { |
| .audit_skip = 1, |
| .iopoll = 1, |
| }, |
| [IORING_OP_STATX] = { |
| .audit_skip = 1, |
| }, |
| [IORING_OP_READ] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .buffer_select = 1, |
| .plug = 1, |
| .audit_skip = 1, |
| .ioprio = 1, |
| .iopoll = 1, |
| .async_size = sizeof(struct io_async_rw), |
| }, |
| [IORING_OP_WRITE] = { |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| .plug = 1, |
| .audit_skip = 1, |
| .ioprio = 1, |
| .iopoll = 1, |
| .async_size = sizeof(struct io_async_rw), |
| }, |
| [IORING_OP_FADVISE] = { |
| .needs_file = 1, |
| .audit_skip = 1, |
| }, |
| [IORING_OP_MADVISE] = {}, |
| [IORING_OP_SEND] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| .audit_skip = 1, |
| .ioprio = 1, |
| }, |
| [IORING_OP_RECV] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .buffer_select = 1, |
| .audit_skip = 1, |
| .ioprio = 1, |
| }, |
| [IORING_OP_OPENAT2] = { |
| }, |
| [IORING_OP_EPOLL_CTL] = { |
| .unbound_nonreg_file = 1, |
| .audit_skip = 1, |
| }, |
| [IORING_OP_SPLICE] = { |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| .audit_skip = 1, |
| }, |
| [IORING_OP_PROVIDE_BUFFERS] = { |
| .audit_skip = 1, |
| .iopoll = 1, |
| }, |
| [IORING_OP_REMOVE_BUFFERS] = { |
| .audit_skip = 1, |
| .iopoll = 1, |
| }, |
| [IORING_OP_TEE] = { |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| .audit_skip = 1, |
| }, |
| [IORING_OP_SHUTDOWN] = { |
| .needs_file = 1, |
| }, |
| [IORING_OP_RENAMEAT] = {}, |
| [IORING_OP_UNLINKAT] = {}, |
| [IORING_OP_MKDIRAT] = {}, |
| [IORING_OP_SYMLINKAT] = {}, |
| [IORING_OP_LINKAT] = {}, |
| [IORING_OP_MSG_RING] = { |
| .needs_file = 1, |
| .iopoll = 1, |
| }, |
| [IORING_OP_FSETXATTR] = { |
| .needs_file = 1 |
| }, |
| [IORING_OP_SETXATTR] = {}, |
| [IORING_OP_FGETXATTR] = { |
| .needs_file = 1 |
| }, |
| [IORING_OP_GETXATTR] = {}, |
| [IORING_OP_SOCKET] = { |
| .audit_skip = 1, |
| }, |
| [IORING_OP_URING_CMD] = { |
| .needs_file = 1, |
| .plug = 1, |
| .needs_async_setup = 1, |
| .async_size = uring_cmd_pdu_size(1), |
| }, |
| }; |
| |
| /* requests with any of those set should undergo io_disarm_next() */ |
| #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) |
| #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) |
| |
| static bool io_disarm_next(struct io_kiocb *req); |
| static void io_uring_del_tctx_node(unsigned long index); |
| static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, |
| struct task_struct *task, |
| bool cancel_all); |
| static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); |
| |
| static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); |
| static void io_dismantle_req(struct io_kiocb *req); |
| static void io_queue_linked_timeout(struct io_kiocb *req); |
| static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, |
| struct io_uring_rsrc_update2 *up, |
| unsigned nr_args); |
| static void io_clean_op(struct io_kiocb *req); |
| static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, |
| unsigned issue_flags); |
| static struct file *io_file_get_normal(struct io_kiocb *req, int fd); |
| static void io_queue_sqe(struct io_kiocb *req); |
| static void io_rsrc_put_work(struct work_struct *work); |
| |
| static void io_req_task_queue(struct io_kiocb *req); |
| static void __io_submit_flush_completions(struct io_ring_ctx *ctx); |
| static int io_req_prep_async(struct io_kiocb *req); |
| |
| static int io_install_fixed_file(struct io_kiocb *req, struct file *file, |
| unsigned int issue_flags, u32 slot_index); |
| static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, |
| unsigned int offset); |
| static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); |
| |
| static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); |
| static void io_eventfd_signal(struct io_ring_ctx *ctx); |
| static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); |
| |
| static struct kmem_cache *req_cachep; |
| |
| static const struct file_operations io_uring_fops; |
| |
| const char *io_uring_get_opcode(u8 opcode) |
| { |
| switch ((enum io_uring_op)opcode) { |
| case IORING_OP_NOP: |
| return "NOP"; |
| case IORING_OP_READV: |
| return "READV"; |
| case IORING_OP_WRITEV: |
| return "WRITEV"; |
| case IORING_OP_FSYNC: |
| return "FSYNC"; |
| case IORING_OP_READ_FIXED: |
| return "READ_FIXED"; |
| case IORING_OP_WRITE_FIXED: |
| return "WRITE_FIXED"; |
| case IORING_OP_POLL_ADD: |
| return "POLL_ADD"; |
| case IORING_OP_POLL_REMOVE: |
| return "POLL_REMOVE"; |
| case IORING_OP_SYNC_FILE_RANGE: |
| return "SYNC_FILE_RANGE"; |
| case IORING_OP_SENDMSG: |
| return "SENDMSG"; |
| case IORING_OP_RECVMSG: |
| return "RECVMSG"; |
| case IORING_OP_TIMEOUT: |
| return "TIMEOUT"; |
| case IORING_OP_TIMEOUT_REMOVE: |
| return "TIMEOUT_REMOVE"; |
| case IORING_OP_ACCEPT: |
| return "ACCEPT"; |
| case IORING_OP_ASYNC_CANCEL: |
| return "ASYNC_CANCEL"; |
| case IORING_OP_LINK_TIMEOUT: |
| return "LINK_TIMEOUT"; |
| case IORING_OP_CONNECT: |
| return "CONNECT"; |
| case IORING_OP_FALLOCATE: |
| return "FALLOCATE"; |
| case IORING_OP_OPENAT: |
| return "OPENAT"; |
| case IORING_OP_CLOSE: |
| return "CLOSE"; |
| case IORING_OP_FILES_UPDATE: |
| return "FILES_UPDATE"; |
| case IORING_OP_STATX: |
| return "STATX"; |
| case IORING_OP_READ: |
| return "READ"; |
| case IORING_OP_WRITE: |
| return "WRITE"; |
| case IORING_OP_FADVISE: |
| return "FADVISE"; |
| case IORING_OP_MADVISE: |
| return "MADVISE"; |
| case IORING_OP_SEND: |
| return "SEND"; |
| case IORING_OP_RECV: |
| return "RECV"; |
| case IORING_OP_OPENAT2: |
| return "OPENAT2"; |
| case IORING_OP_EPOLL_CTL: |
| return "EPOLL_CTL"; |
| case IORING_OP_SPLICE: |
| return "SPLICE"; |
| case IORING_OP_PROVIDE_BUFFERS: |
| return "PROVIDE_BUFFERS"; |
| case IORING_OP_REMOVE_BUFFERS: |
| return "REMOVE_BUFFERS"; |
| case IORING_OP_TEE: |
| return "TEE"; |
| case IORING_OP_SHUTDOWN: |
| return "SHUTDOWN"; |
| case IORING_OP_RENAMEAT: |
| return "RENAMEAT"; |
| case IORING_OP_UNLINKAT: |
| return "UNLINKAT"; |
| case IORING_OP_MKDIRAT: |
| return "MKDIRAT"; |
| case IORING_OP_SYMLINKAT: |
| return "SYMLINKAT"; |
| case IORING_OP_LINKAT: |
| return "LINKAT"; |
| case IORING_OP_MSG_RING: |
| return "MSG_RING"; |
| case IORING_OP_FSETXATTR: |
| return "FSETXATTR"; |
| case IORING_OP_SETXATTR: |
| return "SETXATTR"; |
| case IORING_OP_FGETXATTR: |
| return "FGETXATTR"; |
| case IORING_OP_GETXATTR: |
| return "GETXATTR"; |
| case IORING_OP_SOCKET: |
| return "SOCKET"; |
| case IORING_OP_URING_CMD: |
| return "URING_CMD"; |
| case IORING_OP_LAST: |
| return "INVALID"; |
| } |
| return "INVALID"; |
| } |
| |
| struct sock *io_uring_get_socket(struct file *file) |
| { |
| #if defined(CONFIG_UNIX) |
| if (file->f_op == &io_uring_fops) { |
| struct io_ring_ctx *ctx = file->private_data; |
| |
| return ctx->ring_sock->sk; |
| } |
| #endif |
| return NULL; |
| } |
| EXPORT_SYMBOL(io_uring_get_socket); |
| |
| #if defined(CONFIG_UNIX) |
| static inline bool io_file_need_scm(struct file *filp) |
| { |
| #if defined(IO_URING_SCM_ALL) |
| return true; |
| #else |
| return !!unix_get_socket(filp); |
| #endif |
| } |
| #else |
| static inline bool io_file_need_scm(struct file *filp) |
| { |
| return false; |
| } |
| #endif |
| |
| static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) |
| { |
| lockdep_assert_held(&ctx->uring_lock); |
| if (issue_flags & IO_URING_F_UNLOCKED) |
| mutex_unlock(&ctx->uring_lock); |
| } |
| |
| static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) |
| { |
| /* |
| * "Normal" inline submissions always hold the uring_lock, since we |
| * grab it from the system call. Same is true for the SQPOLL offload. |
| * The only exception is when we've detached the request and issue it |
| * from an async worker thread, grab the lock for that case. |
| */ |
| if (issue_flags & IO_URING_F_UNLOCKED) |
| mutex_lock(&ctx->uring_lock); |
| lockdep_assert_held(&ctx->uring_lock); |
| } |
| |
| static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) |
| { |
| if (!*locked) { |
| mutex_lock(&ctx->uring_lock); |
| *locked = true; |
| } |
| } |
| |
| #define io_for_each_link(pos, head) \ |
| for (pos = (head); pos; pos = pos->link) |
| |
| /* |
| * Shamelessly stolen from the mm implementation of page reference checking, |
| * see commit f958d7b528b1 for details. |
| */ |
| #define req_ref_zero_or_close_to_overflow(req) \ |
| ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) |
| |
| static inline bool req_ref_inc_not_zero(struct io_kiocb *req) |
| { |
| WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); |
| return atomic_inc_not_zero(&req->refs); |
| } |
| |
| static inline bool req_ref_put_and_test(struct io_kiocb *req) |
| { |
| if (likely(!(req->flags & REQ_F_REFCOUNT))) |
| return true; |
| |
| WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); |
| return atomic_dec_and_test(&req->refs); |
| } |
| |
| static inline void req_ref_get(struct io_kiocb *req) |
| { |
| WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); |
| WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); |
| atomic_inc(&req->refs); |
| } |
| |
| static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) |
| { |
| if (!wq_list_empty(&ctx->submit_state.compl_reqs)) |
| __io_submit_flush_completions(ctx); |
| } |
| |
| static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) |
| { |
| if (!(req->flags & REQ_F_REFCOUNT)) { |
| req->flags |= REQ_F_REFCOUNT; |
| atomic_set(&req->refs, nr); |
| } |
| } |
| |
| static inline void io_req_set_refcount(struct io_kiocb *req) |
| { |
| __io_req_set_refcount(req, 1); |
| } |
| |
| #define IO_RSRC_REF_BATCH 100 |
| |
| static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) |
| { |
| percpu_ref_put_many(&node->refs, nr); |
| } |
| |
| static inline void io_req_put_rsrc_locked(struct io_kiocb *req, |
| struct io_ring_ctx *ctx) |
| __must_hold(&ctx->uring_lock) |
| { |
| struct io_rsrc_node *node = req->rsrc_node; |
| |
| if (node) { |
| if (node == ctx->rsrc_node) |
| ctx->rsrc_cached_refs++; |
| else |
| io_rsrc_put_node(node, 1); |
| } |
| } |
| |
| static inline void io_req_put_rsrc(struct io_kiocb *req) |
| { |
| if (req->rsrc_node) |
| io_rsrc_put_node(req->rsrc_node, 1); |
| } |
| |
| static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) |
| __must_hold(&ctx->uring_lock) |
| { |
| if (ctx->rsrc_cached_refs) { |
| io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); |
| ctx->rsrc_cached_refs = 0; |
| } |
| } |
| |
| static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) |
| __must_hold(&ctx->uring_lock) |
| { |
| ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; |
| percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); |
| } |
| |
| static inline void io_req_set_rsrc_node(struct io_kiocb *req, |
| struct io_ring_ctx *ctx, |
| unsigned int issue_flags) |
| { |
| if (!req->rsrc_node) { |
| req->rsrc_node = ctx->rsrc_node; |
| |
| if (!(issue_flags & IO_URING_F_UNLOCKED)) { |
| lockdep_assert_held(&ctx->uring_lock); |
| ctx->rsrc_cached_refs--; |
| if (unlikely(ctx->rsrc_cached_refs < 0)) |
| io_rsrc_refs_refill(ctx); |
| } else { |
| percpu_ref_get(&req->rsrc_node->refs); |
| } |
| } |
| } |
| |
| static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) |
| { |
| if (req->flags & REQ_F_BUFFER_RING) { |
| if (req->buf_list) |
| req->buf_list->head++; |
| req->flags &= ~REQ_F_BUFFER_RING; |
| } else { |
| list_add(&req->kbuf->list, list); |
| req->flags &= ~REQ_F_BUFFER_SELECTED; |
| } |
| |
| return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); |
| } |
| |
| static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) |
| { |
| lockdep_assert_held(&req->ctx->completion_lock); |
| |
| if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) |
| return 0; |
| return __io_put_kbuf(req, &req->ctx->io_buffers_comp); |
| } |
| |
| static inline unsigned int io_put_kbuf(struct io_kiocb *req, |
| unsigned issue_flags) |
| { |
| unsigned int cflags; |
| |
| if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) |
| return 0; |
| |
| /* |
| * We can add this buffer back to two lists: |
| * |
| * 1) The io_buffers_cache list. This one is protected by the |
| * ctx->uring_lock. If we already hold this lock, add back to this |
| * list as we can grab it from issue as well. |
| * 2) The io_buffers_comp list. This one is protected by the |
| * ctx->completion_lock. |
| * |
| * We migrate buffers from the comp_list to the issue cache list |
| * when we need one. |
| */ |
| if (req->flags & REQ_F_BUFFER_RING) { |
| /* no buffers to recycle for this case */ |
| cflags = __io_put_kbuf(req, NULL); |
| } else if (issue_flags & IO_URING_F_UNLOCKED) { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| spin_lock(&ctx->completion_lock); |
| cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); |
| spin_unlock(&ctx->completion_lock); |
| } else { |
| lockdep_assert_held(&req->ctx->uring_lock); |
| |
| cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); |
| } |
| |
| return cflags; |
| } |
| |
| static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, |
| unsigned int bgid) |
| { |
| if (ctx->io_bl && bgid < BGID_ARRAY) |
| return &ctx->io_bl[bgid]; |
| |
| return xa_load(&ctx->io_bl_xa, bgid); |
| } |
| |
| static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| struct io_buffer_list *bl; |
| struct io_buffer *buf; |
| |
| if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) |
| return; |
| /* |
| * For legacy provided buffer mode, don't recycle if we already did |
| * IO to this buffer. For ring-mapped provided buffer mode, we should |
| * increment ring->head to explicitly monopolize the buffer to avoid |
| * multiple use. |
| */ |
| if ((req->flags & REQ_F_BUFFER_SELECTED) && |
| (req->flags & REQ_F_PARTIAL_IO)) |
| return; |
| |
| /* |
| * READV uses fields in `struct io_rw` (len/addr) to stash the selected |
| * buffer data. However if that buffer is recycled the original request |
| * data stored in addr is lost. Therefore forbid recycling for now. |
| */ |
| if (req->opcode == IORING_OP_READV) |
| return; |
| |
| /* |
| * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear |
| * the flag and hence ensure that bl->head doesn't get incremented. |
| * If the tail has already been incremented, hang on to it. |
| */ |
| if (req->flags & REQ_F_BUFFER_RING) { |
| if (req->buf_list) { |
| if (req->flags & REQ_F_PARTIAL_IO) { |
| req->buf_list->head++; |
| req->buf_list = NULL; |
| } else { |
| req->buf_index = req->buf_list->bgid; |
| req->flags &= ~REQ_F_BUFFER_RING; |
| } |
| } |
| return; |
| } |
| |
| io_ring_submit_lock(ctx, issue_flags); |
| |
| buf = req->kbuf; |
| bl = io_buffer_get_list(ctx, buf->bgid); |
| list_add(&buf->list, &bl->buf_list); |
| req->flags &= ~REQ_F_BUFFER_SELECTED; |
| req->buf_index = buf->bgid; |
| |
| io_ring_submit_unlock(ctx, issue_flags); |
| } |
| |
| static bool io_match_task(struct io_kiocb *head, struct task_struct *task, |
| bool cancel_all) |
| __must_hold(&req->ctx->timeout_lock) |
| { |
| struct io_kiocb *req; |
| |
| if (task && head->task != task) |
| return false; |
| if (cancel_all) |
| return true; |
| |
| io_for_each_link(req, head) { |
| if (req->flags & REQ_F_INFLIGHT) |
| return true; |
| } |
| return false; |
| } |
| |
| static bool io_match_linked(struct io_kiocb *head) |
| { |
| struct io_kiocb *req; |
| |
| io_for_each_link(req, head) { |
| if (req->flags & REQ_F_INFLIGHT) |
| return true; |
| } |
| return false; |
| } |
| |
| /* |
| * As io_match_task() but protected against racing with linked timeouts. |
| * User must not hold timeout_lock. |
| */ |
| static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, |
| bool cancel_all) |
| { |
| bool matched; |
| |
| if (task && head->task != task) |
| return false; |
| if (cancel_all) |
| return true; |
| |
| if (head->flags & REQ_F_LINK_TIMEOUT) { |
| struct io_ring_ctx *ctx = head->ctx; |
| |
| /* protect against races with linked timeouts */ |
| spin_lock_irq(&ctx->timeout_lock); |
| matched = io_match_linked(head); |
| spin_unlock_irq(&ctx->timeout_lock); |
| } else { |
| matched = io_match_linked(head); |
| } |
| return matched; |
| } |
| |
| static inline bool req_has_async_data(struct io_kiocb *req) |
| { |
| return req->flags & REQ_F_ASYNC_DATA; |
| } |
| |
| static inline void req_set_fail(struct io_kiocb *req) |
| { |
| req->flags |= REQ_F_FAIL; |
| if (req->flags & REQ_F_CQE_SKIP) { |
| req->flags &= ~REQ_F_CQE_SKIP; |
| req->flags |= REQ_F_SKIP_LINK_CQES; |
| } |
| } |
| |
| static inline void req_fail_link_node(struct io_kiocb *req, int res) |
| { |
| req_set_fail(req); |
| req->cqe.res = res; |
| } |
| |
| static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) |
| { |
| wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); |
| } |
| |
| static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) |
| { |
| struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); |
| |
| complete(&ctx->ref_comp); |
| } |
| |
| static inline bool io_is_timeout_noseq(struct io_kiocb *req) |
| { |
| return !req->timeout.off; |
| } |
| |
| static __cold void io_fallback_req_func(struct work_struct *work) |
| { |
| struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, |
| fallback_work.work); |
| struct llist_node *node = llist_del_all(&ctx->fallback_llist); |
| struct io_kiocb *req, *tmp; |
| bool locked = false; |
| |
| percpu_ref_get(&ctx->refs); |
| llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) |
| req->io_task_work.func(req, &locked); |
| |
| if (locked) { |
| io_submit_flush_completions(ctx); |
| mutex_unlock(&ctx->uring_lock); |
| } |
| percpu_ref_put(&ctx->refs); |
| } |
| |
| static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) |
| { |
| struct io_ring_ctx *ctx; |
| int hash_bits; |
| |
| ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); |
| if (!ctx) |
| return NULL; |
| |
| xa_init(&ctx->io_bl_xa); |
| |
| /* |
| * Use 5 bits less than the max cq entries, that should give us around |
| * 32 entries per hash list if totally full and uniformly spread. |
| */ |
| hash_bits = ilog2(p->cq_entries); |
| hash_bits -= 5; |
| if (hash_bits <= 0) |
| hash_bits = 1; |
| ctx->cancel_hash_bits = hash_bits; |
| ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), |
| GFP_KERNEL); |
| if (!ctx->cancel_hash) |
| goto err; |
| __hash_init(ctx->cancel_hash, 1U << hash_bits); |
| |
| ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); |
| if (!ctx->dummy_ubuf) |
| goto err; |
| /* set invalid range, so io_import_fixed() fails meeting it */ |
| ctx->dummy_ubuf->ubuf = -1UL; |
| |
| if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, |
| PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) |
| goto err; |
| |
| ctx->flags = p->flags; |
| init_waitqueue_head(&ctx->sqo_sq_wait); |
| INIT_LIST_HEAD(&ctx->sqd_list); |
| INIT_LIST_HEAD(&ctx->cq_overflow_list); |
| INIT_LIST_HEAD(&ctx->io_buffers_cache); |
| INIT_LIST_HEAD(&ctx->apoll_cache); |
| init_completion(&ctx->ref_comp); |
| xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); |
| mutex_init(&ctx->uring_lock); |
| init_waitqueue_head(&ctx->cq_wait); |
| spin_lock_init(&ctx->completion_lock); |
| spin_lock_init(&ctx->timeout_lock); |
| INIT_WQ_LIST(&ctx->iopoll_list); |
| INIT_LIST_HEAD(&ctx->io_buffers_pages); |
| INIT_LIST_HEAD(&ctx->io_buffers_comp); |
| INIT_LIST_HEAD(&ctx->defer_list); |
| INIT_LIST_HEAD(&ctx->timeout_list); |
| INIT_LIST_HEAD(&ctx->ltimeout_list); |
| spin_lock_init(&ctx->rsrc_ref_lock); |
| INIT_LIST_HEAD(&ctx->rsrc_ref_list); |
| INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); |
| init_llist_head(&ctx->rsrc_put_llist); |
| INIT_LIST_HEAD(&ctx->tctx_list); |
| ctx->submit_state.free_list.next = NULL; |
| INIT_WQ_LIST(&ctx->locked_free_list); |
| INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); |
| INIT_WQ_LIST(&ctx->submit_state.compl_reqs); |
| return ctx; |
| err: |
| kfree(ctx->dummy_ubuf); |
| kfree(ctx->cancel_hash); |
| kfree(ctx->io_bl); |
| xa_destroy(&ctx->io_bl_xa); |
| kfree(ctx); |
| return NULL; |
| } |
| |
| static void io_account_cq_overflow(struct io_ring_ctx *ctx) |
| { |
| struct io_rings *r = ctx->rings; |
| |
| WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); |
| ctx->cq_extra--; |
| } |
| |
| static bool req_need_defer(struct io_kiocb *req, u32 seq) |
| { |
| if (unlikely(req->flags & REQ_F_IO_DRAIN)) { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; |
| } |
| |
| return false; |
| } |
| |
| static inline bool io_req_ffs_set(struct io_kiocb *req) |
| { |
| return req->flags & REQ_F_FIXED_FILE; |
| } |
| |
| static inline void io_req_track_inflight(struct io_kiocb *req) |
| { |
| if (!(req->flags & REQ_F_INFLIGHT)) { |
| req->flags |= REQ_F_INFLIGHT; |
| atomic_inc(&req->task->io_uring->inflight_tracked); |
| } |
| } |
| |
| static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) |
| { |
| if (WARN_ON_ONCE(!req->link)) |
| return NULL; |
| |
| req->flags &= ~REQ_F_ARM_LTIMEOUT; |
| req->flags |= REQ_F_LINK_TIMEOUT; |
| |
| /* linked timeouts should have two refs once prep'ed */ |
| io_req_set_refcount(req); |
| __io_req_set_refcount(req->link, 2); |
| return req->link; |
| } |
| |
| static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) |
| { |
| if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) |
| return NULL; |
| return __io_prep_linked_timeout(req); |
| } |
| |
| static noinline void __io_arm_ltimeout(struct io_kiocb *req) |
| { |
| io_queue_linked_timeout(__io_prep_linked_timeout(req)); |
| } |
| |
| static inline void io_arm_ltimeout(struct io_kiocb *req) |
| { |
| if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) |
| __io_arm_ltimeout(req); |
| } |
| |
| static void io_prep_async_work(struct io_kiocb *req) |
| { |
| const struct io_op_def *def = &io_op_defs[req->opcode]; |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| if (!(req->flags & REQ_F_CREDS)) { |
| req->flags |= REQ_F_CREDS; |
| req->creds = get_current_cred(); |
| } |
| |
| req->work.list.next = NULL; |
| req->work.flags = 0; |
| req->work.cancel_seq = atomic_read(&ctx->cancel_seq); |
| if (req->flags & REQ_F_FORCE_ASYNC) |
| req->work.flags |= IO_WQ_WORK_CONCURRENT; |
| |
| if (req->flags & REQ_F_ISREG) { |
| if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) |
| io_wq_hash_work(&req->work, file_inode(req->file)); |
| } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { |
| if (def->unbound_nonreg_file) |
| req->work.flags |= IO_WQ_WORK_UNBOUND; |
| } |
| } |
| |
| static void io_prep_async_link(struct io_kiocb *req) |
| { |
| struct io_kiocb *cur; |
| |
| if (req->flags & REQ_F_LINK_TIMEOUT) { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| spin_lock_irq(&ctx->timeout_lock); |
| io_for_each_link(cur, req) |
| io_prep_async_work(cur); |
| spin_unlock_irq(&ctx->timeout_lock); |
| } else { |
| io_for_each_link(cur, req) |
| io_prep_async_work(cur); |
| } |
| } |
| |
| static inline void io_req_add_compl_list(struct io_kiocb *req) |
| { |
| struct io_submit_state *state = &req->ctx->submit_state; |
| |
| if (!(req->flags & REQ_F_CQE_SKIP)) |
| state->flush_cqes = true; |
| wq_list_add_tail(&req->comp_list, &state->compl_reqs); |
| } |
| |
| static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) |
| { |
| struct io_kiocb *link = io_prep_linked_timeout(req); |
| struct io_uring_task *tctx = req->task->io_uring; |
| |
| BUG_ON(!tctx); |
| BUG_ON(!tctx->io_wq); |
| |
| /* init ->work of the whole link before punting */ |
| io_prep_async_link(req); |
| |
| /* |
| * Not expected to happen, but if we do have a bug where this _can_ |
| * happen, catch it here and ensure the request is marked as |
| * canceled. That will make io-wq go through the usual work cancel |
| * procedure rather than attempt to run this request (or create a new |
| * worker for it). |
| */ |
| if (WARN_ON_ONCE(!same_thread_group(req->task, current))) |
| req->work.flags |= IO_WQ_WORK_CANCEL; |
| |
| trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, |
| req->opcode, req->flags, &req->work, |
| io_wq_is_hashed(&req->work)); |
| io_wq_enqueue(tctx->io_wq, &req->work); |
| if (link) |
| io_queue_linked_timeout(link); |
| } |
| |
| static void io_kill_timeout(struct io_kiocb *req, int status) |
| __must_hold(&req->ctx->completion_lock) |
| __must_hold(&req->ctx->timeout_lock) |
| { |
| struct io_timeout_data *io = req->async_data; |
| |
| if (hrtimer_try_to_cancel(&io->timer) != -1) { |
| if (status) |
| req_set_fail(req); |
| atomic_set(&req->ctx->cq_timeouts, |
| atomic_read(&req->ctx->cq_timeouts) + 1); |
| list_del_init(&req->timeout.list); |
| io_req_tw_post_queue(req, status, 0); |
| } |
| } |
| |
| static __cold void io_queue_deferred(struct io_ring_ctx *ctx) |
| { |
| while (!list_empty(&ctx->defer_list)) { |
| struct io_defer_entry *de = list_first_entry(&ctx->defer_list, |
| struct io_defer_entry, list); |
| |
| if (req_need_defer(de->req, de->seq)) |
| break; |
| list_del_init(&de->list); |
| io_req_task_queue(de->req); |
| kfree(de); |
| } |
| } |
| |
| static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) |
| __must_hold(&ctx->completion_lock) |
| { |
| u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); |
| struct io_kiocb *req, *tmp; |
| |
| spin_lock_irq(&ctx->timeout_lock); |
| list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { |
| u32 events_needed, events_got; |
| |
| if (io_is_timeout_noseq(req)) |
| break; |
| |
| /* |
| * Since seq can easily wrap around over time, subtract |
| * the last seq at which timeouts were flushed before comparing. |
| * Assuming not more than 2^31-1 events have happened since, |
| * these subtractions won't have wrapped, so we can check if |
| * target is in [last_seq, current_seq] by comparing the two. |
| */ |
| events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; |
| events_got = seq - ctx->cq_last_tm_flush; |
| if (events_got < events_needed) |
| break; |
| |
| io_kill_timeout(req, 0); |
| } |
| ctx->cq_last_tm_flush = seq; |
| spin_unlock_irq(&ctx->timeout_lock); |
| } |
| |
| static inline void io_commit_cqring(struct io_ring_ctx *ctx) |
| { |
| /* order cqe stores with ring update */ |
| smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); |
| } |
| |
| static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) |
| { |
| if (ctx->off_timeout_used || ctx->drain_active) { |
| spin_lock(&ctx->completion_lock); |
| if (ctx->off_timeout_used) |
| io_flush_timeouts(ctx); |
| if (ctx->drain_active) |
| io_queue_deferred(ctx); |
| io_commit_cqring(ctx); |
| spin_unlock(&ctx->completion_lock); |
| } |
| if (ctx->has_evfd) |
| io_eventfd_signal(ctx); |
| } |
| |
| static inline bool io_sqring_full(struct io_ring_ctx *ctx) |
| { |
| struct io_rings *r = ctx->rings; |
| |
| return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; |
| } |
| |
| static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) |
| { |
| return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); |
| } |
| |
| /* |
| * writes to the cq entry need to come after reading head; the |
| * control dependency is enough as we're using WRITE_ONCE to |
| * fill the cq entry |
| */ |
| static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) |
| { |
| struct io_rings *rings = ctx->rings; |
| unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); |
| unsigned int shift = 0; |
| unsigned int free, queued, len; |
| |
| if (ctx->flags & IORING_SETUP_CQE32) |
| shift = 1; |
| |
| /* userspace may cheat modifying the tail, be safe and do min */ |
| queued = min(__io_cqring_events(ctx), ctx->cq_entries); |
| free = ctx->cq_entries - queued; |
| /* we need a contiguous range, limit based on the current array offset */ |
| len = min(free, ctx->cq_entries - off); |
| if (!len) |
| return NULL; |
| |
| ctx->cached_cq_tail++; |
| ctx->cqe_cached = &rings->cqes[off]; |
| ctx->cqe_sentinel = ctx->cqe_cached + len; |
| ctx->cqe_cached++; |
| return &rings->cqes[off << shift]; |
| } |
| |
| static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) |
| { |
| if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { |
| struct io_uring_cqe *cqe = ctx->cqe_cached; |
| |
| if (ctx->flags & IORING_SETUP_CQE32) { |
| unsigned int off = ctx->cqe_cached - ctx->rings->cqes; |
| |
| cqe += off; |
| } |
| |
| ctx->cached_cq_tail++; |
| ctx->cqe_cached++; |
| return cqe; |
| } |
| |
| return __io_get_cqe(ctx); |
| } |
| |
| static void io_eventfd_signal(struct io_ring_ctx *ctx) |
| { |
| struct io_ev_fd *ev_fd; |
| |
| rcu_read_lock(); |
| /* |
| * rcu_dereference ctx->io_ev_fd once and use it for both for checking |
| * and eventfd_signal |
| */ |
| ev_fd = rcu_dereference(ctx->io_ev_fd); |
| |
| /* |
| * Check again if ev_fd exists incase an io_eventfd_unregister call |
| * completed between the NULL check of ctx->io_ev_fd at the start of |
| * the function and rcu_read_lock. |
| */ |
| if (unlikely(!ev_fd)) |
| goto out; |
| if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) |
| goto out; |
| |
| if (!ev_fd->eventfd_async || io_wq_current_is_worker()) |
| eventfd_signal(ev_fd->cq_ev_fd, 1); |
| out: |
| rcu_read_unlock(); |
| } |
| |
| static inline void io_cqring_wake(struct io_ring_ctx *ctx) |
| { |
| /* |
| * wake_up_all() may seem excessive, but io_wake_function() and |
| * io_should_wake() handle the termination of the loop and only |
| * wake as many waiters as we need to. |
| */ |
| if (wq_has_sleeper(&ctx->cq_wait)) |
| wake_up_all(&ctx->cq_wait); |
| } |
| |
| /* |
| * This should only get called when at least one event has been posted. |
| * Some applications rely on the eventfd notification count only changing |
| * IFF a new CQE has been added to the CQ ring. There's no depedency on |
| * 1:1 relationship between how many times this function is called (and |
| * hence the eventfd count) and number of CQEs posted to the CQ ring. |
| */ |
| static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) |
| { |
| if (unlikely(ctx->off_timeout_used || ctx->drain_active || |
| ctx->has_evfd)) |
| __io_commit_cqring_flush(ctx); |
| |
| io_cqring_wake(ctx); |
| } |
| |
| static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) |
| { |
| if (unlikely(ctx->off_timeout_used || ctx->drain_active || |
| ctx->has_evfd)) |
| __io_commit_cqring_flush(ctx); |
| |
| if (ctx->flags & IORING_SETUP_SQPOLL) |
| io_cqring_wake(ctx); |
| } |
| |
| /* Returns true if there are no backlogged entries after the flush */ |
| static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) |
| { |
| bool all_flushed, posted; |
| size_t cqe_size = sizeof(struct io_uring_cqe); |
| |
| if (!force && __io_cqring_events(ctx) == ctx->cq_entries) |
| return false; |
| |
| if (ctx->flags & IORING_SETUP_CQE32) |
| cqe_size <<= 1; |
| |
| posted = false; |
| spin_lock(&ctx->completion_lock); |
| while (!list_empty(&ctx->cq_overflow_list)) { |
| struct io_uring_cqe *cqe = io_get_cqe(ctx); |
| struct io_overflow_cqe *ocqe; |
| |
| if (!cqe && !force) |
| break; |
| ocqe = list_first_entry(&ctx->cq_overflow_list, |
| struct io_overflow_cqe, list); |
| if (cqe) |
| memcpy(cqe, &ocqe->cqe, cqe_size); |
| else |
| io_account_cq_overflow(ctx); |
| |
| posted = true; |
| list_del(&ocqe->list); |
| kfree(ocqe); |
| } |
| |
| all_flushed = list_empty(&ctx->cq_overflow_list); |
| if (all_flushed) { |
| clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); |
| atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); |
| } |
| |
| io_commit_cqring(ctx); |
| spin_unlock(&ctx->completion_lock); |
| if (posted) |
| io_cqring_ev_posted(ctx); |
| return all_flushed; |
| } |
| |
| static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) |
| { |
| bool ret = true; |
| |
| if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { |
| /* iopoll syncs against uring_lock, not completion_lock */ |
| if (ctx->flags & IORING_SETUP_IOPOLL) |
| mutex_lock(&ctx->uring_lock); |
| ret = __io_cqring_overflow_flush(ctx, false); |
| if (ctx->flags & IORING_SETUP_IOPOLL) |
| mutex_unlock(&ctx->uring_lock); |
| } |
| |
| return ret; |
| } |
| |
| static void __io_put_task(struct task_struct *task, int nr) |
| { |
| struct io_uring_task *tctx = task->io_uring; |
| |
| percpu_counter_sub(&tctx->inflight, nr); |
| if (unlikely(atomic_read(&tctx->in_idle))) |
| wake_up(&tctx->wait); |
| put_task_struct_many(task, nr); |
| } |
| |
| /* must to be called somewhat shortly after putting a request */ |
| static inline void io_put_task(struct task_struct *task, int nr) |
| { |
| if (likely(task == current)) |
| task->io_uring->cached_refs += nr; |
| else |
| __io_put_task(task, nr); |
| } |
| |
| static void io_task_refs_refill(struct io_uring_task *tctx) |
| { |
| unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; |
| |
| percpu_counter_add(&tctx->inflight, refill); |
| refcount_add(refill, ¤t->usage); |
| tctx->cached_refs += refill; |
| } |
| |
| static inline void io_get_task_refs(int nr) |
| { |
| struct io_uring_task *tctx = current->io_uring; |
| |
| tctx->cached_refs -= nr; |
| if (unlikely(tctx->cached_refs < 0)) |
| io_task_refs_refill(tctx); |
| } |
| |
| static __cold void io_uring_drop_tctx_refs(struct task_struct *task) |
| { |
| struct io_uring_task *tctx = task->io_uring; |
| unsigned int refs = tctx->cached_refs; |
| |
| if (refs) { |
| tctx->cached_refs = 0; |
| percpu_counter_sub(&tctx->inflight, refs); |
| put_task_struct_many(task, refs); |
| } |
| } |
| |
| static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, |
| s32 res, u32 cflags, u64 extra1, |
| u64 extra2) |
| { |
| struct io_overflow_cqe *ocqe; |
| size_t ocq_size = sizeof(struct io_overflow_cqe); |
| bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); |
| |
| if (is_cqe32) |
| ocq_size += sizeof(struct io_uring_cqe); |
| |
| ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); |
| trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); |
| if (!ocqe) { |
| /* |
| * If we're in ring overflow flush mode, or in task cancel mode, |
| * or cannot allocate an overflow entry, then we need to drop it |
| * on the floor. |
| */ |
| io_account_cq_overflow(ctx); |
| set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); |
| return false; |
| } |
| if (list_empty(&ctx->cq_overflow_list)) { |
| set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); |
| atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); |
| |
| } |
| ocqe->cqe.user_data = user_data; |
| ocqe->cqe.res = res; |
| ocqe->cqe.flags = cflags; |
| if (is_cqe32) { |
| ocqe->cqe.big_cqe[0] = extra1; |
| ocqe->cqe.big_cqe[1] = extra2; |
| } |
| list_add_tail(&ocqe->list, &ctx->cq_overflow_list); |
| return true; |
| } |
| |
| static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, |
| struct io_kiocb *req) |
| { |
| struct io_uring_cqe *cqe; |
| |
| if (!(ctx->flags & IORING_SETUP_CQE32)) { |
| trace_io_uring_complete(req->ctx, req, req->cqe.user_data, |
| req->cqe.res, req->cqe.flags, 0, 0); |
| |
| /* |
| * If we can't get a cq entry, userspace overflowed the |
| * submission (by quite a lot). Increment the overflow count in |
| * the ring. |
| */ |
| cqe = io_get_cqe(ctx); |
| if (likely(cqe)) { |
| memcpy(cqe, &req->cqe, sizeof(*cqe)); |
| return true; |
| } |
| |
| return io_cqring_event_overflow(ctx, req->cqe.user_data, |
| req->cqe.res, req->cqe.flags, |
| 0, 0); |
| } else { |
| u64 extra1 = 0, extra2 = 0; |
| |
| if (req->flags & REQ_F_CQE32_INIT) { |
| extra1 = req->extra1; |
| extra2 = req->extra2; |
| } |
| |
| trace_io_uring_complete(req->ctx, req, req->cqe.user_data, |
| req->cqe.res, req->cqe.flags, extra1, extra2); |
| |
| /* |
| * If we can't get a cq entry, userspace overflowed the |
| * submission (by quite a lot). Increment the overflow count in |
| * the ring. |
| */ |
| cqe = io_get_cqe(ctx); |
| if (likely(cqe)) { |
| memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); |
| WRITE_ONCE(cqe->big_cqe[0], extra1); |
| WRITE_ONCE(cqe->big_cqe[1], extra2); |
| return true; |
| } |
| |
| return io_cqring_event_overflow(ctx, req->cqe.user_data, |
| req->cqe.res, req->cqe.flags, |
| extra1, extra2); |
| } |
| } |
| |
| static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, |
| s32 res, u32 cflags) |
| { |
| struct io_uring_cqe *cqe; |
| |
| ctx->cq_extra++; |
| trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); |
| |
| /* |
| * If we can't get a cq entry, userspace overflowed the |
| * submission (by quite a lot). Increment the overflow count in |
| * the ring. |
| */ |
| cqe = io_get_cqe(ctx); |
| if (likely(cqe)) { |
| WRITE_ONCE(cqe->user_data, user_data); |
| WRITE_ONCE(cqe->res, res); |
| WRITE_ONCE(cqe->flags, cflags); |
| |
| if (ctx->flags & IORING_SETUP_CQE32) { |
| WRITE_ONCE(cqe->big_cqe[0], 0); |
| WRITE_ONCE(cqe->big_cqe[1], 0); |
| } |
| return true; |
| } |
| return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); |
| } |
| |
| static void __io_req_complete_put(struct io_kiocb *req) |
| { |
| /* |
| * If we're the last reference to this request, add to our locked |
| * free_list cache. |
| */ |
| if (req_ref_put_and_test(req)) { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| if (req->flags & IO_REQ_LINK_FLAGS) { |
| if (req->flags & IO_DISARM_MASK) |
| io_disarm_next(req); |
| if (req->link) { |
| io_req_task_queue(req->link); |
| req->link = NULL; |
| } |
| } |
| io_req_put_rsrc(req); |
| /* |
| * Selected buffer deallocation in io_clean_op() assumes that |
| * we don't hold ->completion_lock. Clean them here to avoid |
| * deadlocks. |
| */ |
| io_put_kbuf_comp(req); |
| io_dismantle_req(req); |
| io_put_task(req->task, 1); |
| wq_list_add_head(&req->comp_list, &ctx->locked_free_list); |
| ctx->locked_free_nr++; |
| } |
| } |
| |
| static void __io_req_complete_post(struct io_kiocb *req, s32 res, |
| u32 cflags) |
| { |
| if (!(req->flags & REQ_F_CQE_SKIP)) { |
| req->cqe.res = res; |
| req->cqe.flags = cflags; |
| __io_fill_cqe_req(req->ctx, req); |
| } |
| __io_req_complete_put(req); |
| } |
| |
| static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| spin_lock(&ctx->completion_lock); |
| __io_req_complete_post(req, res, cflags); |
| io_commit_cqring(ctx); |
| spin_unlock(&ctx->completion_lock); |
| io_cqring_ev_posted(ctx); |
| } |
| |
| static inline void io_req_complete_state(struct io_kiocb *req, s32 res, |
| u32 cflags) |
| { |
| req->cqe.res = res; |
| req->cqe.flags = cflags; |
| req->flags |= REQ_F_COMPLETE_INLINE; |
| } |
| |
| static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, |
| s32 res, u32 cflags) |
| { |
| if (issue_flags & IO_URING_F_COMPLETE_DEFER) |
| io_req_complete_state(req, res, cflags); |
| else |
| io_req_complete_post(req, res, cflags); |
| } |
| |
| static inline void io_req_complete(struct io_kiocb *req, s32 res) |
| { |
| if (res < 0) |
| req_set_fail(req); |
| __io_req_complete(req, 0, res, 0); |
| } |
| |
| static void io_req_complete_failed(struct io_kiocb *req, s32 res) |
| { |
| req_set_fail(req); |
| io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); |
| } |
| |
| /* |
| * Don't initialise the fields below on every allocation, but do that in |
| * advance and keep them valid across allocations. |
| */ |
| static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) |
| { |
| req->ctx = ctx; |
| req->link = NULL; |
| req->async_data = NULL; |
| /* not necessary, but safer to zero */ |
| req->cqe.res = 0; |
| } |
| |
| static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, |
| struct io_submit_state *state) |
| { |
| spin_lock(&ctx->completion_lock); |
| wq_list_splice(&ctx->locked_free_list, &state->free_list); |
| ctx->locked_free_nr = 0; |
| spin_unlock(&ctx->completion_lock); |
| } |
| |
| static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) |
| { |
| return !ctx->submit_state.free_list.next; |
| } |
| |
| /* |
| * A request might get retired back into the request caches even before opcode |
| * handlers and io_issue_sqe() are done with it, e.g. inline completion path. |
| * Because of that, io_alloc_req() should be called only under ->uring_lock |
| * and with extra caution to not get a request that is still worked on. |
| */ |
| static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) |
| __must_hold(&ctx->uring_lock) |
| { |
| gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; |
| void *reqs[IO_REQ_ALLOC_BATCH]; |
| int ret, i; |
| |
| /* |
| * If we have more than a batch's worth of requests in our IRQ side |
| * locked cache, grab the lock and move them over to our submission |
| * side cache. |
| */ |
| if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { |
| io_flush_cached_locked_reqs(ctx, &ctx->submit_state); |
| if (!io_req_cache_empty(ctx)) |
| return true; |
| } |
| |
| ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); |
| |
| /* |
| * Bulk alloc is all-or-nothing. If we fail to get a batch, |
| * retry single alloc to be on the safe side. |
| */ |
| if (unlikely(ret <= 0)) { |
| reqs[0] = kmem_cache_alloc(req_cachep, gfp); |
| if (!reqs[0]) |
| return false; |
| ret = 1; |
| } |
| |
| percpu_ref_get_many(&ctx->refs, ret); |
| for (i = 0; i < ret; i++) { |
| struct io_kiocb *req = reqs[i]; |
| |
| io_preinit_req(req, ctx); |
| io_req_add_to_cache(req, ctx); |
| } |
| return true; |
| } |
| |
| static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) |
| { |
| if (unlikely(io_req_cache_empty(ctx))) |
| return __io_alloc_req_refill(ctx); |
| return true; |
| } |
| |
| static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) |
| { |
| struct io_wq_work_node *node; |
| |
| node = wq_stack_extract(&ctx->submit_state.free_list); |
| return container_of(node, struct io_kiocb, comp_list); |
| } |
| |
| static inline void io_put_file(struct file *file) |
| { |
| if (file) |
| fput(file); |
| } |
| |
| static inline void io_dismantle_req(struct io_kiocb *req) |
| { |
| unsigned int flags = req->flags; |
| |
| if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) |
| io_clean_op(req); |
| if (!(flags & REQ_F_FIXED_FILE)) |
| io_put_file(req->file); |
| } |
| |
| static __cold void io_free_req(struct io_kiocb *req) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| io_req_put_rsrc(req); |
| io_dismantle_req(req); |
| io_put_task(req->task, 1); |
| |
| spin_lock(&ctx->completion_lock); |
| wq_list_add_head(&req->comp_list, &ctx->locked_free_list); |
| ctx->locked_free_nr++; |
| spin_unlock(&ctx->completion_lock); |
| } |
| |
| static inline void io_remove_next_linked(struct io_kiocb *req) |
| { |
| struct io_kiocb *nxt = req->link; |
| |
| req->link = nxt->link; |
| nxt->link = NULL; |
| } |
| |
| static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) |
| __must_hold(&req->ctx->completion_lock) |
| __must_hold(&req->ctx->timeout_lock) |
| { |
| struct io_kiocb *link = req->link; |
| |
| if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { |
| struct io_timeout_data *io = link->async_data; |
| |
| io_remove_next_linked(req); |
| link->timeout.head = NULL; |
| if (hrtimer_try_to_cancel(&io->timer) != -1) { |
| list_del(&link->timeout.list); |
| return link; |
| } |
| } |
| return NULL; |
| } |
| |
| static void io_fail_links(struct io_kiocb *req) |
| __must_hold(&req->ctx->completion_lock) |
| { |
| struct io_kiocb *nxt, *link = req->link; |
| bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; |
| |
| req->link = NULL; |
| while (link) { |
| long res = -ECANCELED; |
| |
| if (link->flags & REQ_F_FAIL) |
| res = link->cqe.res; |
| |
| nxt = link->link; |
| link->link = NULL; |
| |
| trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, |
| req->opcode, link); |
| |
| if (ignore_cqes) |
| link->flags |= REQ_F_CQE_SKIP; |
| else |
| link->flags &= ~REQ_F_CQE_SKIP; |
| __io_req_complete_post(link, res, 0); |
| link = nxt; |
| } |
| } |
| |
| static bool io_disarm_next(struct io_kiocb *req) |
| __must_hold(&req->ctx->completion_lock) |
| { |
| struct io_kiocb *link = NULL; |
| bool posted = false; |
| |
| if (req->flags & REQ_F_ARM_LTIMEOUT) { |
| link = req->link; |
| req->flags &= ~REQ_F_ARM_LTIMEOUT; |
| if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { |
| io_remove_next_linked(req); |
| io_req_tw_post_queue(link, -ECANCELED, 0); |
| posted = true; |
| } |
| } else if (req->flags & REQ_F_LINK_TIMEOUT) { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| spin_lock_irq(&ctx->timeout_lock); |
| link = io_disarm_linked_timeout(req); |
| spin_unlock_irq(&ctx->timeout_lock); |
| if (link) { |
| posted = true; |
| io_req_tw_post_queue(link, -ECANCELED, 0); |
| } |
| } |
| if (unlikely((req->flags & REQ_F_FAIL) && |
| !(req->flags & REQ_F_HARDLINK))) { |
| posted |= (req->link != NULL); |
| io_fail_links(req); |
| } |
| return posted; |
| } |
| |
| static void __io_req_find_next_prep(struct io_kiocb *req) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| bool posted; |
| |
| spin_lock(&ctx->completion_lock); |
| posted = io_disarm_next(req); |
| io_commit_cqring(ctx); |
| spin_unlock(&ctx->completion_lock); |
| if (posted) |
| io_cqring_ev_posted(ctx); |
| } |
| |
| static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) |
| { |
| struct io_kiocb *nxt; |
| |
| /* |
| * If LINK is set, we have dependent requests in this chain. If we |
| * didn't fail this request, queue the first one up, moving any other |
| * dependencies to the next request. In case of failure, fail the rest |
| * of the chain. |
| */ |
| if (unlikely(req->flags & IO_DISARM_MASK)) |
| __io_req_find_next_prep(req); |
| nxt = req->link; |
| req->link = NULL; |
| return nxt; |
| } |
| |
| static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) |
| { |
| if (!ctx) |
| return; |
| if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) |
| atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); |
| if (*locked) { |
| io_submit_flush_completions(ctx); |
| mutex_unlock(&ctx->uring_lock); |
| *locked = false; |
| } |
| percpu_ref_put(&ctx->refs); |
| } |
| |
| static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) |
| { |
| io_commit_cqring(ctx); |
| spin_unlock(&ctx->completion_lock); |
| io_cqring_ev_posted(ctx); |
| } |
| |
| static void handle_prev_tw_list(struct io_wq_work_node *node, |
| struct io_ring_ctx **ctx, bool *uring_locked) |
| { |
| if (*ctx && !*uring_locked) |
| spin_lock(&(*ctx)->completion_lock); |
| |
| do { |
| struct io_wq_work_node *next = node->next; |
| struct io_kiocb *req = container_of(node, struct io_kiocb, |
| io_task_work.node); |
| |
| prefetch(container_of(next, struct io_kiocb, io_task_work.node)); |
| |
| if (req->ctx != *ctx) { |
| if (unlikely(!*uring_locked && *ctx)) |
| ctx_commit_and_unlock(*ctx); |
| |
| ctx_flush_and_put(*ctx, uring_locked); |
| *ctx = req->ctx; |
| /* if not contended, grab and improve batching */ |
| *uring_locked = mutex_trylock(&(*ctx)->uring_lock); |
| percpu_ref_get(&(*ctx)->refs); |
| if (unlikely(!*uring_locked)) |
| spin_lock(&(*ctx)->completion_lock); |
| } |
| if (likely(*uring_locked)) |
| req->io_task_work.func(req, uring_locked); |
| else |
| __io_req_complete_post(req, req->cqe.res, |
| io_put_kbuf_comp(req)); |
| node = next; |
| } while (node); |
| |
| if (unlikely(!*uring_locked)) |
| ctx_commit_and_unlock(*ctx); |
| } |
| |
| static void handle_tw_list(struct io_wq_work_node *node, |
| struct io_ring_ctx **ctx, bool *locked) |
| { |
| do { |
| struct io_wq_work_node *next = node->next; |
| struct io_kiocb *req = container_of(node, struct io_kiocb, |
| io_task_work.node); |
| |
| prefetch(container_of(next, struct io_kiocb, io_task_work.node)); |
| |
| if (req->ctx != *ctx) { |
| ctx_flush_and_put(*ctx, locked); |
| *ctx = req->ctx; |
| /* if not contended, grab and improve batching */ |
| *locked = mutex_trylock(&(*ctx)->uring_lock); |
| percpu_ref_get(&(*ctx)->refs); |
| } |
| req->io_task_work.func(req, locked); |
| node = next; |
| } while (node); |
| } |
| |
| static void tctx_task_work(struct callback_head *cb) |
| { |
| bool uring_locked = false; |
| struct io_ring_ctx *ctx = NULL; |
| struct io_uring_task *tctx = container_of(cb, struct io_uring_task, |
| task_work); |
| |
| while (1) { |
| struct io_wq_work_node *node1, *node2; |
| |
| spin_lock_irq(&tctx->task_lock); |
| node1 = tctx->prio_task_list.first; |
| node2 = tctx->task_list.first; |
| INIT_WQ_LIST(&tctx->task_list); |
| INIT_WQ_LIST(&tctx->prio_task_list); |
| if (!node2 && !node1) |
| tctx->task_running = false; |
| spin_unlock_irq(&tctx->task_lock); |
| if (!node2 && !node1) |
| break; |
| |
| if (node1) |
| handle_prev_tw_list(node1, &ctx, &uring_locked); |
| if (node2) |
| handle_tw_list(node2, &ctx, &uring_locked); |
| cond_resched(); |
| |
| if (data_race(!tctx->task_list.first) && |
| data_race(!tctx->prio_task_list.first) && uring_locked) |
| io_submit_flush_completions(ctx); |
| } |
| |
| ctx_flush_and_put(ctx, &uring_locked); |
| |
| /* relaxed read is enough as only the task itself sets ->in_idle */ |
| if (unlikely(atomic_read(&tctx->in_idle))) |
| io_uring_drop_tctx_refs(current); |
| } |
| |
| static void __io_req_task_work_add(struct io_kiocb *req, |
| struct io_uring_task *tctx, |
| struct io_wq_work_list *list) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| struct io_wq_work_node *node; |
| unsigned long flags; |
| bool running; |
| |
| spin_lock_irqsave(&tctx->task_lock, flags); |
| wq_list_add_tail(&req->io_task_work.node, list); |
| running = tctx->task_running; |
| if (!running) |
| tctx->task_running = true; |
| spin_unlock_irqrestore(&tctx->task_lock, flags); |
| |
| /* task_work already pending, we're done */ |
| if (running) |
| return; |
| |
| if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) |
| atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); |
| |
| if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) |
| return; |
| |
| spin_lock_irqsave(&tctx->task_lock, flags); |
| tctx->task_running = false; |
| node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); |
| spin_unlock_irqrestore(&tctx->task_lock, flags); |
| |
| while (node) { |
| req = container_of(node, struct io_kiocb, io_task_work.node); |
| node = node->next; |
| if (llist_add(&req->io_task_work.fallback_node, |
| &req->ctx->fallback_llist)) |
| schedule_delayed_work(&req->ctx->fallback_work, 1); |
| } |
| } |
| |
| static void io_req_task_work_add(struct io_kiocb *req) |
| { |
| struct io_uring_task *tctx = req->task->io_uring; |
| |
| __io_req_task_work_add(req, tctx, &tctx->task_list); |
| } |
| |
| static void io_req_task_prio_work_add(struct io_kiocb *req) |
| { |
| struct io_uring_task *tctx = req->task->io_uring; |
| |
| if (req->ctx->flags & IORING_SETUP_SQPOLL) |
| __io_req_task_work_add(req, tctx, &tctx->prio_task_list); |
| else |
| __io_req_task_work_add(req, tctx, &tctx->task_list); |
| } |
| |
| static void io_req_tw_post(struct io_kiocb *req, bool *locked) |
| { |
| io_req_complete_post(req, req->cqe.res, req->cqe.flags); |
| } |
| |
| static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) |
| { |
| req->cqe.res = res; |
| req->cqe.flags = cflags; |
| req->io_task_work.func = io_req_tw_post; |
| io_req_task_work_add(req); |
| } |
| |
| static void io_req_task_cancel(struct io_kiocb *req, bool *locked) |
| { |
| /* not needed for normal modes, but SQPOLL depends on it */ |
| io_tw_lock(req->ctx, locked); |
| io_req_complete_failed(req, req->cqe.res); |
| } |
| |
| static void io_req_task_submit(struct io_kiocb *req, bool *locked) |
| { |
| io_tw_lock(req->ctx, locked); |
| /* req->task == current here, checking PF_EXITING is safe */ |
| if (likely(!(req->task->flags & PF_EXITING))) |
| io_queue_sqe(req); |
| else |
| io_req_complete_failed(req, -EFAULT); |
| } |
| |
| static void io_req_task_queue_fail(struct io_kiocb *req, int ret) |
| { |
| req->cqe.res = ret; |
| req->io_task_work.func = io_req_task_cancel; |
| io_req_task_work_add(req); |
| } |
| |
| static void io_req_task_queue(struct io_kiocb *req) |
| { |
| req->io_task_work.func = io_req_task_submit; |
| io_req_task_work_add(req); |
| } |
| |
| static void io_req_task_queue_reissue(struct io_kiocb *req) |
| { |
| req->io_task_work.func = io_queue_iowq; |
| io_req_task_work_add(req); |
| } |
| |
| static void io_queue_next(struct io_kiocb *req) |
| { |
| struct io_kiocb *nxt = io_req_find_next(req); |
| |
| if (nxt) |
| io_req_task_queue(nxt); |
| } |
| |
| static void io_free_batch_list(struct io_ring_ctx *ctx, |
| struct io_wq_work_node *node) |
| __must_hold(&ctx->uring_lock) |
| { |
| struct task_struct *task = NULL; |
| int task_refs = 0; |
| |
| do { |
| struct io_kiocb *req = container_of(node, struct io_kiocb, |
| comp_list); |
| |
| if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { |
| if (req->flags & REQ_F_REFCOUNT) { |
| node = req->comp_list.next; |
| if (!req_ref_put_and_test(req)) |
| continue; |
| } |
| if ((req->flags & REQ_F_POLLED) && req->apoll) { |
| struct async_poll *apoll = req->apoll; |
| |
| if (apoll->double_poll) |
| kfree(apoll->double_poll); |
| list_add(&apoll->poll.wait.entry, |
| &ctx->apoll_cache); |
| req->flags &= ~REQ_F_POLLED; |
| } |
| if (req->flags & IO_REQ_LINK_FLAGS) |
| io_queue_next(req); |
| if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) |
| io_clean_op(req); |
| } |
| if (!(req->flags & REQ_F_FIXED_FILE)) |
| io_put_file(req->file); |
| |
| io_req_put_rsrc_locked(req, ctx); |
| |
| if (req->task != task) { |
| if (task) |
| io_put_task(task, task_refs); |
| task = req->task; |
| task_refs = 0; |
| } |
| task_refs++; |
| node = req->comp_list.next; |
| io_req_add_to_cache(req, ctx); |
| } while (node); |
| |
| if (task) |
| io_put_task(task, task_refs); |
| } |
| |
| static void __io_submit_flush_completions(struct io_ring_ctx *ctx) |
| __must_hold(&ctx->uring_lock) |
| { |
| struct io_wq_work_node *node, *prev; |
| struct io_submit_state *state = &ctx->submit_state; |
| |
| if (state->flush_cqes) { |
| spin_lock(&ctx->completion_lock); |
| wq_list_for_each(node, prev, &state->compl_reqs) { |
| struct io_kiocb *req = container_of(node, struct io_kiocb, |
| comp_list); |
| |
| if (!(req->flags & REQ_F_CQE_SKIP)) |
| __io_fill_cqe_req(ctx, req); |
| } |
| |
| io_commit_cqring(ctx); |
| spin_unlock(&ctx->completion_lock); |
| io_cqring_ev_posted(ctx); |
| state->flush_cqes = false; |
| } |
| |
| io_free_batch_list(ctx, state->compl_reqs.first); |
| INIT_WQ_LIST(&state->compl_reqs); |
| } |
| |
| /* |
| * Drop reference to request, return next in chain (if there is one) if this |
| * was the last reference to this request. |
| */ |
| static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) |
| { |
| struct io_kiocb *nxt = NULL; |
| |
| if (req_ref_put_and_test(req)) { |
| if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) |
| nxt = io_req_find_next(req); |
| io_free_req(req); |
| } |
| return nxt; |
| } |
| |
| static inline void io_put_req(struct io_kiocb *req) |
| { |
| if (req_ref_put_and_test(req)) { |
| io_queue_next(req); |
| io_free_req(req); |
| } |
| } |
| |
| static unsigned io_cqring_events(struct io_ring_ctx *ctx) |
| { |
| /* See comment at the top of this file */ |
| smp_rmb(); |
|