| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * Shared application/kernel submission and completion ring pairs, for |
| * supporting fast/efficient IO. |
| * |
| * A note on the read/write ordering memory barriers that are matched between |
| * the application and kernel side. |
| * |
| * After the application reads the CQ ring tail, it must use an |
| * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses |
| * before writing the tail (using smp_load_acquire to read the tail will |
| * do). It also needs a smp_mb() before updating CQ head (ordering the |
| * entry load(s) with the head store), pairing with an implicit barrier |
| * through a control-dependency in io_get_cqring (smp_store_release to |
| * store head will do). Failure to do so could lead to reading invalid |
| * CQ entries. |
| * |
| * Likewise, the application must use an appropriate smp_wmb() before |
| * writing the SQ tail (ordering SQ entry stores with the tail store), |
| * which pairs with smp_load_acquire in io_get_sqring (smp_store_release |
| * to store the tail will do). And it needs a barrier ordering the SQ |
| * head load before writing new SQ entries (smp_load_acquire to read |
| * head will do). |
| * |
| * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application |
| * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* |
| * updating the SQ tail; a full memory barrier smp_mb() is needed |
| * between. |
| * |
| * Also see the examples in the liburing library: |
| * |
| * git://git.kernel.dk/liburing |
| * |
| * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens |
| * from data shared between the kernel and application. This is done both |
| * for ordering purposes, but also to ensure that once a value is loaded from |
| * data that the application could potentially modify, it remains stable. |
| * |
| * Copyright (C) 2018-2019 Jens Axboe |
| * Copyright (c) 2018-2019 Christoph Hellwig |
| */ |
| #include <linux/kernel.h> |
| #include <linux/init.h> |
| #include <linux/errno.h> |
| #include <linux/syscalls.h> |
| #include <linux/compat.h> |
| #include <net/compat.h> |
| #include <linux/refcount.h> |
| #include <linux/uio.h> |
| #include <linux/bits.h> |
| |
| #include <linux/sched/signal.h> |
| #include <linux/fs.h> |
| #include <linux/file.h> |
| #include <linux/fdtable.h> |
| #include <linux/mm.h> |
| #include <linux/mman.h> |
| #include <linux/percpu.h> |
| #include <linux/slab.h> |
| #include <linux/kthread.h> |
| #include <linux/blkdev.h> |
| #include <linux/bvec.h> |
| #include <linux/net.h> |
| #include <net/sock.h> |
| #include <net/af_unix.h> |
| #include <net/scm.h> |
| #include <linux/anon_inodes.h> |
| #include <linux/sched/mm.h> |
| #include <linux/uaccess.h> |
| #include <linux/nospec.h> |
| #include <linux/sizes.h> |
| #include <linux/hugetlb.h> |
| #include <linux/highmem.h> |
| #include <linux/namei.h> |
| #include <linux/fsnotify.h> |
| #include <linux/fadvise.h> |
| #include <linux/eventpoll.h> |
| #include <linux/fs_struct.h> |
| #include <linux/splice.h> |
| #include <linux/task_work.h> |
| |
| #define CREATE_TRACE_POINTS |
| #include <trace/events/io_uring.h> |
| |
| #include <uapi/linux/io_uring.h> |
| |
| #include "internal.h" |
| #include "io-wq.h" |
| |
| #define IORING_MAX_ENTRIES 32768 |
| #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) |
| |
| /* |
| * Shift of 9 is 512 entries, or exactly one page on 64-bit archs |
| */ |
| #define IORING_FILE_TABLE_SHIFT 9 |
| #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) |
| #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) |
| #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) |
| |
| struct io_uring { |
| u32 head ____cacheline_aligned_in_smp; |
| u32 tail ____cacheline_aligned_in_smp; |
| }; |
| |
| /* |
| * This data is shared with the application through the mmap at offsets |
| * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. |
| * |
| * The offsets to the member fields are published through struct |
| * io_sqring_offsets when calling io_uring_setup. |
| */ |
| struct io_rings { |
| /* |
| * Head and tail offsets into the ring; the offsets need to be |
| * masked to get valid indices. |
| * |
| * The kernel controls head of the sq ring and the tail of the cq ring, |
| * and the application controls tail of the sq ring and the head of the |
| * cq ring. |
| */ |
| struct io_uring sq, cq; |
| /* |
| * Bitmasks to apply to head and tail offsets (constant, equals |
| * ring_entries - 1) |
| */ |
| u32 sq_ring_mask, cq_ring_mask; |
| /* Ring sizes (constant, power of 2) */ |
| u32 sq_ring_entries, cq_ring_entries; |
| /* |
| * Number of invalid entries dropped by the kernel due to |
| * invalid index stored in array |
| * |
| * Written by the kernel, shouldn't be modified by the |
| * application (i.e. get number of "new events" by comparing to |
| * cached value). |
| * |
| * After a new SQ head value was read by the application this |
| * counter includes all submissions that were dropped reaching |
| * the new SQ head (and possibly more). |
| */ |
| u32 sq_dropped; |
| /* |
| * Runtime SQ flags |
| * |
| * Written by the kernel, shouldn't be modified by the |
| * application. |
| * |
| * The application needs a full memory barrier before checking |
| * for IORING_SQ_NEED_WAKEUP after updating the sq tail. |
| */ |
| u32 sq_flags; |
| /* |
| * Runtime CQ flags |
| * |
| * Written by the application, shouldn't be modified by the |
| * kernel. |
| */ |
| u32 cq_flags; |
| /* |
| * Number of completion events lost because the queue was full; |
| * this should be avoided by the application by making sure |
| * there are not more requests pending than there is space in |
| * the completion queue. |
| * |
| * Written by the kernel, shouldn't be modified by the |
| * application (i.e. get number of "new events" by comparing to |
| * cached value). |
| * |
| * As completion events come in out of order this counter is not |
| * ordered with any other data. |
| */ |
| u32 cq_overflow; |
| /* |
| * Ring buffer of completion events. |
| * |
| * The kernel writes completion events fresh every time they are |
| * produced, so the application is allowed to modify pending |
| * entries. |
| */ |
| struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; |
| }; |
| |
| struct io_mapped_ubuf { |
| u64 ubuf; |
| size_t len; |
| struct bio_vec *bvec; |
| unsigned int nr_bvecs; |
| }; |
| |
| struct fixed_file_table { |
| struct file **files; |
| }; |
| |
| struct fixed_file_ref_node { |
| struct percpu_ref refs; |
| struct list_head node; |
| struct list_head file_list; |
| struct fixed_file_data *file_data; |
| struct llist_node llist; |
| }; |
| |
| struct fixed_file_data { |
| struct fixed_file_table *table; |
| struct io_ring_ctx *ctx; |
| |
| struct percpu_ref *cur_refs; |
| struct percpu_ref refs; |
| struct completion done; |
| struct list_head ref_list; |
| spinlock_t lock; |
| }; |
| |
| struct io_buffer { |
| struct list_head list; |
| __u64 addr; |
| __s32 len; |
| __u16 bid; |
| }; |
| |
| struct io_ring_ctx { |
| struct { |
| struct percpu_ref refs; |
| } ____cacheline_aligned_in_smp; |
| |
| struct { |
| unsigned int flags; |
| unsigned int compat: 1; |
| unsigned int account_mem: 1; |
| unsigned int cq_overflow_flushed: 1; |
| unsigned int drain_next: 1; |
| unsigned int eventfd_async: 1; |
| |
| /* |
| * Ring buffer of indices into array of io_uring_sqe, which is |
| * mmapped by the application using the IORING_OFF_SQES offset. |
| * |
| * This indirection could e.g. be used to assign fixed |
| * io_uring_sqe entries to operations and only submit them to |
| * the queue when needed. |
| * |
| * The kernel modifies neither the indices array nor the entries |
| * array. |
| */ |
| u32 *sq_array; |
| unsigned cached_sq_head; |
| unsigned sq_entries; |
| unsigned sq_mask; |
| unsigned sq_thread_idle; |
| unsigned cached_sq_dropped; |
| atomic_t cached_cq_overflow; |
| unsigned long sq_check_overflow; |
| |
| struct list_head defer_list; |
| struct list_head timeout_list; |
| struct list_head cq_overflow_list; |
| |
| wait_queue_head_t inflight_wait; |
| struct io_uring_sqe *sq_sqes; |
| } ____cacheline_aligned_in_smp; |
| |
| struct io_rings *rings; |
| |
| /* IO offload */ |
| struct io_wq *io_wq; |
| struct task_struct *sqo_thread; /* if using sq thread polling */ |
| struct mm_struct *sqo_mm; |
| wait_queue_head_t sqo_wait; |
| |
| /* |
| * If used, fixed file set. Writers must ensure that ->refs is dead, |
| * readers must ensure that ->refs is alive as long as the file* is |
| * used. Only updated through io_uring_register(2). |
| */ |
| struct fixed_file_data *file_data; |
| unsigned nr_user_files; |
| int ring_fd; |
| struct file *ring_file; |
| |
| /* if used, fixed mapped user buffers */ |
| unsigned nr_user_bufs; |
| struct io_mapped_ubuf *user_bufs; |
| |
| struct user_struct *user; |
| |
| const struct cred *creds; |
| |
| struct completion ref_comp; |
| struct completion sq_thread_comp; |
| |
| /* if all else fails... */ |
| struct io_kiocb *fallback_req; |
| |
| #if defined(CONFIG_UNIX) |
| struct socket *ring_sock; |
| #endif |
| |
| struct idr io_buffer_idr; |
| |
| struct idr personality_idr; |
| |
| struct { |
| unsigned cached_cq_tail; |
| unsigned cq_entries; |
| unsigned cq_mask; |
| atomic_t cq_timeouts; |
| unsigned long cq_check_overflow; |
| struct wait_queue_head cq_wait; |
| struct fasync_struct *cq_fasync; |
| struct eventfd_ctx *cq_ev_fd; |
| } ____cacheline_aligned_in_smp; |
| |
| struct { |
| struct mutex uring_lock; |
| wait_queue_head_t wait; |
| } ____cacheline_aligned_in_smp; |
| |
| struct { |
| spinlock_t completion_lock; |
| |
| /* |
| * ->poll_list is protected by the ctx->uring_lock for |
| * io_uring instances that don't use IORING_SETUP_SQPOLL. |
| * For SQPOLL, only the single threaded io_sq_thread() will |
| * manipulate the list, hence no extra locking is needed there. |
| */ |
| struct list_head poll_list; |
| struct hlist_head *cancel_hash; |
| unsigned cancel_hash_bits; |
| bool poll_multi_file; |
| |
| spinlock_t inflight_lock; |
| struct list_head inflight_list; |
| } ____cacheline_aligned_in_smp; |
| |
| struct delayed_work file_put_work; |
| struct llist_head file_put_llist; |
| |
| struct work_struct exit_work; |
| }; |
| |
| /* |
| * First field must be the file pointer in all the |
| * iocb unions! See also 'struct kiocb' in <linux/fs.h> |
| */ |
| struct io_poll_iocb { |
| struct file *file; |
| union { |
| struct wait_queue_head *head; |
| u64 addr; |
| }; |
| __poll_t events; |
| bool done; |
| bool canceled; |
| struct wait_queue_entry wait; |
| }; |
| |
| struct io_close { |
| struct file *file; |
| struct file *put_file; |
| int fd; |
| }; |
| |
| struct io_timeout_data { |
| struct io_kiocb *req; |
| struct hrtimer timer; |
| struct timespec64 ts; |
| enum hrtimer_mode mode; |
| }; |
| |
| struct io_accept { |
| struct file *file; |
| struct sockaddr __user *addr; |
| int __user *addr_len; |
| int flags; |
| unsigned long nofile; |
| }; |
| |
| struct io_sync { |
| struct file *file; |
| loff_t len; |
| loff_t off; |
| int flags; |
| int mode; |
| }; |
| |
| struct io_cancel { |
| struct file *file; |
| u64 addr; |
| }; |
| |
| struct io_timeout { |
| struct file *file; |
| u64 addr; |
| int flags; |
| u32 off; |
| u32 target_seq; |
| }; |
| |
| struct io_rw { |
| /* NOTE: kiocb has the file as the first member, so don't do it here */ |
| struct kiocb kiocb; |
| u64 addr; |
| u64 len; |
| }; |
| |
| struct io_connect { |
| struct file *file; |
| struct sockaddr __user *addr; |
| int addr_len; |
| }; |
| |
| struct io_sr_msg { |
| struct file *file; |
| union { |
| struct user_msghdr __user *msg; |
| void __user *buf; |
| }; |
| int msg_flags; |
| int bgid; |
| size_t len; |
| struct io_buffer *kbuf; |
| }; |
| |
| struct io_open { |
| struct file *file; |
| int dfd; |
| struct filename *filename; |
| struct open_how how; |
| unsigned long nofile; |
| }; |
| |
| struct io_files_update { |
| struct file *file; |
| u64 arg; |
| u32 nr_args; |
| u32 offset; |
| }; |
| |
| struct io_fadvise { |
| struct file *file; |
| u64 offset; |
| u32 len; |
| u32 advice; |
| }; |
| |
| struct io_madvise { |
| struct file *file; |
| u64 addr; |
| u32 len; |
| u32 advice; |
| }; |
| |
| struct io_epoll { |
| struct file *file; |
| int epfd; |
| int op; |
| int fd; |
| struct epoll_event event; |
| }; |
| |
| struct io_splice { |
| struct file *file_out; |
| struct file *file_in; |
| loff_t off_out; |
| loff_t off_in; |
| u64 len; |
| unsigned int flags; |
| }; |
| |
| struct io_provide_buf { |
| struct file *file; |
| __u64 addr; |
| __s32 len; |
| __u32 bgid; |
| __u16 nbufs; |
| __u16 bid; |
| }; |
| |
| struct io_statx { |
| struct file *file; |
| int dfd; |
| unsigned int mask; |
| unsigned int flags; |
| const char __user *filename; |
| struct statx __user *buffer; |
| }; |
| |
| struct io_async_connect { |
| struct sockaddr_storage address; |
| }; |
| |
| struct io_async_msghdr { |
| struct iovec fast_iov[UIO_FASTIOV]; |
| struct iovec *iov; |
| struct sockaddr __user *uaddr; |
| struct msghdr msg; |
| struct sockaddr_storage addr; |
| }; |
| |
| struct io_async_rw { |
| struct iovec fast_iov[UIO_FASTIOV]; |
| struct iovec *iov; |
| ssize_t nr_segs; |
| ssize_t size; |
| }; |
| |
| struct io_async_ctx { |
| union { |
| struct io_async_rw rw; |
| struct io_async_msghdr msg; |
| struct io_async_connect connect; |
| struct io_timeout_data timeout; |
| }; |
| }; |
| |
| enum { |
| REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, |
| REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, |
| REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, |
| REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, |
| REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, |
| REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, |
| |
| REQ_F_LINK_HEAD_BIT, |
| REQ_F_LINK_NEXT_BIT, |
| REQ_F_FAIL_LINK_BIT, |
| REQ_F_INFLIGHT_BIT, |
| REQ_F_CUR_POS_BIT, |
| REQ_F_NOWAIT_BIT, |
| REQ_F_LINK_TIMEOUT_BIT, |
| REQ_F_TIMEOUT_BIT, |
| REQ_F_ISREG_BIT, |
| REQ_F_MUST_PUNT_BIT, |
| REQ_F_TIMEOUT_NOSEQ_BIT, |
| REQ_F_COMP_LOCKED_BIT, |
| REQ_F_NEED_CLEANUP_BIT, |
| REQ_F_OVERFLOW_BIT, |
| REQ_F_POLLED_BIT, |
| REQ_F_BUFFER_SELECTED_BIT, |
| REQ_F_NO_FILE_TABLE_BIT, |
| REQ_F_QUEUE_TIMEOUT_BIT, |
| REQ_F_WORK_INITIALIZED_BIT, |
| |
| /* not a real bit, just to check we're not overflowing the space */ |
| __REQ_F_LAST_BIT, |
| }; |
| |
| enum { |
| /* ctx owns file */ |
| REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), |
| /* drain existing IO first */ |
| REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), |
| /* linked sqes */ |
| REQ_F_LINK = BIT(REQ_F_LINK_BIT), |
| /* doesn't sever on completion < 0 */ |
| REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), |
| /* IOSQE_ASYNC */ |
| REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), |
| /* IOSQE_BUFFER_SELECT */ |
| REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), |
| |
| /* head of a link */ |
| REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), |
| /* already grabbed next link */ |
| REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), |
| /* fail rest of links */ |
| REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), |
| /* on inflight list */ |
| REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), |
| /* read/write uses file position */ |
| REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), |
| /* must not punt to workers */ |
| REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), |
| /* has linked timeout */ |
| REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), |
| /* timeout request */ |
| REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), |
| /* regular file */ |
| REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), |
| /* must be punted even for NONBLOCK */ |
| REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), |
| /* no timeout sequence */ |
| REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), |
| /* completion under lock */ |
| REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), |
| /* needs cleanup */ |
| REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), |
| /* in overflow list */ |
| REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), |
| /* already went through poll handler */ |
| REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), |
| /* buffer already selected */ |
| REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), |
| /* doesn't need file table for this request */ |
| REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), |
| /* needs to queue linked timeout */ |
| REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), |
| /* io_wq_work is initialized */ |
| REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), |
| }; |
| |
| struct async_poll { |
| struct io_poll_iocb poll; |
| struct io_wq_work work; |
| }; |
| |
| /* |
| * NOTE! Each of the iocb union members has the file pointer |
| * as the first entry in their struct definition. So you can |
| * access the file pointer through any of the sub-structs, |
| * or directly as just 'ki_filp' in this struct. |
| */ |
| struct io_kiocb { |
| union { |
| struct file *file; |
| struct io_rw rw; |
| struct io_poll_iocb poll; |
| struct io_accept accept; |
| struct io_sync sync; |
| struct io_cancel cancel; |
| struct io_timeout timeout; |
| struct io_connect connect; |
| struct io_sr_msg sr_msg; |
| struct io_open open; |
| struct io_close close; |
| struct io_files_update files_update; |
| struct io_fadvise fadvise; |
| struct io_madvise madvise; |
| struct io_epoll epoll; |
| struct io_splice splice; |
| struct io_provide_buf pbuf; |
| struct io_statx statx; |
| }; |
| |
| struct io_async_ctx *io; |
| int cflags; |
| u8 opcode; |
| /* polled IO has completed */ |
| u8 iopoll_completed; |
| |
| u16 buf_index; |
| |
| struct io_ring_ctx *ctx; |
| struct list_head list; |
| unsigned int flags; |
| refcount_t refs; |
| struct task_struct *task; |
| unsigned long fsize; |
| u64 user_data; |
| u32 result; |
| u32 sequence; |
| |
| struct list_head link_list; |
| |
| struct list_head inflight_entry; |
| |
| struct percpu_ref *fixed_file_refs; |
| |
| union { |
| /* |
| * Only commands that never go async can use the below fields, |
| * obviously. Right now only IORING_OP_POLL_ADD uses them, and |
| * async armed poll handlers for regular commands. The latter |
| * restore the work, if needed. |
| */ |
| struct { |
| struct callback_head task_work; |
| struct hlist_node hash_node; |
| struct async_poll *apoll; |
| }; |
| struct io_wq_work work; |
| }; |
| }; |
| |
| #define IO_PLUG_THRESHOLD 2 |
| #define IO_IOPOLL_BATCH 8 |
| |
| struct io_submit_state { |
| struct blk_plug plug; |
| |
| /* |
| * io_kiocb alloc cache |
| */ |
| void *reqs[IO_IOPOLL_BATCH]; |
| unsigned int free_reqs; |
| |
| /* |
| * File reference cache |
| */ |
| struct file *file; |
| unsigned int fd; |
| unsigned int has_refs; |
| unsigned int used_refs; |
| unsigned int ios_left; |
| }; |
| |
| struct io_op_def { |
| /* needs req->io allocated for deferral/async */ |
| unsigned async_ctx : 1; |
| /* needs current->mm setup, does mm access */ |
| unsigned needs_mm : 1; |
| /* needs req->file assigned */ |
| unsigned needs_file : 1; |
| /* don't fail if file grab fails */ |
| unsigned needs_file_no_error : 1; |
| /* hash wq insertion if file is a regular file */ |
| unsigned hash_reg_file : 1; |
| /* unbound wq insertion if file is a non-regular file */ |
| unsigned unbound_nonreg_file : 1; |
| /* opcode is not supported by this kernel */ |
| unsigned not_supported : 1; |
| /* needs file table */ |
| unsigned file_table : 1; |
| /* needs ->fs */ |
| unsigned needs_fs : 1; |
| /* set if opcode supports polled "wait" */ |
| unsigned pollin : 1; |
| unsigned pollout : 1; |
| /* op supports buffer selection */ |
| unsigned buffer_select : 1; |
| }; |
| |
| static const struct io_op_def io_op_defs[] = { |
| [IORING_OP_NOP] = {}, |
| [IORING_OP_READV] = { |
| .async_ctx = 1, |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .buffer_select = 1, |
| }, |
| [IORING_OP_WRITEV] = { |
| .async_ctx = 1, |
| .needs_mm = 1, |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| }, |
| [IORING_OP_FSYNC] = { |
| .needs_file = 1, |
| }, |
| [IORING_OP_READ_FIXED] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| }, |
| [IORING_OP_WRITE_FIXED] = { |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| }, |
| [IORING_OP_POLL_ADD] = { |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| }, |
| [IORING_OP_POLL_REMOVE] = {}, |
| [IORING_OP_SYNC_FILE_RANGE] = { |
| .needs_file = 1, |
| }, |
| [IORING_OP_SENDMSG] = { |
| .async_ctx = 1, |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .needs_fs = 1, |
| .pollout = 1, |
| }, |
| [IORING_OP_RECVMSG] = { |
| .async_ctx = 1, |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .needs_fs = 1, |
| .pollin = 1, |
| .buffer_select = 1, |
| }, |
| [IORING_OP_TIMEOUT] = { |
| .async_ctx = 1, |
| .needs_mm = 1, |
| }, |
| [IORING_OP_TIMEOUT_REMOVE] = {}, |
| [IORING_OP_ACCEPT] = { |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .file_table = 1, |
| .pollin = 1, |
| }, |
| [IORING_OP_ASYNC_CANCEL] = {}, |
| [IORING_OP_LINK_TIMEOUT] = { |
| .async_ctx = 1, |
| .needs_mm = 1, |
| }, |
| [IORING_OP_CONNECT] = { |
| .async_ctx = 1, |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| }, |
| [IORING_OP_FALLOCATE] = { |
| .needs_file = 1, |
| }, |
| [IORING_OP_OPENAT] = { |
| .file_table = 1, |
| .needs_fs = 1, |
| }, |
| [IORING_OP_CLOSE] = { |
| .needs_file = 1, |
| .needs_file_no_error = 1, |
| .file_table = 1, |
| }, |
| [IORING_OP_FILES_UPDATE] = { |
| .needs_mm = 1, |
| .file_table = 1, |
| }, |
| [IORING_OP_STATX] = { |
| .needs_mm = 1, |
| .needs_fs = 1, |
| .file_table = 1, |
| }, |
| [IORING_OP_READ] = { |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .buffer_select = 1, |
| }, |
| [IORING_OP_WRITE] = { |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| }, |
| [IORING_OP_FADVISE] = { |
| .needs_file = 1, |
| }, |
| [IORING_OP_MADVISE] = { |
| .needs_mm = 1, |
| }, |
| [IORING_OP_SEND] = { |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollout = 1, |
| }, |
| [IORING_OP_RECV] = { |
| .needs_mm = 1, |
| .needs_file = 1, |
| .unbound_nonreg_file = 1, |
| .pollin = 1, |
| .buffer_select = 1, |
| }, |
| [IORING_OP_OPENAT2] = { |
| .file_table = 1, |
| .needs_fs = 1, |
| }, |
| [IORING_OP_EPOLL_CTL] = { |
| .unbound_nonreg_file = 1, |
| .file_table = 1, |
| }, |
| [IORING_OP_SPLICE] = { |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| }, |
| [IORING_OP_PROVIDE_BUFFERS] = {}, |
| [IORING_OP_REMOVE_BUFFERS] = {}, |
| [IORING_OP_TEE] = { |
| .needs_file = 1, |
| .hash_reg_file = 1, |
| .unbound_nonreg_file = 1, |
| }, |
| }; |
| |
| static void io_wq_submit_work(struct io_wq_work **workptr); |
| static void io_cqring_fill_event(struct io_kiocb *req, long res); |
| static void io_put_req(struct io_kiocb *req); |
| static void __io_double_put_req(struct io_kiocb *req); |
| static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); |
| static void io_queue_linked_timeout(struct io_kiocb *req); |
| static int __io_sqe_files_update(struct io_ring_ctx *ctx, |
| struct io_uring_files_update *ip, |
| unsigned nr_args); |
| static int io_grab_files(struct io_kiocb *req); |
| static void io_cleanup_req(struct io_kiocb *req); |
| static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, |
| int fd, struct file **out_file, bool fixed); |
| static void __io_queue_sqe(struct io_kiocb *req, |
| const struct io_uring_sqe *sqe); |
| |
| static struct kmem_cache *req_cachep; |
| |
| static const struct file_operations io_uring_fops; |
| |
| struct sock *io_uring_get_socket(struct file *file) |
| { |
| #if defined(CONFIG_UNIX) |
| if (file->f_op == &io_uring_fops) { |
| struct io_ring_ctx *ctx = file->private_data; |
| |
| return ctx->ring_sock->sk; |
| } |
| #endif |
| return NULL; |
| } |
| EXPORT_SYMBOL(io_uring_get_socket); |
| |
| static void io_file_put_work(struct work_struct *work); |
| |
| /* |
| * Note: must call io_req_init_async() for the first time you |
| * touch any members of io_wq_work. |
| */ |
| static inline void io_req_init_async(struct io_kiocb *req) |
| { |
| if (req->flags & REQ_F_WORK_INITIALIZED) |
| return; |
| |
| memset(&req->work, 0, sizeof(req->work)); |
| req->flags |= REQ_F_WORK_INITIALIZED; |
| } |
| |
| static inline bool io_async_submit(struct io_ring_ctx *ctx) |
| { |
| return ctx->flags & IORING_SETUP_SQPOLL; |
| } |
| |
| static void io_ring_ctx_ref_free(struct percpu_ref *ref) |
| { |
| struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); |
| |
| complete(&ctx->ref_comp); |
| } |
| |
| static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) |
| { |
| struct io_ring_ctx *ctx; |
| int hash_bits; |
| |
| ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); |
| if (!ctx) |
| return NULL; |
| |
| ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL); |
| if (!ctx->fallback_req) |
| goto err; |
| |
| /* |
| * Use 5 bits less than the max cq entries, that should give us around |
| * 32 entries per hash list if totally full and uniformly spread. |
| */ |
| hash_bits = ilog2(p->cq_entries); |
| hash_bits -= 5; |
| if (hash_bits <= 0) |
| hash_bits = 1; |
| ctx->cancel_hash_bits = hash_bits; |
| ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), |
| GFP_KERNEL); |
| if (!ctx->cancel_hash) |
| goto err; |
| __hash_init(ctx->cancel_hash, 1U << hash_bits); |
| |
| if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, |
| PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) |
| goto err; |
| |
| ctx->flags = p->flags; |
| init_waitqueue_head(&ctx->sqo_wait); |
| init_waitqueue_head(&ctx->cq_wait); |
| INIT_LIST_HEAD(&ctx->cq_overflow_list); |
| init_completion(&ctx->ref_comp); |
| init_completion(&ctx->sq_thread_comp); |
| idr_init(&ctx->io_buffer_idr); |
| idr_init(&ctx->personality_idr); |
| mutex_init(&ctx->uring_lock); |
| init_waitqueue_head(&ctx->wait); |
| spin_lock_init(&ctx->completion_lock); |
| INIT_LIST_HEAD(&ctx->poll_list); |
| INIT_LIST_HEAD(&ctx->defer_list); |
| INIT_LIST_HEAD(&ctx->timeout_list); |
| init_waitqueue_head(&ctx->inflight_wait); |
| spin_lock_init(&ctx->inflight_lock); |
| INIT_LIST_HEAD(&ctx->inflight_list); |
| INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); |
| init_llist_head(&ctx->file_put_llist); |
| return ctx; |
| err: |
| if (ctx->fallback_req) |
| kmem_cache_free(req_cachep, ctx->fallback_req); |
| kfree(ctx->cancel_hash); |
| kfree(ctx); |
| return NULL; |
| } |
| |
| static inline bool __req_need_defer(struct io_kiocb *req) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| return req->sequence != ctx->cached_cq_tail |
| + atomic_read(&ctx->cached_cq_overflow); |
| } |
| |
| static inline bool req_need_defer(struct io_kiocb *req) |
| { |
| if (unlikely(req->flags & REQ_F_IO_DRAIN)) |
| return __req_need_defer(req); |
| |
| return false; |
| } |
| |
| static void __io_commit_cqring(struct io_ring_ctx *ctx) |
| { |
| struct io_rings *rings = ctx->rings; |
| |
| /* order cqe stores with ring update */ |
| smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); |
| |
| if (wq_has_sleeper(&ctx->cq_wait)) { |
| wake_up_interruptible(&ctx->cq_wait); |
| kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); |
| } |
| } |
| |
| static inline void io_req_work_grab_env(struct io_kiocb *req, |
| const struct io_op_def *def) |
| { |
| if (!req->work.mm && def->needs_mm) { |
| mmgrab(current->mm); |
| req->work.mm = current->mm; |
| } |
| if (!req->work.creds) |
| req->work.creds = get_current_cred(); |
| if (!req->work.fs && def->needs_fs) { |
| spin_lock(¤t->fs->lock); |
| if (!current->fs->in_exec) { |
| req->work.fs = current->fs; |
| req->work.fs->users++; |
| } else { |
| req->work.flags |= IO_WQ_WORK_CANCEL; |
| } |
| spin_unlock(¤t->fs->lock); |
| } |
| if (!req->work.task_pid) |
| req->work.task_pid = task_pid_vnr(current); |
| } |
| |
| static inline void io_req_work_drop_env(struct io_kiocb *req) |
| { |
| if (!(req->flags & REQ_F_WORK_INITIALIZED)) |
| return; |
| |
| if (req->work.mm) { |
| mmdrop(req->work.mm); |
| req->work.mm = NULL; |
| } |
| if (req->work.creds) { |
| put_cred(req->work.creds); |
| req->work.creds = NULL; |
| } |
| if (req->work.fs) { |
| struct fs_struct *fs = req->work.fs; |
| |
| spin_lock(&req->work.fs->lock); |
| if (--fs->users) |
| fs = NULL; |
| spin_unlock(&req->work.fs->lock); |
| if (fs) |
| free_fs_struct(fs); |
| } |
| } |
| |
| static inline void io_prep_async_work(struct io_kiocb *req, |
| struct io_kiocb **link) |
| { |
| const struct io_op_def *def = &io_op_defs[req->opcode]; |
| |
| if (req->flags & REQ_F_ISREG) { |
| if (def->hash_reg_file) |
| io_wq_hash_work(&req->work, file_inode(req->file)); |
| } else { |
| if (def->unbound_nonreg_file) |
| req->work.flags |= IO_WQ_WORK_UNBOUND; |
| } |
| |
| io_req_work_grab_env(req, def); |
| |
| *link = io_prep_linked_timeout(req); |
| } |
| |
| static inline void io_queue_async_work(struct io_kiocb *req) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| struct io_kiocb *link; |
| |
| io_prep_async_work(req, &link); |
| |
| trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, |
| &req->work, req->flags); |
| io_wq_enqueue(ctx->io_wq, &req->work); |
| |
| if (link) |
| io_queue_linked_timeout(link); |
| } |
| |
| static void io_kill_timeout(struct io_kiocb *req) |
| { |
| int ret; |
| |
| ret = hrtimer_try_to_cancel(&req->io->timeout.timer); |
| if (ret != -1) { |
| atomic_inc(&req->ctx->cq_timeouts); |
| list_del_init(&req->list); |
| req->flags |= REQ_F_COMP_LOCKED; |
| io_cqring_fill_event(req, 0); |
| io_put_req(req); |
| } |
| } |
| |
| static void io_kill_timeouts(struct io_ring_ctx *ctx) |
| { |
| struct io_kiocb *req, *tmp; |
| |
| spin_lock_irq(&ctx->completion_lock); |
| list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) |
| io_kill_timeout(req); |
| spin_unlock_irq(&ctx->completion_lock); |
| } |
| |
| static void __io_queue_deferred(struct io_ring_ctx *ctx) |
| { |
| do { |
| struct io_kiocb *req = list_first_entry(&ctx->defer_list, |
| struct io_kiocb, list); |
| |
| if (req_need_defer(req)) |
| break; |
| list_del_init(&req->list); |
| io_queue_async_work(req); |
| } while (!list_empty(&ctx->defer_list)); |
| } |
| |
| static void io_flush_timeouts(struct io_ring_ctx *ctx) |
| { |
| while (!list_empty(&ctx->timeout_list)) { |
| struct io_kiocb *req = list_first_entry(&ctx->timeout_list, |
| struct io_kiocb, list); |
| |
| if (req->flags & REQ_F_TIMEOUT_NOSEQ) |
| break; |
| if (req->timeout.target_seq != ctx->cached_cq_tail |
| - atomic_read(&ctx->cq_timeouts)) |
| break; |
| |
| list_del_init(&req->list); |
| io_kill_timeout(req); |
| } |
| } |
| |
| static void io_commit_cqring(struct io_ring_ctx *ctx) |
| { |
| io_flush_timeouts(ctx); |
| __io_commit_cqring(ctx); |
| |
| if (unlikely(!list_empty(&ctx->defer_list))) |
| __io_queue_deferred(ctx); |
| } |
| |
| static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) |
| { |
| struct io_rings *rings = ctx->rings; |
| unsigned tail; |
| |
| tail = ctx->cached_cq_tail; |
| /* |
| * writes to the cq entry need to come after reading head; the |
| * control dependency is enough as we're using WRITE_ONCE to |
| * fill the cq entry |
| */ |
| if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries) |
| return NULL; |
| |
| ctx->cached_cq_tail++; |
| return &rings->cqes[tail & ctx->cq_mask]; |
| } |
| |
| static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) |
| { |
| if (!ctx->cq_ev_fd) |
| return false; |
| if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) |
| return false; |
| if (!ctx->eventfd_async) |
| return true; |
| return io_wq_current_is_worker(); |
| } |
| |
| static void io_cqring_ev_posted(struct io_ring_ctx *ctx) |
| { |
| if (waitqueue_active(&ctx->wait)) |
| wake_up(&ctx->wait); |
| if (waitqueue_active(&ctx->sqo_wait)) |
| wake_up(&ctx->sqo_wait); |
| if (io_should_trigger_evfd(ctx)) |
| eventfd_signal(ctx->cq_ev_fd, 1); |
| } |
| |
| /* Returns true if there are no backlogged entries after the flush */ |
| static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) |
| { |
| struct io_rings *rings = ctx->rings; |
| struct io_uring_cqe *cqe; |
| struct io_kiocb *req; |
| unsigned long flags; |
| LIST_HEAD(list); |
| |
| if (!force) { |
| if (list_empty_careful(&ctx->cq_overflow_list)) |
| return true; |
| if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == |
| rings->cq_ring_entries)) |
| return false; |
| } |
| |
| spin_lock_irqsave(&ctx->completion_lock, flags); |
| |
| /* if force is set, the ring is going away. always drop after that */ |
| if (force) |
| ctx->cq_overflow_flushed = 1; |
| |
| cqe = NULL; |
| while (!list_empty(&ctx->cq_overflow_list)) { |
| cqe = io_get_cqring(ctx); |
| if (!cqe && !force) |
| break; |
| |
| req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, |
| list); |
| list_move(&req->list, &list); |
| req->flags &= ~REQ_F_OVERFLOW; |
| if (cqe) { |
| WRITE_ONCE(cqe->user_data, req->user_data); |
| WRITE_ONCE(cqe->res, req->result); |
| WRITE_ONCE(cqe->flags, req->cflags); |
| } else { |
| WRITE_ONCE(ctx->rings->cq_overflow, |
| atomic_inc_return(&ctx->cached_cq_overflow)); |
| } |
| } |
| |
| io_commit_cqring(ctx); |
| if (cqe) { |
| clear_bit(0, &ctx->sq_check_overflow); |
| clear_bit(0, &ctx->cq_check_overflow); |
| } |
| spin_unlock_irqrestore(&ctx->completion_lock, flags); |
| io_cqring_ev_posted(ctx); |
| |
| while (!list_empty(&list)) { |
| req = list_first_entry(&list, struct io_kiocb, list); |
| list_del(&req->list); |
| io_put_req(req); |
| } |
| |
| return cqe != NULL; |
| } |
| |
| static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| struct io_uring_cqe *cqe; |
| |
| trace_io_uring_complete(ctx, req->user_data, res); |
| |
| /* |
| * If we can't get a cq entry, userspace overflowed the |
| * submission (by quite a lot). Increment the overflow count in |
| * the ring. |
| */ |
| cqe = io_get_cqring(ctx); |
| if (likely(cqe)) { |
| WRITE_ONCE(cqe->user_data, req->user_data); |
| WRITE_ONCE(cqe->res, res); |
| WRITE_ONCE(cqe->flags, cflags); |
| } else if (ctx->cq_overflow_flushed) { |
| WRITE_ONCE(ctx->rings->cq_overflow, |
| atomic_inc_return(&ctx->cached_cq_overflow)); |
| } else { |
| if (list_empty(&ctx->cq_overflow_list)) { |
| set_bit(0, &ctx->sq_check_overflow); |
| set_bit(0, &ctx->cq_check_overflow); |
| } |
| req->flags |= REQ_F_OVERFLOW; |
| refcount_inc(&req->refs); |
| req->result = res; |
| req->cflags = cflags; |
| list_add_tail(&req->list, &ctx->cq_overflow_list); |
| } |
| } |
| |
| static void io_cqring_fill_event(struct io_kiocb *req, long res) |
| { |
| __io_cqring_fill_event(req, res, 0); |
| } |
| |
| static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| unsigned long flags; |
| |
| spin_lock_irqsave(&ctx->completion_lock, flags); |
| __io_cqring_fill_event(req, res, cflags); |
| io_commit_cqring(ctx); |
| spin_unlock_irqrestore(&ctx->completion_lock, flags); |
| |
| io_cqring_ev_posted(ctx); |
| } |
| |
| static void io_cqring_add_event(struct io_kiocb *req, long res) |
| { |
| __io_cqring_add_event(req, res, 0); |
| } |
| |
| static inline bool io_is_fallback_req(struct io_kiocb *req) |
| { |
| return req == (struct io_kiocb *) |
| ((unsigned long) req->ctx->fallback_req & ~1UL); |
| } |
| |
| static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) |
| { |
| struct io_kiocb *req; |
| |
| req = ctx->fallback_req; |
| if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) |
| return req; |
| |
| return NULL; |
| } |
| |
| static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, |
| struct io_submit_state *state) |
| { |
| gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; |
| struct io_kiocb *req; |
| |
| if (!state) { |
| req = kmem_cache_alloc(req_cachep, gfp); |
| if (unlikely(!req)) |
| goto fallback; |
| } else if (!state->free_reqs) { |
| size_t sz; |
| int ret; |
| |
| sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); |
| ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs); |
| |
| /* |
| * Bulk alloc is all-or-nothing. If we fail to get a batch, |
| * retry single alloc to be on the safe side. |
| */ |
| if (unlikely(ret <= 0)) { |
| state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); |
| if (!state->reqs[0]) |
| goto fallback; |
| ret = 1; |
| } |
| state->free_reqs = ret - 1; |
| req = state->reqs[ret - 1]; |
| } else { |
| state->free_reqs--; |
| req = state->reqs[state->free_reqs]; |
| } |
| |
| return req; |
| fallback: |
| return io_get_fallback_req(ctx); |
| } |
| |
| static inline void io_put_file(struct io_kiocb *req, struct file *file, |
| bool fixed) |
| { |
| if (fixed) |
| percpu_ref_put(req->fixed_file_refs); |
| else |
| fput(file); |
| } |
| |
| static void __io_req_aux_free(struct io_kiocb *req) |
| { |
| if (req->flags & REQ_F_NEED_CLEANUP) |
| io_cleanup_req(req); |
| |
| kfree(req->io); |
| if (req->file) |
| io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); |
| if (req->task) |
| put_task_struct(req->task); |
| |
| io_req_work_drop_env(req); |
| } |
| |
| static void __io_free_req(struct io_kiocb *req) |
| { |
| __io_req_aux_free(req); |
| |
| if (req->flags & REQ_F_INFLIGHT) { |
| struct io_ring_ctx *ctx = req->ctx; |
| unsigned long flags; |
| |
| spin_lock_irqsave(&ctx->inflight_lock, flags); |
| list_del(&req->inflight_entry); |
| if (waitqueue_active(&ctx->inflight_wait)) |
| wake_up(&ctx->inflight_wait); |
| spin_unlock_irqrestore(&ctx->inflight_lock, flags); |
| } |
| |
| percpu_ref_put(&req->ctx->refs); |
| if (likely(!io_is_fallback_req(req))) |
| kmem_cache_free(req_cachep, req); |
| else |
| clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); |
| } |
| |
| struct req_batch { |
| void *reqs[IO_IOPOLL_BATCH]; |
| int to_free; |
| int need_iter; |
| }; |
| |
| static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) |
| { |
| if (!rb->to_free) |
| return; |
| if (rb->need_iter) { |
| int i, inflight = 0; |
| unsigned long flags; |
| |
| for (i = 0; i < rb->to_free; i++) { |
| struct io_kiocb *req = rb->reqs[i]; |
| |
| if (req->flags & REQ_F_INFLIGHT) |
| inflight++; |
| __io_req_aux_free(req); |
| } |
| if (!inflight) |
| goto do_free; |
| |
| spin_lock_irqsave(&ctx->inflight_lock, flags); |
| for (i = 0; i < rb->to_free; i++) { |
| struct io_kiocb *req = rb->reqs[i]; |
| |
| if (req->flags & REQ_F_INFLIGHT) { |
| list_del(&req->inflight_entry); |
| if (!--inflight) |
| break; |
| } |
| } |
| spin_unlock_irqrestore(&ctx->inflight_lock, flags); |
| |
| if (waitqueue_active(&ctx->inflight_wait)) |
| wake_up(&ctx->inflight_wait); |
| } |
| do_free: |
| kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); |
| percpu_ref_put_many(&ctx->refs, rb->to_free); |
| rb->to_free = rb->need_iter = 0; |
| } |
| |
| static bool io_link_cancel_timeout(struct io_kiocb *req) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| int ret; |
| |
| ret = hrtimer_try_to_cancel(&req->io->timeout.timer); |
| if (ret != -1) { |
| io_cqring_fill_event(req, -ECANCELED); |
| io_commit_cqring(ctx); |
| req->flags &= ~REQ_F_LINK_HEAD; |
| io_put_req(req); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| bool wake_ev = false; |
| |
| /* Already got next link */ |
| if (req->flags & REQ_F_LINK_NEXT) |
| return; |
| |
| /* |
| * The list should never be empty when we are called here. But could |
| * potentially happen if the chain is messed up, check to be on the |
| * safe side. |
| */ |
| while (!list_empty(&req->link_list)) { |
| struct io_kiocb *nxt = list_first_entry(&req->link_list, |
| struct io_kiocb, link_list); |
| |
| if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && |
| (nxt->flags & REQ_F_TIMEOUT))) { |
| list_del_init(&nxt->link_list); |
| wake_ev |= io_link_cancel_timeout(nxt); |
| req->flags &= ~REQ_F_LINK_TIMEOUT; |
| continue; |
| } |
| |
| list_del_init(&req->link_list); |
| if (!list_empty(&nxt->link_list)) |
| nxt->flags |= REQ_F_LINK_HEAD; |
| *nxtptr = nxt; |
| break; |
| } |
| |
| req->flags |= REQ_F_LINK_NEXT; |
| if (wake_ev) |
| io_cqring_ev_posted(ctx); |
| } |
| |
| /* |
| * Called if REQ_F_LINK_HEAD is set, and we fail the head request |
| */ |
| static void io_fail_links(struct io_kiocb *req) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| unsigned long flags; |
| |
| spin_lock_irqsave(&ctx->completion_lock, flags); |
| |
| while (!list_empty(&req->link_list)) { |
| struct io_kiocb *link = list_first_entry(&req->link_list, |
| struct io_kiocb, link_list); |
| |
| list_del_init(&link->link_list); |
| trace_io_uring_fail_link(req, link); |
| |
| if ((req->flags & REQ_F_LINK_TIMEOUT) && |
| link->opcode == IORING_OP_LINK_TIMEOUT) { |
| io_link_cancel_timeout(link); |
| } else { |
| io_cqring_fill_event(link, -ECANCELED); |
| __io_double_put_req(link); |
| } |
| req->flags &= ~REQ_F_LINK_TIMEOUT; |
| } |
| |
| io_commit_cqring(ctx); |
| spin_unlock_irqrestore(&ctx->completion_lock, flags); |
| io_cqring_ev_posted(ctx); |
| } |
| |
| static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) |
| { |
| if (likely(!(req->flags & REQ_F_LINK_HEAD))) |
| return; |
| |
| /* |
| * If LINK is set, we have dependent requests in this chain. If we |
| * didn't fail this request, queue the first one up, moving any other |
| * dependencies to the next request. In case of failure, fail the rest |
| * of the chain. |
| */ |
| if (req->flags & REQ_F_FAIL_LINK) { |
| io_fail_links(req); |
| } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == |
| REQ_F_LINK_TIMEOUT) { |
| struct io_ring_ctx *ctx = req->ctx; |
| unsigned long flags; |
| |
| /* |
| * If this is a timeout link, we could be racing with the |
| * timeout timer. Grab the completion lock for this case to |
| * protect against that. |
| */ |
| spin_lock_irqsave(&ctx->completion_lock, flags); |
| io_req_link_next(req, nxt); |
| spin_unlock_irqrestore(&ctx->completion_lock, flags); |
| } else { |
| io_req_link_next(req, nxt); |
| } |
| } |
| |
| static void io_free_req(struct io_kiocb *req) |
| { |
| struct io_kiocb *nxt = NULL; |
| |
| io_req_find_next(req, &nxt); |
| __io_free_req(req); |
| |
| if (nxt) |
| io_queue_async_work(nxt); |
| } |
| |
| static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) |
| { |
| struct io_kiocb *link; |
| const struct io_op_def *def = &io_op_defs[nxt->opcode]; |
| |
| if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) |
| io_wq_hash_work(&nxt->work, file_inode(nxt->file)); |
| |
| *workptr = &nxt->work; |
| link = io_prep_linked_timeout(nxt); |
| if (link) |
| nxt->flags |= REQ_F_QUEUE_TIMEOUT; |
| } |
| |
| /* |
| * Drop reference to request, return next in chain (if there is one) if this |
| * was the last reference to this request. |
| */ |
| __attribute__((nonnull)) |
| static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) |
| { |
| if (refcount_dec_and_test(&req->refs)) { |
| io_req_find_next(req, nxtptr); |
| __io_free_req(req); |
| } |
| } |
| |
| static void io_put_req(struct io_kiocb *req) |
| { |
| if (refcount_dec_and_test(&req->refs)) |
| io_free_req(req); |
| } |
| |
| static void io_steal_work(struct io_kiocb *req, |
| struct io_wq_work **workptr) |
| { |
| /* |
| * It's in an io-wq worker, so there always should be at least |
| * one reference, which will be dropped in io_put_work() just |
| * after the current handler returns. |
| * |
| * It also means, that if the counter dropped to 1, then there is |
| * no asynchronous users left, so it's safe to steal the next work. |
| */ |
| if (refcount_read(&req->refs) == 1) { |
| struct io_kiocb *nxt = NULL; |
| |
| io_req_find_next(req, &nxt); |
| if (nxt) |
| io_wq_assign_next(workptr, nxt); |
| } |
| } |
| |
| /* |
| * Must only be used if we don't need to care about links, usually from |
| * within the completion handling itself. |
| */ |
| static void __io_double_put_req(struct io_kiocb *req) |
| { |
| /* drop both submit and complete references */ |
| if (refcount_sub_and_test(2, &req->refs)) |
| __io_free_req(req); |
| } |
| |
| static void io_double_put_req(struct io_kiocb *req) |
| { |
| /* drop both submit and complete references */ |
| if (refcount_sub_and_test(2, &req->refs)) |
| io_free_req(req); |
| } |
| |
| static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) |
| { |
| struct io_rings *rings = ctx->rings; |
| |
| if (test_bit(0, &ctx->cq_check_overflow)) { |
| /* |
| * noflush == true is from the waitqueue handler, just ensure |
| * we wake up the task, and the next invocation will flush the |
| * entries. We cannot safely to it from here. |
| */ |
| if (noflush && !list_empty(&ctx->cq_overflow_list)) |
| return -1U; |
| |
| io_cqring_overflow_flush(ctx, false); |
| } |
| |
| /* See comment at the top of this file */ |
| smp_rmb(); |
| return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); |
| } |
| |
| static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) |
| { |
| struct io_rings *rings = ctx->rings; |
| |
| /* make sure SQ entry isn't read before tail */ |
| return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; |
| } |
| |
| static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) |
| { |
| if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) |
| return false; |
| |
| if (req->file || req->io) |
| rb->need_iter++; |
| |
| rb->reqs[rb->to_free++] = req; |
| if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) |
| io_free_req_many(req->ctx, rb); |
| return true; |
| } |
| |
| static int io_put_kbuf(struct io_kiocb *req) |
| { |
| struct io_buffer *kbuf; |
| int cflags; |
| |
| kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; |
| cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; |
| cflags |= IORING_CQE_F_BUFFER; |
| req->rw.addr = 0; |
| kfree(kbuf); |
| return cflags; |
| } |
| |
| /* |
| * Find and free completed poll iocbs |
| */ |
| static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, |
| struct list_head *done) |
| { |
| struct req_batch rb; |
| struct io_kiocb *req; |
| |
| rb.to_free = rb.need_iter = 0; |
| while (!list_empty(done)) { |
| int cflags = 0; |
| |
| req = list_first_entry(done, struct io_kiocb, list); |
| list_del(&req->list); |
| |
| if (req->flags & REQ_F_BUFFER_SELECTED) |
| cflags = io_put_kbuf(req); |
| |
| __io_cqring_fill_event(req, req->result, cflags); |
| (*nr_events)++; |
| |
| if (refcount_dec_and_test(&req->refs) && |
| !io_req_multi_free(&rb, req)) |
| io_free_req(req); |
| } |
| |
| io_commit_cqring(ctx); |
| if (ctx->flags & IORING_SETUP_SQPOLL) |
| io_cqring_ev_posted(ctx); |
| io_free_req_many(ctx, &rb); |
| } |
| |
| static void io_iopoll_queue(struct list_head *again) |
| { |
| struct io_kiocb *req; |
| |
| do { |
| req = list_first_entry(again, struct io_kiocb, list); |
| list_del(&req->list); |
| refcount_inc(&req->refs); |
| io_queue_async_work(req); |
| } while (!list_empty(again)); |
| } |
| |
| static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, |
| long min) |
| { |
| struct io_kiocb *req, *tmp; |
| LIST_HEAD(done); |
| LIST_HEAD(again); |
| bool spin; |
| int ret; |
| |
| /* |
| * Only spin for completions if we don't have multiple devices hanging |
| * off our complete list, and we're under the requested amount. |
| */ |
| spin = !ctx->poll_multi_file && *nr_events < min; |
| |
| ret = 0; |
| list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { |
| struct kiocb *kiocb = &req->rw.kiocb; |
| |
| /* |
| * Move completed and retryable entries to our local lists. |
| * If we find a request that requires polling, break out |
| * and complete those lists first, if we have entries there. |
| */ |
| if (READ_ONCE(req->iopoll_completed)) { |
| list_move_tail(&req->list, &done); |
| continue; |
| } |
| if (!list_empty(&done)) |
| break; |
| |
| if (req->result == -EAGAIN) { |
| list_move_tail(&req->list, &again); |
| continue; |
| } |
| if (!list_empty(&again)) |
| break; |
| |
| ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); |
| if (ret < 0) |
| break; |
| |
| if (ret && spin) |
| spin = false; |
| ret = 0; |
| } |
| |
| if (!list_empty(&done)) |
| io_iopoll_complete(ctx, nr_events, &done); |
| |
| if (!list_empty(&again)) |
| io_iopoll_queue(&again); |
| |
| return ret; |
| } |
| |
| /* |
| * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a |
| * non-spinning poll check - we'll still enter the driver poll loop, but only |
| * as a non-spinning completion check. |
| */ |
| static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, |
| long min) |
| { |
| while (!list_empty(&ctx->poll_list) && !need_resched()) { |
| int ret; |
| |
| ret = io_do_iopoll(ctx, nr_events, min); |
| if (ret < 0) |
| return ret; |
| if (!min || *nr_events >= min) |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| /* |
| * We can't just wait for polled events to come to us, we have to actively |
| * find and complete them. |
| */ |
| static void io_iopoll_reap_events(struct io_ring_ctx *ctx) |
| { |
| if (!(ctx->flags & IORING_SETUP_IOPOLL)) |
| return; |
| |
| mutex_lock(&ctx->uring_lock); |
| while (!list_empty(&ctx->poll_list)) { |
| unsigned int nr_events = 0; |
| |
| io_iopoll_getevents(ctx, &nr_events, 1); |
| |
| /* |
| * Ensure we allow local-to-the-cpu processing to take place, |
| * in this case we need to ensure that we reap all events. |
| */ |
| cond_resched(); |
| } |
| mutex_unlock(&ctx->uring_lock); |
| } |
| |
| static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, |
| long min) |
| { |
| int iters = 0, ret = 0; |
| |
| /* |
| * We disallow the app entering submit/complete with polling, but we |
| * still need to lock the ring to prevent racing with polled issue |
| * that got punted to a workqueue. |
| */ |
| mutex_lock(&ctx->uring_lock); |
| do { |
| int tmin = 0; |
| |
| /* |
| * Don't enter poll loop if we already have events pending. |
| * If we do, we can potentially be spinning for commands that |
| * already triggered a CQE (eg in error). |
| */ |
| if (io_cqring_events(ctx, false)) |
| break; |
| |
| /* |
| * If a submit got punted to a workqueue, we can have the |
| * application entering polling for a command before it gets |
| * issued. That app will hold the uring_lock for the duration |
| * of the poll right here, so we need to take a breather every |
| * now and then to ensure that the issue has a chance to add |
| * the poll to the issued list. Otherwise we can spin here |
| * forever, while the workqueue is stuck trying to acquire the |
| * very same mutex. |
| */ |
| if (!(++iters & 7)) { |
| mutex_unlock(&ctx->uring_lock); |
| mutex_lock(&ctx->uring_lock); |
| } |
| |
| if (*nr_events < min) |
| tmin = min - *nr_events; |
| |
| ret = io_iopoll_getevents(ctx, nr_events, tmin); |
| if (ret <= 0) |
| break; |
| ret = 0; |
| } while (min && !*nr_events && !need_resched()); |
| |
| mutex_unlock(&ctx->uring_lock); |
| return ret; |
| } |
| |
| static void kiocb_end_write(struct io_kiocb *req) |
| { |
| /* |
| * Tell lockdep we inherited freeze protection from submission |
| * thread. |
| */ |
| if (req->flags & REQ_F_ISREG) { |
| struct inode *inode = file_inode(req->file); |
| |
| __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); |
| } |
| file_end_write(req->file); |
| } |
| |
| static inline void req_set_fail_links(struct io_kiocb *req) |
| { |
| if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) |
| req->flags |= REQ_F_FAIL_LINK; |
| } |
| |
| static void io_complete_rw_common(struct kiocb *kiocb, long res) |
| { |
| struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); |
| int cflags = 0; |
| |
| if (kiocb->ki_flags & IOCB_WRITE) |
| kiocb_end_write(req); |
| |
| if (res != req->result) |
| req_set_fail_links(req); |
| if (req->flags & REQ_F_BUFFER_SELECTED) |
| cflags = io_put_kbuf(req); |
| __io_cqring_add_event(req, res, cflags); |
| } |
| |
| static void io_complete_rw(struct kiocb *kiocb, long res, long res2) |
| { |
| struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); |
| |
| io_complete_rw_common(kiocb, res); |
| io_put_req(req); |
| } |
| |
| static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) |
| { |
| struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); |
| |
| if (kiocb->ki_flags & IOCB_WRITE) |
| kiocb_end_write(req); |
| |
| if (res != req->result) |
| req_set_fail_links(req); |
| req->result = res; |
| if (res != -EAGAIN) |
| WRITE_ONCE(req->iopoll_completed, 1); |
| } |
| |
| /* |
| * After the iocb has been issued, it's safe to be found on the poll list. |
| * Adding the kiocb to the list AFTER submission ensures that we don't |
| * find it from a io_iopoll_getevents() thread before the issuer is done |
| * accessing the kiocb cookie. |
| */ |
| static void io_iopoll_req_issued(struct io_kiocb *req) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| /* |
| * Track whether we have multiple files in our lists. This will impact |
| * how we do polling eventually, not spinning if we're on potentially |
| * different devices. |
| */ |
| if (list_empty(&ctx->poll_list)) { |
| ctx->poll_multi_file = false; |
| } else if (!ctx->poll_multi_file) { |
| struct io_kiocb *list_req; |
| |
| list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, |
| list); |
| if (list_req->file != req->file) |
| ctx->poll_multi_file = true; |
| } |
| |
| /* |
| * For fast devices, IO may have already completed. If it has, add |
| * it to the front so we find it first. |
| */ |
| if (READ_ONCE(req->iopoll_completed)) |
| list_add(&req->list, &ctx->poll_list); |
| else |
| list_add_tail(&req->list, &ctx->poll_list); |
| |
| if ((ctx->flags & IORING_SETUP_SQPOLL) && |
| wq_has_sleeper(&ctx->sqo_wait)) |
| wake_up(&ctx->sqo_wait); |
| } |
| |
| static void __io_state_file_put(struct io_submit_state *state) |
| { |
| int diff = state->has_refs - state->used_refs; |
| |
| if (diff) |
| fput_many(state->file, diff); |
| state->file = NULL; |
| } |
| |
| static inline void io_state_file_put(struct io_submit_state *state) |
| { |
| if (state->file) |
| __io_state_file_put(state); |
| } |
| |
| /* |
| * Get as many references to a file as we have IOs left in this submission, |
| * assuming most submissions are for one file, or at least that each file |
| * has more than one submission. |
| */ |
| static struct file *__io_file_get(struct io_submit_state *state, int fd) |
| { |
| if (!state) |
| return fget(fd); |
| |
| if (state->file) { |
| if (state->fd == fd) { |
| state->used_refs++; |
| state->ios_left--; |
| return state->file; |
| } |
| __io_state_file_put(state); |
| } |
| state->file = fget_many(fd, state->ios_left); |
| if (!state->file) |
| return NULL; |
| |
| state->fd = fd; |
| state->has_refs = state->ios_left; |
| state->used_refs = 1; |
| state->ios_left--; |
| return state->file; |
| } |
| |
| /* |
| * If we tracked the file through the SCM inflight mechanism, we could support |
| * any file. For now, just ensure that anything potentially problematic is done |
| * inline. |
| */ |
| static bool io_file_supports_async(struct file *file, int rw) |
| { |
| umode_t mode = file_inode(file)->i_mode; |
| |
| if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) |
| return true; |
| if (S_ISREG(mode) && file->f_op != &io_uring_fops) |
| return true; |
| |
| /* any ->read/write should understand O_NONBLOCK */ |
| if (file->f_flags & O_NONBLOCK) |
| return true; |
| |
| if (!(file->f_mode & FMODE_NOWAIT)) |
| return false; |
| |
| if (rw == READ) |
| return file->f_op->read_iter != NULL; |
| |
| return file->f_op->write_iter != NULL; |
| } |
| |
| static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, |
| bool force_nonblock) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| struct kiocb *kiocb = &req->rw.kiocb; |
| unsigned ioprio; |
| int ret; |
| |
| if (S_ISREG(file_inode(req->file)->i_mode)) |
| req->flags |= REQ_F_ISREG; |
| |
| kiocb->ki_pos = READ_ONCE(sqe->off); |
| if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { |
| req->flags |= REQ_F_CUR_POS; |
| kiocb->ki_pos = req->file->f_pos; |
| } |
| kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); |
| kiocb->ki_flags = iocb_flags(kiocb->ki_filp); |
| ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); |
| if (unlikely(ret)) |
| return ret; |
| |
| ioprio = READ_ONCE(sqe->ioprio); |
| if (ioprio) { |
| ret = ioprio_check_cap(ioprio); |
| if (ret) |
| return ret; |
| |
| kiocb->ki_ioprio = ioprio; |
| } else |
| kiocb->ki_ioprio = get_current_ioprio(); |
| |
| /* don't allow async punt if RWF_NOWAIT was requested */ |
| if (kiocb->ki_flags & IOCB_NOWAIT) |
| req->flags |= REQ_F_NOWAIT; |
| |
| if (force_nonblock) |
| kiocb->ki_flags |= IOCB_NOWAIT; |
| |
| if (ctx->flags & IORING_SETUP_IOPOLL) { |
| if (!(kiocb->ki_flags & IOCB_DIRECT) || |
| !kiocb->ki_filp->f_op->iopoll) |
| return -EOPNOTSUPP; |
| |
| kiocb->ki_flags |= IOCB_HIPRI; |
| kiocb->ki_complete = io_complete_rw_iopoll; |
| req->result = 0; |
| req->iopoll_completed = 0; |
| } else { |
| if (kiocb->ki_flags & IOCB_HIPRI) |
| return -EINVAL; |
| kiocb->ki_complete = io_complete_rw; |
| } |
| |
| req->rw.addr = READ_ONCE(sqe->addr); |
| req->rw.len = READ_ONCE(sqe->len); |
| req->buf_index = READ_ONCE(sqe->buf_index); |
| return 0; |
| } |
| |
| static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) |
| { |
| switch (ret) { |
| case -EIOCBQUEUED: |
| break; |
| case -ERESTARTSYS: |
| case -ERESTARTNOINTR: |
| case -ERESTARTNOHAND: |
| case -ERESTART_RESTARTBLOCK: |
| /* |
| * We can't just restart the syscall, since previously |
| * submitted sqes may already be in progress. Just fail this |
| * IO with EINTR. |
| */ |
| ret = -EINTR; |
| /* fall through */ |
| default: |
| kiocb->ki_complete(kiocb, ret, 0); |
| } |
| } |
| |
| static void kiocb_done(struct kiocb *kiocb, ssize_t ret) |
| { |
| struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); |
| |
| if (req->flags & REQ_F_CUR_POS) |
| req->file->f_pos = kiocb->ki_pos; |
| if (ret >= 0 && kiocb->ki_complete == io_complete_rw) |
| io_complete_rw(kiocb, ret, 0); |
| else |
| io_rw_done(kiocb, ret); |
| } |
| |
| static ssize_t io_import_fixed(struct io_kiocb *req, int rw, |
| struct iov_iter *iter) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| size_t len = req->rw.len; |
| struct io_mapped_ubuf *imu; |
| u16 index, buf_index; |
| size_t offset; |
| u64 buf_addr; |
| |
| /* attempt to use fixed buffers without having provided iovecs */ |
| if (unlikely(!ctx->user_bufs)) |
| return -EFAULT; |
| |
| buf_index = req->buf_index; |
| if (unlikely(buf_index >= ctx->nr_user_bufs)) |
| return -EFAULT; |
| |
| index = array_index_nospec(buf_index, ctx->nr_user_bufs); |
| imu = &ctx->user_bufs[index]; |
| buf_addr = req->rw.addr; |
| |
| /* overflow */ |
| if (buf_addr + len < buf_addr) |
| return -EFAULT; |
| /* not inside the mapped region */ |
| if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len) |
| return -EFAULT; |
| |
| /* |
| * May not be a start of buffer, set size appropriately |
| * and advance us to the beginning. |
| */ |
| offset = buf_addr - imu->ubuf; |
| iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); |
| |
| if (offset) { |
| /* |
| * Don't use iov_iter_advance() here, as it's really slow for |
| * using the latter parts of a big fixed buffer - it iterates |
| * over each segment manually. We can cheat a bit here, because |
| * we know that: |
| * |
| * 1) it's a BVEC iter, we set it up |
| * 2) all bvecs are PAGE_SIZE in size, except potentially the |
| * first and last bvec |
| * |
| * So just find our index, and adjust the iterator afterwards. |
| * If the offset is within the first bvec (or the whole first |
| * bvec, just use iov_iter_advance(). This makes it easier |
| * since we can just skip the first segment, which may not |
| * be PAGE_SIZE aligned. |
| */ |
| const struct bio_vec *bvec = imu->bvec; |
| |
| if (offset <= bvec->bv_len) { |
| iov_iter_advance(iter, offset); |
| } else { |
| unsigned long seg_skip; |
| |
| /* skip first vec */ |
| offset -= bvec->bv_len; |
| seg_skip = 1 + (offset >> PAGE_SHIFT); |
| |
| iter->bvec = bvec + seg_skip; |
| iter->nr_segs -= seg_skip; |
| iter->count -= bvec->bv_len + offset; |
| iter->iov_offset = offset & ~PAGE_MASK; |
| } |
| } |
| |
| return len; |
| } |
| |
| static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) |
| { |
| if (needs_lock) |
| mutex_unlock(&ctx->uring_lock); |
| } |
| |
| static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) |
| { |
| /* |
| * "Normal" inline submissions always hold the uring_lock, since we |
| * grab it from the system call. Same is true for the SQPOLL offload. |
| * The only exception is when we've detached the request and issue it |
| * from an async worker thread, grab the lock for that case. |
| */ |
| if (needs_lock) |
| mutex_lock(&ctx->uring_lock); |
| } |
| |
| static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, |
| int bgid, struct io_buffer *kbuf, |
| bool needs_lock) |
| { |
| struct io_buffer *head; |
| |
| if (req->flags & REQ_F_BUFFER_SELECTED) |
| return kbuf; |
| |
| io_ring_submit_lock(req->ctx, needs_lock); |
| |
| lockdep_assert_held(&req->ctx->uring_lock); |
| |
| head = idr_find(&req->ctx->io_buffer_idr, bgid); |
| if (head) { |
| if (!list_empty(&head->list)) { |
| kbuf = list_last_entry(&head->list, struct io_buffer, |
| list); |
| list_del(&kbuf->list); |
| } else { |
| kbuf = head; |
| idr_remove(&req->ctx->io_buffer_idr, bgid); |
| } |
| if (*len > kbuf->len) |
| *len = kbuf->len; |
| } else { |
| kbuf = ERR_PTR(-ENOBUFS); |
| } |
| |
| io_ring_submit_unlock(req->ctx, needs_lock); |
| |
| return kbuf; |
| } |
| |
| static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, |
| bool needs_lock) |
| { |
| struct io_buffer *kbuf; |
| u16 bgid; |
| |
| kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; |
| bgid = req->buf_index; |
| kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); |
| if (IS_ERR(kbuf)) |
| return kbuf; |
| req->rw.addr = (u64) (unsigned long) kbuf; |
| req->flags |= REQ_F_BUFFER_SELECTED; |
| return u64_to_user_ptr(kbuf->addr); |
| } |
| |
| #ifdef CONFIG_COMPAT |
| static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, |
| bool needs_lock) |
| { |
| struct compat_iovec __user *uiov; |
| compat_ssize_t clen; |
| void __user *buf; |
| ssize_t len; |
| |
| uiov = u64_to_user_ptr(req->rw.addr); |
| if (!access_ok(uiov, sizeof(*uiov))) |
| return -EFAULT; |
| if (__get_user(clen, &uiov->iov_len)) |
| return -EFAULT; |
| if (clen < 0) |
| return -EINVAL; |
| |
| len = clen; |
| buf = io_rw_buffer_select(req, &len, needs_lock); |
| if (IS_ERR(buf)) |
| return PTR_ERR(buf); |
| iov[0].iov_base = buf; |
| iov[0].iov_len = (compat_size_t) len; |
| return 0; |
| } |
| #endif |
| |
| static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, |
| bool needs_lock) |
| { |
| struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); |
| void __user *buf; |
| ssize_t len; |
| |
| if (copy_from_user(iov, uiov, sizeof(*uiov))) |
| return -EFAULT; |
| |
| len = iov[0].iov_len; |
| if (len < 0) |
| return -EINVAL; |
| buf = io_rw_buffer_select(req, &len, needs_lock); |
| if (IS_ERR(buf)) |
| return PTR_ERR(buf); |
| iov[0].iov_base = buf; |
| iov[0].iov_len = len; |
| return 0; |
| } |
| |
| static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, |
| bool needs_lock) |
| { |
| if (req->flags & REQ_F_BUFFER_SELECTED) { |
| struct io_buffer *kbuf; |
| |
| kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; |
| iov[0].iov_base = u64_to_user_ptr(kbuf->addr); |
| iov[0].iov_len = kbuf->len; |
| return 0; |
| } |
| if (!req->rw.len) |
| return 0; |
| else if (req->rw.len > 1) |
| return -EINVAL; |
| |
| #ifdef CONFIG_COMPAT |
| if (req->ctx->compat) |
| return io_compat_import(req, iov, needs_lock); |
| #endif |
| |
| return __io_iov_buffer_select(req, iov, needs_lock); |
| } |
| |
| static ssize_t io_import_iovec(int rw, struct io_kiocb *req, |
| struct iovec **iovec, struct iov_iter *iter, |
| bool needs_lock) |
| { |
| void __user *buf = u64_to_user_ptr(req->rw.addr); |
| size_t sqe_len = req->rw.len; |
| ssize_t ret; |
| u8 opcode; |
| |
| opcode = req->opcode; |
| if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { |
| *iovec = NULL; |
| return io_import_fixed(req, rw, iter); |
| } |
| |
| /* buffer index only valid with fixed read/write, or buffer select */ |
| if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) |
| return -EINVAL; |
| |
| if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { |
| if (req->flags & REQ_F_BUFFER_SELECT) { |
| buf = io_rw_buffer_select(req, &sqe_len, needs_lock); |
| if (IS_ERR(buf)) { |
| *iovec = NULL; |
| return PTR_ERR(buf); |
| } |
| req->rw.len = sqe_len; |
| } |
| |
| ret = import_single_range(rw, buf, sqe_len, *iovec, iter); |
| *iovec = NULL; |
| return ret < 0 ? ret : sqe_len; |
| } |
| |
| if (req->io) { |
| struct io_async_rw *iorw = &req->io->rw; |
| |
| *iovec = iorw->iov; |
| iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); |
| if (iorw->iov == iorw->fast_iov) |
| *iovec = NULL; |
| return iorw->size; |
| } |
| |
| if (req->flags & REQ_F_BUFFER_SELECT) { |
| ret = io_iov_buffer_select(req, *iovec, needs_lock); |
| if (!ret) { |
| ret = (*iovec)->iov_len; |
| iov_iter_init(iter, rw, *iovec, 1, ret); |
| } |
| *iovec = NULL; |
| return ret; |
| } |
| |
| #ifdef CONFIG_COMPAT |
| if (req->ctx->compat) |
| return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, |
| iovec, iter); |
| #endif |
| |
| return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); |
| } |
| |
| /* |
| * For files that don't have ->read_iter() and ->write_iter(), handle them |
| * by looping over ->read() or ->write() manually. |
| */ |
| static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, |
| struct iov_iter *iter) |
| { |
| ssize_t ret = 0; |
| |
| /* |
| * Don't support polled IO through this interface, and we can't |
| * support non-blocking either. For the latter, this just causes |
| * the kiocb to be handled from an async context. |
| */ |
| if (kiocb->ki_flags & IOCB_HIPRI) |
| return -EOPNOTSUPP; |
| if (kiocb->ki_flags & IOCB_NOWAIT) |
| return -EAGAIN; |
| |
| while (iov_iter_count(iter)) { |
| struct iovec iovec; |
| ssize_t nr; |
| |
| if (!iov_iter_is_bvec(iter)) { |
| iovec = iov_iter_iovec(iter); |
| } else { |
| /* fixed buffers import bvec */ |
| iovec.iov_base = kmap(iter->bvec->bv_page) |
| + iter->iov_offset; |
| iovec.iov_len = min(iter->count, |
| iter->bvec->bv_len - iter->iov_offset); |
| } |
| |
| if (rw == READ) { |
| nr = file->f_op->read(file, iovec.iov_base, |
| iovec.iov_len, &kiocb->ki_pos); |
| } else { |
| nr = file->f_op->write(file, iovec.iov_base, |
| iovec.iov_len, &kiocb->ki_pos); |
| } |
| |
| if (iov_iter_is_bvec(iter)) |
| kunmap(iter->bvec->bv_page); |
| |
| if (nr < 0) { |
| if (!ret) |
| ret = nr; |
| break; |
| } |
| ret += nr; |
| if (nr != iovec.iov_len) |
| break; |
| iov_iter_advance(iter, nr); |
| } |
| |
| return ret; |
| } |
| |
| static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, |
| struct iovec *iovec, struct iovec *fast_iov, |
| struct iov_iter *iter) |
| { |
| req->io->rw.nr_segs = iter->nr_segs; |
| req->io->rw.size = io_size; |
| req->io->rw.iov = iovec; |
| if (!req->io->rw.iov) { |
| req->io->rw.iov = req->io->rw.fast_iov; |
| if (req->io->rw.iov != fast_iov) |
| memcpy(req->io->rw.iov, fast_iov, |
| sizeof(struct iovec) * iter->nr_segs); |
| } else { |
| req->flags |= REQ_F_NEED_CLEANUP; |
| } |
| } |
| |
| static inline int __io_alloc_async_ctx(struct io_kiocb *req) |
| { |
| req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); |
| return req->io == NULL; |
| } |
| |
| static int io_alloc_async_ctx(struct io_kiocb *req) |
| { |
| if (!io_op_defs[req->opcode].async_ctx) |
| return 0; |
| |
| return __io_alloc_async_ctx(req); |
| } |
| |
| static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, |
| struct iovec *iovec, struct iovec *fast_iov, |
| struct iov_iter *iter) |
| { |
| if (!io_op_defs[req->opcode].async_ctx) |
| return 0; |
| if (!req->io) { |
| if (__io_alloc_async_ctx(req)) |
| return -ENOMEM; |
| |
| io_req_map_rw(req, io_size, iovec, fast_iov, iter); |
| } |
| return 0; |
| } |
| |
| static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, |
| bool force_nonblock) |
| { |
| struct io_async_ctx *io; |
| struct iov_iter iter; |
| ssize_t ret; |
| |
| ret = io_prep_rw(req, sqe, force_nonblock); |
| if (ret) |
| return ret; |
| |
| if (unlikely(!(req->file->f_mode & FMODE_READ))) |
| return -EBADF; |
| |
| /* either don't need iovec imported or already have it */ |
| if (!req->io || req->flags & REQ_F_NEED_CLEANUP) |
| return 0; |
| |
| io = req->io; |
| io->rw.iov = io->rw.fast_iov; |
| req->io = NULL; |
| ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); |
| req->io = io; |
| if (ret < 0) |
| return ret; |
| |
| io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); |
| return 0; |
| } |
| |
| static int io_read(struct io_kiocb *req, bool force_nonblock) |
| { |
| struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; |
| struct kiocb *kiocb = &req->rw.kiocb; |
| struct iov_iter iter; |
| size_t iov_count; |
| ssize_t io_size, ret; |
| |
| ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); |
| if (ret < 0) |
| return ret; |
| |
| /* Ensure we clear previously set non-block flag */ |
| if (!force_nonblock) |
| kiocb->ki_flags &= ~IOCB_NOWAIT; |
| |
| req->result = 0; |
| io_size = ret; |
| if (req->flags & REQ_F_LINK_HEAD) |
| req->result = io_size; |
| |
| /* |
| * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so |
| * we know to async punt it even if it was opened O_NONBLOCK |
| */ |
| if (force_nonblock && !io_file_supports_async(req->file, READ)) |
| goto copy_iov; |
| |
| iov_count = iov_iter_count(&iter); |
| ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); |
| if (!ret) { |
| ssize_t ret2; |
| |
| if (req->file->f_op->read_iter) |
| ret2 = call_read_iter(req->file, kiocb, &iter); |
| else |
| ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); |
| |
| /* Catch -EAGAIN return for forced non-blocking submission */ |
| if (!force_nonblock || ret2 != -EAGAIN) { |
| kiocb_done(kiocb, ret2); |
| } else { |
| copy_iov: |
| ret = io_setup_async_rw(req, io_size, iovec, |
| inline_vecs, &iter); |
| if (ret) |
| goto out_free; |
| /* any defer here is final, must blocking retry */ |
| if (!(req->flags & REQ_F_NOWAIT) && |
| !file_can_poll(req->file)) |
| req->flags |= REQ_F_MUST_PUNT; |
| return -EAGAIN; |
| } |
| } |
| out_free: |
| kfree(iovec); |
| req->flags &= ~REQ_F_NEED_CLEANUP; |
| return ret; |
| } |
| |
| static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, |
| bool force_nonblock) |
| { |
| struct io_async_ctx *io; |
| struct iov_iter iter; |
| ssize_t ret; |
| |
| ret = io_prep_rw(req, sqe, force_nonblock); |
| if (ret) |
| return ret; |
| |
| if (unlikely(!(req->file->f_mode & FMODE_WRITE))) |
| return -EBADF; |
| |
| req->fsize = rlimit(RLIMIT_FSIZE); |
| |
| /* either don't need iovec imported or already have it */ |
| if (!req->io || req->flags & REQ_F_NEED_CLEANUP) |
| return 0; |
| |
| io = req->io; |
| io->rw.iov = io->rw.fast_iov; |
| req->io = NULL; |
| ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); |
| req->io = io; |
| if (ret < 0) |
| return ret; |
| |
| io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); |
| return 0; |
| } |
| |
| static int io_write(struct io_kiocb *req, bool force_nonblock) |
| { |
| struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; |
| struct kiocb *kiocb = &req->rw.kiocb; |
| struct iov_iter iter; |
| size_t iov_count; |
| ssize_t ret, io_size; |
| |
| ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); |
| if (ret < 0) |
| return ret; |
| |
| /* Ensure we clear previously set non-block flag */ |
| if (!force_nonblock) |
| req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; |
| |
| req->result = 0; |
| io_size = ret; |
| if (req->flags & REQ_F_LINK_HEAD) |
| req->result = io_size; |
| |
| /* |
| * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so |
| * we know to async punt it even if it was opened O_NONBLOCK |
| */ |
| if (force_nonblock && !io_file_supports_async(req->file, WRITE)) |
| goto copy_iov; |
| |
| /* file path doesn't support NOWAIT for non-direct_IO */ |
| if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && |
| (req->flags & REQ_F_ISREG)) |
| goto copy_iov; |
| |
| iov_count = iov_iter_count(&iter); |
| ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); |
| if (!ret) { |
| ssize_t ret2; |
| |
| /* |
| * Open-code file_start_write here to grab freeze protection, |
| * which will be released by another thread in |
| * io_complete_rw(). Fool lockdep by telling it the lock got |
| * released so that it doesn't complain about the held lock when |
| * we return to userspace. |
| */ |
| if (req->flags & REQ_F_ISREG) { |
| __sb_start_write(file_inode(req->file)->i_sb, |
| SB_FREEZE_WRITE, true); |
| __sb_writers_release(file_inode(req->file)->i_sb, |
| SB_FREEZE_WRITE); |
| } |
| kiocb->ki_flags |= IOCB_WRITE; |
| |
| if (!force_nonblock) |
| current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; |
| |
| if (req->file->f_op->write_iter) |
| ret2 = call_write_iter(req->file, kiocb, &iter); |
| else |
| ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); |
| |
| if (!force_nonblock) |
| current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; |
| |
| /* |
| * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just |
| * retry them without IOCB_NOWAIT. |
| */ |
| if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) |
| ret2 = -EAGAIN; |
| if (!force_nonblock || ret2 != -EAGAIN) { |
| kiocb_done(kiocb, ret2); |
| } else { |
| copy_iov: |
| ret = io_setup_async_rw(req, io_size, iovec, |
| inline_vecs, &iter); |
| if (ret) |
| goto out_free; |
| /* any defer here is final, must blocking retry */ |
| if (!(req->flags & REQ_F_NOWAIT) && |
| !file_can_poll(req->file)) |
| req->flags |= REQ_F_MUST_PUNT; |
| return -EAGAIN; |
| } |
| } |
| out_free: |
| req->flags &= ~REQ_F_NEED_CLEANUP; |
| kfree(iovec); |
| return ret; |
| } |
| |
| static int __io_splice_prep(struct io_kiocb *req, |
| const struct io_uring_sqe *sqe) |
| { |
| struct io_splice* sp = &req->splice; |
| unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; |
| int ret; |
| |
| if (req->flags & REQ_F_NEED_CLEANUP) |
| return 0; |
| if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) |
| return -EINVAL; |
| |
| sp->file_in = NULL; |
| sp->len = READ_ONCE(sqe->len); |
| sp->flags = READ_ONCE(sqe->splice_flags); |
| |
| if (unlikely(sp->flags & ~valid_flags)) |
| return -EINVAL; |
| |
| ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, |
| (sp->flags & SPLICE_F_FD_IN_FIXED)); |
| if (ret) |
| return ret; |
| req->flags |= REQ_F_NEED_CLEANUP; |
| |
| if (!S_ISREG(file_inode(sp->file_in)->i_mode)) { |
| /* |
| * Splice operation will be punted aync, and here need to |
| * modify io_wq_work.flags, so initialize io_wq_work firstly. |
| */ |
| io_req_init_async(req); |
| req->work.flags |= IO_WQ_WORK_UNBOUND; |
| } |
| |
| return 0; |
| } |
| |
| static int io_tee_prep(struct io_kiocb *req, |
| const struct io_uring_sqe *sqe) |
| { |
| if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) |
| return -EINVAL; |
| return __io_splice_prep(req, sqe); |
| } |
| |
| static int io_tee(struct io_kiocb *req, bool force_nonblock) |
| { |
| struct io_splice *sp = &req->splice; |
| struct file *in = sp->file_in; |
| struct file *out = sp->file_out; |
| unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; |
| long ret = 0; |
| |
| if (force_nonblock) |
| return -EAGAIN; |
| if (sp->len) |
| ret = do_tee(in, out, sp->len, flags); |
| |
| io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); |
| req->flags &= ~REQ_F_NEED_CLEANUP; |
| |
| io_cqring_add_event(req, ret); |
| if (ret != sp->len) |
| req_set_fail_links(req); |
| io_put_req(req); |
| return 0; |
| } |
| |
| static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) |
| { |
| struct io_splice* sp = &req->splice; |
| |
| sp->off_in = READ_ONCE(sqe->splice_off_in); |
| sp->off_out = READ_ONCE(sqe->off); |
| return __io_splice_prep(req, sqe); |
| } |
| |
| static int io_splice(struct io_kiocb *req, bool force_nonblock) |
| { |
| struct io_splice *sp = &req->splice; |
| struct file *in = sp->file_in; |
| struct file *out = sp->file_out; |
| unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; |
| loff_t *poff_in, *poff_out; |
| long ret = 0; |
| |
| if (force_nonblock) |
| return -EAGAIN; |
| |
| poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; |
| poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; |
| |
| if (sp->len) |
| ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); |
| |
| io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); |
| req->flags &= ~REQ_F_NEED_CLEANUP; |
| |
| io_cqring_add_event(req, ret); |
| if (ret != sp->len) |
| req_set_fail_links(req); |
| io_put_req(req); |
| return 0; |
| } |
| |
| /* |
| * IORING_OP_NOP just posts a completion event, nothing else. |
| */ |
| static int io_nop(struct io_kiocb *req) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) |
| return -EINVAL; |
| |
| io_cqring_add_event(req, 0); |
| io_put_req(req); |
| return 0; |
| } |
| |
| static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) |
| { |
| struct io_ring_ctx *ctx = req->ctx; |
| |
| if (!req->file) |
| return -EBADF; |
| |
| if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) |
| return -EINVAL; |
| if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) |
| return -EINVAL; |
| |
| req->sync.flags = READ_ONCE(sqe->fsync_flags); |
| if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) |
| return -EINVAL; |
| |
| req->sync.off = READ_ONCE(sqe->off); |
| req->sync.len = READ_ONCE(sqe->len); |
| return 0; |
| } |
| |
| static int io_fsync(struct io_kiocb *req, bool force_nonblock) |
| { |
| loff_t end = req->sync.off + req->sync.len; |
| int ret; |
| |
| /* fsync always requires a blocking context */ |
| if (force_nonblock) |
| return -EAGAIN; |
| |
| ret = vfs_fsync_range(req->file, req->sync.off, |
| end > 0 ? end : LLONG_MAX, |
| req->sync.flags & IORING_FSYNC_DATASYNC); |
| if (ret < 0) |
| req_set_fail_links(req); |
| io_cqring_add_event(req, ret); |
| io_put_req(req); |
| return 0; |
| } |
| |
| static int io_fallocate_prep(struct io_kiocb *req, |
| const struct io_uring_sqe *sqe) |
| { |
| if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) |
| return -EINVAL; |
| if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) |
| return -EINVAL; |
| |
| req->sync.off = READ_ONCE(sqe->off); |
| req->sync.len = READ_ONCE(sqe->addr); |
| req->sync.mode = READ_ONCE(sqe->len); |
| req->fsize = rlimit(RLIMIT_FSIZE); |
| return 0; |
| } |
| |
| static int io_fallocate(struct io_kiocb *req, bool force_nonblock) |
| { |
| int ret; |
| |
| /* fallocate always requiring blocking context */ |
| if (force_nonblock) |
| return -EAGAIN; |
| |
| current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; |
| ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, |
| req->sync.len); |
| current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; |
| if (ret < 0) |
| req_set_fail_links(req); |
| io_cqring_add_event(req, ret); |
| io_put_req(req); |
| return 0; |
| } |
| |
| static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) |
| { |
| const char __user *fname; |
| int ret; |
| |
| if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) |
| return -EINVAL; |
| if (unlikely(sqe->ioprio || sqe->buf_index)) |
| return -EINVAL; |
| if (unlikely(req->flags & REQ_F_FIXED_FILE)) |
| return -EBADF; |
| |
| /* open.how should be already initialised */ |
| if (!(req->open.how.flags & O_PATH) && force_o_largefile()) |
| req->open.how.flags |= O_LARGEFILE; |
| |
| req->open.dfd = READ_ONCE(sqe->fd); |
| fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); |
| req->open.filename = getname(fname); |
| if (IS_ERR(req->open.filename)) { |
| ret = PTR_ERR(req->open.filename); |
| req->open.filename = NULL; |
| return ret; |
| } |
| req->open.nofile = rlimit(RLIMIT_NOFILE); |
| req->flags |= REQ_F_NEED_CLEANUP; |
| return 0; |
| } |
| |
| static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) |
| { |
| u64 flags, mode; |
| |
| if (req->flags & REQ_F_NEED_CLEANUP) |
| return 0; |
| mode = READ_ONCE(sqe->len); |
| flags = READ_ONCE(sqe->open_flags); |
| req->open.how = build_open_how(flags, mode); |
| return __io_openat_prep(req, sqe); |
| } |
| |
| static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) |
| { |
| struct open_how __user *how; |
| size_t len; |
| int ret; |
| |
| if (req->flags & REQ_F_NEED_CLEANUP) |
| return 0; |
| how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); |
| len = READ_ONCE(sqe->len); |
| if (len < OPEN_HOW_SIZE_VER0) |
| return -EINVAL; |
| |
| ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, |
| len); |
| if (ret) |
| return ret; |
| |
| return __io_openat_prep(req, sqe); |
| } |
| |
| static int io_openat2(struct io_kiocb *req, bool force_nonblock) |
| { |
| struct open_flags op; |
| struct file *file; |
| int ret; |
| |
| if (force_nonblock) |
| return -EAGAIN; |
| |
| ret = build_open_flags(&req->open.how, &op); |
| if (ret) |
| goto err; |
| |
| ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); |
| if (ret < 0) |
| goto err; |
| |
| file = do_filp_open(req->open.dfd, req->open.filename, &op); |
| if (IS_ERR(file)) { |
| put_unused_fd(ret); |
| ret = PTR_ERR(file); |
| } else { |
| fsnotify_open(file); |
| fd_install(ret, file); |
| } |
| err: |
| putname(req->open.filename); |
| req->flags &= ~REQ_F_NEED_CLEANUP; |
| if (ret < 0) |
| req_set_fail_links(req); |
| io_cqring_add_event(req, ret); |
| io_put_req(req); |
| return 0; |
| } |
| |
| static int io_openat(struct io_kiocb *req, bool force_nonblock) |
| { |
| return io_openat2(req, force_nonblock); |
| } |
| |
| static int io_remove_buffers_prep(struct io_kiocb *req, |
| const struct io_uring_sqe *sqe) |
| { |
| struct io_provide_buf *p = &req->pbuf; |
| u64 tmp; |
| |
| if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) |
| return -EINVAL; |
| |
| tmp = READ_ONCE(sqe->fd); |
| if (!tmp || tmp > USHRT_MAX) |
| return -EINVAL; |
| |
| memset(p, 0, sizeof(*p)); |
| p->nbufs = tmp; |
| p->bgid = READ_ONCE(sqe->buf_group); |
| return 0; |
| } |
| |
| static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, |
| int bgid, unsigned nbufs) |
| { |
| unsigned i = 0; |
| |
| /* shouldn't happen */ |
| if (!nbufs) |
| return 0; |
| |
| /* the head kbuf is the list itself */ |
| while (!list_empty(&buf->list)) { |
| struct io_buffer *nxt; |
| |
| nxt = list_first_entry(&buf->list, struct io_buffer, list); |
| list_del(&nxt->list); |
|