diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 5585 |
1 files changed, 3618 insertions, 1967 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 1b2517291b78..ce69bd9b0838 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -44,6 +44,7 @@ #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/compat.h> +#include <net/compat.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/bits.h> @@ -54,7 +55,6 @@ #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/mman.h> -#include <linux/mmu_context.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/kthread.h> @@ -76,6 +76,9 @@ #include <linux/fadvise.h> #include <linux/eventpoll.h> #include <linux/fs_struct.h> +#include <linux/splice.h> +#include <linux/task_work.h> +#include <linux/pagemap.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -139,7 +142,7 @@ struct io_rings { */ u32 sq_dropped; /* - * Runtime flags + * Runtime SQ flags * * Written by the kernel, shouldn't be modified by the * application. @@ -149,6 +152,13 @@ struct io_rings { */ u32 sq_flags; /* + * Runtime CQ flags + * + * Written by the application, shouldn't be modified by the + * kernel. + */ + u32 cq_flags; + /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure * there are not more requests pending than there is space in @@ -183,14 +193,30 @@ struct fixed_file_table { struct file **files; }; +struct fixed_file_ref_node { + struct percpu_ref refs; + struct list_head node; + struct list_head file_list; + struct fixed_file_data *file_data; + struct llist_node llist; +}; + struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; + struct percpu_ref *cur_refs; struct percpu_ref refs; - struct llist_head put_llist; - struct work_struct ref_work; struct completion done; + struct list_head ref_list; + spinlock_t lock; +}; + +struct io_buffer { + struct list_head list; + __u64 addr; + __s32 len; + __u16 bid; }; struct io_ring_ctx { @@ -201,7 +227,7 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int account_mem: 1; + unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; @@ -260,8 +286,8 @@ struct io_ring_ctx { const struct cred *creds; - /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ - struct completion *completions; + struct completion ref_comp; + struct completion sq_thread_comp; /* if all else fails... */ struct io_kiocb *fallback_req; @@ -270,6 +296,8 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr io_buffer_idr; + struct idr personality_idr; struct { @@ -290,15 +318,14 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - struct llist_head poll_llist; /* - * ->poll_list is protected by the ctx->uring_lock for + * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head poll_list; + struct list_head iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_file; @@ -306,6 +333,11 @@ struct io_ring_ctx { spinlock_t inflight_lock; struct list_head inflight_list; } ____cacheline_aligned_in_smp; + + struct delayed_work file_put_work; + struct llist_head file_put_llist; + + struct work_struct exit_work; }; /* @@ -335,7 +367,6 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; - u32 seq_offset; }; struct io_accept { @@ -343,6 +374,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + unsigned long nofile; }; struct io_sync { @@ -362,7 +394,9 @@ struct io_timeout { struct file *file; u64 addr; int flags; - unsigned count; + u32 off; + u32 target_seq; + struct list_head list; }; struct io_rw { @@ -381,22 +415,21 @@ struct io_connect { struct io_sr_msg { struct file *file; union { - struct user_msghdr __user *msg; + struct user_msghdr __user *umsg; void __user *buf; }; int msg_flags; + int bgid; size_t len; + struct io_buffer *kbuf; }; struct io_open { struct file *file; int dfd; - union { - unsigned mask; - }; struct filename *filename; - struct statx __user *buffer; struct open_how how; + unsigned long nofile; }; struct io_files_update { @@ -428,6 +461,39 @@ struct io_epoll { struct epoll_event event; }; +struct io_splice { + struct file *file_out; + struct file *file_in; + loff_t off_out; + loff_t off_in; + u64 len; + unsigned int flags; +}; + +struct io_provide_buf { + struct file *file; + __u64 addr; + __s32 len; + __u32 bgid; + __u16 nbufs; + __u16 bid; +}; + +struct io_statx { + struct file *file; + int dfd; + unsigned int mask; + unsigned int flags; + const char __user *filename; + struct statx __user *buffer; +}; + +struct io_completion { + struct file *file; + struct list_head list; + int cflags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -442,9 +508,10 @@ struct io_async_msghdr { struct io_async_rw { struct iovec fast_iov[UIO_FASTIOV]; - struct iovec *iov; - ssize_t nr_segs; - ssize_t size; + const struct iovec *free_iovec; + struct iov_iter iter; + size_t bytes_done; + struct wait_page_queue wpq; }; struct io_async_ctx { @@ -462,21 +529,25 @@ enum { REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_LINK_NEXT_BIT, + REQ_F_LINK_HEAD_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, - REQ_F_IOPOLL_COMPLETED_BIT, REQ_F_LINK_TIMEOUT_BIT, - REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_MUST_PUNT_BIT, - REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, - REQ_F_OVERFLOW_BIT, + REQ_F_POLLED_BIT, + REQ_F_BUFFER_SELECTED_BIT, + REQ_F_NO_FILE_TABLE_BIT, + REQ_F_WORK_INITIALIZED_BIT, + REQ_F_TASK_PINNED_BIT, + + /* not a real bit, just to check we're not overflowing the space */ + __REQ_F_LAST_BIT, }; enum { @@ -490,9 +561,11 @@ enum { REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + /* IOSQE_BUFFER_SELECT */ + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), - /* already grabbed next link */ - REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), + /* head of a link */ + REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -501,24 +574,29 @@ enum { REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), /* must not punt to workers */ REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), - /* polled IO has completed */ - REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT), /* has linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* timeout request */ - REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* must be punted even for NONBLOCK */ - REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), - /* no timeout sequence */ - REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* in overflow list */ - REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), + /* already went through poll handler */ + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + /* buffer already selected */ + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* doesn't need file table for this request */ + REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), + /* io_wq_work is initialized */ + REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), + /* req->task is refcounted */ + REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), +}; + +struct async_poll { + struct io_poll_iocb poll; + struct io_poll_iocb *double_poll; }; /* @@ -544,37 +622,57 @@ struct io_kiocb { struct io_fadvise fadvise; struct io_madvise madvise; struct io_epoll epoll; + struct io_splice splice; + struct io_provide_buf pbuf; + struct io_statx statx; + /* use only after cleaning per-op data, see io_clean_op() */ + struct io_completion compl; }; struct io_async_ctx *io; - /* - * llist_node is only used for poll deferred completions - */ - struct llist_node llist_node; - bool in_async; - bool needs_fixed_file; u8 opcode; + /* polled IO has completed */ + u8 iopoll_completed; - struct io_ring_ctx *ctx; - union { - struct list_head list; - struct hlist_node hash_node; - }; - struct list_head link_list; - unsigned int flags; - refcount_t refs; - u64 user_data; - u32 result; - u32 sequence; + u16 buf_index; + u32 result; + + struct io_ring_ctx *ctx; + unsigned int flags; + refcount_t refs; + struct task_struct *task; + u64 user_data; - struct list_head inflight_entry; + struct list_head link_list; + + /* + * 1. used with ctx->iopoll_list with reads/writes + * 2. to track reqs with ->files (see io_op_def::file_table) + */ + struct list_head inflight_entry; + + struct percpu_ref *fixed_file_refs; + struct callback_head task_work; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + struct hlist_node hash_node; + struct async_poll *apoll; + struct io_wq_work work; +}; - struct io_wq_work work; +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; + u32 seq; }; -#define IO_PLUG_THRESHOLD 2 #define IO_IOPOLL_BATCH 8 +struct io_comp_state { + unsigned int nr; + struct list_head list; + struct io_ring_ctx *ctx; +}; + struct io_submit_state { struct blk_plug plug; @@ -585,12 +683,16 @@ struct io_submit_state { unsigned int free_reqs; /* + * Batch completion logic + */ + struct io_comp_state comp; + + /* * File reference cache */ struct file *file; unsigned int fd; unsigned int has_refs; - unsigned int used_refs; unsigned int ios_left; }; @@ -601,8 +703,8 @@ struct io_op_def { unsigned needs_mm : 1; /* needs req->file assigned */ unsigned needs_file : 1; - /* needs req->file assigned IFF fd is >= 0 */ - unsigned fd_non_neg : 1; + /* don't fail if file grab fails */ + unsigned needs_file_no_error : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -613,6 +715,12 @@ struct io_op_def { unsigned file_table : 1; /* needs ->fs */ unsigned needs_fs : 1; + /* set if opcode supports polled "wait" */ + unsigned pollin : 1; + unsigned pollout : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; + unsigned needs_fsize : 1; }; static const struct io_op_def io_op_defs[] = { @@ -622,6 +730,8 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITEV] = { .async_ctx = 1, @@ -629,6 +739,8 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -636,11 +748,14 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -656,6 +771,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollout = 1, }, [IORING_OP_RECVMSG] = { .async_ctx = 1, @@ -663,6 +779,8 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_TIMEOUT] = { .async_ctx = 1, @@ -674,6 +792,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .file_table = 1, + .pollin = 1, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { @@ -685,18 +804,19 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .needs_fsize = 1, }, [IORING_OP_OPENAT] = { - .needs_file = 1, - .fd_non_neg = 1, .file_table = 1, .needs_fs = 1, }, [IORING_OP_CLOSE] = { .needs_file = 1, + .needs_file_no_error = 1, .file_table = 1, }, [IORING_OP_FILES_UPDATE] = { @@ -705,19 +825,22 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_STATX] = { .needs_mm = 1, - .needs_file = 1, - .fd_non_neg = 1, .needs_fs = 1, + .file_table = 1, }, [IORING_OP_READ] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITE] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -729,15 +852,16 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_RECV] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_OPENAT2] = { - .needs_file = 1, - .fd_non_neg = 1, .file_table = 1, .needs_fs = 1, }, @@ -745,20 +869,52 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .file_table = 1, }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_PROVIDE_BUFFERS] = {}, + [IORING_OP_REMOVE_BUFFERS] = {}, + [IORING_OP_TEE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, +}; + +enum io_mem_account { + ACCT_LOCKED, + ACCT_PINNED, }; -static void io_wq_submit_work(struct io_wq_work **workptr); +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); +static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_grab_files(struct io_kiocb *req); -static void io_ring_file_ref_flush(struct fixed_file_data *data); -static void io_cleanup_req(struct io_kiocb *req); +static int io_prep_work_files(struct io_kiocb *req); +static void __io_clean_op(struct io_kiocb *req); +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, + const struct io_uring_sqe *sqe, + struct io_comp_state *cs); +static void io_file_put_work(struct work_struct *work); + +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock); +static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, + struct iov_iter *iter, bool force); static struct kmem_cache *req_cachep; @@ -777,11 +933,92 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static void io_get_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + return; + get_task_struct(req->task); + req->flags |= REQ_F_TASK_PINNED; +} + +static inline void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | + REQ_F_INFLIGHT)) + __io_clean_op(req); +} + +/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ +static void __io_put_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + put_task_struct(req->task); +} + +static void io_sq_thread_drop_mm(void) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) +{ + if (!current->mm) { + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || + !mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + +/* + * Note: must call io_req_init_async() for the first time you + * touch any members of io_wq_work. + */ +static inline void io_req_init_async(struct io_kiocb *req) +{ + if (req->flags & REQ_F_WORK_INITIALIZED) + return; + + memset(&req->work, 0, sizeof(req->work)); + req->flags |= REQ_F_WORK_INITIALIZED; +} + +static inline bool io_async_submit(struct io_ring_ctx *ctx) +{ + return ctx->flags & IORING_SETUP_SQPOLL; +} + static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); - complete(&ctx->completions[0]); + complete(&ctx->ref_comp); +} + +static inline bool io_is_timeout_noseq(struct io_kiocb *req) +{ + return !req->timeout.off; } static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) @@ -797,10 +1034,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx->fallback_req) goto err; - ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL); - if (!ctx->completions) - goto err; - /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread. @@ -821,75 +1054,43 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; + init_waitqueue_head(&ctx->sqo_wait); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); - init_completion(&ctx->completions[0]); - init_completion(&ctx->completions[1]); + init_completion(&ctx->ref_comp); + init_completion(&ctx->sq_thread_comp); + idr_init(&ctx->io_buffer_idr); idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - init_llist_head(&ctx->poll_llist); - INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); spin_lock_init(&ctx->inflight_lock); INIT_LIST_HEAD(&ctx->inflight_list); + INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); + init_llist_head(&ctx->file_put_llist); return ctx; err: if (ctx->fallback_req) kmem_cache_free(req_cachep, ctx->fallback_req); - kfree(ctx->completions); kfree(ctx->cancel_hash); kfree(ctx); return NULL; } -static inline bool __req_need_defer(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped - + atomic_read(&ctx->cached_cq_overflow); -} - -static inline bool req_need_defer(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) - return __req_need_defer(req); - - return false; -} - -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req; - - req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); - if (req && !req_need_defer(req)) { - list_del_init(&req->list); - return req; - } - - return NULL; -} - -static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) +static bool req_need_defer(struct io_kiocb *req, u32 seq) { - struct io_kiocb *req; + if (unlikely(req->flags & REQ_F_IO_DRAIN)) { + struct io_ring_ctx *ctx = req->ctx; - req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); - if (req) { - if (req->flags & REQ_F_TIMEOUT_NOSEQ) - return NULL; - if (!__req_need_defer(req)) { - list_del_init(&req->list); - return req; - } + return seq != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); } - return NULL; + return false; } static void __io_commit_cqring(struct io_ring_ctx *ctx) @@ -905,31 +1106,17 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req, - const struct io_op_def *def) +/* + * Returns true if we need to defer file table putting. This can only happen + * from the error path with REQ_F_COMP_LOCKED set. + */ +static bool io_req_clean_work(struct io_kiocb *req) { - if (!req->work.mm && def->needs_mm) { - mmgrab(current->mm); - req->work.mm = current->mm; - } - if (!req->work.creds) - req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } - if (!req->work.task_pid) - req->work.task_pid = task_pid_vnr(current); -} + if (!(req->flags & REQ_F_WORK_INITIALIZED)) + return false; + + req->flags &= ~REQ_F_WORK_INITIALIZED; -static inline void io_req_work_drop_env(struct io_kiocb *req) -{ if (req->work.mm) { mmdrop(req->work.mm); req->work.mm = NULL; @@ -941,51 +1128,84 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) if (req->work.fs) { struct fs_struct *fs = req->work.fs; + if (req->flags & REQ_F_COMP_LOCKED) + return true; + spin_lock(&req->work.fs->lock); if (--fs->users) fs = NULL; spin_unlock(&req->work.fs->lock); if (fs) free_fs_struct(fs); + req->work.fs = NULL; } + + return false; } -static inline bool io_prep_async_work(struct io_kiocb *req, - struct io_kiocb **link) +static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; - bool do_hashed = false; + + io_req_init_async(req); if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file) - do_hashed = true; + if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL)) + io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (!req->work.fs && def->needs_fs) { + spin_lock(¤t->fs->lock); + if (!current->fs->in_exec) { + req->work.fs = current->fs; + req->work.fs->users++; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } + if (def->needs_fsize) + req->work.fsize = rlimit(RLIMIT_FSIZE); + else + req->work.fsize = RLIM_INFINITY; +} - io_req_work_grab_env(req, def); +static void io_prep_async_link(struct io_kiocb *req) +{ + struct io_kiocb *cur; - *link = io_prep_linked_timeout(req); - return do_hashed; + io_prep_async_work(req); + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(cur, &req->link_list, link_list) + io_prep_async_work(cur); } -static inline void io_queue_async_work(struct io_kiocb *req) +static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; - bool do_hashed; + struct io_kiocb *link = io_prep_linked_timeout(req); - do_hashed = io_prep_async_work(req, &link); + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, + &req->work, req->flags); + io_wq_enqueue(ctx->io_wq, &req->work); + return link; +} - trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, - req->flags); - if (!do_hashed) { - io_wq_enqueue(ctx->io_wq, &req->work); - } else { - io_wq_enqueue_hashed(ctx->io_wq, &req->work, - file_inode(req->file)); - } +static void io_queue_async_work(struct io_kiocb *req) +{ + struct io_kiocb *link; + + /* init ->work of the whole link before punting */ + io_prep_async_link(req); + link = __io_queue_async_work(req); if (link) io_queue_linked_timeout(link); @@ -997,8 +1217,9 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { - atomic_inc(&req->ctx->cq_timeouts); - list_del_init(&req->list); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); io_put_req(req); @@ -1010,22 +1231,57 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) io_kill_timeout(req); spin_unlock_irq(&ctx->completion_lock); } -static void io_commit_cqring(struct io_ring_ctx *ctx) +static void __io_queue_deferred(struct io_ring_ctx *ctx) { - struct io_kiocb *req; + do { + struct io_defer_entry *de = list_first_entry(&ctx->defer_list, + struct io_defer_entry, list); + struct io_kiocb *link; + + if (req_need_defer(de->req, de->seq)) + break; + list_del_init(&de->list); + /* punt-init is done before queueing for defer */ + link = __io_queue_async_work(de->req); + if (link) { + __io_queue_linked_timeout(link); + /* drop submission reference */ + link->flags |= REQ_F_COMP_LOCKED; + io_put_req(link); + } + kfree(de); + } while (!list_empty(&ctx->defer_list)); +} + +static void io_flush_timeouts(struct io_ring_ctx *ctx) +{ + while (!list_empty(&ctx->timeout_list)) { + struct io_kiocb *req = list_first_entry(&ctx->timeout_list, + struct io_kiocb, timeout.list); + + if (io_is_timeout_noseq(req)) + break; + if (req->timeout.target_seq != ctx->cached_cq_tail + - atomic_read(&ctx->cq_timeouts)) + break; - while ((req = io_get_timeout_req(ctx)) != NULL) + list_del_init(&req->timeout.list); io_kill_timeout(req); + } +} +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + io_flush_timeouts(ctx); __io_commit_cqring(ctx); - while ((req = io_get_deferred_req(ctx)) != NULL) - io_queue_async_work(req); + if (unlikely(!list_empty(&ctx->defer_list))) + __io_queue_deferred(ctx); } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) @@ -1050,24 +1306,30 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) { if (!ctx->cq_ev_fd) return false; + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return false; if (!ctx->eventfd_async) return true; - return io_wq_current_is_worker() || in_interrupt(); + return io_wq_current_is_worker(); } -static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev) +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (trigger_ev) + if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) { - __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx)); + if (list_empty(&ctx->cq_overflow_list)) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + } } /* Returns true if there are no backlogged entries after the flush */ @@ -1100,13 +1362,12 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) break; req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - list); - list_move(&req->list, &list); - req->flags &= ~REQ_F_OVERFLOW; + compl.list); + list_move(&req->compl.list, &list); if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, req->compl.cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1114,23 +1375,21 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); - if (cqe) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - } + io_cqring_mark_overflow(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); while (!list_empty(&list)) { - req = list_first_entry(&list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&list, struct io_kiocb, compl.list); + list_del(&req->compl.list); io_put_req(req); } return cqe != NULL; } -static void io_cqring_fill_event(struct io_kiocb *req, long res) +static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; @@ -1146,7 +1405,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) if (likely(cqe)) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, cflags); } else if (ctx->cq_overflow_flushed) { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1154,27 +1413,82 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->sq_check_overflow); set_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } - req->flags |= REQ_F_OVERFLOW; - refcount_inc(&req->refs); + io_clean_op(req); req->result = res; - list_add_tail(&req->list, &ctx->cq_overflow_list); + req->compl.cflags = cflags; + refcount_inc(&req->refs); + list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void io_cqring_fill_event(struct io_kiocb *req, long res) +{ + __io_cqring_fill_event(req, res, 0); +} + +static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(req, res); + __io_cqring_fill_event(req, res, cflags); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } +static void io_submit_flush_completions(struct io_comp_state *cs) +{ + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, compl.list); + list_del(&req->compl.list); + __io_cqring_fill_event(req, req->result, req->compl.cflags); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, + struct io_comp_state *cs) +{ + if (!cs) { + io_cqring_add_event(req, res, cflags); + io_put_req(req); + } else { + io_clean_op(req); + req->result = res; + req->compl.cflags = cflags; + list_add_tail(&req->compl.list, &cs->list); + if (++cs->nr >= 32) + io_submit_flush_completions(cs); + } +} + +static void io_req_complete(struct io_kiocb *req, long res) +{ + __io_req_complete(req, res, 0, NULL); +} + static inline bool io_is_fallback_req(struct io_kiocb *req) { return req == (struct io_kiocb *) @@ -1186,23 +1500,19 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = ctx->fallback_req; - if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req)) + if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) return req; return NULL; } -static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, - struct io_submit_state *state) +static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, + struct io_submit_state *state) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!state) { - req = kmem_cache_alloc(req_cachep, gfp); - if (unlikely(!req)) - goto fallback; - } else if (!state->free_reqs) { + if (!state->free_reqs) { size_t sz; int ret; @@ -1226,121 +1536,75 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req = state->reqs[state->free_reqs]; } -got_it: - req->io = NULL; - req->file = NULL; - req->ctx = ctx; - req->flags = 0; - /* one is dropped after submission, the other at completion */ - refcount_set(&req->refs, 2); - req->result = 0; - INIT_IO_WORK(&req->work, io_wq_submit_work); return req; fallback: - req = io_get_fallback_req(ctx); - if (req) - goto got_it; - percpu_ref_put(&ctx->refs); - return NULL; + return io_get_fallback_req(ctx); } -static void __io_req_do_free(struct io_kiocb *req) +static inline void io_put_file(struct io_kiocb *req, struct file *file, + bool fixed) { - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); + if (fixed) + percpu_ref_put(req->fixed_file_refs); else - clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); + fput(file); } -static void __io_req_aux_free(struct io_kiocb *req) +static bool io_dismantle_req(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - if (req->flags & REQ_F_NEED_CLEANUP) - io_cleanup_req(req); + io_clean_op(req); - kfree(req->io); - if (req->file) { - if (req->flags & REQ_F_FIXED_FILE) - percpu_ref_put(&ctx->file_data->refs); - else - fput(req->file); - } + if (req->io) + kfree(req->io); + if (req->file) + io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - io_req_work_drop_env(req); + return io_req_clean_work(req); } -static void __io_free_req(struct io_kiocb *req) +static void __io_free_req_finish(struct io_kiocb *req) { - __io_req_aux_free(req); - - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - } + struct io_ring_ctx *ctx = req->ctx; - percpu_ref_put(&req->ctx->refs); - __io_req_do_free(req); + __io_put_req_task(req); + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req); + percpu_ref_put(&ctx->refs); } -struct req_batch { - void *reqs[IO_IOPOLL_BATCH]; - int to_free; - int need_iter; -}; - -static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) +static void io_req_task_file_table_put(struct callback_head *cb) { - int fixed_refs = rb->to_free; - - if (!rb->to_free) - return; - if (rb->need_iter) { - int i, inflight = 0; - unsigned long flags; + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct fs_struct *fs = req->work.fs; - fixed_refs = 0; - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; + spin_lock(&req->work.fs->lock); + if (--fs->users) + fs = NULL; + spin_unlock(&req->work.fs->lock); + if (fs) + free_fs_struct(fs); + req->work.fs = NULL; + __io_free_req_finish(req); +} - if (req->flags & REQ_F_FIXED_FILE) { - req->file = NULL; - fixed_refs++; - } - if (req->flags & REQ_F_INFLIGHT) - inflight++; - __io_req_aux_free(req); - } - if (!inflight) - goto do_free; +static void __io_free_req(struct io_kiocb *req) +{ + if (!io_dismantle_req(req)) { + __io_free_req_finish(req); + } else { + int ret; - spin_lock_irqsave(&ctx->inflight_lock, flags); - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; + init_task_work(&req->task_work, io_req_task_file_table_put); + ret = task_work_add(req->task, &req->task_work, TWA_RESUME); + if (unlikely(ret)) { + struct task_struct *tsk; - if (req->flags & REQ_F_INFLIGHT) { - list_del(&req->inflight_entry); - if (!--inflight) - break; - } + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); } - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); } -do_free: - kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - if (fixed_refs) - percpu_ref_put_many(&ctx->file_data->refs, fixed_refs); - percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = rb->need_iter = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1352,7 +1616,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); - req->flags &= ~REQ_F_LINK; + req->flags &= ~REQ_F_LINK_HEAD; io_put_req(req); return true; } @@ -1360,53 +1624,68 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static bool __io_kill_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *link; + bool wake_ev; + + if (list_empty(&req->link_list)) + return false; + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + if (link->opcode != IORING_OP_LINK_TIMEOUT) + return false; + + list_del_init(&link->link_list); + link->flags |= REQ_F_COMP_LOCKED; + wake_ev = io_link_cancel_timeout(link); + req->flags &= ~REQ_F_LINK_TIMEOUT; + return wake_ev; +} + +static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool wake_ev = false; + bool wake_ev; - /* Already got next link */ - if (req->flags & REQ_F_LINK_NEXT) - return; + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + wake_ev = __io_kill_linked_timeout(req); + } + + if (wake_ev) + io_cqring_ev_posted(ctx); +} + +static struct io_kiocb *io_req_link_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt; /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the * safe side. */ - while (!list_empty(&req->link_list)) { - struct io_kiocb *nxt = list_first_entry(&req->link_list, - struct io_kiocb, link_list); - - if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT))) { - list_del_init(&nxt->link_list); - wake_ev |= io_link_cancel_timeout(nxt); - req->flags &= ~REQ_F_LINK_TIMEOUT; - continue; - } - - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK; - *nxtptr = nxt; - break; - } + if (unlikely(list_empty(&req->link_list))) + return NULL; - req->flags |= REQ_F_LINK_NEXT; - if (wake_ev) - io_cqring_ev_posted(ctx); + nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK_HEAD; + return nxt; } /* - * Called if REQ_F_LINK is set, and we fail the head request + * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void io_fail_links(struct io_kiocb *req) +static void __io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, @@ -1415,25 +1694,38 @@ static void io_fail_links(struct io_kiocb *req) list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->opcode == IORING_OP_LINK_TIMEOUT) { - io_link_cancel_timeout(link); - } else { - io_cqring_fill_event(link, -ECANCELED); - __io_double_put_req(link); - } + io_cqring_fill_event(link, -ECANCELED); + link->flags |= REQ_F_COMP_LOCKED; + __io_double_put_req(link); req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } -static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_fail_links(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK))) - return; + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + __io_fail_links(req); + } + + io_cqring_ev_posted(ctx); +} + +static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) +{ + req->flags &= ~REQ_F_LINK_HEAD; + if (req->flags & REQ_F_LINK_TIMEOUT) + io_kill_linked_timeout(req); /* * If LINK is set, we have dependent requests in this chain. If we @@ -1441,48 +1733,191 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) { - io_fail_links(req); - } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == - REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; + if (likely(!(req->flags & REQ_F_FAIL_LINK))) + return io_req_link_next(req); + io_fail_links(req); + return NULL; +} - /* - * If this is a timeout link, we could be racing with the - * timeout timer. Grab the completion lock for this case to - * protect against that. - */ - spin_lock_irqsave(&ctx->completion_lock, flags); - io_req_link_next(req, nxt); - spin_unlock_irqrestore(&ctx->completion_lock, flags); +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_LINK_HEAD))) + return NULL; + return __io_req_find_next(req); +} + +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, + bool twa_signal_ok) +{ + struct task_struct *tsk = req->task; + struct io_ring_ctx *ctx = req->ctx; + int ret, notify; + + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. For + * all other cases, use TWA_SIGNAL unconditionally to ensure we're + * processing task_work. There's no reliable way to tell if TWA_RESUME + * will do the job. + */ + notify = 0; + if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) + notify = TWA_SIGNAL; + + ret = task_work_add(tsk, cb, notify); + if (!ret) + wake_up_process(tsk); + + return ret; +} + +static void __io_req_task_cancel(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_req_task_cancel(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_cancel(req, -ECANCELED); +} + +static void __io_req_task_submit(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!__io_sq_thread_acquire_mm(ctx)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL, NULL); + mutex_unlock(&ctx->uring_lock); } else { - io_req_link_next(req, nxt); + __io_req_task_cancel(req, -EFAULT); } } -static void io_free_req(struct io_kiocb *req) +static void io_req_task_submit(struct callback_head *cb) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; - io_req_find_next(req, &nxt); - __io_free_req(req); + __io_req_task_submit(req); + percpu_ref_put(&ctx->refs); +} + +static void io_req_task_queue(struct io_kiocb *req) +{ + int ret; + + init_task_work(&req->task_work, io_req_task_submit); + percpu_ref_get(&req->ctx->refs); + + ret = io_req_task_work_add(req, &req->task_work, true); + if (unlikely(ret)) { + struct task_struct *tsk; + + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); + } +} + +static void io_queue_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt = io_req_find_next(req); if (nxt) - io_queue_async_work(nxt); + io_req_task_queue(nxt); +} + +static void io_free_req(struct io_kiocb *req) +{ + io_queue_next(req); + __io_free_req(req); +} + +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; + + struct task_struct *task; + int task_refs; +}; + +static inline void io_init_req_batch(struct req_batch *rb) +{ + rb->to_free = 0; + rb->task_refs = 0; + rb->task = NULL; +} + +static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = 0; +} + +static void io_req_free_batch_finish(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + if (rb->to_free) + __io_req_free_batch_flush(ctx, rb); + if (rb->task) { + put_task_struct_many(rb->task, rb->task_refs); + rb->task = NULL; + } +} + +static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) +{ + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); + + if (req->flags & REQ_F_TASK_PINNED) { + if (req->task != rb->task) { + if (rb->task) + put_task_struct_many(rb->task, rb->task_refs); + rb->task = req->task; + rb->task_refs = 0; + } + rb->task_refs++; + req->flags &= ~REQ_F_TASK_PINNED; + } + + WARN_ON_ONCE(io_dismantle_req(req)); + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + __io_req_free_batch_flush(req->ctx, rb); } /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ -__attribute__((nonnull)) -static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) { + struct io_kiocb *nxt = NULL; + if (refcount_dec_and_test(&req->refs)) { - io_req_find_next(req, nxtptr); + nxt = io_req_find_next(req); __io_free_req(req); } + return nxt; } static void io_put_req(struct io_kiocb *req) @@ -1491,6 +1926,22 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static struct io_wq_work *io_steal_work(struct io_kiocb *req) +{ + struct io_kiocb *nxt; + + /* + * A ref is owned by io-wq in which context we're. So, if that's the + * last one, it's safe to steal next work. False negatives are Ok, + * it just will be re-punted async in io_put_work() + */ + if (refcount_read(&req->refs) != 1) + return NULL; + + nxt = io_req_find_next(req); + return nxt ? &nxt->work : NULL; +} + /* * Must only be used if we don't need to care about links, usually from * within the completion handling itself. @@ -1538,18 +1989,45 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) { - if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) - return false; + unsigned int cflags; - if (!(req->flags & REQ_F_FIXED_FILE) || req->io) - rb->need_iter++; + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(kbuf); + return cflags; +} - rb->reqs[rb->to_free++] = req; - if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) - io_free_req_many(req->ctx, rb); - return true; +static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + return io_put_kbuf(req, kbuf); +} + +static inline bool io_run_task_work(void) +{ + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); + return true; + } + + return false; +} + +static void io_iopoll_queue(struct list_head *again) +{ + struct io_kiocb *req; + + do { + req = list_first_entry(again, struct io_kiocb, inflight_entry); + list_del(&req->inflight_entry); + __io_complete_rw(req, -EAGAIN, 0, NULL); + } while (!list_empty(again)); } /* @@ -1560,22 +2038,41 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct req_batch rb; struct io_kiocb *req; + LIST_HEAD(again); + + /* order with ->result store in io_complete_rw_iopoll() */ + smp_rmb(); - rb.to_free = rb.need_iter = 0; + io_init_req_batch(&rb); while (!list_empty(done)) { - req = list_first_entry(done, struct io_kiocb, list); - list_del(&req->list); + int cflags = 0; - io_cqring_fill_event(req, req->result); + req = list_first_entry(done, struct io_kiocb, inflight_entry); + if (READ_ONCE(req->result) == -EAGAIN) { + req->result = 0; + req->iopoll_completed = 0; + list_move_tail(&req->inflight_entry, &again); + continue; + } + list_del(&req->inflight_entry); + + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_rw_kbuf(req); + + __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) - io_free_req(req); + if (refcount_dec_and_test(&req->refs)) + io_req_free_batch(&rb, req); } io_commit_cqring(ctx); - io_free_req_many(ctx, &rb); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_ev_posted(ctx); + io_req_free_batch_finish(ctx, &rb); + + if (!list_empty(&again)) + io_iopoll_queue(&again); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1593,16 +2090,16 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; /* - * Move completed entries to our local list. If we find a - * request that requires polling, break out and complete - * the done list first, if we have entries there. + * Move completed and retryable entries to our local lists. + * If we find a request that requires polling, break out + * and complete those lists first, if we have entries there. */ - if (req->flags & REQ_F_IOPOLL_COMPLETED) { - list_move_tail(&req->list, &done); + if (READ_ONCE(req->iopoll_completed)) { + list_move_tail(&req->inflight_entry, &done); continue; } if (!list_empty(&done)) @@ -1612,6 +2109,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ret < 0) break; + /* iopoll may have completed current req */ + if (READ_ONCE(req->iopoll_completed)) + list_move_tail(&req->inflight_entry, &done); + if (ret && spin) spin = false; ret = 0; @@ -1631,13 +2132,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list) && !need_resched()) { + while (!list_empty(&ctx->iopoll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); if (ret < 0) return ret; - if (!min || *nr_events >= min) + if (*nr_events >= min) return 0; } @@ -1648,29 +2149,37 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; - io_iopoll_getevents(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 0); + /* let it sleep and repeat later if can't complete a request */ + if (nr_events == 0) + break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. + * Also let task_work, etc. to progress by releasing the mutex */ - cond_resched(); + if (need_resched()) { + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } } mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { + unsigned int nr_events = 0; int iters = 0, ret = 0; /* @@ -1680,8 +2189,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ mutex_lock(&ctx->uring_lock); do { - int tmin = 0; - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that @@ -1702,17 +2209,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); + io_run_task_work(); mutex_lock(&ctx->uring_lock); } - if (*nr_events < min) - tmin = min - *nr_events; - - ret = io_iopoll_getevents(ctx, nr_events, tmin); + ret = io_iopoll_getevents(ctx, &nr_events, min); if (ret <= 0) break; ret = 0; - } while (min && !*nr_events && !need_resched()); + } while (min && !nr_events && !need_resched()); mutex_unlock(&ctx->uring_lock); return ret; @@ -1732,41 +2237,98 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } -static inline void req_set_fail_links(struct io_kiocb *req) -{ - if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; -} - -static void io_complete_rw_common(struct kiocb *kiocb, long res) +static void io_complete_rw_common(struct kiocb *kiocb, long res, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + int cflags = 0; if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); if (res != req->result) req_set_fail_links(req); - io_cqring_add_event(req, res); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_rw_kbuf(req); + __io_req_complete(req, res, cflags, cs); } -static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req, int error) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + ssize_t ret = -ECANCELED; + struct iov_iter iter; + int rw; - io_complete_rw_common(kiocb, res); - io_put_req(req); + if (error) { + ret = error; + goto end_req; + } + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + rw = READ; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + rw = WRITE; + break; + default: + printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", + req->opcode); + goto end_req; + } + + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); + if (!ret) + return true; + kfree(iovec); +end_req: + req_set_fail_links(req); + io_req_complete(req, ret); + return false; } +#endif -static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) +static bool io_rw_reissue(struct io_kiocb *req, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - struct io_kiocb *nxt = NULL; +#ifdef CONFIG_BLOCK + int ret; - io_complete_rw_common(kiocb, res); - io_put_req_find_next(req, &nxt); + if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + return false; - return nxt; + ret = io_sq_thread_acquire_mm(req->ctx, req); + + if (io_resubmit_prep(req, ret)) { + refcount_inc(&req->refs); + io_queue_async_work(req); + return true; + } + +#endif + return false; +} + +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs) +{ + if (!io_rw_reissue(req, res)) + io_complete_rw_common(&req->rw.kiocb, res, cs); +} + +static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + + __io_complete_rw(req, res, res2, NULL); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -1776,11 +2338,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (res != req->result) + if (res != -EAGAIN && res != req->result) req_set_fail_links(req); - req->result = res; - if (res != -EAGAIN) - req->flags |= REQ_F_IOPOLL_COMPLETED; + + WRITE_ONCE(req->result, res); + /* order with io_poll_complete() checking ->result */ + smp_wmb(); + WRITE_ONCE(req->iopoll_completed, 1); } /* @@ -1798,13 +2362,13 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->poll_list)) { + if (list_empty(&ctx->iopoll_list)) { ctx->poll_multi_file = false; } else if (!ctx->poll_multi_file) { struct io_kiocb *list_req; - list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, - list); + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, + inflight_entry); if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -1813,25 +2377,27 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * For fast devices, IO may have already completed. If it has, add * it to the front so we find it first. */ - if (req->flags & REQ_F_IOPOLL_COMPLETED) - list_add(&req->list, &ctx->poll_list); + if (READ_ONCE(req->iopoll_completed)) + list_add(&req->inflight_entry, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->poll_list); + list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); } -static void io_file_put(struct io_submit_state *state) +static void __io_state_file_put(struct io_submit_state *state) { - if (state->file) { - int diff = state->has_refs - state->used_refs; + if (state->has_refs) + fput_many(state->file, state->has_refs); + state->file = NULL; +} - if (diff) - fput_many(state->file, diff); - state->file = NULL; - } +static inline void io_state_file_put(struct io_submit_state *state) +{ + if (state->file) + __io_state_file_put(state); } /* @@ -1839,45 +2405,72 @@ static void io_file_put(struct io_submit_state *state) * assuming most submissions are for one file, or at least that each file * has more than one submission. */ -static struct file *io_file_get(struct io_submit_state *state, int fd) +static struct file *__io_file_get(struct io_submit_state *state, int fd) { if (!state) return fget(fd); if (state->file) { if (state->fd == fd) { - state->used_refs++; + state->has_refs--; state->ios_left--; return state->file; } - io_file_put(state); + __io_state_file_put(state); } state->file = fget_many(fd, state->ios_left); if (!state->file) return NULL; state->fd = fd; - state->has_refs = state->ios_left; - state->used_refs = 1; state->ios_left--; + state->has_refs = state->ios_left; return state->file; } +static bool io_bdev_nowait(struct block_device *bdev) +{ +#ifdef CONFIG_BLOCK + return !bdev || queue_is_mq(bdev_get_queue(bdev)); +#else + return true; +#endif +} + /* * If we tracked the file through the SCM inflight mechanism, we could support * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool io_file_supports_async(struct file *file) +static bool io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) + if (S_ISBLK(mode)) { + if (io_bdev_nowait(file->f_inode->i_bdev)) + return true; + return false; + } + if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; - if (S_ISREG(mode) && file->f_op != &io_uring_fops) + if (S_ISREG(mode)) { + if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + file->f_op != &io_uring_fops) + return true; + return false; + } + + /* any ->read/write should understand O_NONBLOCK */ + if (file->f_flags & O_NONBLOCK) return true; - return false; + if (!(file->f_mode & FMODE_NOWAIT)) + return false; + + if (rw == READ) + return file->f_op->read_iter != NULL; + + return file->f_op->write_iter != NULL; } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -1913,10 +2506,12 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_ioprio = get_current_ioprio(); /* don't allow async punt if RWF_NOWAIT was requested */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - (req->file->f_flags & O_NONBLOCK)) + if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; + if (kiocb->ki_flags & IOCB_DIRECT) + io_get_req_task(req); + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; @@ -1927,7 +2522,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; - req->result = 0; + req->iopoll_completed = 0; + io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -1936,9 +2532,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); - /* we own ->private, reuse it for the buffer index */ - req->rw.kiocb.private = (void *) (unsigned long) - READ_ONCE(sqe->buf_index); + req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -1957,21 +2551,29 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) * IO with EINTR. */ ret = -EINTR; - /* fall through */ + fallthrough; default: kiocb->ki_complete(kiocb, ret, 0); } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, - bool in_async) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + /* add previously done IO, if any */ + if (req->io && req->io->rw.bytes_done > 0) { + if (ret < 0) + ret = req->io->rw.bytes_done; + else + ret += req->io->rw.bytes_done; + } + if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) - *nxt = __io_complete_rw(kiocb, ret); + if (ret >= 0 && kiocb->ki_complete == io_complete_rw) + __io_complete_rw(req, ret, 0, cs); else io_rw_done(kiocb, ret); } @@ -1982,7 +2584,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct io_ring_ctx *ctx = req->ctx; size_t len = req->rw.len; struct io_mapped_ubuf *imu; - unsigned index, buf_index; + u16 index, buf_index; size_t offset; u64 buf_addr; @@ -1990,7 +2592,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, if (unlikely(!ctx->user_bufs)) return -EFAULT; - buf_index = (unsigned long) req->rw.kiocb.private; + buf_index = req->buf_index; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; @@ -2050,11 +2652,153 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, return len; } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter) +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) +{ + if (needs_lock) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (needs_lock) + mutex_lock(&ctx->uring_lock); +} + +static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, + int bgid, struct io_buffer *kbuf, + bool needs_lock) +{ + struct io_buffer *head; + + if (req->flags & REQ_F_BUFFER_SELECTED) + return kbuf; + + io_ring_submit_lock(req->ctx, needs_lock); + + lockdep_assert_held(&req->ctx->uring_lock); + + head = idr_find(&req->ctx->io_buffer_idr, bgid); + if (head) { + if (!list_empty(&head->list)) { + kbuf = list_last_entry(&head->list, struct io_buffer, + list); + list_del(&kbuf->list); + } else { + kbuf = head; + idr_remove(&req->ctx->io_buffer_idr, bgid); + } + if (*len > kbuf->len) + *len = kbuf->len; + } else { + kbuf = ERR_PTR(-ENOBUFS); + } + + io_ring_submit_unlock(req->ctx, needs_lock); + + return kbuf; +} + +static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, + bool needs_lock) +{ + struct io_buffer *kbuf; + u16 bgid; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + bgid = req->buf_index; + kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); + if (IS_ERR(kbuf)) + return kbuf; + req->rw.addr = (u64) (unsigned long) kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + return u64_to_user_ptr(kbuf->addr); +} + +#ifdef CONFIG_COMPAT +static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + struct compat_iovec __user *uiov; + compat_ssize_t clen; + void __user *buf; + ssize_t len; + + uiov = u64_to_user_ptr(req->rw.addr); + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + + len = clen; + buf = io_rw_buffer_select(req, &len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + iov[0].iov_base = buf; + iov[0].iov_len = (compat_size_t) len; + return 0; +} +#endif + +static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); + void __user *buf; + ssize_t len; + + if (copy_from_user(iov, uiov, sizeof(*uiov))) + return -EFAULT; + + len = iov[0].iov_len; + if (len < 0) + return -EINVAL; + buf = io_rw_buffer_select(req, &len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + iov[0].iov_base = buf; + iov[0].iov_len = len; + return 0; +} + +static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + if (req->flags & REQ_F_BUFFER_SELECTED) { + struct io_buffer *kbuf; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov[0].iov_len = kbuf->len; + return 0; + } + if (!req->rw.len) + return 0; + else if (req->rw.len > 1) + return -EINVAL; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return io_compat_import(req, iov, needs_lock); +#endif + + return __io_iov_buffer_select(req, iov, needs_lock); +} + +static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; + ssize_t ret; u8 opcode; opcode = req->opcode; @@ -2063,25 +2807,31 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return io_import_fixed(req, rw, iter); } - /* buffer index only valid with fixed read/write */ - if (req->rw.kiocb.private) + /* buffer index only valid with fixed read/write, or buffer select */ + if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - ssize_t ret; + if (req->flags & REQ_F_BUFFER_SELECT) { + buf = io_rw_buffer_select(req, &sqe_len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + req->rw.len = sqe_len; + } + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; return ret < 0 ? ret : sqe_len; } - if (req->io) { - struct io_async_rw *iorw = &req->io->rw; - - *iovec = iorw->iov; - iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); - if (iorw->iov == iorw->fast_iov) - *iovec = NULL; - return iorw->size; + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_iov_buffer_select(req, *iovec, needs_lock); + if (!ret) { + ret = (*iovec)->iov_len; + iov_iter_init(iter, rw, *iovec, 1, ret); + } + *iovec = NULL; + return ret; } #ifdef CONFIG_COMPAT @@ -2093,6 +2843,21 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) +{ + if (!req->io) + return __io_import_iovec(rw, req, iovec, iter, needs_lock); + *iovec = NULL; + return iov_iter_count(&req->io->rw.iter); +} + +static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) +{ + return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos; +} + /* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. @@ -2128,10 +2893,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } if (iov_iter_is_bvec(iter)) @@ -2151,50 +2916,82 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter) +static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter) { - req->io->rw.nr_segs = iter->nr_segs; - req->io->rw.size = io_size; - req->io->rw.iov = iovec; - if (!req->io->rw.iov) { - req->io->rw.iov = req->io->rw.fast_iov; - memcpy(req->io->rw.iov, fast_iov, - sizeof(struct iovec) * iter->nr_segs); + struct io_async_rw *rw = &req->io->rw; + + memcpy(&rw->iter, iter, sizeof(*iter)); + rw->free_iovec = NULL; + rw->bytes_done = 0; + /* can only be fixed buffers, no need to do anything */ + if (iter->type == ITER_BVEC) + return; + if (!iovec) { + unsigned iov_off = 0; + + rw->iter.iov = rw->fast_iov; + if (iter->iov != fast_iov) { + iov_off = iter->iov - fast_iov; + rw->iter.iov += iov_off; + } + if (rw->fast_iov != fast_iov) + memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, + sizeof(struct iovec) * iter->nr_segs); } else { + rw->free_iovec = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } -static int io_alloc_async_ctx(struct io_kiocb *req) +static inline int __io_alloc_async_ctx(struct io_kiocb *req) { - if (!io_op_defs[req->opcode].async_ctx) - return 0; req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); return req->io == NULL; } -static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter) +static int io_alloc_async_ctx(struct io_kiocb *req) { if (!io_op_defs[req->opcode].async_ctx) return 0; + + return __io_alloc_async_ctx(req); +} + +static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, + struct iov_iter *iter, bool force) +{ + if (!force && !io_op_defs[req->opcode].async_ctx) + return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) + if (__io_alloc_async_ctx(req)) return -ENOMEM; - io_req_map_rw(req, io_size, iovec, fast_iov, iter); + io_req_map_rw(req, iovec, fast_iov, iter); } return 0; } +static inline int io_rw_prep_async(struct io_kiocb *req, int rw, + bool force_nonblock) +{ + struct io_async_rw *iorw = &req->io->rw; + ssize_t ret; + + iorw->iter.iov = iorw->fast_iov; + ret = __io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, + &iorw->iter, !force_nonblock); + if (unlikely(ret < 0)) + return ret; + + io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); + return 0; +} + static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -2207,83 +3004,210 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; + return io_rw_prep_async(req, READ, force_nonblock); +} - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter); - req->io = io; - if (ret < 0) - return ret; +/* + * This is our waitqueue callback handler, registered through lock_page_async() + * when we initially tried to do the IO with the iocb armed our waitqueue. + * This gets called when the page is unlocked, and we generally expect that to + * happen when the page IO is completed and the page is now uptodate. This will + * queue a task_work based retry of the operation, attempting to copy the data + * again. If the latter fails because the page was NOT uptodate, then we will + * do a thread based blocking retry of the operation. That's the unexpected + * slow path. + */ +static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, + int sync, void *arg) +{ + struct wait_page_queue *wpq; + struct io_kiocb *req = wait->private; + struct wait_page_key *key = arg; + int ret; - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + wpq = container_of(wait, struct wait_page_queue, wait); + + if (!wake_page_match(wpq, key)) + return 0; + + list_del_init(&wait->entry); + + init_task_work(&req->task_work, io_req_task_submit); + percpu_ref_get(&req->ctx->refs); + + /* submit ref gets dropped, acquire a new one */ + refcount_inc(&req->refs); + ret = io_req_task_work_add(req, &req->task_work, true); + if (unlikely(ret)) { + struct task_struct *tsk; + + /* queue just for cancelation */ + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); + } + return 1; +} + +/* + * This controls whether a given IO request should be armed for async page + * based retry. If we return false here, the request is handed to the async + * worker threads for retry. If we're doing buffered reads on a regular file, + * we prepare a private wait_page_queue entry and retry the operation. This + * will either succeed because the page is now uptodate and unlocked, or it + * will register a callback when the page is unlocked at IO completion. Through + * that callback, io_uring uses task_work to setup a retry of the operation. + * That retry will attempt the buffered read again. The retry will generally + * succeed, or in rare cases where it fails, we then fall back to using the + * async worker threads for a blocking retry. + */ +static bool io_rw_should_retry(struct io_kiocb *req) +{ + struct wait_page_queue *wait = &req->io->rw.wpq; + struct kiocb *kiocb = &req->rw.kiocb; + + /* never retry for NOWAIT, we just complete with -EAGAIN */ + if (req->flags & REQ_F_NOWAIT) + return false; + + /* Only for buffered IO */ + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) + return false; + + /* + * just use poll if we can, and don't attempt if the fs doesn't + * support callback based unlocks + */ + if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + return false; + + wait->wait.func = io_async_buf_func; + wait->wait.private = req; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_waitq = wait; + + io_get_req_task(req); + return true; } -static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +{ + if (req->file->f_op->read_iter) + return call_read_iter(req->file, &req->rw.kiocb, iter); + else if (req->file->f_op->read) + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); + else + return -EINVAL; +} + +static int io_read(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter iter; + struct iov_iter __iter, *iter = &__iter; + ssize_t io_size, ret, ret2; size_t iov_count; - ssize_t io_size, ret; - ret = io_import_iovec(READ, req, &iovec, &iter); + if (req->io) + iter = &req->io->rw.iter; + + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); + io_size = ret; + req->result = io_size; + ret = 0; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_flags &= ~IOCB_NOWAIT; - req->result = 0; - io_size = ret; - if (req->flags & REQ_F_LINK) - req->result = io_size; - - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + /* If the file doesn't support async, just async punt */ + if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; - } - iov_count = iov_iter_count(&iter); - ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); + if (unlikely(ret)) + goto out_free; + + ret = io_iter_do_read(req, iter); + if (!ret) { - ssize_t ret2; + goto done; + } else if (ret == -EIOCBQUEUED) { + ret = 0; + goto out_free; + } else if (ret == -EAGAIN) { + /* IOPOLL retry should happen for io-wq threads */ + if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) + goto done; + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); + if (ret) + goto out_free; + return -EAGAIN; + } else if (ret < 0) { + /* make sure -ERESTARTSYS -> -EINTR is done */ + goto done; + } - if (req->file->f_op->read_iter) - ret2 = call_read_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); + /* read it all, or we did blocking attempt. no retry. */ + if (!iov_iter_count(iter) || !force_nonblock || + (req->file->f_flags & O_NONBLOCK)) + goto done; - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); - } else { + io_size -= ret; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) - goto out_free; - return -EAGAIN; - } + ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); + if (ret2) { + ret = ret2; + goto out_free; + } + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* now use our persistent iterator, if we aren't already */ + iter = &req->io->rw.iter; +retry: + req->io->rw.bytes_done += ret; + /* if we can retry, do so with the callbacks armed */ + if (!io_rw_should_retry(req)) { + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; } + + /* + * Now retry read with the IOCB_WAITQ parts set in the iocb. If we + * get -EIOCBQUEUED, then we'll get a notification when the desired + * page gets unlocked. We can also get a partial read here, and if we + * do, then just retry at the new offset. + */ + ret = io_iter_do_read(req, iter); + if (ret == -EIOCBQUEUED) { + ret = 0; + goto out_free; + } else if (ret > 0 && ret < io_size) { + /* we got some bytes, but not all. retry. */ + goto retry; + } +done: + kiocb_done(kiocb, ret, cs); + ret = 0; out_free: - kfree(iovec); - req->flags &= ~REQ_F_NEED_CLEANUP; + /* it's reportedly faster than delegating the null check to kfree() */ + if (iovec) + kfree(iovec); return ret; } static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -2296,114 +3220,207 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, WRITE, force_nonblock); } -static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter iter; + struct iov_iter __iter, *iter = &__iter; size_t iov_count; - ssize_t ret, io_size; + ssize_t ret, ret2, io_size; + + if (req->io) + iter = &req->io->rw.iter; - ret = io_import_iovec(WRITE, req, &iovec, &iter); + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - req->result = 0; - io_size = ret; - if (req->flags & REQ_F_LINK) - req->result = io_size; - - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + /* If the file doesn't support async, just async punt */ + if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; - } /* file path doesn't support NOWAIT for non-direct_IO */ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && (req->flags & REQ_F_ISREG)) goto copy_iov; - iov_count = iov_iter_count(&iter); - ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - ssize_t ret2; + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); + if (unlikely(ret)) + goto out_free; - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (req->flags & REQ_F_ISREG) { + __sb_start_write(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); - /* - * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); - } else { + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, iter); + else if (req->file->f_op->write) + ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter); + else + ret2 = -EINVAL; + + /* + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; + if (!force_nonblock || ret2 != -EAGAIN) { + /* IOPOLL retry should happen for io-wq threads */ + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) + goto copy_iov; + kiocb_done(kiocb, ret2, cs); + } else { copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) - goto out_free; + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); + if (!ret) return -EAGAIN; - } } out_free: - req->flags &= ~REQ_F_NEED_CLEANUP; - kfree(iovec); + /* it's reportedly faster than delegating the null check to kfree() */ + if (iovec) + kfree(iovec); return ret; } +static int __io_splice_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; + int ret; + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sp->file_in = NULL; + sp->len = READ_ONCE(sqe->len); + sp->flags = READ_ONCE(sqe->splice_flags); + + if (unlikely(sp->flags & ~valid_flags)) + return -EINVAL; + + ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + + if (!S_ISREG(file_inode(sp->file_in)->i_mode)) { + /* + * Splice operation will be punted aync, and here need to + * modify io_wq_work.flags, so initialize io_wq_work firstly. + */ + io_req_init_async(req); + req->work.flags |= IO_WQ_WORK_UNBOUND; + } + + return 0; +} + +static int io_tee_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) + return -EINVAL; + return __io_splice_prep(req, sqe); +} + +static int io_tee(struct io_kiocb *req, bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + long ret = 0; + + if (force_nonblock) + return -EAGAIN; + if (sp->len) + ret = do_tee(in, out, sp->len, flags); + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + if (ret != sp->len) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + return __io_splice_prep(req, sqe); +} + +static int io_splice(struct io_kiocb *req, bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + loff_t *poff_in, *poff_out; + long ret = 0; + + if (force_nonblock) + return -EAGAIN; + + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; + + if (sp->len) + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + if (ret != sp->len) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req) +static int io_nop(struct io_kiocb *req, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(req, 0); - io_put_req(req); + __io_req_complete(req, 0, 0, cs); return 0; } @@ -2428,104 +3445,31 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static bool io_req_cancelled(struct io_kiocb *req) -{ - if (req->work.flags & IO_WQ_WORK_CANCEL) { - req_set_fail_links(req); - io_cqring_add_event(req, -ECANCELED); - io_put_req(req); - return true; - } - - return false; -} - -static void io_link_work_cb(struct io_wq_work **workptr) -{ - struct io_wq_work *work = *workptr; - struct io_kiocb *link = work->data; - - io_queue_linked_timeout(link); - work->func = io_wq_submit_work; -} - -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) -{ - struct io_kiocb *link; - - io_prep_async_work(nxt, &link); - *workptr = &nxt->work; - if (link) { - nxt->work.flags |= IO_WQ_WORK_CB; - nxt->work.func = io_link_work_cb; - nxt->work.data = link; - } -} - -static void io_fsync_finish(struct io_wq_work **workptr) +static int io_fsync(struct io_kiocb *req, bool force_nonblock) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); loff_t end = req->sync.off + req->sync.len; - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; + /* fsync always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); -} - -static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) -{ - struct io_wq_work *work, *old_work; - - /* fsync always requires a blocking context */ - if (force_nonblock) { - io_put_req(req); - req->work.func = io_fsync_finish; - return -EAGAIN; - } - - work = old_work = &req->work; - io_fsync_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); + io_req_complete(req, ret); return 0; } -static void io_fallocate_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; - int ret; - - if (io_req_cancelled(req)) - return; - - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, - req->sync.len); - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); -} - static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) return -EINVAL; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); @@ -2533,73 +3477,72 @@ static int io_fallocate_prep(struct io_kiocb *req, return 0; } -static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_fallocate(struct io_kiocb *req, bool force_nonblock) { - struct io_wq_work *work, *old_work; + int ret; /* fallocate always requiring blocking context */ - if (force_nonblock) { - io_put_req(req); - req->work.func = io_fallocate_finish; + if (force_nonblock) return -EAGAIN; - } - - work = old_work = &req->work; - io_fallocate_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); - + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); return 0; } -static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { const char __user *fname; int ret; - if (sqe->ioprio || sqe->buf_index) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; + + /* open.how should be already initialised */ + if (!(req->open.how.flags & O_PATH) && force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; req->open.dfd = READ_ONCE(sqe->fd); - req->open.how.mode = READ_ONCE(sqe->len); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.how.flags = READ_ONCE(sqe->open_flags); - req->open.filename = getname(fname); if (IS_ERR(req->open.filename)) { ret = PTR_ERR(req->open.filename); req->open.filename = NULL; return ret; } - + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; } +static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + u64 flags, mode; + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; + mode = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->open_flags); + req->open.how = build_open_how(flags, mode); + return __io_openat_prep(req, sqe); +} + static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct open_how __user *how; - const char __user *fname; size_t len; int ret; - if (sqe->ioprio || sqe->buf_index) - return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) - return -EBADF; if (req->flags & REQ_F_NEED_CLEANUP) return 0; - - req->open.dfd = READ_ONCE(sqe->fd); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); - if (len < OPEN_HOW_SIZE_VER0) return -EINVAL; @@ -2608,22 +3551,10 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) return ret; - if (!(req->open.how.flags & O_PATH) && force_o_largefile()) - req->open.how.flags |= O_LARGEFILE; - - req->open.filename = getname(fname); - if (IS_ERR(req->open.filename)) { - ret = PTR_ERR(req->open.filename); - req->open.filename = NULL; - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; + return __io_openat_prep(req, sqe); } -static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_openat2(struct io_kiocb *req, bool force_nonblock) { struct open_flags op; struct file *file; @@ -2636,7 +3567,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, if (ret) goto err; - ret = get_unused_fd_flags(req->open.how.flags); + ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); if (ret < 0) goto err; @@ -2653,16 +3584,170 @@ err: req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_req_complete(req, ret); return 0; } -static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_openat(struct io_kiocb *req, bool force_nonblock) { - req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); - return io_openat2(req, nxt, force_nonblock); + return io_openat2(req, force_nonblock); +} + +static int io_remove_buffers_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = &req->pbuf; + u64 tmp; + + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -EINVAL; + + memset(p, 0, sizeof(*p)); + p->nbufs = tmp; + p->bgid = READ_ONCE(sqe->buf_group); + return 0; +} + +static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, + int bgid, unsigned nbufs) +{ + unsigned i = 0; + + /* shouldn't happen */ + if (!nbufs) + return 0; + + /* the head kbuf is the list itself */ + while (!list_empty(&buf->list)) { + struct io_buffer *nxt; + + nxt = list_first_entry(&buf->list, struct io_buffer, list); + list_del(&nxt->list); + kfree(nxt); + if (++i == nbufs) + return i; + } + i++; + kfree(buf); + idr_remove(&ctx->io_buffer_idr, bgid); + + return i; +} + +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) +{ + struct io_provide_buf *p = &req->pbuf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head; + int ret = 0; + + io_ring_submit_lock(ctx, !force_nonblock); + + lockdep_assert_held(&ctx->uring_lock); + + ret = -ENOENT; + head = idr_find(&ctx->io_buffer_idr, p->bgid); + if (head) + ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); + + io_ring_submit_lock(ctx, !force_nonblock); + if (ret < 0) + req_set_fail_links(req); + __io_req_complete(req, ret, 0, cs); + return 0; +} + +static int io_provide_buffers_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = &req->pbuf; + u64 tmp; + + if (sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -E2BIG; + p->nbufs = tmp; + p->addr = READ_ONCE(sqe->addr); + p->len = READ_ONCE(sqe->len); + + if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs))) + return -EFAULT; + + p->bgid = READ_ONCE(sqe->buf_group); + tmp = READ_ONCE(sqe->off); + if (tmp > USHRT_MAX) + return -E2BIG; + p->bid = tmp; + return 0; +} + +static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) +{ + struct io_buffer *buf; + u64 addr = pbuf->addr; + int i, bid = pbuf->bid; + + for (i = 0; i < pbuf->nbufs; i++) { + buf = kmalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + break; + + buf->addr = addr; + buf->len = pbuf->len; + buf->bid = bid; + addr += pbuf->len; + bid++; + if (!*head) { + INIT_LIST_HEAD(&buf->list); + *head = buf; + } else { + list_add_tail(&buf->list, &(*head)->list); + } + } + + return i ? i : -ENOMEM; +} + +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) +{ + struct io_provide_buf *p = &req->pbuf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head, *list; + int ret = 0; + + io_ring_submit_lock(ctx, !force_nonblock); + + lockdep_assert_held(&ctx->uring_lock); + + list = head = idr_find(&ctx->io_buffer_idr, p->bgid); + + ret = io_add_buffers(p, &head); + if (ret < 0) + goto out; + + if (!list) { + ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, + GFP_KERNEL); + if (ret < 0) { + __io_remove_buffers(ctx, head, p->bgid, -1U); + goto out; + } + } +out: + io_ring_submit_unlock(ctx, !force_nonblock); + if (ret < 0) + req_set_fail_links(req); + __io_req_complete(req, ret, 0, cs); + return 0; } static int io_epoll_ctl_prep(struct io_kiocb *req, @@ -2671,6 +3756,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #if defined(CONFIG_EPOLL) if (sqe->ioprio || sqe->buf_index) return -EINVAL; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); req->epoll.op = READ_ONCE(sqe->len); @@ -2690,8 +3777,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; @@ -2703,8 +3790,7 @@ static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + __io_req_complete(req, ret, 0, cs); return 0; #else return -EOPNOTSUPP; @@ -2716,6 +3802,8 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) if (sqe->ioprio || sqe->buf_index || sqe->off) return -EINVAL; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; req->madvise.addr = READ_ONCE(sqe->addr); req->madvise.len = READ_ONCE(sqe->len); @@ -2726,8 +3814,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #endif } -static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_madvise(struct io_kiocb *req, bool force_nonblock) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = &req->madvise; @@ -2739,8 +3826,7 @@ static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, ret = do_madvise(ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -2751,6 +3837,8 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (sqe->ioprio || sqe->buf_index || sqe->addr) return -EINVAL; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; req->fadvise.offset = READ_ONCE(sqe->off); req->fadvise.len = READ_ONCE(sqe->len); @@ -2758,8 +3846,7 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_fadvise(struct io_kiocb *req, bool force_nonblock) { struct io_fadvise *fa = &req->fadvise; int ret; @@ -2778,83 +3865,46 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_req_complete(req, ret); return 0; } static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - const char __user *fname; - unsigned lookup_flags; - int ret; - + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; - req->open.dfd = READ_ONCE(sqe->fd); - req->open.mask = READ_ONCE(sqe->len); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - req->open.how.flags = READ_ONCE(sqe->statx_flags); + req->statx.dfd = READ_ONCE(sqe->fd); + req->statx.mask = READ_ONCE(sqe->len); + req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + req->statx.flags = READ_ONCE(sqe->statx_flags); - if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags)) - return -EINVAL; - - req->open.filename = getname_flags(fname, lookup_flags, NULL); - if (IS_ERR(req->open.filename)) { - ret = PTR_ERR(req->open.filename); - req->open.filename = NULL; - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; return 0; } -static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_statx(struct io_kiocb *req, bool force_nonblock) { - struct io_open *ctx = &req->open; - unsigned lookup_flags; - struct path path; - struct kstat stat; + struct io_statx *ctx = &req->statx; int ret; - if (force_nonblock) + if (force_nonblock) { + /* only need file table for an actual valid fd */ + if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD) + req->flags |= REQ_F_NO_FILE_TABLE; return -EAGAIN; + } - if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) - return -EINVAL; - -retry: - /* filename_lookup() drops it, keep a reference */ - ctx->filename->refcnt++; - - ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path, - NULL); - if (ret) - goto err; + ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, + ctx->buffer); - ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags); - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - if (!ret) - ret = cp_statx(&stat, ctx->buffer); -err: - putname(ctx->filename); - req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_req_complete(req, ret); return 0; } @@ -2862,76 +3912,58 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { /* * If we queue this for async, it must not be cancellable. That would - * leave the 'file' in an undeterminate state. + * leave the 'file' in an undeterminate state, and here need to modify + * io_wq_work.flags, so initialize io_wq_work firstly. */ + io_req_init_async(req); req->work.flags |= IO_WQ_WORK_NO_CANCEL; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); - if (req->file->f_op == &io_uring_fops || + if ((req->file && req->file->f_op == &io_uring_fops) || req->close.fd == req->ctx->ring_fd) return -EBADF; + req->close.put_file = NULL; return 0; } -/* only called when __close_fd_get_file() is done */ -static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt) -{ - int ret; - - ret = filp_close(req->close.put_file, req->work.files); - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - fput(req->close.put_file); - io_put_req_find_next(req, nxt); -} - -static void io_close_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; - - /* not cancellable, don't do io_req_cancelled() */ - __io_close_finish(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); -} - -static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_close(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { + struct io_close *close = &req->close; int ret; - req->close.put_file = NULL; - ret = __close_fd_get_file(req->close.fd, &req->close.put_file); - if (ret < 0) - return ret; + /* might be already done during nonblock submission */ + if (!close->put_file) { + ret = __close_fd_get_file(close->fd, &close->put_file); + if (ret < 0) + return (ret == -ENOENT) ? -EBADF : ret; + } /* if the file has a flush method, be safe and punt to async */ - if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) - goto eagain; + if (close->put_file->f_op->flush && force_nonblock) { + /* was never set, but play safe */ + req->flags &= ~REQ_F_NOWAIT; + /* avoid grabbing files - we don't need the files */ + req->flags |= REQ_F_NO_FILE_TABLE; + return -EAGAIN; + } - /* - * No ->flush(), safely close from here and just punt the - * fput() to async context. - */ - __io_close_finish(req, nxt); - return 0; -eagain: - req->work.func = io_close_finish; - /* - * Do manual async queue here to avoid grabbing files - we don't - * need the files, and it'll cause io_close_finish() to close - * the file again and cause a double CQE entry for this request - */ - io_queue_async_work(req); + /* No ->flush() or already async, safely close from here */ + ret = filp_close(close->put_file, req->work.files); + if (ret < 0) + req_set_fail_links(req); + fput(close->put_file); + close->put_file = NULL; + __io_req_complete(req, ret, 0, cs); return 0; } @@ -2953,53 +3985,58 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_sync_file_range_finish(struct io_wq_work **workptr) +static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; + /* sync_file_range always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); + io_req_complete(req, ret); + return 0; } -static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +#if defined(CONFIG_NET) +static int io_setup_async_msg(struct io_kiocb *req, + struct io_async_msghdr *kmsg) { - struct io_wq_work *work, *old_work; - - /* sync_file_range always requires a blocking context */ - if (force_nonblock) { - io_put_req(req); - req->work.func = io_sync_file_range_finish; + if (req->io) return -EAGAIN; + if (io_alloc_async_ctx(req)) { + if (kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); + return -ENOMEM; } + req->flags |= REQ_F_NEED_CLEANUP; + memcpy(&req->io->msg, kmsg, sizeof(*kmsg)); + return -EAGAIN; +} - work = old_work = &req->work; - io_sync_file_range_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); - return 0; +static int io_sendmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->iov = iomsg->fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, + req->sr_msg.msg_flags, &iomsg->iov); } static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; int ret; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); #ifdef CONFIG_COMPAT @@ -3013,151 +4050,224 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, - &io->msg.iov); + ret = io_sendmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; -#else - return -EOPNOTSUPP; -#endif } -static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { -#if defined(CONFIG_NET) - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; + unsigned flags; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_async_ctx io; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - struct io_sr_msg *sr = &req->sr_msg; + if (unlikely(!sock)) + return ret; - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } - io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, - sr->msg_flags, &io.msg.iov); - if (ret) - return ret; - } + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) { - if (req->io) - return -EAGAIN; - if (io_alloc_async_ctx(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); - return -ENOMEM; - } - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + __io_req_complete(req, ret, 0, cs); return 0; -#else - return -EOPNOTSUPP; -#endif } -static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { -#if defined(CONFIG_NET) + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; struct socket *sock; + unsigned flags; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, - &msg.msg_iter); - if (ret) - return ret; + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret;; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + __io_req_complete(req, ret, 0, cs); return 0; -#else - return -EOPNOTSUPP; +} + +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + struct io_sr_msg *sr = &req->sr_msg; + struct iovec __user *uiov; + size_t iov_len; + int ret; + + ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, + &iomsg->uaddr, &uiov, &iov_len); + if (ret) + return ret; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (iov_len > 1) + return -EINVAL; + if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov))) + return -EFAULT; + sr->len = iomsg->iov[0].iov_len; + iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1, + sr->len); + iomsg->iov = NULL; + } else { + ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + &iomsg->iov, &iomsg->msg.msg_iter); + if (ret > 0) + ret = 0; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + struct compat_msghdr __user *msg_compat; + struct io_sr_msg *sr = &req->sr_msg; + struct compat_iovec __user *uiov; + compat_uptr_t ptr; + compat_size_t len; + int ret; + + msg_compat = (struct compat_msghdr __user *) sr->umsg; + ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr, + &ptr, &len); + if (ret) + return ret; + + uiov = compat_ptr(ptr); + if (req->flags & REQ_F_BUFFER_SELECT) { + compat_ssize_t clen; + + if (len > 1) + return -EINVAL; + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = iomsg->iov[0].iov_len; + iomsg->iov = NULL; + } else { + ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, + &iomsg->iov, + &iomsg->msg.msg_iter); + if (ret < 0) + return ret; + } + + return 0; +} #endif + +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->msg.msg_name = &iomsg->addr; + iomsg->iov = iomsg->fast_iov; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return __io_compat_recvmsg_copy_hdr(req, iomsg); +#endif + + return __io_recvmsg_copy_hdr(req, iomsg); +} + +static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, + bool needs_lock) +{ + struct io_sr_msg *sr = &req->sr_msg; + struct io_buffer *kbuf; + + kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); + if (IS_ERR(kbuf)) + return kbuf; + + sr->kbuf = kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + return kbuf; +} + +static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) +{ + return io_put_kbuf(req, req->sr_msg.kbuf); } static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; int ret; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->bgid = READ_ONCE(sqe->buf_group); #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -3170,147 +4280,128 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.iov = io->msg.fast_iov; - ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, - &io->msg.uaddr, &io->msg.iov); + ret = io_recvmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; -#else - return -EOPNOTSUPP; -#endif } -static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { -#if defined(CONFIG_NET) - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - int ret; - - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + struct io_buffer *kbuf; + unsigned flags; + int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_async_ctx io; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - struct io_sr_msg *sr = &req->sr_msg; - - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; + if (unlikely(!sock)) + return ret; - io.msg.iov = io.msg.fast_iov; - ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, - sr->msg_flags, &io.msg.uaddr, - &io.msg.iov); - if (ret) - return ret; - } + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_recvmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, - kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - if (req->io) - return -EAGAIN; - if (io_alloc_async_ctx(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); - return -ENOMEM; - } - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - req->flags |= REQ_F_NEED_CLEANUP; - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + 1, req->sr_msg.len); } - if (kmsg && kmsg->iov != kmsg->fast_iov) + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, + kmsg->uaddr, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + __io_req_complete(req, ret, cflags, cs); return 0; -#else - return -EOPNOTSUPP; -#endif } -static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { -#if defined(CONFIG_NET) + struct io_buffer *kbuf; + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + void __user *buf = sr->buf; struct socket *sock; - int ret; - - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + struct iovec iov; + unsigned flags; + int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - unsigned flags; - - ret = import_single_range(READ, sr->buf, sr->len, &iov, - &msg.msg_iter); - if (ret) - return ret; + if (unlikely(!sock)) + return ret; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + buf = u64_to_user_ptr(kbuf->addr); } - io_cqring_add_event(req, ret); + ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + goto out_free; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = sock_recvmsg(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; +out_free: + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + __io_req_complete(req, ret, cflags, cs); return 0; -#else - return -EOPNOTSUPP; -#endif } - static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_accept *accept = &req->accept; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) @@ -3321,68 +4412,36 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); + accept->nofile = rlimit(RLIMIT_NOFILE); return 0; -#else - return -EOPNOTSUPP; -#endif } -#if defined(CONFIG_NET) -static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_accept *accept = &req->accept; - unsigned file_flags; + unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; int ret; - file_flags = force_nonblock ? O_NONBLOCK : 0; + if (req->file->f_flags & O_NONBLOCK) + req->flags |= REQ_F_NOWAIT; + ret = __sys_accept4_file(req->file, file_flags, accept->addr, - accept->addr_len, accept->flags); + accept->addr_len, accept->flags, + accept->nofile); if (ret == -EAGAIN && force_nonblock) return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret < 0) + if (ret < 0) { + if (ret == -ERESTARTSYS) + ret = -EINTR; req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); - return 0; -} - -static void io_accept_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; - - if (io_req_cancelled(req)) - return; - __io_accept(req, &nxt, false); - if (nxt) - io_wq_assign_next(workptr, nxt); -} -#endif - -static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) -{ -#if defined(CONFIG_NET) - int ret; - - ret = __io_accept(req, nxt, force_nonblock); - if (ret == -EAGAIN && force_nonblock) { - req->work.func = io_accept_finish; - io_put_req(req); - return -EAGAIN; } + __io_req_complete(req, ret, 0, cs); return 0; -#else - return -EOPNOTSUPP; -#endif } static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_connect *conn = &req->connect; struct io_async_ctx *io = req->io; @@ -3399,15 +4458,11 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return move_addr_to_kernel(conn->addr, conn->addr_len, &io->connect.address); -#else - return -EOPNOTSUPP; -#endif } -static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { -#if defined(CONFIG_NET) struct io_async_ctx __io, *io; unsigned file_flags; int ret; @@ -3442,287 +4497,584 @@ static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, out: if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + __io_req_complete(req, ret, 0, cs); return 0; -#else +} +#else /* !CONFIG_NET */ +static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ return -EOPNOTSUPP; -#endif } -static void io_poll_remove_one(struct io_kiocb *req) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { - struct io_poll_iocb *poll = &req->poll; + return -EOPNOTSUPP; +} - spin_lock(&poll->head->lock); - WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait.entry)) { - list_del_init(&poll->wait.entry); - io_queue_async_work(req); - } - spin_unlock(&poll->head->lock); - hash_del(&req->hash_node); +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) +{ + return -EOPNOTSUPP; } -static void io_poll_remove_all(struct io_ring_ctx *ctx) +static int io_recvmsg_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - struct hlist_node *tmp; + return -EOPNOTSUPP; +} + +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) +{ + return -EOPNOTSUPP; +} + +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) +{ + return -EOPNOTSUPP; +} + +static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) +{ + return -EOPNOTSUPP; +} + +static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_NET */ + +struct io_poll_table { + struct poll_table_struct pt; struct io_kiocb *req; - int i; + int error; +}; - spin_lock_irq(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; +static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, + __poll_t mask, task_work_func_t func) +{ + bool twa_signal_ok; + int ret; - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) - io_poll_remove_one(req); + /* for instances that support it check for an event match first: */ + if (mask && !(mask & poll->events)) + return 0; + + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + + list_del_init(&poll->wait.entry); + + req->result = mask; + init_task_work(&req->task_work, func); + percpu_ref_get(&req->ctx->refs); + + /* + * If we using the signalfd wait_queue_head for this wakeup, then + * it's not safe to use TWA_SIGNAL as we could be recursing on the + * tsk->sighand->siglock on doing the wakeup. Should not be needed + * either, as the normal wakeup will suffice. + */ + twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh); + + /* + * If this fails, then the task is exiting. When a task exits, the + * work gets canceled, so just cancel this request as well instead + * of executing it. We can't safely execute it anyway, as we may not + * have the needed state needed for it anyway. + */ + ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok); + if (unlikely(ret)) { + struct task_struct *tsk; + + WRITE_ONCE(poll->canceled, true); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); } - spin_unlock_irq(&ctx->completion_lock); + return 1; } -static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) +static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) + __acquires(&req->ctx->completion_lock) { - struct hlist_head *list; - struct io_kiocb *req; + struct io_ring_ctx *ctx = req->ctx; - list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; - hlist_for_each_entry(req, list, hash_node) { - if (sqe_addr == req->user_data) { - io_poll_remove_one(req); - return 0; - } + if (!req->result && !READ_ONCE(poll->canceled)) { + struct poll_table_struct pt = { ._key = poll->events }; + + req->result = vfs_poll(req->file, &pt) & poll->events; } - return -ENOENT; + spin_lock_irq(&ctx->completion_lock); + if (!req->result && !READ_ONCE(poll->canceled)) { + add_wait_queue(poll->head, &poll->wait); + return true; + } + + return false; } -static int io_poll_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->poll_events) - return -EINVAL; + /* pure poll stashes this in ->io, poll driven retry elsewhere */ + if (req->opcode == IORING_OP_POLL_ADD) + return (struct io_poll_iocb *) req->io; + return req->apoll->double_poll; +} - req->poll.addr = READ_ONCE(sqe->addr); - return 0; +static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) +{ + if (req->opcode == IORING_OP_POLL_ADD) + return &req->poll; + return &req->apoll->poll; } -/* - * Find a running poll command that matches one specified in sqe->addr, - * and remove it if found. - */ -static int io_poll_remove(struct io_kiocb *req) +static void io_poll_remove_double(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - u64 addr; - int ret; + struct io_poll_iocb *poll = io_poll_get_double(req); - addr = req->poll.addr; - spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, addr); - spin_unlock_irq(&ctx->completion_lock); + lockdep_assert_held(&req->ctx->completion_lock); - io_cqring_add_event(req, ret); - if (ret < 0) - req_set_fail_links(req); - io_put_req(req); - return 0; + if (poll && poll->head) { + struct wait_queue_head *head = poll->head; + + spin_lock(&head->lock); + list_del_init(&poll->wait.entry); + if (poll->wait.private) + refcount_dec(&req->refs); + poll->head = NULL; + spin_unlock(&head->lock); + } } static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; + io_poll_remove_double(req); req->poll.done = true; - if (error) - io_cqring_fill_event(req, error); - else - io_cqring_fill_event(req, mangle_poll(mask)); + io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); } -static void io_poll_complete_work(struct io_wq_work **workptr) +static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) { - struct io_wq_work *work = *workptr; - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_poll_iocb *poll = &req->poll; - struct poll_table_struct pt = { ._key = poll->events }; struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt = NULL; - __poll_t mask = 0; - int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) { - WRITE_ONCE(poll->canceled, true); - ret = -ECANCELED; - } else if (READ_ONCE(poll->canceled)) { - ret = -ECANCELED; - } - - if (ret != -ECANCELED) - mask = vfs_poll(poll->file, &pt) & poll->events; - - /* - * Note that ->ki_cancel callers also delete iocb from active_reqs after - * calling ->ki_cancel. We need the ctx_lock roundtrip here to - * synchronize with them. In the cancellation case the list_del_init - * itself is not actually needed, but harmless so we keep it in to - * avoid further branches in the fast path. - */ - spin_lock_irq(&ctx->completion_lock); - if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, &poll->wait); + if (io_poll_rewait(req, &req->poll)) { spin_unlock_irq(&ctx->completion_lock); return; } + hash_del(&req->hash_node); - io_poll_complete(req, mask, ret); + io_poll_complete(req, req->result, 0); + req->flags |= REQ_F_COMP_LOCKED; + *nxt = io_put_req_find_next(req); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); +} - if (ret < 0) - req_set_fail_links(req); - io_put_req_find_next(req, &nxt); +static void io_poll_task_func(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *nxt = NULL; + + io_poll_task_handler(req, &nxt); if (nxt) - io_wq_assign_next(workptr, nxt); + __io_req_task_submit(nxt); + percpu_ref_put(&ctx->refs); } -static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) +static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, + int sync, void *key) { - struct io_kiocb *req, *tmp; - struct req_batch rb; + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = io_poll_get_single(req); + __poll_t mask = key_to_poll(key); - rb.to_free = rb.need_iter = 0; - spin_lock_irq(&ctx->completion_lock); - llist_for_each_entry_safe(req, tmp, nodes, llist_node) { - hash_del(&req->hash_node); - io_poll_complete(req, req->result, 0); + /* for instances that support it check for an event match first: */ + if (mask && !(mask & poll->events)) + return 0; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) { - req->flags |= REQ_F_COMP_LOCKED; - io_free_req(req); + if (poll && poll->head) { + bool done; + + spin_lock(&poll->head->lock); + done = list_empty(&poll->wait.entry); + if (!done) + list_del_init(&poll->wait.entry); + /* make sure double remove sees this as being gone */ + wait->private = NULL; + spin_unlock(&poll->head->lock); + if (!done) + __io_async_wake(req, poll, mask, io_poll_task_func); + } + refcount_dec(&req->refs); + return 1; +} + +static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, + wait_queue_func_t wake_func) +{ + poll->head = NULL; + poll->done = false; + poll->canceled = false; + poll->events = events; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); +} + +static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, + struct wait_queue_head *head, + struct io_poll_iocb **poll_ptr) +{ + struct io_kiocb *req = pt->req; + + /* + * If poll->head is already set, it's because the file being polled + * uses multiple waitqueues for poll handling (eg one for read, one + * for write). Setup a separate io_poll_iocb if this happens. + */ + if (unlikely(poll->head)) { + /* already have a 2nd entry, fail a third attempt */ + if (*poll_ptr) { + pt->error = -EINVAL; + return; + } + poll = kmalloc(sizeof(*poll), GFP_ATOMIC); + if (!poll) { + pt->error = -ENOMEM; + return; } + io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake); + refcount_inc(&req->refs); + poll->wait.private = req; + *poll_ptr = poll; } - spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); + pt->error = 0; + poll->head = head; + + if (poll->events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(head, &poll->wait); + else + add_wait_queue(head, &poll->wait); } -static void io_poll_flush(struct io_wq_work **workptr) +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct llist_node *nodes; + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct async_poll *apoll = pt->req->apoll; - nodes = llist_del_all(&req->ctx->poll_llist); - if (nodes) - __io_poll_flush(req->ctx, nodes); + __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_poll_trigger_evfd(struct io_wq_work **workptr) +static void io_async_task_func(struct callback_head *cb) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct async_poll *apoll = req->apoll; + struct io_ring_ctx *ctx = req->ctx; - eventfd_signal(req->ctx->cq_ev_fd, 1); - io_put_req(req); + trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); + + if (io_poll_rewait(req, &apoll->poll)) { + spin_unlock_irq(&ctx->completion_lock); + percpu_ref_put(&ctx->refs); + return; + } + + /* If req is still hashed, it cannot have been canceled. Don't check. */ + if (hash_hashed(&req->hash_node)) + hash_del(&req->hash_node); + + io_poll_remove_double(req); + spin_unlock_irq(&ctx->completion_lock); + + if (!READ_ONCE(apoll->poll.canceled)) + __io_req_task_submit(req); + else + __io_req_task_cancel(req, -ECANCELED); + + percpu_ref_put(&ctx->refs); + kfree(apoll->double_poll); + kfree(apoll); } -static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, +static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_poll_iocb *poll = wait->private; - struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->apoll->poll; + + trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, + key_to_poll(key)); + + return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ struct io_ring_ctx *ctx = req->ctx; - __poll_t mask = key_to_poll(key); + struct hlist_head *list; - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} - list_del_init(&poll->wait.entry); +static __poll_t __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll_iocb *poll, + struct io_poll_table *ipt, __poll_t mask, + wait_queue_func_t wake_func) + __acquires(&ctx->completion_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + bool cancel = false; - /* - * Run completion inline if we can. We're using trylock here because - * we are violating the completion_lock -> poll wq lock ordering. - * If we have a link timeout we're going to need the completion_lock - * for finalizing the request, mark us as having grabbed that already. - */ - if (mask) { - unsigned long flags; + io_init_poll_iocb(poll, mask, wake_func); + poll->file = req->file; + poll->wait.private = req; - if (llist_empty(&ctx->poll_llist) && - spin_trylock_irqsave(&ctx->completion_lock, flags)) { - bool trigger_ev; + ipt->pt._key = mask; + ipt->req = req; + ipt->error = -EINVAL; - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); + mask = vfs_poll(req->file, &ipt->pt) & poll->events; - trigger_ev = io_should_trigger_evfd(ctx); - if (trigger_ev && eventfd_signal_count()) { - trigger_ev = false; - req->work.func = io_poll_trigger_evfd; - } else { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - req = NULL; - } - spin_unlock_irqrestore(&ctx->completion_lock, flags); - __io_cqring_ev_posted(ctx, trigger_ev); - } else { - req->result = mask; - req->llist_node.next = NULL; - /* if the list wasn't empty, we're done */ - if (!llist_add(&req->llist_node, &ctx->poll_llist)) - req = NULL; - else - req->work.func = io_poll_flush; + spin_lock_irq(&ctx->completion_lock); + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt->error) + cancel = true; + ipt->error = 0; + mask = 0; } + if (mask || ipt->error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + io_poll_req_insert(req); + spin_unlock(&poll->head->lock); } - if (req) - io_queue_async_work(req); - return 1; + return mask; } -struct io_poll_table { - struct poll_table_struct pt; +static bool io_arm_poll_handler(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + struct io_poll_table ipt; + __poll_t mask, ret; + int rw; + + if (!req->file || !file_can_poll(req->file)) + return false; + if (req->flags & REQ_F_POLLED) + return false; + if (def->pollin) + rw = READ; + else if (def->pollout) + rw = WRITE; + else + return false; + /* if we can't nonblock try, then no point in arming a poll handler */ + if (!io_file_supports_async(req->file, rw)) + return false; + + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return false; + apoll->double_poll = NULL; + + req->flags |= REQ_F_POLLED; + io_get_req_task(req); + req->apoll = apoll; + INIT_HLIST_NODE(&req->hash_node); + + mask = 0; + if (def->pollin) + mask |= POLLIN | POLLRDNORM; + if (def->pollout) + mask |= POLLOUT | POLLWRNORM; + mask |= POLLERR | POLLPRI; + + ipt.pt._qproc = io_async_queue_proc; + + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, + io_async_wake); + if (ret || ipt.error) { + io_poll_remove_double(req); + spin_unlock_irq(&ctx->completion_lock); + kfree(apoll->double_poll); + kfree(apoll); + return false; + } + spin_unlock_irq(&ctx->completion_lock); + trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, + apoll->poll.events); + return true; +} + +static bool __io_poll_remove_one(struct io_kiocb *req, + struct io_poll_iocb *poll) +{ + bool do_complete = false; + + spin_lock(&poll->head->lock); + WRITE_ONCE(poll->canceled, true); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); + do_complete = true; + } + spin_unlock(&poll->head->lock); + hash_del(&req->hash_node); + return do_complete; +} + +static bool io_poll_remove_one(struct io_kiocb *req) +{ + bool do_complete; + + io_poll_remove_double(req); + + if (req->opcode == IORING_OP_POLL_ADD) { + do_complete = __io_poll_remove_one(req, &req->poll); + } else { + struct async_poll *apoll = req->apoll; + + /* non-poll requests have submit ref still */ + do_complete = __io_poll_remove_one(req, &apoll->poll); + if (do_complete) { + io_put_req(req); + kfree(apoll->double_poll); + kfree(apoll); + } + } + + if (do_complete) { + io_cqring_fill_event(req, -ECANCELED); + io_commit_cqring(req->ctx); + req->flags |= REQ_F_COMP_LOCKED; + req_set_fail_links(req); + io_put_req(req); + } + + return do_complete; +} + +static void io_poll_remove_all(struct io_ring_ctx *ctx) +{ + struct hlist_node *tmp; struct io_kiocb *req; - int error; -}; + int posted = 0, i; -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) + posted += io_poll_remove_one(req); + } + spin_unlock_irq(&ctx->completion_lock); + + if (posted) + io_cqring_ev_posted(ctx); +} + +static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) { - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct hlist_head *list; + struct io_kiocb *req; - if (unlikely(pt->req->poll.head)) { - pt->error = -EINVAL; - return; + list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (sqe_addr != req->user_data) + continue; + if (io_poll_remove_one(req)) + return 0; + return -EALREADY; } - pt->error = 0; - pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); + return -ENOENT; } -static void io_poll_req_insert(struct io_kiocb *req) +static int io_poll_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + req->poll.addr = READ_ONCE(sqe->addr); + return 0; +} + +/* + * Find a running poll command that matches one specified in sqe->addr, + * and remove it if found. + */ +static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; + u64 addr; + int ret; - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); + addr = req->poll.addr; + spin_lock_irq(&ctx->completion_lock); + ret = io_poll_cancel(ctx, addr); + spin_unlock_irq(&ctx->completion_lock); + + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->poll; + + return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); +} + +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u16 events; + u32 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3731,57 +5083,30 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (!poll->file) return -EBADF; - events = READ_ONCE(sqe->poll_events); - poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | + (events & EPOLLEXCLUSIVE); + + io_get_req_task(req); return 0; } -static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) +static int io_poll_add(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - bool cancel = false; __poll_t mask; - INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); - - poll->head = NULL; - poll->done = false; - poll->canceled = false; - ipt.pt._qproc = io_poll_queue_proc; - ipt.pt._key = poll->events; - ipt.req = req; - ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ - - /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, io_poll_wake); - poll->wait.private = poll; - - INIT_LIST_HEAD(&req->list); - mask = vfs_poll(poll->file, &ipt.pt) & poll->events; + mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, + io_poll_wake); - spin_lock_irq(&ctx->completion_lock); - if (likely(poll->head)) { - spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { - if (ipt.error) - cancel = true; - ipt.error = 0; - mask = 0; - } - if (mask || ipt.error) - list_del_init(&poll->wait.entry); - else if (cancel) - WRITE_ONCE(poll->canceled, true); - else if (!poll->done) /* actually waiting for an event */ - io_poll_req_insert(req); - spin_unlock(&poll->head->lock); - } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; io_poll_complete(req, mask, 0); @@ -3790,7 +5115,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) if (mask) { io_cqring_ev_posted(ctx); - io_put_req_find_next(req, nxt); + io_put_req(req); } return ipt.error; } @@ -3803,27 +5128,16 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - atomic_inc(&ctx->cq_timeouts); - spin_lock_irqsave(&ctx->completion_lock, flags); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + /* * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. */ - if (!list_empty(&req->list)) { - struct io_kiocb *prev; - - /* - * Adjust the reqs sequence before the current one because it - * will consume a slot in the cq_ring and the cq_tail - * pointer will be increased, otherwise other timeout reqs may - * return in advance without waiting for enough wait_nr. - */ - prev = req; - list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) - prev->sequence++; - list_del_init(&req->list); - } + if (!list_empty(&req->timeout.list)) + list_del_init(&req->timeout.list); io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); @@ -3835,14 +5149,30 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int __io_timeout_cancel(struct io_kiocb *req) +{ + int ret; + + list_del_init(&req->timeout.list); + + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + if (ret == -1) + return -EALREADY; + + req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; + io_cqring_fill_event(req, -ECANCELED); + io_put_req(req); + return 0; +} + static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) { struct io_kiocb *req; int ret = -ENOENT; - list_for_each_entry(req, &ctx->timeout_list, list) { + list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (user_data == req->user_data) { - list_del_init(&req->list); ret = 0; break; } @@ -3851,14 +5181,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); - if (ret == -1) - return -EALREADY; - - req_set_fail_links(req); - io_cqring_fill_event(req, -ECANCELED); - io_put_req(req); - return 0; + return __io_timeout_cancel(req); } static int io_timeout_remove_prep(struct io_kiocb *req, @@ -3866,7 +5189,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req, { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->buf_index || sqe->len) return -EINVAL; req->timeout.addr = READ_ONCE(sqe->addr); @@ -3903,25 +5228,25 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, { struct io_timeout_data *data; unsigned flags; + u32 off = READ_ONCE(sqe->off); if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) return -EINVAL; - if (sqe->off && is_timeout_link) + if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - req->timeout.count = READ_ONCE(sqe->off); + req->timeout.off = off; if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; data = &req->io->timeout; data->req = req; - req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; @@ -3937,72 +5262,42 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_timeout(struct io_kiocb *req) { - unsigned count; struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data; + struct io_timeout_data *data = &req->io->timeout; struct list_head *entry; - unsigned span = 0; + u32 tail, off = req->timeout.off; - data = &req->io->timeout; + spin_lock_irq(&ctx->completion_lock); /* * sqe->off holds how many events that need to occur for this * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - count = req->timeout.count; - if (!count) { - req->flags |= REQ_F_TIMEOUT_NOSEQ; - spin_lock_irq(&ctx->completion_lock); + if (io_is_timeout_noseq(req)) { entry = ctx->timeout_list.prev; goto add; } - req->sequence = ctx->cached_sq_head + count - 1; - data->seq_offset = count; + tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + req->timeout.target_seq = tail + off; /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. */ - spin_lock_irq(&ctx->completion_lock); list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - unsigned nxt_sq_head; - long long tmp, tmp_nxt; - u32 nxt_offset = nxt->io->timeout.seq_offset; + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, + timeout.list); - if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(nxt)) continue; - - /* - * Since cached_sq_head + count - 1 can overflow, use type long - * long to store it. - */ - tmp = (long long)ctx->cached_sq_head + count - 1; - nxt_sq_head = nxt->sequence - nxt_offset + 1; - tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1; - - /* - * cached_sq_head may overflow, and it will never overflow twice - * once there is some timeout req still be valid. - */ - if (ctx->cached_sq_head < nxt_sq_head) - tmp += UINT_MAX; - - if (tmp > tmp_nxt) + /* nxt.seq is behind @tail, otherwise would've been completed */ + if (off >= nxt->timeout.target_seq - tail) break; - - /* - * Sequence of reqs after the insert one and itself should - * be adjusted because each timeout req consumes a slot. - */ - span++; - nxt->sequence++; } - req->sequence -= span; add: - list_add(&req->list, entry); + list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); @@ -4021,7 +5316,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) enum io_wq_cancel cancel_ret; int ret = 0; - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); + cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -4039,7 +5334,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) static void io_async_find_and_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req, __u64 sqe_addr, - struct io_kiocb **nxt, int success_ret) + int success_ret) { unsigned long flags; int ret; @@ -4065,7 +5360,7 @@ done: if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); } static int io_async_cancel_prep(struct io_kiocb *req, @@ -4073,26 +5368,29 @@ static int io_async_cancel_prep(struct io_kiocb *req, { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || - sqe->cancel_flags) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); return 0; } -static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) +static int io_async_cancel(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); + io_async_find_and_cancel(ctx, req, req->cancel.addr, 0); return 0; } static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->flags || sqe->ioprio || sqe->rw_flags) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->rw_flags) return -EINVAL; req->files_update.offset = READ_ONCE(sqe->off); @@ -4103,7 +5401,8 @@ static int io_files_update_prep(struct io_kiocb *req, return 0; } -static int io_files_update(struct io_kiocb *req, bool force_nonblock) +static int io_files_update(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_files_update up; @@ -4121,8 +5420,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4131,13 +5429,14 @@ static int io_req_defer_prep(struct io_kiocb *req, { ssize_t ret = 0; - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } + if (!sqe) + return 0; - io_req_work_grab_env(req, &io_op_defs[req->opcode]); + if (io_alloc_async_ctx(req)) + return -EAGAIN; + ret = io_prep_work_files(req); + if (unlikely(ret)) + return ret; switch (req->opcode) { case IORING_OP_NOP: @@ -4217,6 +5516,18 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_EPOLL_CTL: ret = io_epoll_ctl_prep(req, sqe); break; + case IORING_OP_SPLICE: + ret = io_splice_prep(req, sqe); + break; + case IORING_OP_PROVIDE_BUFFERS: + ret = io_provide_buffers_prep(req, sqe); + break; + case IORING_OP_REMOVE_BUFFERS: + ret = io_remove_buffers_prep(req, sqe); + break; + case IORING_OP_TEE: + ret = io_tee_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4227,72 +5538,129 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } +static u32 io_get_sequence(struct io_kiocb *req) +{ + struct io_kiocb *pos; + struct io_ring_ctx *ctx = req->ctx; + u32 total_submitted, nr_reqs = 1; + + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(pos, &req->link_list, link_list) + nr_reqs++; + + total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; + return total_submitted - nr_reqs; +} + static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; + struct io_defer_entry *de; int ret; + u32 seq; /* Still need defer if there is pending req in defer list. */ - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) + if (likely(list_empty_careful(&ctx->defer_list) && + !(req->flags & REQ_F_IO_DRAIN))) return 0; - if (!req->io && io_alloc_async_ctx(req)) - return -EAGAIN; + seq = io_get_sequence(req); + /* Still a chance to pass the sequence check */ + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) + return 0; - ret = io_req_defer_prep(req, sqe); - if (ret < 0) - return ret; + if (!req->io) { + ret = io_req_defer_prep(req, sqe); + if (ret) + return ret; + } + io_prep_async_link(req); + de = kmalloc(sizeof(*de), GFP_KERNEL); + if (!de) + return -ENOMEM; spin_lock_irq(&ctx->completion_lock); - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { + if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - return 0; + kfree(de); + io_queue_async_work(req); + return -EIOCBQUEUED; } trace_io_uring_defer(ctx, req, req->user_data); - list_add_tail(&req->list, &ctx->defer_list); + de->req = req; + de->seq = seq; + list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } -static void io_cleanup_req(struct io_kiocb *req) +static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); - break; - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); - break; - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - case IORING_OP_STATX: - putname(req->open.filename); - break; + if (req->flags & REQ_F_BUFFER_SELECTED) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + kfree((void *)(unsigned long)req->rw.addr); + break; + case IORING_OP_RECVMSG: + case IORING_OP_RECV: + kfree(req->sr_msg.kbuf); + break; + } + req->flags &= ~REQ_F_BUFFER_SELECTED; + } + + if (req->flags & REQ_F_NEED_CLEANUP) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (io->rw.free_iovec) + kfree(io->rw.free_iovec); + break; + case IORING_OP_RECVMSG: + case IORING_OP_SENDMSG: + if (io->msg.iov != io->msg.fast_iov) + kfree(io->msg.iov); + break; + case IORING_OP_SPLICE: + case IORING_OP_TEE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; + } + req->flags &= ~REQ_F_NEED_CLEANUP; } - req->flags &= ~REQ_F_NEED_CLEANUP; + if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + } } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) + bool force_nonblock, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; switch (req->opcode) { case IORING_OP_NOP: - ret = io_nop(req); + ret = io_nop(req, cs); break; case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -4302,7 +5670,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_read(req, nxt, force_nonblock); + ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: @@ -4312,7 +5680,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_write(req, nxt, force_nonblock); + ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: if (sqe) { @@ -4320,7 +5688,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_fsync(req, nxt, force_nonblock); + ret = io_fsync(req, force_nonblock); break; case IORING_OP_POLL_ADD: if (sqe) { @@ -4328,7 +5696,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_poll_add(req, nxt); + ret = io_poll_add(req); break; case IORING_OP_POLL_REMOVE: if (sqe) { @@ -4344,7 +5712,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_sync_file_range(req, nxt, force_nonblock); + ret = io_sync_file_range(req, force_nonblock); break; case IORING_OP_SENDMSG: case IORING_OP_SEND: @@ -4354,9 +5722,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, nxt, force_nonblock); + ret = io_sendmsg(req, force_nonblock, cs); else - ret = io_send(req, nxt, force_nonblock); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: case IORING_OP_RECV: @@ -4366,9 +5734,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, nxt, force_nonblock); + ret = io_recvmsg(req, force_nonblock, cs); else - ret = io_recv(req, nxt, force_nonblock); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -4392,7 +5760,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_accept(req, nxt, force_nonblock); + ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: if (sqe) { @@ -4400,7 +5768,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_connect(req, nxt, force_nonblock); + ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: if (sqe) { @@ -4408,7 +5776,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_async_cancel(req, nxt); + ret = io_async_cancel(req); break; case IORING_OP_FALLOCATE: if (sqe) { @@ -4416,7 +5784,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_fallocate(req, nxt, force_nonblock); + ret = io_fallocate(req, force_nonblock); break; case IORING_OP_OPENAT: if (sqe) { @@ -4424,7 +5792,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_openat(req, nxt, force_nonblock); + ret = io_openat(req, force_nonblock); break; case IORING_OP_CLOSE: if (sqe) { @@ -4432,7 +5800,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_close(req, nxt, force_nonblock); + ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: if (sqe) { @@ -4440,7 +5808,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_files_update(req, force_nonblock); + ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: if (sqe) { @@ -4448,7 +5816,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_statx(req, nxt, force_nonblock); + ret = io_statx(req, force_nonblock); break; case IORING_OP_FADVISE: if (sqe) { @@ -4456,7 +5824,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_fadvise(req, nxt, force_nonblock); + ret = io_fadvise(req, force_nonblock); break; case IORING_OP_MADVISE: if (sqe) { @@ -4464,7 +5832,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_madvise(req, nxt, force_nonblock); + ret = io_madvise(req, force_nonblock); break; case IORING_OP_OPENAT2: if (sqe) { @@ -4472,7 +5840,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_openat2(req, nxt, force_nonblock); + ret = io_openat2(req, force_nonblock); break; case IORING_OP_EPOLL_CTL: if (sqe) { @@ -4480,7 +5848,39 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_epoll_ctl(req, nxt, force_nonblock); + ret = io_epoll_ctl(req, force_nonblock, cs); + break; + case IORING_OP_SPLICE: + if (sqe) { + ret = io_splice_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_splice(req, force_nonblock); + break; + case IORING_OP_PROVIDE_BUFFERS: + if (sqe) { + ret = io_provide_buffers_prep(req, sqe); + if (ret) + break; + } + ret = io_provide_buffers(req, force_nonblock, cs); + break; + case IORING_OP_REMOVE_BUFFERS: + if (sqe) { + ret = io_remove_buffers_prep(req, sqe); + if (ret) + break; + } + ret = io_remove_buffers(req, force_nonblock, cs); + break; + case IORING_OP_TEE: + if (sqe) { + ret = io_tee_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_tee(req, force_nonblock); break; default: ret = -EINVAL; @@ -4490,12 +5890,10 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) return ret; - if (ctx->flags & IORING_SETUP_IOPOLL) { + /* If the op doesn't have a file, we're not polling for it */ + if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) { const bool in_async = io_wq_current_is_worker(); - if (req->result == -EAGAIN) - return -EAGAIN; - /* workqueue context doesn't hold uring_lock, grab it now */ if (in_async) mutex_lock(&ctx->uring_lock); @@ -4509,13 +5907,16 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_wq_submit_work(struct io_wq_work **workptr) +static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { - struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; + struct io_kiocb *timeout; int ret = 0; + timeout = io_prep_linked_timeout(req); + if (timeout) + io_queue_linked_timeout(timeout); + /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == IO_WQ_WORK_CANCEL) { @@ -4523,9 +5924,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } if (!ret) { - req->in_async = true; do { - ret = io_issue_sqe(req, NULL, &nxt, false); + ret = io_issue_sqe(req, NULL, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -4537,27 +5937,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } while (1); } - /* drop submission reference */ - io_put_req(req); - if (ret) { req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); } - /* if a dependent link is ready, pass it back */ - if (!ret && nxt) - io_wq_assign_next(workptr, nxt); -} - -static int io_req_needs_file(struct io_kiocb *req, int fd) -{ - if (!io_op_defs[req->opcode].needs_file) - return 0; - if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg) - return 0; - return 1; + return io_steal_work(req); } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -4566,42 +5951,47 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, struct fixed_file_table *table; table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK];; + return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; - int fd; - - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); - - if (!io_req_needs_file(req, fd)) - return 0; + struct file *file; - if (flags & IOSQE_FIXED_FILE) { + if (fixed) { if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); - req->file = io_file_from_index(ctx, fd); - if (!req->file) - return -EBADF; - req->flags |= REQ_F_FIXED_FILE; - percpu_ref_get(&ctx->file_data->refs); + file = io_file_from_index(ctx, fd); + if (file) { + req->fixed_file_refs = ctx->file_data->cur_refs; + percpu_ref_get(req->fixed_file_refs); + } } else { - if (req->needs_fixed_file) - return -EBADF; trace_io_uring_file_get(ctx, fd); - req->file = io_file_get(state, fd); - if (unlikely(!req->file)) - return -EBADF; + file = __io_file_get(state, fd); } - return 0; + if (file || io_op_defs[req->opcode].needs_file_no_error) { + *out_file = file; + return 0; + } + return -EBADF; +} + +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, + int fd) +{ + bool fixed; + + fixed = (req->flags & REQ_F_FIXED_FILE) != 0; + if (unlikely(!fixed && io_async_submit(req->ctx))) + return -EBADF; + + return io_file_get(state, req, fd, &req->file, fixed); } static int io_grab_files(struct io_kiocb *req) @@ -4609,7 +5999,9 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; - if (req->work.files) + io_req_init_async(req); + + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; if (!ctx->ring_file) return -EBADF; @@ -4634,6 +6026,13 @@ static int io_grab_files(struct io_kiocb *req) return ret; } +static inline int io_prep_work_files(struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].file_table) + return 0; + return io_grab_files(req); +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -4663,25 +6062,20 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (prev) { req_set_fail_links(prev); - io_async_find_and_cancel(ctx, req, prev->user_data, NULL, - -ETIME); + io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req(prev); } else { - io_cqring_add_event(req, -ETIME); - io_put_req(req); + io_req_complete(req, -ETIME); } return HRTIMER_NORESTART; } -static void io_queue_linked_timeout(struct io_kiocb *req) +static void __io_queue_linked_timeout(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - /* * If the list is now empty, then our linked request finished before * we got a chance to setup the timer */ - spin_lock_irq(&ctx->completion_lock); if (!list_empty(&req->link_list)) { struct io_timeout_data *data = &req->io->timeout; @@ -4689,6 +6083,14 @@ static void io_queue_linked_timeout(struct io_kiocb *req) hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); } +} + +static void io_queue_linked_timeout(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + __io_queue_linked_timeout(req); spin_unlock_irq(&ctx->completion_lock); /* drop submission reference */ @@ -4699,7 +6101,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { struct io_kiocb *nxt; - if (!(req->flags & REQ_F_LINK)) + if (!(req->flags & REQ_F_LINK_HEAD)) + return NULL; + if (req->flags & REQ_F_LINK_TIMEOUT) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, @@ -4711,17 +6115,19 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { struct io_kiocb *linked_timeout; - struct io_kiocb *nxt = NULL; + struct io_kiocb *nxt; const struct cred *old_creds = NULL; int ret; again: linked_timeout = io_prep_linked_timeout(req); - if (req->work.creds && req->work.creds != current_cred()) { + if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && + req->work.creds != current_cred()) { if (old_creds) revert_creds(old_creds); if (old_creds == req->work.creds) @@ -4730,60 +6136,59 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, &nxt, true); + ret = io_issue_sqe(req, sqe, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || - (req->flags & REQ_F_MUST_PUNT))) { + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + if (!io_arm_poll_handler(req)) { punt: - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (ret) + ret = io_prep_work_files(req); + if (unlikely(ret)) goto err; + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); } - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); - goto done_req; - } - -err: - /* drop submission reference */ - io_put_req_find_next(req, &nxt); - - if (linked_timeout) { - if (!ret) + if (linked_timeout) io_queue_linked_timeout(linked_timeout); - else - io_put_req(linked_timeout); + goto exit; } - /* and drop final reference, if we failed */ - if (ret) { - io_cqring_add_event(req, ret); + if (unlikely(ret)) { +err: + /* un-prep timeout, so it'll be killed as any other linked */ + req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); io_put_req(req); + io_req_complete(req, ret); + goto exit; } -done_req: + + /* drop submission reference */ + nxt = io_put_req_find_next(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); + if (nxt) { req = nxt; - nxt = NULL; if (req->flags & REQ_F_FORCE_ASYNC) goto punt; goto again; } +exit: if (old_creds) revert_creds(old_creds); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { int ret; @@ -4791,73 +6196,44 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) { if (ret != -EIOCBQUEUED) { fail_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret < 0)) - goto fail_req; + if (!req->io) { + ret = io_req_defer_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; + } + /* * Never try inline submit of IOSQE_ASYNC is set, go straight * to async execution. */ + io_req_init_async(req); req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe); + __io_queue_sqe(req, sqe, cs); } } -static inline void io_queue_link_head(struct io_kiocb *req) +static inline void io_queue_link_head(struct io_kiocb *req, + struct io_comp_state *cs) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_cqring_add_event(req, -ECANCELED); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, -ECANCELED); } else - io_queue_sqe(req, NULL); + io_queue_sqe(req, NULL, cs); } -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC) - -static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_submit_state *state, struct io_kiocb **link) +static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; - unsigned int sqe_flags; - int ret, id; - - sqe_flags = READ_ONCE(sqe->flags); - - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { - ret = -EINVAL; - goto err_req; - } - - id = READ_ONCE(sqe->personality); - if (id) { - req->work.creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!req->work.creds)) { - ret = -EINVAL; - goto err_req; - } - get_cred(req->work.creds); - } - - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| - IOSQE_ASYNC); - - ret = io_req_set_file(state, req, sqe); - if (unlikely(ret)) { -err_req: - io_cqring_add_event(req, ret); - io_double_put_req(req); - return false; - } + int ret; /* * If we already have a head request, queue this one for async @@ -4876,47 +6252,44 @@ err_req: * next after the link request. The last one is done via * drain_next flag to persist the effect across calls. */ - if (sqe_flags & IOSQE_IO_DRAIN) { + if (req->flags & REQ_F_IO_DRAIN) { head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - if (io_alloc_async_ctx(req)) { - ret = -EAGAIN; - goto err_req; - } - ret = io_req_defer_prep(req, sqe); - if (ret) { + if (unlikely(ret)) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; - goto err_req; + return ret; } trace_io_uring_link(ctx, req, head); + io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ - if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { - io_queue_link_head(head); + if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { + io_queue_link_head(head, cs); *link = NULL; } } else { if (unlikely(ctx->drain_next)) { req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = 0; + ctx->drain_next = 0; } - if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { - req->flags |= REQ_F_LINK; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + req->flags |= REQ_F_LINK_HEAD; INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); - if (ret) + if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req, sqe, cs); } } - return true; + return 0; } /* @@ -4924,8 +6297,10 @@ err_req: */ static void io_submit_state_end(struct io_submit_state *state) { + if (!list_empty(&state->comp.list)) + io_submit_flush_completions(&state->comp); blk_finish_plug(&state->plug); - io_file_put(state); + io_state_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); } @@ -4934,9 +6309,15 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - unsigned int max_ios) + struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); +#ifdef CONFIG_BLOCK + state->plug.nowait = true; +#endif + state->comp.nr = 0; + INIT_LIST_HEAD(&state->comp.list); + state->comp.ctx = ctx; state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; @@ -4962,8 +6343,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe **sqe_ptr) +static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) { u32 *sq_array = ctx->sq_array; unsigned head; @@ -4977,35 +6357,81 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, * though the application is the one updating it. */ head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); - if (likely(head < ctx->sq_entries)) { - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = ctx->cached_sq_head; - *sqe_ptr = &ctx->sq_sqes[head]; - req->opcode = READ_ONCE((*sqe_ptr)->opcode); - req->user_data = READ_ONCE((*sqe_ptr)->user_data); - ctx->cached_sq_head++; - return true; - } + if (likely(head < ctx->sq_entries)) + return &ctx->sq_sqes[head]; /* drop invalid entries */ - ctx->cached_sq_head++; ctx->cached_sq_dropped++; WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); - return false; + return NULL; +} + +static inline void io_consume_sqe(struct io_ring_ctx *ctx) +{ + ctx->cached_sq_head++; +} + +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) + +static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe, + struct io_submit_state *state) +{ + unsigned int sqe_flags; + int id; + + req->opcode = READ_ONCE(sqe->opcode); + req->user_data = READ_ONCE(sqe->user_data); + req->io = NULL; + req->file = NULL; + req->ctx = ctx; + req->flags = 0; + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); + req->task = current; + req->result = 0; + + if (unlikely(req->opcode >= IORING_OP_LAST)) + return -EINVAL; + + if (unlikely(io_sq_thread_acquire_mm(ctx, req))) + return -EFAULT; + + sqe_flags = READ_ONCE(sqe->flags); + /* enforce forwards compatibility on users */ + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) + return -EINVAL; + + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) + return -EOPNOTSUPP; + + id = READ_ONCE(sqe->personality); + if (id) { + io_req_init_async(req); + req->work.creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.creds)) + return -EINVAL; + get_cred(req->work.creds); + } + + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags; + + if (!io_op_defs[req->opcode].needs_file) + return 0; + + return io_req_set_file(state, req, READ_ONCE(sqe->fd)); } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct file *ring_file, int ring_fd, - struct mm_struct **mm, bool async) + struct file *ring_file, int ring_fd) { - struct io_submit_state state, *statep = NULL; + struct io_submit_state state; struct io_kiocb *link = NULL; int i, submitted = 0; - bool mm_fault = false; /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { @@ -5020,10 +6446,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, nr); - statep = &state; - } + io_submit_state_start(&state, ctx, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; @@ -5033,44 +6456,35 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct io_kiocb *req; int err; - req = io_get_req(ctx, statep); + sqe = io_get_sqe(ctx); + if (unlikely(!sqe)) { + io_consume_sqe(ctx); + break; + } + req = io_alloc_req(ctx, &state); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, req, &sqe)) { - __io_req_do_free(req); - break; - } + err = io_init_req(ctx, req, sqe, &state); + io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; - if (unlikely(req->opcode >= IORING_OP_LAST)) { - err = -EINVAL; + if (unlikely(err)) { fail_req: - io_cqring_add_event(req, err); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, err); break; } - if (io_op_defs[req->opcode].needs_mm && !*mm) { - mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); - if (unlikely(mm_fault)) { - err = -EFAULT; - goto fail_req; - } - use_mm(ctx->sqo_mm); - *mm = ctx->sqo_mm; - } - - req->in_async = async; - req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, - true, async); - if (!io_submit_sqe(req, sqe, statep, &link)) - break; + true, io_async_submit(ctx)); + err = io_submit_sqe(req, sqe, &link, &state.comp); + if (err) + goto fail_req; } if (unlikely(submitted != nr)) { @@ -5079,9 +6493,8 @@ fail_req: percpu_ref_put_many(&ctx->refs, nr - ref_used); } if (link) - io_queue_link_head(link); - if (statep) - io_submit_state_end(&state); + io_queue_link_head(link, &state.comp); + io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -5089,32 +6502,43 @@ fail_req: return submitted; } +static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) +{ + /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + +static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; - struct mm_struct *cur_mm = NULL; const struct cred *old_cred; - mm_segment_t old_fs; DEFINE_WAIT(wait); unsigned long timeout; int ret = 0; - complete(&ctx->completions[1]); + complete(&ctx->sq_thread_comp); - old_fs = get_fs(); - set_fs(USER_DS); old_cred = override_creds(ctx->creds); timeout = jiffies + ctx->sq_thread_idle; while (!kthread_should_park()) { unsigned int to_submit; - if (!list_empty(&ctx->poll_list)) { + if (!list_empty(&ctx->iopoll_list)) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list)) - io_iopoll_getevents(ctx, &nr_events, 0); + if (!list_empty(&ctx->iopoll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; mutex_unlock(&ctx->uring_lock); @@ -5126,18 +6550,14 @@ static int io_sq_thread(void *data) * If submit got -EBUSY, flag us as needing the application * to enter the kernel to reap and flush events. */ - if (!to_submit || ret == -EBUSY) { + if (!to_submit || ret == -EBUSY || need_resched()) { /* * Drop cur_mm before scheduling, we can't hold it for * long periods (or over schedule()). Do this before * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - if (cur_mm) { - unuse_mm(cur_mm); - mmput(cur_mm); - cur_mm = NULL; - } + io_sq_thread_drop_mm(); /* * We're polling. If we're within the defined idle @@ -5146,9 +6566,10 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (!list_empty(&ctx->poll_list) || + if (!list_empty(&ctx->iopoll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { + io_run_task_work(); cond_resched(); continue; } @@ -5158,21 +6579,18 @@ static int io_sq_thread(void *data) /* * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to poll_list, it - * is because reqs may have been punted to io worker and - * will be added to poll_list later, hence check the - * poll_list again. + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. */ if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->poll_list)) { + !list_empty_careful(&ctx->iopoll_list)) { finish_wait(&ctx->sqo_wait, &wait); continue; } - /* Tell userspace we may need a wakeup call */ - ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - /* make sure to read SQ tail after writing flags */ - smp_mb(); + io_ring_set_wakeup_flag(ctx); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -5180,30 +6598,35 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } + if (io_run_task_work()) { + finish_wait(&ctx->sqo_wait, &wait); + io_ring_clear_wakeup_flag(ctx); + continue; + } if (signal_pending(current)) flush_signals(current); schedule(); finish_wait(&ctx->sqo_wait, &wait); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + io_ring_clear_wakeup_flag(ctx); + ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + io_ring_clear_wakeup_flag(ctx); } mutex_lock(&ctx->uring_lock); - ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + if (likely(!percpu_ref_is_dying(&ctx->refs))) + ret = io_submit_sqes(ctx, to_submit, NULL, -1); mutex_unlock(&ctx->uring_lock); timeout = jiffies + ctx->sq_thread_idle; } - set_fs(old_fs); - if (cur_mm) { - unuse_mm(cur_mm); - mmput(cur_mm); - } + io_run_task_work(); + + io_sq_thread_drop_mm(); revert_creds(old_cred); kthread_parkme(); @@ -5263,8 +6686,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, struct io_rings *rings = ctx->rings; int ret = 0; - if (io_cqring_events(ctx, false) >= min_events) - return 0; + do { + if (io_cqring_events(ctx, false) >= min_events) + return 0; + if (!io_run_task_work()) + break; + } while (1); if (sig) { #ifdef CONFIG_COMPAT @@ -5284,13 +6711,23 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); - if (io_should_wake(&iowq, false)) - break; - schedule(); + /* make sure we run task_work before checking for signals */ + if (io_run_task_work()) + continue; if (signal_pending(current)) { + if (current->jobctl & JOBCTL_TASK_WORK) { + spin_lock_irq(¤t->sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + continue; + } ret = -EINTR; break; } + if (io_should_wake(&iowq, false)) + break; + schedule(); } while (1); finish_wait(&ctx->wait, &iowq.wq); @@ -5330,43 +6767,36 @@ static void io_file_ref_kill(struct percpu_ref *ref) complete(&data->done); } -static void io_file_ref_exit_and_free(struct work_struct *work) -{ - struct fixed_file_data *data; - - data = container_of(work, struct fixed_file_data, ref_work); - - /* - * Ensure any percpu-ref atomic switch callback has run, it could have - * been in progress when the files were being unregistered. Once - * that's done, we can safely exit and free the ref and containing - * data structure. - */ - rcu_barrier(); - percpu_ref_exit(&data->refs); - kfree(data); -} - static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_file_data *data = ctx->file_data; + struct fixed_file_ref_node *ref_node = NULL; unsigned nr_tables, i; if (!data) return -ENXIO; - percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); - flush_work(&data->ref_work); + spin_lock(&data->lock); + if (!list_empty(&data->ref_list)) + ref_node = list_first_entry(&data->ref_list, + struct fixed_file_ref_node, node); + spin_unlock(&data->lock); + if (ref_node) + percpu_ref_kill(&ref_node->refs); + + percpu_ref_kill(&data->refs); + + /* wait for all refs nodes to complete */ + flush_delayed_work(&ctx->file_put_work); wait_for_completion(&data->done); - io_ring_file_ref_flush(data); __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) kfree(data->table[i].files); kfree(data->table); - INIT_WORK(&data->ref_work, io_file_ref_exit_and_free); - queue_work(system_wq, &data->ref_work); + percpu_ref_exit(&data->refs); + kfree(data); ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; @@ -5375,7 +6805,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx) { if (ctx->sqo_thread) { - wait_for_completion(&ctx->completions[1]); + wait_for_completion(&ctx->sq_thread_comp); /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity @@ -5410,13 +6840,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) struct sk_buff *skb; int i, nr_files; - if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { - unsigned long inflight = ctx->user->unix_inflight + nr; - - if (inflight > task_rlimit(current, RLIMIT_NOFILE)) - return -EMFILE; - } - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); if (!fpl) return -ENOMEM; @@ -5591,50 +7014,93 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) } struct io_file_put { - struct llist_node llist; + struct list_head list; struct file *file; - struct completion *done; }; -static void io_ring_file_ref_flush(struct fixed_file_data *data) +static void __io_file_put_work(struct fixed_file_ref_node *ref_node) { + struct fixed_file_data *file_data = ref_node->file_data; + struct io_ring_ctx *ctx = file_data->ctx; struct io_file_put *pfile, *tmp; - struct llist_node *node; - while ((node = llist_del_all(&data->put_llist)) != NULL) { - llist_for_each_entry_safe(pfile, tmp, node, llist) { - io_ring_file_put(data->ctx, pfile->file); - if (pfile->done) - complete(pfile->done); - else - kfree(pfile); - } + list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) { + list_del(&pfile->list); + io_ring_file_put(ctx, pfile->file); + kfree(pfile); } + + spin_lock(&file_data->lock); + list_del(&ref_node->node); + spin_unlock(&file_data->lock); + + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); + percpu_ref_put(&file_data->refs); } -static void io_ring_file_ref_switch(struct work_struct *work) +static void io_file_put_work(struct work_struct *work) { - struct fixed_file_data *data; + struct io_ring_ctx *ctx; + struct llist_node *node; + + ctx = container_of(work, struct io_ring_ctx, file_put_work.work); + node = llist_del_all(&ctx->file_put_llist); - data = container_of(work, struct fixed_file_data, ref_work); - io_ring_file_ref_flush(data); - percpu_ref_switch_to_percpu(&data->refs); + while (node) { + struct fixed_file_ref_node *ref_node; + struct llist_node *next = node->next; + + ref_node = llist_entry(node, struct fixed_file_ref_node, llist); + __io_file_put_work(ref_node); + node = next; + } } static void io_file_data_ref_zero(struct percpu_ref *ref) { - struct fixed_file_data *data; + struct fixed_file_ref_node *ref_node; + struct io_ring_ctx *ctx; + bool first_add; + int delay = HZ; - data = container_of(ref, struct fixed_file_data, refs); + ref_node = container_of(ref, struct fixed_file_ref_node, refs); + ctx = ref_node->file_data->ctx; - /* - * We can't safely switch from inside this context, punt to wq. If - * the table ref is going away, the table is being unregistered. - * Don't queue up the async work for that case, the caller will - * handle it. - */ - if (!percpu_ref_is_dying(&data->refs)) - queue_work(system_wq, &data->ref_work); + if (percpu_ref_is_dying(&ctx->file_data->refs)) + delay = 0; + + first_add = llist_add(&ref_node->llist, &ctx->file_put_llist); + if (!delay) + mod_delayed_work(system_wq, &ctx->file_put_work, 0); + else if (first_add) + queue_delayed_work(system_wq, &ctx->file_put_work, delay); +} + +static struct fixed_file_ref_node *alloc_fixed_file_ref_node( + struct io_ring_ctx *ctx) +{ + struct fixed_file_ref_node *ref_node; + + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return ERR_PTR(-ENOMEM); + + if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero, + 0, GFP_KERNEL)) { + kfree(ref_node); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&ref_node->node); + INIT_LIST_HEAD(&ref_node->file_list); + ref_node->file_data = ctx->file_data; + return ref_node; +} + +static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node) +{ + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); } static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, @@ -5645,6 +7111,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, struct file *file; int fd, ret = 0; unsigned i; + struct fixed_file_ref_node *ref_node; if (ctx->file_data) return -EBUSY; @@ -5658,6 +7125,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -ENOMEM; ctx->file_data->ctx = ctx; init_completion(&ctx->file_data->done); + INIT_LIST_HEAD(&ctx->file_data->ref_list); + spin_lock_init(&ctx->file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); ctx->file_data->table = kcalloc(nr_tables, @@ -5669,15 +7138,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -ENOMEM; } - if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { kfree(ctx->file_data->table); kfree(ctx->file_data); ctx->file_data = NULL; return -ENOMEM; } - ctx->file_data->put_llist.first = NULL; - INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { percpu_ref_exit(&ctx->file_data->refs); @@ -5732,6 +7199,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_tables; i++) kfree(ctx->file_data->table[i].files); + percpu_ref_exit(&ctx->file_data->refs); kfree(ctx->file_data->table); kfree(ctx->file_data); ctx->file_data = NULL; @@ -5740,9 +7208,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } ret = io_sqe_files_scm(ctx); - if (ret) + if (ret) { io_sqe_files_unregister(ctx); + return ret; + } + ref_node = alloc_fixed_file_ref_node(ctx); + if (IS_ERR(ref_node)) { + io_sqe_files_unregister(ctx); + return PTR_ERR(ref_node); + } + + ctx->file_data->cur_refs = &ref_node->refs; + spin_lock(&ctx->file_data->lock); + list_add(&ref_node->node, &ctx->file_data->ref_list); + spin_unlock(&ctx->file_data->lock); + percpu_ref_get(&ctx->file_data->refs); return ret; } @@ -5789,46 +7270,22 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static void io_atomic_switch(struct percpu_ref *ref) +static int io_queue_file_removal(struct fixed_file_data *data, + struct file *file) { - struct fixed_file_data *data; + struct io_file_put *pfile; + struct percpu_ref *refs = data->cur_refs; + struct fixed_file_ref_node *ref_node; - /* - * Juggle reference to ensure we hit zero, if needed, so we can - * switch back to percpu mode - */ - data = container_of(ref, struct fixed_file_data, refs); - percpu_ref_put(&data->refs); - percpu_ref_get(&data->refs); -} - -static bool io_queue_file_removal(struct fixed_file_data *data, - struct file *file) -{ - struct io_file_put *pfile, pfile_stack; - DECLARE_COMPLETION_ONSTACK(done); - - /* - * If we fail allocating the struct we need for doing async reomval - * of this file, just punt to sync and wait for it. - */ pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); - if (!pfile) { - pfile = &pfile_stack; - pfile->done = &done; - } + if (!pfile) + return -ENOMEM; + ref_node = container_of(refs, struct fixed_file_ref_node, refs); pfile->file = file; - llist_add(&pfile->llist, &data->put_llist); + list_add(&pfile->list, &ref_node->file_list); - if (pfile == &pfile_stack) { - percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); - wait_for_completion(&done); - flush_work(&data->ref_work); - return false; - } - - return true; + return 0; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -5836,17 +7293,22 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, unsigned nr_args) { struct fixed_file_data *data = ctx->file_data; - bool ref_switch = false; + struct fixed_file_ref_node *ref_node; struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; + bool needs_switch = false; if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; + ref_node = alloc_fixed_file_ref_node(ctx); + if (IS_ERR(ref_node)) + return PTR_ERR(ref_node); + done = 0; fds = u64_to_user_ptr(up->fds); while (nr_args) { @@ -5863,9 +7325,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { file = io_file_from_index(ctx, index); + err = io_queue_file_removal(data, file); + if (err) + break; table->files[index] = NULL; - if (io_queue_file_removal(data, file)) - ref_switch = true; + needs_switch = true; } if (fd != -1) { file = fget(fd); @@ -5888,19 +7352,29 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, } table->files[index] = file; err = io_sqe_file_register(ctx, file, i); - if (err) + if (err) { + fput(file); break; + } } nr_args--; done++; up->offset++; } - if (ref_switch) - percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); + if (needs_switch) { + percpu_ref_kill(data->cur_refs); + spin_lock(&data->lock); + list_add(&ref_node->node, &data->ref_list); + data->cur_refs = &ref_node->refs; + spin_unlock(&data->lock); + percpu_ref_get(&ctx->file_data->refs); + } else + destroy_fixed_file_ref_node(ref_node); return done ? done : err; } + static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { @@ -5918,20 +7392,14 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, return __io_sqe_files_update(ctx, &up, nr_args); } -static void io_put_work(struct io_wq_work *work) +static void io_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + /* Consider that io_steal_work() relies on this ref */ io_put_req(req); } -static void io_get_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - refcount_inc(&req->refs); -} - static int io_init_wq_offload(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -5942,8 +7410,8 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx, int ret = 0; data.user = ctx->user; - data.get_work = io_get_work; - data.put_work = io_put_work; + data.free_work = io_free_work; + data.do_work = io_wq_submit_work; if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { /* Do QD, or 4 * CPUS, whatever is smallest */ @@ -5984,10 +7452,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - init_waitqueue_head(&ctx->sqo_wait); - mmgrab(current->mm); - ctx->sqo_mm = current->mm; - if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) @@ -6032,17 +7496,17 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; return ret; } -static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) { atomic_long_sub(nr_pages, &user->locked_vm); } -static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +static inline int __io_account_mem(struct user_struct *user, + unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -6060,6 +7524,41 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) +{ + if (ctx->limit_mem) + __io_unaccount_mem(ctx->user, nr_pages); + + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm -= nr_pages; + else if (acct == ACCT_PINNED) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + } +} + +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) +{ + int ret; + + if (ctx->limit_mem) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm += nr_pages; + else if (acct == ACCT_PINNED) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + } + + return 0; +} + static void io_mem_free(void *ptr) { struct page *page; @@ -6096,6 +7595,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return SIZE_MAX; #endif + if (sq_offset) + *sq_offset = off; + sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; @@ -6103,9 +7605,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; - if (sq_offset) - *sq_offset = off; - return off; } @@ -6134,8 +7633,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -6218,16 +7716,14 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - if (ctx->account_mem) { - ret = io_account_mem(ctx->user, nr_pages); - if (ret) - goto err; - } + ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); + if (ret) + goto err; ret = 0; if (!pages || nr_pages > got_pages) { - kfree(vmas); - kfree(pages); + kvfree(vmas); + kvfree(pages); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); vmas = kvmalloc_array(nr_pages, @@ -6235,8 +7731,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -6246,13 +7741,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } ret = 0; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); @@ -6270,7 +7764,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, } else { ret = pret < 0 ? pret : -EFAULT; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (ret) { /* * if we did partial map, or found file backed vmas, @@ -6278,8 +7772,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); kvfree(imu->bvec); goto err; } @@ -6345,16 +7838,33 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } +static int __io_destroy_buffers(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + struct io_buffer *buf = p; + + __io_remove_buffers(ctx, buf, id, -1U); + return 0; +} + +static void io_destroy_buffers(struct io_ring_ctx *ctx) +{ + idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx); + idr_destroy(&ctx->io_buffer_idr); +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); - if (ctx->sqo_mm) + io_sqe_buffer_unregister(ctx); + if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } - io_iopoll_reap_events(ctx); - io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); + io_destroy_buffers(ctx); idr_destroy(&ctx->personality_idr); #if defined(CONFIG_UNIX) @@ -6368,12 +7878,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, - ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); put_cred(ctx->creds); - kfree(ctx->completions); kfree(ctx->cancel_hash); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); @@ -6417,35 +7923,59 @@ static int io_remove_personalities(int id, void *p, void *data) return 0; } +static void io_ring_exit_work(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + exit_work); + + /* + * If we're doing polled IO and end up having requests being + * submitted async (out-of-line), then completions can come in while + * we're waiting for refs to drop. We need to reap these manually, + * as nobody else will be looking for them. + */ + do { + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); + io_iopoll_try_reap_events(ctx); + } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); + io_ring_ctx_free(ctx); +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); - /* - * Wait for sq thread to idle, if we have one. It won't spin on new - * work after we've killed the ctx ref above. This is important to do - * before we cancel existing commands, as the thread could otherwise - * be queueing new work post that. If that's work we need to cancel, - * it could cause shutdown to hang. - */ - while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait)) - cpu_relax(); - io_kill_timeouts(ctx); io_poll_remove_all(ctx); if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); - io_iopoll_reap_events(ctx); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); - wait_for_completion(&ctx->completions[0]); - io_ring_ctx_free(ctx); + + /* + * Do this upfront, so we won't have a grace period where the ring + * is closed but resources aren't reaped yet. This can cause + * spurious failure in setting up a new ring. + */ + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries), + ACCT_LOCKED); + + INIT_WORK(&ctx->exit_work, io_ring_exit_work); + /* + * Use system_unbound_wq to avoid spawning tons of event kworkers + * if we're exiting a ton of rings at the same time. It just adds + * noise and overhead, there's no discernable change in runtime + * over using system_wq. + */ + queue_work(system_unbound_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -6457,14 +7987,117 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } +static bool io_wq_files_match(struct io_wq_work *work, void *data) +{ + struct files_struct *files = data; + + return work->files == files; +} + +/* + * Returns true if 'preq' is the link parent of 'req' + */ +static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) +{ + struct io_kiocb *link; + + if (!(preq->flags & REQ_F_LINK_HEAD)) + return false; + + list_for_each_entry(link, &preq->link_list, link_list) { + if (link == req) + return true; + } + + return false; +} + +/* + * We're looking to cancel 'req' because it's holding on to our files, but + * 'req' could be a link to another request. See if it is, and cancel that + * parent request if so. + */ +static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct hlist_node *tmp; + struct io_kiocb *preq; + bool found = false; + int i; + + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(preq, tmp, list, hash_node) { + found = io_match_link(preq, req); + if (found) { + io_poll_remove_one(preq); + break; + } + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + +static bool io_timeout_remove_link(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_kiocb *preq; + bool found = false; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { + found = io_match_link(preq, req); + if (found) { + __io_timeout_cancel(preq); + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + +static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +{ + return io_match_link(container_of(work, struct io_kiocb, work), data); +} + +static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + enum io_wq_cancel cret; + + /* cancel this particular work, if it's running */ + cret = io_wq_cancel_work(ctx->io_wq, &req->work); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* find links that hold this pending, cancel those */ + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* if we have a poll link holding this pending, cancel that */ + if (io_poll_remove_link(ctx, req)) + return; + + /* final option, timeout link is holding this req pending */ + io_timeout_remove_link(ctx, req); +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { - struct io_kiocb *req; - DEFINE_WAIT(wait); + if (list_empty_careful(&ctx->inflight_list)) + return; + + /* cancel all at once, should be faster than doing it one by one*/ + io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); while (!list_empty_careful(&ctx->inflight_list)) { - struct io_kiocb *cancel_req = NULL; + struct io_kiocb *cancel_req = NULL, *req; + DEFINE_WAIT(wait); spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { @@ -6484,35 +8117,20 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* We need to keep going until we don't find a matching req */ if (!cancel_req) break; - - if (cancel_req->flags & REQ_F_OVERFLOW) { - spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->list); - cancel_req->flags &= ~REQ_F_OVERFLOW; - if (list_empty(&ctx->cq_overflow_list)) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - } - spin_unlock_irq(&ctx->completion_lock); - - WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); - - /* - * Put inflight ref and overflow ref. If that's - * all we had, then we're done with this request. - */ - if (refcount_sub_and_test(2, &cancel_req->refs)) { - io_put_req(cancel_req); - continue; - } - } - - io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + /* cancel this request, or head link requests */ + io_attempt_cancel(ctx, cancel_req); io_put_req(cancel_req); schedule(); + finish_wait(&ctx->inflight_wait, &wait); } - finish_wait(&ctx->inflight_wait, &wait); +} + +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct task_struct *task = data; + + return req->task == task; } static int io_uring_flush(struct file *file, void *data) @@ -6525,7 +8143,7 @@ static int io_uring_flush(struct file *file, void *data) * If the task is going away, cancel work it may have pending */ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current)); + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); return 0; } @@ -6609,6 +8227,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; + io_run_task_work(); + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; @@ -6638,25 +8258,25 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { - struct mm_struct *cur_mm; - mutex_lock(&ctx->uring_lock); - /* already have mm, so io_submit_sqes() won't try to grab it */ - cur_mm = ctx->sqo_mm; - submitted = io_submit_sqes(ctx, to_submit, f.file, fd, - &cur_mm, false); + submitted = io_submit_sqes(ctx, to_submit, f.file, fd); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) goto out; } if (flags & IORING_ENTER_GETEVENTS) { - unsigned nr_events = 0; - min_complete = min(min_complete, ctx->cq_entries); - if (ctx->flags & IORING_SETUP_IOPOLL) { - ret = io_iopoll_check(ctx, &nr_events, min_complete); + /* + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user + * space applications don't need to do io completion events + * polling again, they can rely on io_sq_thread to do polling + * work, which can reduce cpu usage and uring_lock contention. + */ + if (ctx->flags & IORING_SETUP_IOPOLL && + !(ctx->flags & IORING_SETUP_SQPOLL)) { + ret = io_iopoll_check(ctx, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } @@ -6731,6 +8351,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "Personalities:\n"); idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); } + seq_printf(m, "PollList:\n"); + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list = &ctx->cancel_hash[i]; + struct io_kiocb *req; + + hlist_for_each_entry(req, list, hash_node) + seq_printf(m, " op=%d, task_works=%d\n", req->opcode, + req->task->task_works != NULL); + } + spin_unlock_irq(&ctx->completion_lock); mutex_unlock(&ctx->uring_lock); } @@ -6766,6 +8397,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_rings *rings; size_t size, sq_array_offset; + /* make sure these are sane, as we already accounted them */ + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -6782,8 +8417,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_entries = p->cq_entries; ctx->sq_mask = rings->sq_ring_mask; ctx->cq_mask = rings->cq_ring_mask; - ctx->sq_entries = rings->sq_ring_entries; - ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { @@ -6845,11 +8478,12 @@ err: return ret; } -static int io_uring_create(unsigned entries, struct io_uring_params *p) +static int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) { struct user_struct *user = NULL; struct io_ring_ctx *ctx; - bool account_mem; + bool limit_mem; int ret; if (!entries) @@ -6888,10 +8522,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) } user = get_uid(current_user()); - account_mem = !capable(CAP_IPC_LOCK); + limit_mem = !capable(CAP_IPC_LOCK); - if (account_mem) { - ret = io_account_mem(user, + if (limit_mem) { + ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { free_uid(user); @@ -6901,17 +8535,29 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx = io_ring_ctx_alloc(p); if (!ctx) { - if (account_mem) - io_unaccount_mem(user, ring_pages(p->sq_entries, + if (limit_mem) + __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->account_mem = account_mem; ctx->user = user; ctx->creds = get_current_cred(); + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + + /* + * Account memory _before_ installing the file descriptor. Once + * the descriptor is installed, it can get closed at any time. Also + * do this before hitting the general error path, as ring freeing + * will un-account as well. + */ + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); + ctx->limit_mem = limit_mem; + ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; @@ -6936,6 +8582,17 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); + + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | + IORING_FEAT_POLL_32BITS; + + if (copy_to_user(params, p, sizeof(*p))) { + ret = -EFAULT; + goto err; + } /* * Install ring fd as the very last thing, so we don't risk someone @@ -6945,9 +8602,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -6963,7 +8617,6 @@ err: static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_uring_params p; - long ret; int i; if (copy_from_user(&p, params, sizeof(p))) @@ -6978,14 +8631,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; - ret = io_uring_create(entries, &p); - if (ret < 0) - return ret; - - if (copy_to_user(params, &p, sizeof(p))) - return -EFAULT; - - return ret; + return io_uring_create(entries, &p, params); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, @@ -7099,7 +8745,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - ret = wait_for_completion_interruptible(&ctx->completions[0]); + ret = wait_for_completion_interruptible(&ctx->ref_comp); mutex_lock(&ctx->uring_lock); if (ret) { percpu_ref_resurrect(&ctx->refs); @@ -7176,7 +8822,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); out: - reinit_completion(&ctx->completions[0]); + reinit_completion(&ctx->ref_comp); } return ret; } @@ -7225,12 +8871,14 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(8, __u64, off); BUILD_BUG_SQE_ELEM(8, __u64, addr2); BUILD_BUG_SQE_ELEM(16, __u64, addr); + BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); BUILD_BUG_SQE_ELEM(24, __u32, len); BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); @@ -7239,11 +8887,14 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, __u32, open_flags); BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); + BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); BUILD_BUG_SQE_ELEM(42, __u16, personality); + BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); + BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; }; |