diff options
Diffstat (limited to 'fs/aio.c')
-rw-r--r-- | fs/aio.c | 374 |
1 files changed, 186 insertions, 188 deletions
@@ -46,6 +46,7 @@ #include <asm/kmap_types.h> #include <linux/uaccess.h> +#include <linux/nospec.h> #include "internal.h" @@ -162,9 +163,13 @@ struct kioctx { unsigned id; }; +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in <linux/fs.h> + */ struct fsync_iocb { - struct work_struct work; struct file *file; + struct work_struct work; bool datasync; }; @@ -172,14 +177,21 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool cancelled; struct wait_queue_entry wait; struct work_struct work; }; +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ struct aio_kiocb { union { + struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; @@ -188,8 +200,7 @@ struct aio_kiocb { struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; - struct iocb __user *ki_user_iocb; /* user's aiocb */ - __u64 ki_user_data; /* user's data for completion */ + struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ @@ -912,7 +923,7 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) local_irq_restore(flags); } -static bool get_reqs_available(struct kioctx *ctx) +static bool __get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; @@ -1004,32 +1015,35 @@ static void user_refill_reqs_available(struct kioctx *ctx) spin_unlock_irq(&ctx->completion_lock); } +static bool get_reqs_available(struct kioctx *ctx) +{ + if (__get_reqs_available(ctx)) + return true; + user_refill_reqs_available(ctx); + return __get_reqs_available(ctx); +} + /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. + * + * The refcount is initialized to 2 - one for the async op completion, + * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; - if (!get_reqs_available(ctx)) { - user_refill_reqs_available(ctx); - if (!get_reqs_available(ctx)) - return NULL; - } - - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) - goto out_put; + return NULL; percpu_ref_get(&ctx->reqs); - INIT_LIST_HEAD(&req->ki_list); - refcount_set(&req->ki_refcnt, 0); req->ki_ctx = ctx; + INIT_LIST_HEAD(&req->ki_list); + refcount_set(&req->ki_refcnt, 2); + req->ki_eventfd = NULL; return req; -out_put: - put_reqs_available(ctx, 1); - return NULL; } static struct kioctx *lookup_ioctx(unsigned long ctx_id) @@ -1049,6 +1063,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; + id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) @@ -1059,19 +1074,18 @@ out: return ret; } -static inline void iocb_put(struct aio_kiocb *iocb) +static inline void iocb_destroy(struct aio_kiocb *iocb) { - if (refcount_read(&iocb->ki_refcnt) == 0 || - refcount_dec_and_test(&iocb->ki_refcnt)) { - percpu_ref_put(&iocb->ki_ctx->reqs); - kmem_cache_free(kiocb_cachep, iocb); - } + if (iocb->ki_filp) + fput(iocb->ki_filp); + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); } /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct aio_kiocb *iocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; @@ -1095,17 +1109,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - event->obj = (u64)(unsigned long)iocb->ki_user_iocb; - event->data = iocb->ki_user_data; - event->res = res; - event->res2 = res2; + *event = iocb->ki_res; kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, - res, res2); + pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, + (void __user *)(unsigned long)iocb->ki_res.obj, + iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again @@ -1147,7 +1158,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - iocb_put(iocb); +} + +static inline void iocb_put(struct aio_kiocb *iocb) +{ + if (refcount_dec_and_test(&iocb->ki_refcnt)) { + aio_complete(iocb); + iocb_destroy(iocb); + } } /* aio_read_events_ring @@ -1421,18 +1439,17 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) file_end_write(kiocb->ki_filp); } - fput(kiocb->ki_filp); - aio_complete(iocb, res, res2); + iocb->ki_res.res = res; + iocb->ki_res.res2 = res2; + iocb_put(iocb); } -static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) { int ret; - req->ki_filp = fget(iocb->aio_fildes); - if (unlikely(!req->ki_filp)) - return -EBADF; req->ki_complete = aio_complete_rw; + req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) @@ -1447,7 +1464,6 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - fput(req->ki_filp); return ret; } @@ -1457,11 +1473,13 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - fput(req->ki_filp); - return ret; + return ret; + + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ + return 0; } -static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, +static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; @@ -1496,12 +1514,12 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) ret = -EINTR; /*FALLTHRU*/ default: - aio_complete_rw(req, ret, 0); + req->ki_complete(req, ret, 0); } } -static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, - bool compat) +static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, + bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1512,29 +1530,24 @@ static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, if (ret) return ret; file = req->ki_filp; - - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_fput; + return -EBADF; ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } -static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, - bool compat) +static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, + bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1546,16 +1559,14 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, return ret; file = req->ki_filp; - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->write_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { /* @@ -1573,35 +1584,26 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } static void aio_fsync_work(struct work_struct *work) { - struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); - int ret; + struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); - ret = vfs_fsync(req->file, req->datasync); - fput(req->file); - aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); + iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + iocb_put(iocb); } -static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) +static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, + bool datasync) { if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; - if (unlikely(!req->file->f_op->fsync)) { - fput(req->file); + if (unlikely(!req->file->f_op->fsync)) return -EINVAL; - } req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); @@ -1609,14 +1611,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) return 0; } -static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - struct file *file = iocb->poll.file; - - aio_complete(iocb, mangle_poll(mask), 0); - fput(file); -} - static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1642,9 +1636,11 @@ static void aio_poll_complete_work(struct work_struct *work) return; } list_del_init(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; spin_unlock_irq(&ctx->ctx_lock); - aio_poll_complete(iocb, mask); + iocb_put(iocb); } /* assumes we are called with irqs disabled */ @@ -1670,27 +1666,29 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); - - req->woken = true; + unsigned long flags; /* for instances that support it check for an event match first: */ - if (mask) { - if (!(mask & req->events)) - return 0; + if (mask && !(mask & req->events)) + return 0; - /* try to complete the iocb inline if we can: */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { - list_del(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); + list_del_init(&req->wait.entry); - list_del_init(&req->wait.entry); - aio_poll_complete(iocb, mask); - return 1; - } + if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Try to complete the iocb inline if we can. Use + * irqsave/irqrestore because not all filesystems (e.g. fuse) + * call this function with IRQs disabled and because IRQs + * have to be disabled before ctx_lock is obtained. + */ + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; + spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); + iocb_put(iocb); + } else { + schedule_work(&req->work); } - - list_del_init(&req->wait.entry); - schedule_work(&req->work); return 1; } @@ -1717,11 +1715,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->iocb->poll.wait); } -static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) +static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; + bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ @@ -1733,9 +1732,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; + + req->head = NULL; + req->done = false; + req->cancelled = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; @@ -1746,83 +1746,79 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&aiocb->ki_refcnt, 2); - mask = vfs_poll(req->file, &apt.pt) & req->events; - if (unlikely(!req->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (req->woken) { - /* wake_up context handles the rest */ - mask = 0; + if (likely(req->head)) { + spin_lock(&req->head->lock); + if (unlikely(list_empty(&req->wait.entry))) { + if (apt.error) + cancel = true; + apt.error = 0; + mask = 0; + } + if (mask || apt.error) { + list_del_init(&req->wait.entry); + } else if (cancel) { + WRITE_ONCE(req->cancelled, true); + } else if (!req->done) { /* actually waiting for an event */ + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + aiocb->ki_cancel = aio_poll_cancel; + } + spin_unlock(&req->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; - } else if (mask || apt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&req->wait.entry)); - list_del_init(&req->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); - -out: - if (unlikely(apt.error)) { - fput(req->file); - return apt.error; - } - if (mask) - aio_poll_complete(aiocb, mask); - iocb_put(aiocb); - return 0; + iocb_put(aiocb); + return apt.error; } -static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - bool compat) +static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, + struct iocb __user *user_iocb, bool compat) { struct aio_kiocb *req; - struct iocb iocb; ssize_t ret; - if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) - return -EFAULT; - /* enforce forwards compatibility on users */ - if (unlikely(iocb.aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( - (iocb.aio_buf != (unsigned long)iocb.aio_buf) || - (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || - ((ssize_t)iocb.aio_nbytes < 0) + (iocb->aio_buf != (unsigned long)iocb->aio_buf) || + (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || + ((ssize_t)iocb->aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; } + if (!get_reqs_available(ctx)) + return -EAGAIN; + + ret = -EAGAIN; req = aio_get_req(ctx); if (unlikely(!req)) - return -EAGAIN; + goto out_put_reqs_available; + + req->ki_filp = fget(iocb->aio_fildes); + ret = -EBADF; + if (unlikely(!req->ki_filp)) + goto out_put_req; - if (iocb.aio_flags & IOCB_FLAG_RESFD) { + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); + req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); req->ki_eventfd = NULL; @@ -1836,54 +1832,70 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; } - req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb.aio_data; + req->ki_res.obj = (u64)(unsigned long)user_iocb; + req->ki_res.data = iocb->aio_data; + req->ki_res.res = 0; + req->ki_res.res2 = 0; - switch (iocb.aio_lio_opcode) { + switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->rw, &iocb, false, compat); + ret = aio_read(&req->rw, iocb, false, compat); break; case IOCB_CMD_PWRITE: - ret = aio_write(&req->rw, &iocb, false, compat); + ret = aio_write(&req->rw, iocb, false, compat); break; case IOCB_CMD_PREADV: - ret = aio_read(&req->rw, &iocb, true, compat); + ret = aio_read(&req->rw, iocb, true, compat); break; case IOCB_CMD_PWRITEV: - ret = aio_write(&req->rw, &iocb, true, compat); + ret = aio_write(&req->rw, iocb, true, compat); break; case IOCB_CMD_FSYNC: - ret = aio_fsync(&req->fsync, &iocb, false); + ret = aio_fsync(&req->fsync, iocb, false); break; case IOCB_CMD_FDSYNC: - ret = aio_fsync(&req->fsync, &iocb, true); + ret = aio_fsync(&req->fsync, iocb, true); break; case IOCB_CMD_POLL: - ret = aio_poll(req, &iocb); + ret = aio_poll(req, iocb); break; default: - pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); + pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); ret = -EINVAL; break; } + /* Done with the synchronous reference */ + iocb_put(req); + /* * If ret is 0, we'd either done aio_complete() ourselves or have * arranged for that to be done asynchronously. Anything non-zero * means that we need to destroy req ourselves. */ - if (ret) - goto out_put_req; - return 0; + if (!ret) + return 0; + out_put_req: - put_reqs_available(ctx, 1); - percpu_ref_put(&ctx->reqs); if (req->ki_eventfd) eventfd_ctx_put(req->ki_eventfd); - kmem_cache_free(kiocb_cachep, req); + iocb_destroy(req); +out_put_reqs_available: + put_reqs_available(ctx, 1); return ret; } +static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, + bool compat) +{ + struct iocb iocb; + + if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) + return -EFAULT; + + return __io_submit_one(ctx, &iocb, user_iocb, compat); +} + /* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context @@ -1976,24 +1988,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, } #endif -/* lookup_kiocb - * Finds a given iocb for cancellation. - */ -static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) -{ - struct aio_kiocb *kiocb; - - assert_spin_locked(&ctx->ctx_lock); - - /* TODO: use a hash or array, this sucks. */ - list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { - if (kiocb->ki_user_iocb == iocb) - return kiocb; - } - return NULL; -} - /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is @@ -2011,6 +2005,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; + u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; @@ -2022,10 +2017,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - kiocb = lookup_kiocb(ctx, iocb); - if (kiocb) { - ret = kiocb->ki_cancel(&kiocb->rw); - list_del_init(&kiocb->ki_list); + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_res.obj == obj) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + break; + } } spin_unlock_irq(&ctx->ctx_lock); |