diff options
author | Stefan Agner <stefan.agner@toradex.com> | 2019-06-18 14:29:35 +0200 |
---|---|---|
committer | Stefan Agner <stefan.agner@toradex.com> | 2019-06-18 14:29:35 +0200 |
commit | d15d0b7a9f89cf5a905ad6802eb23100c8063939 (patch) | |
tree | 84caa660dc73842efa29e116dcb3e32aa81a5cf0 /fs | |
parent | e9dcc568b2e968af848bbdb4267ba6cde5457b9e (diff) | |
parent | 858848641fbecd42437e36adc9291b0ce5db379e (diff) |
Merge tag 'v4.19.50-rt22' into toradex_4.19.y-rt
Linux 4.19.50-rt22
Diffstat (limited to 'fs')
201 files changed, 3393 insertions, 1652 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 89bac3d2f05b..619128b55837 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -61,6 +61,8 @@ enum { Opt_cache_loose, Opt_fscache, Opt_mmap, /* Access options */ Opt_access, Opt_posixacl, + /* Lock timeout option */ + Opt_locktimeout, /* Error token */ Opt_err }; @@ -80,6 +82,7 @@ static const match_table_t tokens = { {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, + {Opt_locktimeout, "locktimeout=%u"}, {Opt_err, NULL} }; @@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = NULL; #endif + v9ses->session_lock_timeout = P9_LOCK_TIMEOUT; if (!opts) return 0; @@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #endif break; + case Opt_locktimeout: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + if (option < 1) { + p9_debug(P9_DEBUG_ERROR, + "locktimeout must be a greater than zero integer.\n"); + ret = -EINVAL; + continue; + } + v9ses->session_lock_timeout = (long)option * HZ; + break; + default: continue; } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 982e017acadb..129e5243a6bf 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -116,6 +116,7 @@ struct v9fs_session_info { struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; + long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 5a0db6dec8d1..aaee1e6584e6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,6 +40,9 @@ */ #define P9_LOCK_TIMEOUT (30*HZ) +/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */ +#define V9FS_STAT2INODE_KEEP_ISIZE 1 + extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; @@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); -void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); -void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); +void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, + struct super_block *sb, unsigned int flags); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); @@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) } int v9fs_open_to_dotl_flags(int flags); + +static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) +{ + /* + * 32-bit need the lock, concurrent updates could break the + * sequences and make i_size_read() loop forever. + * 64-bit updates are atomic and can skip the locking. + */ + if (sizeof(i_size) > sizeof(long)) + spin_lock(&inode->i_lock); + i_size_write(inode, i_size); + if (sizeof(i_size) > sizeof(long)) + spin_unlock(&inode->i_lock); +} #endif diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 48db9a9f13f9..cb6c4031af55 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -105,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) int err = 0; struct p9_fid *fid; int buflen; - int reclen = 0; struct p9_rdir *rdir; struct kvec kvec; @@ -138,11 +137,10 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) while (rdir->head < rdir->tail) { err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); - if (err) { + if (err <= 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); return -EIO; } - reclen = st.size+2; over = !dir_emit(ctx, st.name, strlen(st.name), v9fs_qid2ino(&st.qid), dt_type(&st)); @@ -150,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (over) return 0; - rdir->head += reclen; - ctx->pos += reclen; + rdir->head += err; + ctx->pos += err; } } } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index ab3d5f5dbb00..05454a7e22dc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) uint8_t status = P9_LOCK_ERROR; int res = 0; unsigned char fl_type; + struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); @@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; + v9ses = v9fs_inode2v9ses(file_inode(filp)); + /* * if its a blocked request and we get P9_LOCK_BLOCKED as the status * for lock request, keep on trying @@ -202,7 +205,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; - if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + if (schedule_timeout_interruptible(v9ses->session_lock_timeout) + != 0) break; /* * p9_client_lock_dotl overwrites flock.client_id with the @@ -442,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) i_size = i_size_read(inode); if (iocb->ki_pos > i_size) { inode_add_bytes(inode, iocb->ki_pos - i_size); - i_size_write(inode, iocb->ki_pos); + /* + * Need to serialize against i_size_write() in + * v9fs_stat2inode() + */ + v9fs_i_size_write(inode, iocb->ki_pos); } return retval; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 85ff859d3af5..72b779bc0942 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode(st, inode, sb); + v9fs_stat2inode(st, inode, sb, 0); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb); + v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); generic_fillattr(d_inode(dentry), stat); p9stat_free(st); @@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate * @sb: superblock of filesystem + * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, - struct super_block *sb) + struct super_block *sb, unsigned int flags) { umode_t mode; char ext[32]; @@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->length); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ - inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + inode->i_blocks = (stat->length + 512 - 1) >> 9; v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } @@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { int umode; dev_t rdev; - loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_stat(fid); @@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode(st, inode, inode->i_sb, flags); out: p9stat_free(st); kfree(st); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4823e1c46999..a950a927a626 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode_dotl(st, inode); + v9fs_stat2inode_dotl(st, inode, 0); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, d_inode(dentry)); + v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate + * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void -v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { @@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) } if (stat->st_result_mask & P9_STATS_RDEV) inode->i_rdev = new_decode_dev(stat->st_rdev); - if (stat->st_result_mask & P9_STATS_SIZE) - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && + stat->st_result_mask & P9_STATS_SIZE) + v9fs_i_size_write(inode, stat->st_size); if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } @@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry, int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { - loff_t i_size; struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); @@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 48ce50484e80..eeab9953af89 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, d_inode(root)); + v9fs_stat2inode_dotl(st, d_inode(root), 0); kfree(st); } else { struct p9_wstat *st = NULL; @@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, d_inode(root), sb); + v9fs_stat2inode(st, d_inode(root), sb, 0); p9stat_free(st); kfree(st); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index dc62d15a964b..1bb300ef362b 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -208,7 +208,7 @@ again: /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; goto again; @@ -413,7 +413,7 @@ static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl) /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; afs_lock_may_be_available(vnode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 071075d775a9..0726e40db0f8 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -411,7 +411,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; } else { - vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; vnode->cb_v_break = vnode->volume->cb_v_break; valid = false; } @@ -543,6 +542,8 @@ void afs_evict_inode(struct inode *inode) #endif afs_put_permits(rcu_access_pointer(vnode->permit_cache)); + key_put(vnode->lock_key); + vnode->lock_key = NULL; _leave(""); } diff --git a/fs/afs/write.c b/fs/afs/write.c index 19c04caf3c01..e00461a6de9a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -253,6 +253,7 @@ static void afs_kill_pages(struct address_space *mapping, first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); + unlock_page(page); } __pagevec_release(&pv); @@ -46,6 +46,7 @@ #include <asm/kmap_types.h> #include <linux/uaccess.h> +#include <linux/nospec.h> #include "internal.h" @@ -162,9 +163,13 @@ struct kioctx { unsigned id; }; +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in <linux/fs.h> + */ struct fsync_iocb { - struct work_struct work; struct file *file; + struct work_struct work; bool datasync; }; @@ -172,14 +177,21 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool cancelled; struct wait_queue_entry wait; struct work_struct work; }; +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ struct aio_kiocb { union { + struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; @@ -188,8 +200,7 @@ struct aio_kiocb { struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; - struct iocb __user *ki_user_iocb; /* user's aiocb */ - __u64 ki_user_data; /* user's data for completion */ + struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ @@ -912,7 +923,7 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) local_irq_restore(flags); } -static bool get_reqs_available(struct kioctx *ctx) +static bool __get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; @@ -1004,32 +1015,35 @@ static void user_refill_reqs_available(struct kioctx *ctx) spin_unlock_irq(&ctx->completion_lock); } +static bool get_reqs_available(struct kioctx *ctx) +{ + if (__get_reqs_available(ctx)) + return true; + user_refill_reqs_available(ctx); + return __get_reqs_available(ctx); +} + /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. + * + * The refcount is initialized to 2 - one for the async op completion, + * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; - if (!get_reqs_available(ctx)) { - user_refill_reqs_available(ctx); - if (!get_reqs_available(ctx)) - return NULL; - } - - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) - goto out_put; + return NULL; percpu_ref_get(&ctx->reqs); - INIT_LIST_HEAD(&req->ki_list); - refcount_set(&req->ki_refcnt, 0); req->ki_ctx = ctx; + INIT_LIST_HEAD(&req->ki_list); + refcount_set(&req->ki_refcnt, 2); + req->ki_eventfd = NULL; return req; -out_put: - put_reqs_available(ctx, 1); - return NULL; } static struct kioctx *lookup_ioctx(unsigned long ctx_id) @@ -1049,6 +1063,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; + id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) @@ -1059,19 +1074,18 @@ out: return ret; } -static inline void iocb_put(struct aio_kiocb *iocb) +static inline void iocb_destroy(struct aio_kiocb *iocb) { - if (refcount_read(&iocb->ki_refcnt) == 0 || - refcount_dec_and_test(&iocb->ki_refcnt)) { - percpu_ref_put(&iocb->ki_ctx->reqs); - kmem_cache_free(kiocb_cachep, iocb); - } + if (iocb->ki_filp) + fput(iocb->ki_filp); + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); } /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct aio_kiocb *iocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; @@ -1095,17 +1109,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - event->obj = (u64)(unsigned long)iocb->ki_user_iocb; - event->data = iocb->ki_user_data; - event->res = res; - event->res2 = res2; + *event = iocb->ki_res; kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, - res, res2); + pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, + (void __user *)(unsigned long)iocb->ki_res.obj, + iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again @@ -1147,7 +1158,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - iocb_put(iocb); +} + +static inline void iocb_put(struct aio_kiocb *iocb) +{ + if (refcount_dec_and_test(&iocb->ki_refcnt)) { + aio_complete(iocb); + iocb_destroy(iocb); + } } /* aio_read_events_ring @@ -1421,18 +1439,17 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) file_end_write(kiocb->ki_filp); } - fput(kiocb->ki_filp); - aio_complete(iocb, res, res2); + iocb->ki_res.res = res; + iocb->ki_res.res2 = res2; + iocb_put(iocb); } -static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) { int ret; - req->ki_filp = fget(iocb->aio_fildes); - if (unlikely(!req->ki_filp)) - return -EBADF; req->ki_complete = aio_complete_rw; + req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) @@ -1447,7 +1464,6 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - fput(req->ki_filp); return ret; } @@ -1457,11 +1473,13 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - fput(req->ki_filp); - return ret; + return ret; + + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ + return 0; } -static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, +static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; @@ -1496,12 +1514,12 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) ret = -EINTR; /*FALLTHRU*/ default: - aio_complete_rw(req, ret, 0); + req->ki_complete(req, ret, 0); } } -static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, - bool compat) +static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, + bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1512,29 +1530,24 @@ static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, if (ret) return ret; file = req->ki_filp; - - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_fput; + return -EBADF; ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } -static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, - bool compat) +static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, + bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1546,16 +1559,14 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, return ret; file = req->ki_filp; - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->write_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { /* @@ -1573,35 +1584,26 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } static void aio_fsync_work(struct work_struct *work) { - struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); - int ret; + struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); - ret = vfs_fsync(req->file, req->datasync); - fput(req->file); - aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); + iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + iocb_put(iocb); } -static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) +static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, + bool datasync) { if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; - if (unlikely(!req->file->f_op->fsync)) { - fput(req->file); + if (unlikely(!req->file->f_op->fsync)) return -EINVAL; - } req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); @@ -1609,14 +1611,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) return 0; } -static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - struct file *file = iocb->poll.file; - - aio_complete(iocb, mangle_poll(mask), 0); - fput(file); -} - static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1642,9 +1636,11 @@ static void aio_poll_complete_work(struct work_struct *work) return; } list_del_init(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; spin_unlock_irq(&ctx->ctx_lock); - aio_poll_complete(iocb, mask); + iocb_put(iocb); } /* assumes we are called with irqs disabled */ @@ -1670,27 +1666,29 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); - - req->woken = true; + unsigned long flags; /* for instances that support it check for an event match first: */ - if (mask) { - if (!(mask & req->events)) - return 0; + if (mask && !(mask & req->events)) + return 0; - /* try to complete the iocb inline if we can: */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { - list_del(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); + list_del_init(&req->wait.entry); - list_del_init(&req->wait.entry); - aio_poll_complete(iocb, mask); - return 1; - } + if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Try to complete the iocb inline if we can. Use + * irqsave/irqrestore because not all filesystems (e.g. fuse) + * call this function with IRQs disabled and because IRQs + * have to be disabled before ctx_lock is obtained. + */ + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; + spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); + iocb_put(iocb); + } else { + schedule_work(&req->work); } - - list_del_init(&req->wait.entry); - schedule_work(&req->work); return 1; } @@ -1717,11 +1715,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->iocb->poll.wait); } -static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) +static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; + bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ @@ -1733,9 +1732,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; + + req->head = NULL; + req->done = false; + req->cancelled = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; @@ -1746,83 +1746,79 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&aiocb->ki_refcnt, 2); - mask = vfs_poll(req->file, &apt.pt) & req->events; - if (unlikely(!req->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (req->woken) { - /* wake_up context handles the rest */ - mask = 0; + if (likely(req->head)) { + spin_lock(&req->head->lock); + if (unlikely(list_empty(&req->wait.entry))) { + if (apt.error) + cancel = true; + apt.error = 0; + mask = 0; + } + if (mask || apt.error) { + list_del_init(&req->wait.entry); + } else if (cancel) { + WRITE_ONCE(req->cancelled, true); + } else if (!req->done) { /* actually waiting for an event */ + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + aiocb->ki_cancel = aio_poll_cancel; + } + spin_unlock(&req->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; - } else if (mask || apt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&req->wait.entry)); - list_del_init(&req->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); - -out: - if (unlikely(apt.error)) { - fput(req->file); - return apt.error; - } - if (mask) - aio_poll_complete(aiocb, mask); - iocb_put(aiocb); - return 0; + iocb_put(aiocb); + return apt.error; } -static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - bool compat) +static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, + struct iocb __user *user_iocb, bool compat) { struct aio_kiocb *req; - struct iocb iocb; ssize_t ret; - if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) - return -EFAULT; - /* enforce forwards compatibility on users */ - if (unlikely(iocb.aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( - (iocb.aio_buf != (unsigned long)iocb.aio_buf) || - (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || - ((ssize_t)iocb.aio_nbytes < 0) + (iocb->aio_buf != (unsigned long)iocb->aio_buf) || + (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || + ((ssize_t)iocb->aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; } + if (!get_reqs_available(ctx)) + return -EAGAIN; + + ret = -EAGAIN; req = aio_get_req(ctx); if (unlikely(!req)) - return -EAGAIN; + goto out_put_reqs_available; + + req->ki_filp = fget(iocb->aio_fildes); + ret = -EBADF; + if (unlikely(!req->ki_filp)) + goto out_put_req; - if (iocb.aio_flags & IOCB_FLAG_RESFD) { + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); + req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); req->ki_eventfd = NULL; @@ -1836,54 +1832,70 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; } - req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb.aio_data; + req->ki_res.obj = (u64)(unsigned long)user_iocb; + req->ki_res.data = iocb->aio_data; + req->ki_res.res = 0; + req->ki_res.res2 = 0; - switch (iocb.aio_lio_opcode) { + switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->rw, &iocb, false, compat); + ret = aio_read(&req->rw, iocb, false, compat); break; case IOCB_CMD_PWRITE: - ret = aio_write(&req->rw, &iocb, false, compat); + ret = aio_write(&req->rw, iocb, false, compat); break; case IOCB_CMD_PREADV: - ret = aio_read(&req->rw, &iocb, true, compat); + ret = aio_read(&req->rw, iocb, true, compat); break; case IOCB_CMD_PWRITEV: - ret = aio_write(&req->rw, &iocb, true, compat); + ret = aio_write(&req->rw, iocb, true, compat); break; case IOCB_CMD_FSYNC: - ret = aio_fsync(&req->fsync, &iocb, false); + ret = aio_fsync(&req->fsync, iocb, false); break; case IOCB_CMD_FDSYNC: - ret = aio_fsync(&req->fsync, &iocb, true); + ret = aio_fsync(&req->fsync, iocb, true); break; case IOCB_CMD_POLL: - ret = aio_poll(req, &iocb); + ret = aio_poll(req, iocb); break; default: - pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); + pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); ret = -EINVAL; break; } + /* Done with the synchronous reference */ + iocb_put(req); + /* * If ret is 0, we'd either done aio_complete() ourselves or have * arranged for that to be done asynchronously. Anything non-zero * means that we need to destroy req ourselves. */ - if (ret) - goto out_put_req; - return 0; + if (!ret) + return 0; + out_put_req: - put_reqs_available(ctx, 1); - percpu_ref_put(&ctx->reqs); if (req->ki_eventfd) eventfd_ctx_put(req->ki_eventfd); - kmem_cache_free(kiocb_cachep, req); + iocb_destroy(req); +out_put_reqs_available: + put_reqs_available(ctx, 1); return ret; } +static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, + bool compat) +{ + struct iocb iocb; + + if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) + return -EFAULT; + + return __io_submit_one(ctx, &iocb, user_iocb, compat); +} + /* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context @@ -1976,24 +1988,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, } #endif -/* lookup_kiocb - * Finds a given iocb for cancellation. - */ -static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) -{ - struct aio_kiocb *kiocb; - - assert_spin_locked(&ctx->ctx_lock); - - /* TODO: use a hash or array, this sucks. */ - list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { - if (kiocb->ki_user_iocb == iocb) - return kiocb; - } - return NULL; -} - /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is @@ -2011,6 +2005,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; + u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; @@ -2022,10 +2017,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - kiocb = lookup_kiocb(ctx, iocb); - if (kiocb) { - ret = kiocb->ki_cancel(&kiocb->rw); - list_del_init(&kiocb->ki_list); + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_res.obj == obj) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + break; + } } spin_unlock_irq(&ctx->ctx_lock); diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 27134058cf55..354b7147cead 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -597,7 +597,6 @@ int autofs_expire_run(struct super_block *sb, pkt.len = dentry->d_name.len; memcpy(pkt.name, dentry->d_name.name, pkt.len); pkt.name[pkt.len] = '\0'; - dput(dentry); if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; @@ -610,6 +609,8 @@ int autofs_expire_run(struct super_block *sb, complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); + dput(dentry); + return ret; } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 846c052569dd..3c14a8e45ffb 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -255,8 +255,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) } root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); - if (!root) + if (!root) { + ret = -ENOMEM; goto fail_ino; + } pipe = NULL; root->d_fsdata = ino; diff --git a/fs/block_dev.c b/fs/block_dev.c index 38b8ce05cbc7..1c25dae083a8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -104,6 +104,20 @@ void invalidate_bdev(struct block_device *bdev) } EXPORT_SYMBOL(invalidate_bdev); +static void set_init_blocksize(struct block_device *bdev) +{ + unsigned bsize = bdev_logical_block_size(bdev); + loff_t size = i_size_read(bdev->bd_inode); + + while (bsize < PAGE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} + int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ @@ -282,10 +296,10 @@ static void blkdev_bio_end_io(struct bio *bio) struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; - if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - } else { + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -1408,18 +1422,9 @@ EXPORT_SYMBOL(check_disk_change); void bd_set_size(struct block_device *bdev, loff_t size) { - unsigned bsize = bdev_logical_block_size(bdev); - inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); inode_unlock(bdev->bd_inode); - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); } EXPORT_SYMBOL(bd_set_size); @@ -1496,8 +1501,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + set_init_blocksize(bdev); + } /* * If the device is invalidated, rescan partition @@ -1532,6 +1539,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + set_init_blocksize(bdev); } if (bdev->bd_bdi == &noop_backing_dev_info) diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 3b66c957ea6f..5810463dc6d2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -9,6 +9,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/posix_acl.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/slab.h> #include "ctree.h" @@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, } if (acl) { + unsigned int nofs_flag; + size = posix_acl_xattr_size(acl->a_count); + /* + * We're holding a transaction handle, so use a NOFS memory + * allocation context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); value = kmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!value) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ae750b1574a2..ac6c383d6314 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -710,7 +710,7 @@ out: * read tree blocks and add keys where required. */ static int add_missing_keys(struct btrfs_fs_info *fs_info, - struct preftrees *preftrees) + struct preftrees *preftrees, bool lock) { struct prelim_ref *ref; struct extent_buffer *eb; @@ -735,12 +735,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_extent_buffer(eb); return -EIO; } - btrfs_tree_read_lock(eb); + if (lock) + btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); else btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); - btrfs_tree_read_unlock(eb); + if (lock) + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL); cond_resched(); @@ -1225,7 +1227,7 @@ again: btrfs_release_path(path); - ret = add_missing_keys(fs_info, &preftrees); + ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; @@ -1286,11 +1288,14 @@ again: ret = -EIO; goto out; } - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (!path->skip_locking) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie, ignore_offset); - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) goto out; @@ -1452,8 +1457,8 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * callers (such as fiemap) which want to know whether the extent is * shared but do not need a ref count. * - * This attempts to allocate a transaction in order to account for - * delayed refs, but continues on even when the alloc fails. + * This attempts to attach to the running transaction in order to account for + * delayed refs, but continues on even when no running transaction exists. * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ @@ -1476,13 +1481,16 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) tmp = ulist_alloc(GFP_NOFS); roots = ulist_alloc(GFP_NOFS); if (!tmp || !roots) { - ulist_free(tmp); - ulist_free(roots); - return -ENOMEM; + ret = -ENOMEM; + goto out; } - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) { + ret = PTR_ERR(trans); + goto out; + } trans = NULL; down_read(&fs_info->commit_root_sem); } else { @@ -1515,6 +1523,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) } else { up_read(&fs_info->commit_root_sem); } +out: ulist_free(tmp); ulist_free(roots); return ret; @@ -1904,13 +1913,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_attach_transaction(fs_info->extent_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && + PTR_ERR(trans) != -EROFS) + return PTR_ERR(trans); + trans = NULL; + } + } + + if (trans) btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - } else { + else down_read(&fs_info->commit_root_sem); - } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, tree_mod_seq_elem.seq, &refs, @@ -1943,7 +1958,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: - if (!search_commit_root) { + if (trans) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans); } else { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1343ac57b438..45f5cf9cd203 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -29,6 +29,7 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, + BTRFS_INODE_SNAPSHOT_FLUSH, }; /* in memory btrfs inode */ @@ -147,6 +148,12 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Track the transaction id of the last transaction used to create a + * hard link for the inode. This is used by the log tree (fsync). + */ + u64 last_link_trans; + + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 089b46c4d97f..79ac1ebabaf7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1003,6 +1003,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return 0; } +static struct extent_buffer *alloc_tree_block_no_bg_flush( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent_start, + const struct btrfs_disk_key *disk_key, + int level, + u64 hint, + u64 empty_size) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *ret; + + /* + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. + * COWing can result in allocation of a new chunk, and flushing pending + * block groups (btrfs_create_pending_block_groups()) can be triggered + * when finishing allocation of a new chunk. Creation of a pending block + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. + * For similar reasons, we also need to delay flushing pending block + * groups when splitting a leaf or node, from one of those trees, since + * we are holding a write lock on it and its parent or when inserting a + * new root node for one of those trees. + */ + if (root == fs_info->extent_root || + root == fs_info->chunk_root || + root == fs_info->dev_root || + root == fs_info->free_space_root) + trans->can_flush_pending_bgs = false; + + ret = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, disk_key, level, + hint, empty_size); + trans->can_flush_pending_bgs = true; + + return ret; +} + /* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked @@ -1050,26 +1092,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; - /* - * If we are COWing a node/leaf from the extent, chunk or device trees, - * make sure that we do not finish block group creation of pending block - * groups. We do this to avoid a deadlock. - * COWing can result in allocation of a new chunk, and flushing pending - * block groups (btrfs_create_pending_block_groups()) can be triggered - * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk and device trees, therefore we could - * deadlock with ourselves since we are holding a lock on an extent - * buffer that btrfs_create_pending_block_groups() may try to COW later. - */ - if (root == fs_info->extent_root || - root == fs_info->chunk_root || - root == fs_info->dev_root) - trans->can_flush_pending_bgs = false; - - cow = btrfs_alloc_tree_block(trans, root, parent_start, - root->root_key.objectid, &disk_key, level, - search_start, empty_size); - trans->can_flush_pending_bgs = true; + cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -2412,6 +2436,16 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + /* + * Do extra check for first_key, eb can be stale due to + * being cached, read from scrub, or have multiple + * parents (shared tree blocks). + */ + if (btrfs_verify_level_key(fs_info, tmp, + parent_level - 1, &first_key, gen)) { + free_extent_buffer(tmp); + return -EUCLEAN; + } *eb_ret = tmp; return 0; } @@ -2624,14 +2658,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, root_lock = BTRFS_READ_LOCK; if (p->search_commit_root) { - /* The commit roots are read only so we always do read locks */ - if (p->need_commit_sem) + /* + * The commit roots are read only so we always do read locks, + * and we always must hold the commit_root_sem when doing + * searches on them, the only exception is send where we don't + * want to block transaction commits for a long time, so + * we need to clone the commit root in order to avoid races + * with transaction commits that create a snapshot of one of + * the roots used by a send operation. + */ + if (p->need_commit_sem) { down_read(&fs_info->commit_root_sem); - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (p->need_commit_sem) + b = btrfs_clone_extent_buffer(root->commit_root); up_read(&fs_info->commit_root_sem); + if (!b) + return ERR_PTR(-ENOMEM); + + } else { + b = root->commit_root; + extent_buffer_get(b); + } + level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when * p->search_commit_root = 1. @@ -2757,6 +2804,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto done; + } while (b) { level = btrfs_header_level(b); @@ -3364,8 +3415,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &lower_key, level, root->node->start, 0); + c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, + root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3494,8 +3545,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, level, c->start, 0); + split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, + c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -4279,8 +4330,8 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, + l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cddfe7806a4..82682da5a40d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 981434764bb9..8fed470bb7e1 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -800,39 +800,58 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_write_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + /* btrfs_dev_replace_finishing() will handle the cleanup part */ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * Scrub doing the replace isn't running so we need to do the + * cleanup step of btrfs_dev_replace_finishing() here + */ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = ktime_get_real_seconds(); - dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - WARN_ON(ret); + btrfs_dev_replace_write_unlock(dev_replace); - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + btrfs_scrub_cancel(fs_info); - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); + + btrfs_info_in_rcu(fs_info, + "suspended dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + break; + default: + result = -EINVAL; + } -leave: mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return result; } @@ -887,6 +906,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; btrfs_dev_replace_write_unlock(dev_replace); return 0; } @@ -898,6 +919,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * dev-replace to start anyway. */ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + btrfs_dev_replace_write_lock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + btrfs_dev_replace_write_unlock(dev_replace); btrfs_info(fs_info, "cannot resume dev-replace, other exclusive operation running"); return 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 834a3f5ef642..b2dc613ebed2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -17,6 +17,7 @@ #include <linux/semaphore.h> #include <linux/error-injection.h> #include <linux/crc32c.h> +#include <linux/sched/mm.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -407,9 +408,9 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return ret; } -static int verify_level_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid) { int found_level; struct btrfs_key found_key; @@ -486,8 +487,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, if (verify_parent_transid(io_tree, eb, parent_transid, 0)) ret = -EIO; - else if (verify_level_key(fs_info, eb, level, - first_key, parent_transid)) + else if (btrfs_verify_level_key(fs_info, eb, level, + first_key, parent_transid)) ret = -EUCLEAN; else break; @@ -994,13 +995,18 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { struct extent_buffer *buf = NULL; struct inode *btree_inode = fs_info->btree_inode; + int ret; buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, 0); - free_extent_buffer(buf); + + ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, + WAIT_NONE, 0); + if (ret < 0) + free_extent_buffer_stale(buf); + else + free_extent_buffer(buf); } int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -1020,12 +1026,12 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, mirror_num); if (ret) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return ret; } if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return -EIO; } else if (extent_buffer_uptodate(buf)) { *eb = buf; @@ -1079,7 +1085,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid, level, first_key); if (ret) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return ERR_PTR(ret); } return buf; @@ -1236,10 +1242,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key key; + unsigned int nofs_flag; int ret = 0; uuid_le uuid = NULL_UUID_LE; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); root = btrfs_alloc_root(fs_info, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); @@ -1656,9 +1669,8 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_fs_info *fs_info = root->fs_info; int again; - struct btrfs_trans_handle *trans; - do { + while (1) { again = 0; /* Make the cleaner go to sleep early. */ @@ -1707,42 +1719,16 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + if (kthread_should_park()) + kthread_parkme(); + if (kthread_should_stop()) + return 0; if (!again) { set_current_state(TASK_INTERRUPTIBLE); - if (!kthread_should_stop()) - schedule(); + schedule(); __set_current_state(TASK_RUNNING); } - } while (!kthread_should_stop()); - - /* - * Transaction kthread is stopped before us and wakes us up. - * However we might have started a new transaction and COWed some - * tree blocks when deleting unused block groups for example. So - * make sure we commit the transaction we started to have a clean - * shutdown when evicting the btree inode - if it has dirty pages - * when we do the final iput() on it, eviction will trigger a - * writeback for it which will fail with null pointer dereferences - * since work queues and other resources were already released and - * destroyed by the time the iput/eviction/writeback is made. - */ - trans = btrfs_attach_transaction(root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) - btrfs_err(fs_info, - "cleaner transaction attach returned %ld", - PTR_ERR(trans)); - } else { - int ret; - - ret = btrfs_commit_transaction(trans); - if (ret) - btrfs_err(fs_info, - "cleaner open transaction commit returned %d", - ret); } - - return 0; } static int transaction_kthread(void *arg) @@ -3923,6 +3909,13 @@ void close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + /* + * We don't want the cleaner to start new transactions, add more delayed + * iputs, etc. while we're closing. We can't use kthread_stop() yet + * because that frees the task_struct, and the transaction kthread might + * still try to wake up the cleaner. + */ + kthread_park(fs_info->cleaner_kthread); /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -3950,9 +3943,8 @@ void close_ctree(struct btrfs_fs_info *fs_info) if (!sb_rdonly(fs_info->sb)) { /* - * If the cleaner thread is stopped and there are - * block groups queued for removal, the deletion will be - * skipped when we quit the cleaner thread. + * The cleaner kthread is stopped, so do one final pass over + * unused block groups. */ btrfs_delete_unused_bgs(fs_info); @@ -4176,6 +4168,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4cccba22640f..7a4a60f26dbf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,6 +39,9 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +int btrfs_verify_level_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid, int level, struct btrfs_key *first_key); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 51e41e53d4ae..0cc800d22a08 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3911,8 +3911,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) info->space_info_kobj, "%s", alloc_name(space_info->flags)); if (ret) { - percpu_counter_destroy(&space_info->total_bytes_pinned); - kfree(space_info); + kobject_put(&space_info->kobj); return ret; } @@ -5872,7 +5871,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, * * This is overestimating in most cases. */ - qgroup_rsv_size = outstanding_extents * fs_info->nodesize; + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; @@ -8911,6 +8910,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_free; } + err = btrfs_run_delayed_items(trans); + if (err) + goto out_end_trans; + if (block_rsv) trans->block_rsv = block_rsv; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dd6faab02bb..90b0a6eff535 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3002,11 +3002,11 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->orig_start) + *prev_em_start != em->start) force_bio_submit = true; if (prev_em_start) - *prev_em_start = em->orig_start; + *prev_em_start = em->start; free_extent_map(em); em = NULL; @@ -3928,12 +3928,25 @@ static int extent_write_cache_pages(struct address_space *mapping, range_whole = 1; scanned = 1; } - if (wbc->sync_mode == WB_SYNC_ALL) + + /* + * We do the tagged writepage as long as the snapshot flush bit is set + * and we are the first one who do the filemap_flush() on this inode. + * + * The nr_to_write == LONG_MAX is needed to make sure other flushers do + * not race in and drop the bit. + */ + if (range_whole && wbc->nr_to_write == LONG_MAX && + test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &BTRFS_I(inode)->runtime_flags)) + wbc->tagged_writepages = 1; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ca4902c66dc4..e24c0a69ff5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2059,6 +2059,18 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) u64 len; /* + * If the inode needs a full sync, make sure we use a full range to + * avoid log tree corruption, due to hole detection racing with ordered + * extent completion for adjacent ranges, and assertion failures during + * hole detection. + */ + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + start = 0; + end = LLONG_MAX; + } + + /* * The range length can be represented by u64, we have to do the typecasts * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() */ @@ -2565,10 +2577,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); - if (ret) { - inode_unlock(inode); + if (ret) goto out_only_mutex; - } path = btrfs_alloc_path(); if (!path) { @@ -3151,6 +3161,7 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_qgroup_reserve_data(inode, &data_reserved, cur_offset, last_byte - cur_offset); if (ret < 0) { + cur_offset = last_byte; free_extent_map(em); break; } @@ -3200,7 +3211,7 @@ out: /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(inode, data_reserved, - alloc_start, alloc_end - cur_offset); + cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7158b5b77c9d..c1cd3fe2b295 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1373,7 +1373,8 @@ next_slot: * Do the same check as in btrfs_cross_ref_exist but * without the unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= + if (!nolock && + btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) @@ -3150,9 +3151,6 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - /* Try to release some metadata so we don't get an OOM but don't wait */ - btrfs_btree_balance_dirty_nodelay(fs_info); - return ret; } @@ -3688,6 +3686,21 @@ cache_index: * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* + * Similar reasoning for last_link_trans, needs to be set otherwise + * for a case like the following: + * + * mkdir A + * touch foo + * ln foo A/bar + * echo 2 > /proc/sys/vm/drop_caches + * fsync foo + * <power failure> + * + * Would result in link bar and directory A not existing after the power + * failure. + */ + BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -6413,8 +6426,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name_len * 2); inode_inc_iversion(&parent_inode->vfs_inode); - parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime = - current_time(&parent_inode->vfs_inode); + /* + * If we are replaying a log tree, we do not want to update the mtime + * and ctime of the parent directory with the current time, since the + * log replay procedure is responsible for setting them to their correct + * values (the ones it had when the fsync was done). + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { + struct timespec64 now = current_time(&parent_inode->vfs_inode); + + parent_inode->vfs_inode.i_mtime = now; + parent_inode->vfs_inode.i_ctime = now; + } ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); if (ret) btrfs_abort_transaction(trans, ret); @@ -6427,14 +6450,19 @@ fail_dir_item: err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); - + if (err) + btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, name_len, ino, parent_ino, &local_index); + if (err) + btrfs_abort_transaction(trans, err); } + + /* Return the original error code */ return ret; } @@ -6646,6 +6674,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; } + BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, true, NULL); @@ -9174,6 +9203,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; + ei->last_link_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -9985,7 +10015,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -10013,6 +10043,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr) } spin_unlock(&root->delalloc_lock); + if (snapshot) + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &binode->runtime_flags); work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); @@ -10046,7 +10079,7 @@ out: return ret; } -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -10054,7 +10087,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1); + ret = start_delalloc_inodes(root, -1, true); if (ret > 0) ret = 0; return ret; @@ -10083,7 +10116,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr); + ret = start_delalloc_inodes(root, nr, false); btrfs_put_fs_root(root); if (ret < 0) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c9152155fcbf..0eb333c62fe4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -496,6 +496,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * If the fs is mounted with nologreplay, which requires it to be + * mounted in RO mode as well, we can not allow discard on free space + * inside block groups, because log trees refer to extents that are not + * pinned in a block group's free space cache (pinning the extents is + * precisely the first phase of replaying a log tree). + */ + if (btrfs_test_opt(fs_info, NOLOGREPLAY)) + return -EROFS; + rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { @@ -778,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, wait_event(root->subv_writers->wait, percpu_counter_sum(&root->subv_writers->counter) == 0); - ret = btrfs_start_delalloc_inodes(root); + ret = btrfs_start_delalloc_snapshot(root); if (ret) goto dec_and_free; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dc6140013ae8..61d22a56c0ba 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -366,11 +366,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, static int prop_compression_validate(const char *value, size_t len) { - if (!strncmp("lzo", value, len)) + if (!strncmp("lzo", value, 3)) return 0; - else if (!strncmp("zlib", value, len)) + else if (!strncmp("zlib", value, 4)) return 0; - else if (!strncmp("zstd", value, len)) + else if (!strncmp("zstd", value, 4)) return 0; return -EINVAL; @@ -396,7 +396,7 @@ static int prop_compression_apply(struct inode *inode, btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; - } else if (!strncmp("zstd", value, len)) { + } else if (!strncmp("zstd", value, 4)) { type = BTRFS_COMPRESS_ZSTD; btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } else { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ff434663d65b..e46e83e87600 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1013,16 +1013,22 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } - spin_lock(&fs_info->qgroup_lock); - fs_info->quota_root = quota_root; - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - spin_unlock(&fs_info->qgroup_lock); ret = btrfs_commit_transaction(trans); trans = NULL; if (ret) goto out_free_path; + /* + * Set quota enabled flag after committing the transaction, to avoid + * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot + * creation. + */ + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + spin_unlock(&fs_info->qgroup_lock); + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -2421,16 +2427,15 @@ out: /* * Two limits to commit transaction in advance. * - * For RATIO, it will be 1/RATIO of the remaining limit - * (excluding data and prealloc meta) as threshold. + * For RATIO, it will be 1/RATIO of the remaining limit as threshold. * For SIZE, it will be in byte unit as threshold. */ -#define QGROUP_PERTRANS_RATIO 32 -#define QGROUP_PERTRANS_SIZE SZ_32M +#define QGROUP_FREE_RATIO 32 +#define QGROUP_FREE_SIZE SZ_32M static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 limit; + u64 free; u64 threshold; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && @@ -2449,20 +2454,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, */ if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) - limit = qg->max_excl; - else - limit = qg->max_rfer; - threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - - qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / - QGROUP_PERTRANS_RATIO; - threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); + if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; + threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } else { + free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; + threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } /* * Use transaction_kthread to commit transaction, so we no * longer need to bother nested transaction nor lock context. */ - if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) + if (free < threshold) btrfs_commit_transaction_locksafe(fs_info); } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index df41d7049936..927f9f3daddb 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2429,8 +2429,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bitmap_clear(rbio->dbitmap, pagenr, 1); kunmap(p); - for (stripe = 0; stripe < rbio->real_stripes; stripe++) + for (stripe = 0; stripe < nr_data; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + kunmap(p_page); } __free_page(p_page); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0526b6c473c7..5d57ed629345 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4289,27 +4289,36 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) { + if (ret < 0) err = ret; - goto out; - } - - if (rc->extents_found == 0) - break; - - btrfs_info(fs_info, "found %llu extents", rc->extents_found); + /* + * We may have gotten ENOSPC after we already dirtied some + * extents. If writeout happens while we're relocating a + * different block group we could end up hitting the + * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in + * btrfs_reloc_cow_block. Make sure we write everything out + * properly so we don't trip over this problem, and then break + * out of the loop if we hit an error. + */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { ret = btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); - if (ret) { + if (ret) err = ret; - goto out; - } invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } + + if (err < 0) + goto out; + + if (rc->extents_found == 0) + break; + + btrfs_info(fs_info, "found %llu extents", rc->extents_found); + } WARN_ON(rc->block_group->pinned > 0); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 65bda0682928..3228d3b3084a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -132,16 +132,17 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); + if (ret < 0) goto out; - } - if (ret != 0) { - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, "unable to update root key %llu %u %llu", - key->objectid, key->type, key->offset); - BUG_ON(1); + if (ret > 0) { + btrfs_crit(fs_info, + "unable to find root key (%llu %u %llu) in tree %llu", + key->objectid, key->type, key->offset, + root->root_key.objectid); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; } l = path->nodes[0]; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 84cb6e5ef36c..258392b75048 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5021,6 +5021,12 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (offset >= sctx->cur_inode_size) return 0; + /* + * Don't go beyond the inode's i_size due to prealloc extents that start + * after the i_size. + */ + end = min_t(u64, end, sctx->cur_inode_size); + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); @@ -6583,6 +6589,38 @@ commit_trans: return btrfs_commit_transaction(trans); } +/* + * Make sure any existing dellaloc is flushed for any root used by a send + * operation so that we do not miss any data and we do not race with writeback + * finishing and changing a tree while send is using the tree. This could + * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and + * a send operation then uses the subvolume. + * After flushing delalloc ensure_commit_roots_uptodate() must be called. + */ +static int flush_delalloc_roots(struct send_ctx *sctx) +{ + struct btrfs_root *root = sctx->parent_root; + int ret; + int i; + + if (root) { + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + for (i = 0; i < sctx->clone_roots_cnt; i++) { + root = sctx->clone_roots[i].root; + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + return 0; +} + static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(&root->root_item_lock); @@ -6807,6 +6845,10 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) NULL); sort_clone_roots = 1; + ret = flush_delalloc_roots(sctx); + if (ret) + goto out; + ret = ensure_commit_roots_uptodate(sctx); if (ret) goto out; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8ad145820ea8..8888337a95b6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1677,6 +1677,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, flags | SB_RDONLY, device_name, data); if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } @@ -1686,12 +1687,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error < 0) { root = ERR_PTR(error); mntput(mnt_root); + kfree(subvol_name); goto out; } } } if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3717c864ba23..aefb0169d46d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -811,7 +811,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, fs_devs->fsid_kobj.kset = btrfs_kset; error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, parent, "%pU", fs_devs->fsid); - return error; + if (error) { + kobject_put(&fs_devs->fsid_kobj); + return error; + } + + return 0; } int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 16ecb76fa53c..0d5840d20efc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3038,6 +3038,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, log->log_transid = root->log_transid; root->log_start_pid = 0; /* + * Update or create log root item under the root's log_mutex to prevent + * races with concurrent log syncs that can lead to failure to update + * log root item because it was not created yet. + */ + ret = update_log_root(trans, log); + /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to * new positions. so it's safe to allow log writers to go in. @@ -3056,8 +3062,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); - ret = update_log_root(trans, log); - mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { /* atomic_dec_and_test implies a barrier */ @@ -3532,9 +3536,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* find the first key from this transaction again */ + /* + * Find the first key from this transaction again. See the note for + * log_new_dir_dentries, if we're logging a directory recursively we + * won't be holding its i_mutex, which means we can modify the directory + * while we're logging it. If we remove an entry between our first + * search and this search we'll not find the key again and can just + * bail. + */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (WARN_ON(ret != 0)) + if (ret != 0) goto done; /* @@ -4114,6 +4125,7 @@ fill_holes: *last_extent, 0, 0, len, 0, len, 0, 0, 0); + *last_extent += len; } } } @@ -4504,6 +4516,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode + * size stored in the btree, return the inode's i_size, so + * that we get a correct inode size after replaying the log + * when before a power failure we had a shrinking truncate + * followed by addition of a new name (rename / new hard link). + * Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a + * write that expands the inode's size and logging a new name + * immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; } btrfs_release_path(path); @@ -4665,15 +4690,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - ASSERT(len == i_size || - (len == fs_info->sectorsize && - btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE) || - (len < i_size && i_size < fs_info->sectorsize)); + BTRFS_FILE_EXTENT_INLINE) return 0; - } len = btrfs_file_extent_num_bytes(leaf, extent); /* Last extent goes beyond i_size, no need to log a hole. */ @@ -5294,7 +5312,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, { int ret = 0; struct dentry *old_parent = NULL; - struct btrfs_inode *orig_inode = inode; /* * for regular files, if its inode is already on disk, we don't @@ -5314,16 +5331,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, } while (1) { - /* - * If we are logging a directory then we start with our inode, - * not our parent's inode, so we need to skip setting the - * logged_trans so that further down in the log code we don't - * think this inode has already been logged. - */ - if (inode != orig_inode) - inode->logged_trans = trans->transid; - smp_mb(); - if (btrfs_must_commit_transaction(trans, inode)) { ret = 1; break; @@ -5781,6 +5788,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + /* + * If a new hard link was added to the inode in the current transaction + * and its link count is now greater than 1, we need to fallback to a + * transaction commit, otherwise we can end up not logging all its new + * parents for all the hard links. Here just from the dentry used to + * fsync, we can not visit the ancestor inodes for all the other hard + * links to figure out if any is new, so we fallback to a transaction + * commit (instead of adding a lot of complexity of scanning a btree, + * since this scenario is not a common use case). + */ + if (inode->vfs_inode.i_nlink > 1 && + inode->last_link_trans > last_committed) { + ret = -EMLINK; + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; @@ -6036,7 +6059,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * if this directory was already logged any new * names for this file/dir will get recorded */ - smp_mb(); if (dir->logged_trans == trans->transid) return; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4405e430da6..207f4e87445d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -850,6 +850,35 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EEXIST); } + /* + * We are going to replace the device path for a given devid, + * make sure it's the same device if the device is mounted + */ + if (device->bdev) { + struct block_device *path_bdev; + + path_bdev = lookup_bdev(path); + if (IS_ERR(path_bdev)) { + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_CAST(path_bdev); + } + + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_warn_in_rcu(device->fs_info, + "duplicate device fsid:devid for %pU:%llu old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + return ERR_PTR(-EEXIST); + } + bdput(path_bdev); + btrfs_info_in_rcu(device->fs_info, + "device fsid %pU devid %llu moved old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + } + name = rcu_string_strdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); @@ -3712,6 +3741,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; + bool reducing_integrity; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3796,24 +3826,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - btrfs_info(fs_info, - "balance: force reducing metadata integrity"); - } else { - btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); - ret = -EINVAL; - goto out; - } - } + !(bctl->meta.target & allowed))) + reducing_integrity = true; + else + reducing_integrity = false; + + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - /* if we're not converting, the target field is uninitialized */ - meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->meta.target : fs_info->avail_metadata_alloc_bits; - data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->data.target : fs_info->avail_data_alloc_bits; + if (reducing_integrity) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, + "balance: force reducing metadata integrity"); + } else { + btrfs_err(fs_info, + "balance: reduces metadata integrity, use --force if you want this"); + ret = -EINVAL; + goto out; + } + } + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { int meta_index = btrfs_bg_flags_to_raid_index(meta_target); @@ -4761,19 +4797,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, /* * Use the number of data stripes to figure out how big this chunk * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size + * and compare that answer with the max chunk size. If it's higher, + * we try to reduce stripe_size. */ if (stripe_size * data_stripes > max_chunk_size) { - stripe_size = div_u64(max_chunk_size, data_stripes); - - /* bump the answer up to a 16MB boundary */ - stripe_size = round_up(stripe_size, SZ_16M); - /* - * But don't go higher than the limits we found while searching - * for free extents + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. */ - stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size = min(round_up(div_u64(max_chunk_size, + data_stripes), SZ_16M), stripe_size); } @@ -6017,7 +6051,7 @@ static void btrfs_end_bio(struct bio *bio) if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - else + else if (!(bio->bi_opf & REQ_RAHEAD)) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) @@ -6391,10 +6425,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, } if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { btrfs_err(fs_info, @@ -7467,6 +7501,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; + u64 prev_devid = 0; + u64 prev_dev_ext_end = 0; int ret = 0; key.objectid = 1; @@ -7511,10 +7547,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext); + /* Check if this dev extent overlaps with the previous one */ + if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", + devid, physical_offset, prev_dev_ext_end); + ret = -EUCLEAN; + goto out; + } + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; + prev_devid = devid; + prev_dev_ext_end = physical_offset + physical_len; + ret = btrfs_next_item(root, path); if (ret < 0) goto out; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ea78c3d6dcfc..f141b45ce349 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -11,6 +11,7 @@ #include <linux/security.h> #include <linux/posix_acl_xattr.h> #include <linux/iversion.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode, { const struct xattr *xattr; struct btrfs_trans_handle *trans = fs_info; + unsigned int nofs_flag; char *name; int err = 0; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); @@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode, if (err < 0) break; } + memalloc_nofs_restore(nofs_flag); return err; } diff --git a/fs/buffer.c b/fs/buffer.c index baa915ac13cd..a5b3a456dbff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -200,6 +200,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) struct buffer_head *head; struct page *page; int all_mapped = 1; + static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); @@ -227,15 +228,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * file io on the block device and getblk. It gets dealt with * elsewhere, don't buffer_error if we had some unmapped buffers */ - if (all_mapped) { - printk("__find_get_block_slow() failed. " - "block=%llu, b_blocknr=%llu\n", - (unsigned long long)block, - (unsigned long long)bh->b_blocknr); - printk("b_state=0x%08lx, b_size=%zu\n", - bh->b_state, bh->b_size); - printk("device %pg blocksize: %d\n", bdev, - 1 << bd_inode->i_blkbits); + ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE); + if (all_mapped && __ratelimit(&last_warned)) { + printk("__find_get_block_slow() failed. block=%llu, " + "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, " + "device %pg blocksize: %d\n", + (unsigned long long)block, + (unsigned long long)bh->b_blocknr, + bh->b_state, bh->b_size, bdev, + 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); @@ -3018,6 +3019,13 @@ void guard_bio_eod(int op, struct bio *bio) /* Uhhuh. We've got a bio that straddles the device size! */ truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); + /* + * The bio contains more than one segment which spans EOD, just return + * and let IO layer turn it into an EIO + */ + if (truncated_bytes > bvec->bv_len) + return; + /* Truncate the bio.. */ bio->bi_iter.bi_size -= truncated_bytes; bvec->bv_len -= truncated_bytes; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index dd7dfdd2ba13..c7542e8dd096 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1030,6 +1030,8 @@ static void drop_inode_snap_realm(struct ceph_inode_info *ci) list_del_init(&ci->i_snap_realm_item); ci->i_snap_realm_counter++; ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; spin_unlock(&realm->inodes_with_caps_lock); ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, realm); @@ -3566,7 +3568,6 @@ retry: tcap->cap_id = t_cap_id; tcap->seq = t_seq - 1; tcap->issue_seq = t_seq - 1; - tcap->mseq = t_mseq; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 82928cea0209..7f3f64ba464f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1470,6 +1470,7 @@ void ceph_dentry_lru_del(struct dentry *dn) unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { struct ceph_inode_info *dci = ceph_inode(dir); + unsigned hash; switch (dci->i_dir_layout.dl_dir_hash) { case 0: /* for backward compat */ @@ -1477,8 +1478,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) return dn->d_name.hash; default: - return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + spin_lock(&dn->d_lock); + hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash, dn->d_name.name, dn->d_name.len); + spin_unlock(&dn->d_lock); + return hash; } } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4055ab4d5c52..3e518c2ae2bf 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -524,6 +524,7 @@ static void ceph_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); + kfree(ci->i_symlink); kmem_cache_free(ceph_inode_cachep, ci); } @@ -561,7 +562,6 @@ void ceph_destroy_inode(struct inode *inode) ceph_put_snap_realm(mdsc, realm); } - kfree(ci->i_symlink); while ((n = rb_first(&ci->i_fragtree)) != NULL) { frag = rb_entry(n, struct ceph_inode_frag, node); rb_erase(n, &ci->i_fragtree); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bc43c822426a..bfcf11c70bfa 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1290,6 +1290,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (drop && + ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1945,10 +1954,39 @@ retry: return path; } +/* Duplicate the dentry->d_name.name safely */ +static int clone_dentry_name(struct dentry *dentry, const char **ppath, + int *ppathlen) +{ + u32 len; + char *name; + +retry: + len = READ_ONCE(dentry->d_name.len); + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + return -ENOMEM; + + spin_lock(&dentry->d_lock); + if (dentry->d_name.len != len) { + spin_unlock(&dentry->d_lock); + kfree(name); + goto retry; + } + memcpy(name, dentry->d_name.name, len); + spin_unlock(&dentry->d_lock); + + name[len] = '\0'; + *ppath = name; + *ppathlen = len; + return 0; +} + static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath, bool parent_locked) { + int ret; char *path; rcu_read_lock(); @@ -1957,8 +1995,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (dir && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + if (parent_locked) { + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + } else { + ret = clone_dentry_name(dentry, ppath, ppathlen); + if (ret) + return ret; + *pfreepath = true; + } return 0; } rcu_read_unlock(); @@ -1966,13 +2011,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath) { struct dentry *dentry; char *path; @@ -1988,7 +2033,7 @@ static int build_inode_path(struct inode *inode, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } @@ -1999,7 +2044,7 @@ static int build_inode_path(struct inode *inode, static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, const char *rpath, u64 rino, const char **ppath, int *pathlen, - u64 *ino, int *freepath) + u64 *ino, bool *freepath, bool parent_locked) { int r = 0; @@ -2009,7 +2054,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, ceph_snap(rinode)); } else if (rdentry) { r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, - freepath); + freepath, parent_locked); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -2035,7 +2080,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; - int freepath1 = 0, freepath2 = 0; + bool freepath1 = false, freepath2 = false; int len; u16 releases; void *p, *end; @@ -2043,16 +2088,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1); + &path1, &pathlen1, &ino1, &freepath1, + test_bit(CEPH_MDS_R_PARENT_LOCKED, + &req->r_req_flags)); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* If r_old_dentry is set, then assume that its parent is locked */ ret = set_request_path_attr(NULL, req->r_old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2); + &path2, &pathlen2, &ino2, &freepath2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 041c27ea8de1..1f46b02f7314 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -568,7 +568,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) old_snapc = NULL; update_snapc: - if (ci->i_head_snapc) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } @@ -616,7 +621,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size); spin_lock(&mdsc->snap_flush_lock); - list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + if (list_empty(&ci->i_snap_flush_item)) + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eab1359d0553..c5cf46e43f2e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -819,6 +819,12 @@ static void ceph_umount_begin(struct super_block *sb) return; } +static int ceph_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + return 0; +} + static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, @@ -826,6 +832,7 @@ static const struct super_operations ceph_super_ops = { .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, + .remount_fs = ceph_remount, .show_options = ceph_show_options, .statfs = ceph_statfs, .umount_begin = ceph_umount_begin, diff --git a/fs/char_dev.c b/fs/char_dev.c index a279c58fe360..8a63cfa29005 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -159,6 +159,12 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, ret = -EBUSY; goto out; } + + if (new_min < old_min && new_max > old_max) { + ret = -EBUSY; + goto out; + } + } cd->next = *cp; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index abcd78e332fe..85dadb93c992 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -133,7 +133,7 @@ config CIFS_XATTR config CIFS_POSIX bool "CIFS POSIX Extensions" - depends on CIFS_XATTR + depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR help Enabling this option will cause the cifs client to attempt to negotiate a newer dialect with servers, such as Samba 3.0.5 diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 6b61df117fd4..563e2f6268c3 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -271,9 +271,9 @@ static void dump_referral(const struct dfs_info3_param *ref) { cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); - cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n", + cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n", ref->flags, ref->server_type); - cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n", + cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n", ref->ref_flag, ref->path_consumed); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9dcaed031843..6f227cc781e5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -235,6 +235,8 @@ struct smb_version_operations { int * (*get_credits_field)(struct TCP_Server_Info *, const int); unsigned int (*get_credits)(struct mid_q_entry *); __u64 (*get_next_mid)(struct TCP_Server_Info *); + void (*revert_current_mid)(struct TCP_Server_Info *server, + const unsigned int val); /* data offset from read response message */ unsigned int (*read_data_offset)(char *); /* @@ -756,6 +758,22 @@ get_next_mid(struct TCP_Server_Info *server) return cpu_to_le16(mid); } +static inline void +revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) +{ + if (server->ops->revert_current_mid) + server->ops->revert_current_mid(server, val); +} + +static inline void +revert_current_mid_from_hdr(struct TCP_Server_Info *server, + const struct smb2_sync_hdr *shdr) +{ + unsigned int num = le16_to_cpu(shdr->CreditCharge); + + return revert_current_mid(server, num > 0 ? num : 1); +} + static inline __u16 get_mid(const struct smb_hdr *smb) { @@ -1245,6 +1263,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1391,6 +1410,7 @@ struct mid_q_entry { struct kref refcount; struct TCP_Server_Info *server; /* server corresponding to this mid */ __u64 mid; /* multiplex id */ + __u16 credits; /* number of credits consumed by this mid */ __u32 pid; /* process id */ __u32 sequence_number; /* for CIFS signing */ unsigned long when_alloc; /* when mid was created */ @@ -1744,6 +1764,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock; #endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); +void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5657b79dbc99..269471c8f42b 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1458,18 +1458,26 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server) } static int -cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid, + bool malformed) { int length; - struct cifs_readdata *rdata = mid->callback_data; length = cifs_discard_remaining_data(server); - dequeue_mid(mid, rdata->result); + dequeue_mid(mid, malformed); mid->resp_buf = server->smallbuf; server->smallbuf = NULL; return length; } +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + + return __cifs_readv_discard(server, mid, rdata->result); +} + int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { @@ -1511,12 +1519,23 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return -1; } + /* set up first two iov for signature check and to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = server->total_read - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - return cifs_readv_discard(server, mid); + /* normal error on read response */ + return __cifs_readv_discard(server, mid, false); } /* Is there enough to get to the rest of the READ_RSP header? */ @@ -1560,14 +1579,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->total_read += length; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->total_read - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n", - rdata->iov[0].iov_base, server->total_read); - /* how much data is in the response? */ #ifdef CONFIG_CIFS_SMB_DIRECT use_rdma_mr = rdata->mr; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 52d71b64c0c6..f31339db45fd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -50,6 +50,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "dns_resolve.h" #include "ntlmssp.h" #include "nterr.h" #include "rfc1002pdu.h" @@ -318,6 +319,53 @@ static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, const char *devname, bool is_smb3); /* + * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may + * get their ip addresses changed at some point. + * + * This should be called with server->srv_mutex held. + */ +#ifdef CONFIG_CIFS_DFS_UPCALL +static int reconn_set_ipaddr(struct TCP_Server_Info *server) +{ + int rc; + int len; + char *unc, *ipaddr = NULL; + + if (!server->hostname) + return -EINVAL; + + len = strlen(server->hostname) + 3; + + unc = kmalloc(len, GFP_KERNEL); + if (!unc) { + cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__); + return -ENOMEM; + } + snprintf(unc, len, "\\\\%s", server->hostname); + + rc = dns_resolve_server_name_to_ip(unc, &ipaddr); + kfree(unc); + + if (rc < 0) { + cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n", + __func__, server->hostname, rc); + return rc; + } + + rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr, + strlen(ipaddr)); + kfree(ipaddr); + + return !rc ? -1 : 0; +} +#else +static inline int reconn_set_ipaddr(struct TCP_Server_Info *server) +{ + return 0; +} +#endif + +/* * cifs tcp session reconnection * * mark tcp session as reconnecting so temporarily locked @@ -417,6 +465,11 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); + rc = reconn_set_ipaddr(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } mutex_unlock(&server->srv_mutex); msleep(3000); } else { @@ -533,6 +586,21 @@ server_unresponsive(struct TCP_Server_Info *server) return false; } +static inline bool +zero_credits(struct TCP_Server_Info *server) +{ + int val; + + spin_lock(&server->req_lock); + val = server->credits + server->echo_credits + server->oplock_credits; + if (server->in_flight == 0 && val == 0) { + spin_unlock(&server->req_lock); + return true; + } + spin_unlock(&server->req_lock); + return false; +} + static int cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) { @@ -545,6 +613,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) for (total_read = 0; msg_data_left(smb_msg); total_read += length) { try_to_freeze(); + /* reconnect if no credits and no requests in flight */ + if (zero_credits(server)) { + cifs_reconnect(server); + return -ECONNABORTED; + } + if (server_unresponsive(server)) return -ECONNABORTED; if (cifs_rdma_enabled(server) && server->smbd_conn) @@ -1273,6 +1347,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol) const char *delims = "/\\"; size_t len; + if (unlikely(!devname || !*devname)) { + cifs_dbg(VFS, "Device name not specified.\n"); + return -EINVAL; + } + /* make sure we have a valid UNC double delimiter prefix */ len = strspn(devname, delims); if (len != 2) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8d41ca7bfcf1..23cee91ed442 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -358,13 +358,31 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } -/* - * Release a reference on the file private data. This may involve closing - * the filehandle out on the server. Must be called without holding - * tcon->open_file_lock and cifs_file->file_info_lock. +/** + * cifsFileInfo_put - release a reference of file priv data + * + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + _cifsFileInfo_put(cifs_file, true); +} + +/** + * _cifsFileInfo_put - release a reference of file priv data + * + * This may involve closing the filehandle @cifs_file out on the + * server. Must be called without holding tcon->open_file_lock and + * cifs_file->file_info_lock. + * + * If @wait_for_oplock_handler is true and we are releasing the last + * reference, wait for any running oplock break handler of the file + * and cancel any pending one. If calling this function from the + * oplock break handler, you need to pass false. + * + */ +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) +{ struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -411,7 +429,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) spin_unlock(&tcon->open_file_lock); - oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); + oplock_break_cancelled = wait_oplock_handler ? + cancel_work_sync(&cifs_file->oplock_break) : false; if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; @@ -1120,14 +1139,18 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) { + if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) { free_xid(xid); return -EINVAL; } + BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > + PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr), + PAGE_SIZE); max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); @@ -1460,12 +1483,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) + if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) return -EINVAL; + BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > + PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr), + PAGE_SIZE); max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); @@ -1623,8 +1650,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX && !rc) + if (flock->fl_flags & FL_POSIX) { + /* + * If this is a request to remove all locks because we + * are closing the file, it doesn't matter if the + * unlocking failed as both cifs.ko and the SMB server + * remove the lock on file close + */ + if (rc) { + cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc); + if (!(flock->fl_flags & FL_CLOSE)) + return rc; + } rc = locks_lock_file_wait(file, flock); + } return rc; } @@ -2863,14 +2902,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) * these pages but not on the region from pos to ppos+len-1. */ written = cifs_user_writev(iocb, from); - if (written > 0 && CIFS_CACHE_READ(cinode)) { + if (CIFS_CACHE_READ(cinode)) { /* - * Windows 7 server can delay breaking level2 oplock if a write - * request comes - break it on the client to prevent reading - * an old data. + * We have read level caching and we have just sent a write + * request to the server thus making data in the cache stale. + * Zap the cache and set oplock/lease level to NONE to avoid + * reading stale data from the cache. All subsequent read + * operations will read new data from the server. */ cifs_zap_mapping(inode); - cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", + cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n", inode); cinode->oplock = 0; } @@ -2947,7 +2988,9 @@ cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) } if (rc) { - for (i = 0; i < nr_pages; i++) { + unsigned int nr_page_failed = i; + + for (i = 0; i < nr_page_failed; i++) { put_page(rdata->pages[i]); rdata->pages[i] = NULL; } @@ -4148,6 +4191,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + _cifsFileInfo_put(cfile, false /* do not wait for ourself */); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 020f49c15b30..1fadd314ae7f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -780,43 +780,50 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else if ((rc == -EACCES) && backup_cred(cifs_sb) && (strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0)) { - /* - * For SMB2 and later the backup intent flag is already - * sent if needed on open and there is no path based - * FindFirst operation to use to retry with - */ + /* + * For SMB2 and later the backup intent flag is already + * sent if needed on open and there is no path based + * FindFirst operation to use to retry with + */ - srchinf = kzalloc(sizeof(struct cifs_search_info), - GFP_KERNEL); - if (srchinf == NULL) { - rc = -ENOMEM; - goto cgii_exit; - } + srchinf = kzalloc(sizeof(struct cifs_search_info), + GFP_KERNEL); + if (srchinf == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } - srchinf->endOfSearch = false; + srchinf->endOfSearch = false; + if (tcon->unix_ext) + srchinf->info_level = SMB_FIND_FILE_UNIX; + else if ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) + srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + else /* no srvino useful for fallback to some netapp */ + srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO; - srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | - CIFS_SEARCH_CLOSE_AT_END | - CIFS_SEARCH_BACKUP_SEARCH; + srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | + CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_BACKUP_SEARCH; - rc = CIFSFindFirst(xid, tcon, full_path, - cifs_sb, NULL, srchflgs, srchinf, false); - if (!rc) { - data = - (FILE_ALL_INFO *)srchinf->srch_entries_start; + rc = CIFSFindFirst(xid, tcon, full_path, + cifs_sb, NULL, srchflgs, srchinf, false); + if (!rc) { + data = (FILE_ALL_INFO *)srchinf->srch_entries_start; - cifs_dir_info_to_fattr(&fattr, - (FILE_DIRECTORY_INFO *)data, cifs_sb); - fattr.cf_uniqueid = le64_to_cpu( - ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); - validinum = true; + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)data, cifs_sb); + fattr.cf_uniqueid = le64_to_cpu( + ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); + validinum = true; - cifs_buf_release(srchinf->ntwrk_buf_start); - } - kfree(srchinf); - if (rc) - goto cgii_exit; + cifs_buf_release(srchinf->ntwrk_buf_start); + } + kfree(srchinf); + if (rc) + goto cgii_exit; } else goto cgii_exit; @@ -1728,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (rc == 0 || rc != -EBUSY) goto do_rename_exit; + /* Don't fall back to using SMB on SMB 2+ mount */ + if (server->vals->protocol_id != 0) + goto do_rename_exit; + /* open-file renames don't work across directories */ if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 6926685e513c..facc94e159a1 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -490,8 +490,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsoplockd_wq, - &netfile->oplock_break); + cifs_queue_oplock_break(netfile); netfile->oplock_break_cancelled = false; spin_unlock(&tcon->open_file_lock); @@ -588,6 +587,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode) spin_unlock(&cinode->writers_lock); } +/** + * cifs_queue_oplock_break - queue the oplock break handler for cfile + * + * This function is called from the demultiplex thread when it + * receives an oplock break for @cfile. + * + * Assumes the tcon->open_file_lock is held. + * Assumes cfile->file_info_lock is NOT held. + */ +void cifs_queue_oplock_break(struct cifsFileInfo *cfile) +{ + /* + * Bump the handle refcount now while we hold the + * open_file_lock to enforce the validity of it for the oplock + * break handler. The matching put is done at the end of the + * handler. + */ + cifsFileInfo_get(cfile); + + queue_work(cifsoplockd_wq, &cfile->oplock_break); +} + void cifs_done_oplock_break(struct cifsInodeInfo *cinode) { clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index a7be6a2f4700..33f7723fb83e 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -655,7 +655,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, /* scan and find it */ int i; char *cur_ent; - char *end_of_smb = cfile->srch_inf.ntwrk_buf_start + + char *end_of_smb; + + if (cfile->srch_inf.ntwrk_buf_start == NULL) { + cifs_dbg(VFS, "ntwrk_buf_start is NULL during readdir\n"); + return -EIO; + } + + end_of_smb = cfile->srch_inf.ntwrk_buf_start + server->ops->calc_smb_size( cfile->srch_inf.ntwrk_buf_start, server); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 378151e09e91..47db8eb6cbcf 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) remaining = tgt_total_cnt - total_in_tgt; if (remaining < 0) { - cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n", + cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n", tgt_total_cnt, total_in_tgt); return -EPROTO; } diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 4ed10dd086e6..b204e84b87fb 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -122,12 +122,14 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) + if (max_buf < sizeof(struct smb2_lock_element)) return -EINVAL; + BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf, PAGE_SIZE); max_num = max_buf / sizeof(struct smb2_lock_element); buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) @@ -264,6 +266,8 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) return -EINVAL; } + BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf, PAGE_SIZE); max_num = max_buf / sizeof(struct smb2_lock_element); buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 20a2d304c603..18814f1d67d9 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -379,8 +379,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"}, {STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"}, {STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"}, - {STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"}, - {STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"}, + {STATUS_FILE_LOCK_CONFLICT, -EACCES, "STATUS_FILE_LOCK_CONFLICT"}, + {STATUS_LOCK_NOT_GRANTED, -EACCES, "STATUS_LOCK_NOT_GRANTED"}, {STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"}, {STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS, "STATUS_CTL_FILE_NOT_SUPPORTED"}, @@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_UNFINISHED_CONTEXT_DELETED, -EIO, "STATUS_UNFINISHED_CONTEXT_DELETED"}, {STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"}, - {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"}, + /* Note that ENOATTTR and ENODATA are the same errno */ + {STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"}, {STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"}, {STATUS_WRONG_CREDENTIAL_HANDLE, -EIO, "STATUS_WRONG_CREDENTIAL_HANDLE"}, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 6a9c47541c53..0a7ed2e3ad4f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, __u8 lease_state; struct list_head *tmp; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server = tcon->ses->server; struct cifs_pending_open *open; struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & @@ -537,14 +536,26 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cifs_dbg(FYI, "lease key match, lease break 0x%x\n", le32_to_cpu(rsp->NewLeaseState)); - server->ops->set_oplock_level(cinode, lease_state, 0, NULL); - if (ack_req) cfile->oplock_break_cancelled = false; else cfile->oplock_break_cancelled = true; - queue_work(cifsoplockd_wq, &cfile->oplock_break); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + + /* + * Set or clear flags depending on the lease state being READ. + * HANDLE caching flag should be added when the client starts + * to defer closing remote file handles with HANDLE leases. + */ + if (lease_state & SMB2_LEASE_READ_CACHING_HE) + set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + + cifs_queue_oplock_break(cfile); kfree(lw); return true; } @@ -648,6 +659,13 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; + if (rsp->sync_hdr.CreditRequest) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } + if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) @@ -701,8 +719,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsoplockd_wq, - &cfile->oplock_break); + + cifs_queue_oplock_break(cfile); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 812da3e56a22..0ccf8f9b63a2 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -34,6 +34,7 @@ #include "cifs_ioctl.h" #include "smbdirect.h" +/* Change credits for different ops and return the total number of credits */ static int change_conf(struct TCP_Server_Info *server) { @@ -41,17 +42,15 @@ change_conf(struct TCP_Server_Info *server) server->oplock_credits = server->echo_credits = 0; switch (server->credits) { case 0: - return -1; + return 0; case 1: server->echoes = false; server->oplocks = false; - cifs_dbg(VFS, "disabling echoes and oplocks\n"); break; case 2: server->echoes = true; server->oplocks = false; server->echo_credits = 1; - cifs_dbg(FYI, "disabling oplocks\n"); break; default: server->echoes = true; @@ -64,14 +63,15 @@ change_conf(struct TCP_Server_Info *server) server->echo_credits = 1; } server->credits -= server->echo_credits + server->oplock_credits; - return 0; + return server->credits + server->echo_credits + server->oplock_credits; } static void smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, const int optype) { - int *val, rc = 0; + int *val, rc = -1; + spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); *val += add; @@ -95,8 +95,26 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, } spin_unlock(&server->req_lock); wake_up(&server->request_q); - if (rc) - cifs_reconnect(server); + + if (server->tcpStatus == CifsNeedReconnect) + return; + + switch (rc) { + case -1: + /* change_conf hasn't been executed */ + break; + case 0: + cifs_dbg(VFS, "Possible client or server bug - zero credits\n"); + break; + case 1: + cifs_dbg(VFS, "disabling echoes and oplocks\n"); + break; + case 2: + cifs_dbg(FYI, "disabling oplocks\n"); + break; + default: + cifs_dbg(FYI, "add %u credits total=%d\n", add, rc); + } } static void @@ -154,14 +172,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, scredits = server->credits; /* can deadlock with reopen */ - if (scredits == 1) { + if (scredits <= 8) { *num = SMB2_MAX_BUFFER_SIZE; *credits = 0; break; } - /* leave one credit for a possible reopen */ - scredits--; + /* leave some credits for reopen and other ops */ + scredits -= 8; *num = min_t(unsigned int, size, scredits * SMB2_MAX_BUFFER_SIZE); @@ -186,6 +204,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server) return mid; } +static void +smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) +{ + spin_lock(&GlobalMid_Lock); + if (server->CurrentMid >= val) + server->CurrentMid -= val; + spin_unlock(&GlobalMid_Lock); +} + static struct mid_q_entry * smb2_find_mid(struct TCP_Server_Info *server, char *buf) { @@ -1879,6 +1906,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov, &resp_buftype); + if (!rc) + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); if (!rc || !err_iov.iov_base) { rc = -ENOENT; goto free_path; @@ -2283,6 +2312,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server, } static void +smb21_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + server->ops->set_oplock_level(cinode, + set_level2 ? SMB2_LEASE_READ_CACHING_HE : + 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -2310,26 +2348,28 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { char message[5] = {0}; + unsigned int new_oplock = 0; oplock &= 0xFF; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; - cinode->oplock = 0; if (oplock & SMB2_LEASE_READ_CACHING_HE) { - cinode->oplock |= CIFS_CACHE_READ_FLG; + new_oplock |= CIFS_CACHE_READ_FLG; strcat(message, "R"); } if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) { - cinode->oplock |= CIFS_CACHE_HANDLE_FLG; + new_oplock |= CIFS_CACHE_HANDLE_FLG; strcat(message, "H"); } if (oplock & SMB2_LEASE_WRITE_CACHING_HE) { - cinode->oplock |= CIFS_CACHE_WRITE_FLG; + new_oplock |= CIFS_CACHE_WRITE_FLG; strcat(message, "W"); } - if (!cinode->oplock) - strcat(message, "None"); + if (!new_oplock) + strncpy(message, "None", sizeof(message)); + + cinode->oplock = new_oplock; cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, &cinode->vfs_inode); } @@ -2901,11 +2941,23 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, server->ops->is_status_pending(buf, server, 0)) return -1; - rdata->result = server->ops->map_error(buf, false); + /* set up first two iov to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = + min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + + rdata->result = server->ops->map_error(buf, true); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - dequeue_mid(mid, rdata->result); + /* normal error on read response */ + dequeue_mid(mid, false); return 0; } @@ -2978,14 +3030,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->vals->read_rsp_size - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", - rdata->iov[0].iov_base, server->vals->read_rsp_size); - length = rdata->copy_into_pages(server, rdata, &iter); kfree(bvec); @@ -3184,8 +3228,10 @@ smb3_receive_transform(struct TCP_Server_Info *server, } /* TODO: add support for compounds containing READ. */ - if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) + if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) { + *num_mids = 1; return receive_encrypted_read(server, &mids[0]); + } return receive_encrypted_standard(server, mids, bufs, num_mids); } @@ -3223,6 +3269,7 @@ struct smb_version_operations smb20_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3317,6 +3364,7 @@ struct smb_version_operations smb21_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3327,7 +3375,7 @@ struct smb_version_operations smb21_operations = { .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -3412,6 +3460,7 @@ struct smb_version_operations smb30_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3423,7 +3472,7 @@ struct smb_version_operations smb30_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -3516,6 +3565,7 @@ struct smb_version_operations smb311_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3527,7 +3577,7 @@ struct smb_version_operations smb311_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f54d07bda067..c181f1621e1a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -881,8 +881,15 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen); - - if (rc != 0) { + if (rc == -EOPNOTSUPP) { + /* + * Old Windows versions or Netapp SMB server can return + * not supported error. Client should accept it. + */ + cifs_dbg(VFS, "Server does not support validate negotiate\n"); + rc = 0; + goto out_free_inbuf; + } else if (rc != 0) { cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); rc = -EIO; goto out_free_inbuf; @@ -1500,9 +1507,16 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; - /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */ + /* + * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 + * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1 + * (Samba servers don't always set the flag so also check if null user) + */ if ((ses->server->dialect == SMB311_PROT_ID) && - !smb3_encryption_required(tcon)) + !smb3_encryption_required(tcon) && + !(ses->session_flags & + (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && + ((ses->user_name != NULL) || (ses->sectype == Kerberos))) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); @@ -2243,10 +2257,12 @@ SMB2_open_free(struct smb_rqst *rqst) { int i; - cifs_small_buf_release(rqst->rq_iov[0].iov_base); - for (i = 1; i < rqst->rq_nvec; i++) - if (rqst->rq_iov[i].iov_base != smb2_padding) - kfree(rqst->rq_iov[i].iov_base); + if (rqst && rqst->rq_iov) { + cifs_small_buf_release(rqst->rq_iov[0].iov_base); + for (i = 1; i < rqst->rq_nvec; i++) + if (rqst->rq_iov[i].iov_base != smb2_padding) + kfree(rqst->rq_iov[i].iov_base); + } } int @@ -2535,7 +2551,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, void SMB2_close_free(struct smb_rqst *rqst) { - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } int @@ -2685,7 +2702,8 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, void SMB2_query_info_free(struct smb_rqst *rqst) { - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } static int @@ -2814,9 +2832,10 @@ smb2_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; - if (mid->mid_state == MID_RESPONSE_RECEIVED) + if (mid->mid_state == MID_RESPONSE_RECEIVED + || mid->mid_state == MID_RESPONSE_MALFORMED) credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); DeleteMidQEntry(mid); @@ -3073,7 +3092,7 @@ smb2_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; - unsigned int credits_received = 1; + unsigned int credits_received = 0; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, @@ -3112,6 +3131,9 @@ smb2_readv_callback(struct mid_q_entry *mid) task_io_account_read(rdata->got_bytes); cifs_stats_bytes_read(tcon, rdata->got_bytes); break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(shdr->CreditRequest); + /* fall through */ default: if (rdata->result != -ENODATA) rdata->result = -EIO; @@ -3127,8 +3149,17 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->mr = NULL; } #endif - if (rdata->result) + if (rdata->result && rdata->result != -ENODATA) { cifs_stats_fail_inc(tcon, SMB2_READ_HE); + trace_smb3_read_err(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, rdata->offset, + rdata->bytes, rdata->result); + } else + trace_smb3_read_done(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->offset, rdata->got_bytes); queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); @@ -3185,12 +3216,14 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = shdr->CreditCharge; + shdr->CreditRequest = + cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); spin_lock(&server->req_lock); server->credits += rdata->credits - le16_to_cpu(shdr->CreditCharge); spin_unlock(&server->req_lock); wake_up(&server->request_q); + rdata->credits = le16_to_cpu(shdr->CreditCharge); flags |= CIFS_HAS_CREDITS; } @@ -3201,13 +3234,11 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); - trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); - } else - trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); + trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, + io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length, rc); + } cifs_small_buf_release(buf); return rc; @@ -3243,25 +3274,27 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_nvec = 1; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); - rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; if (rc) { if (rc != -ENODATA) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); + trace_smb3_read_err(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length, + rc); } - trace_smb3_read_err(rc, xid, req->PersistentFileId, - io_parms->tcon->tid, ses->Suid, - io_parms->offset, io_parms->length); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); + cifs_small_buf_release(req); + *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || (*nbytes > io_parms->length)) { @@ -3295,7 +3328,7 @@ smb2_writev_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -3323,6 +3356,9 @@ smb2_writev_callback(struct mid_q_entry *mid) case MID_RETRY_NEEDED: wdata->result = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + /* fall through */ default: wdata->result = -EIO; break; @@ -3340,8 +3376,17 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->mr = NULL; } #endif - if (wdata->result) + if (wdata->result) { cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + trace_smb3_write_err(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes, wdata->result); + } else + trace_smb3_write_done(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + wdata->offset, wdata->bytes); queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); @@ -3462,12 +3507,14 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = shdr->CreditCharge; + shdr->CreditRequest = + cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); spin_lock(&server->req_lock); server->credits += wdata->credits - le16_to_cpu(shdr->CreditCharge); spin_unlock(&server->req_lock); wake_up(&server->request_q); + wdata->credits = le16_to_cpu(shdr->CreditCharge); flags |= CIFS_HAS_CREDITS; } @@ -3481,10 +3528,7 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata->bytes, rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); - } else - trace_smb3_write_done(0 /* no xid */, req->PersistentFileId, - tcon->tid, tcon->ses->Suid, wdata->offset, - wdata->bytes); + } async_writev_out: cifs_small_buf_release(req); @@ -3549,7 +3593,6 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { @@ -3567,6 +3610,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->offset, *nbytes); } + cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -3710,8 +3754,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; - } - cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } else + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); goto qdir_exit; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 8fb7887f2b3d..437257d1116f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ -#define MAX_SMB2_HDR_SIZE 0x00b0 +/* 52 transform hdr + 64 hdr + 88 create rsp */ +#define MAX_SMB2_HDR_SIZE 204 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 7b351c65ee46..63264db78b89 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, struct TCP_Server_Info *server) { struct mid_q_entry *temp; + unsigned int credits = le16_to_cpu(shdr->CreditCharge); if (server == NULL) { cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n"); @@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, memset(temp, 0, sizeof(struct mid_q_entry)); kref_init(&temp->refcount); temp->mid = le64_to_cpu(shdr->MessageId); + temp->credits = credits > 0 ? credits : 1; temp->pid = current->pid; temp->command = shdr->Command; /* Always LE */ temp->when_alloc = jiffies; @@ -674,13 +676,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) smb2_seq_num_into_buf(ses->server, shdr); rc = smb2_get_mid_entry(ses, shdr, &mid); - if (rc) + if (rc) { + revert_current_mid_from_hdr(ses->server, shdr); return ERR_PTR(rc); + } + rc = smb2_sign_rqst(rqst, ses->server); if (rc) { + revert_current_mid_from_hdr(ses->server, shdr); cifs_delete_mid(mid); return ERR_PTR(rc); } + return mid; } @@ -695,11 +702,14 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) smb2_seq_num_into_buf(server, shdr); mid = smb2_mid_entry_alloc(shdr, server); - if (mid == NULL) + if (mid == NULL) { + revert_current_mid_from_hdr(server, shdr); return ERR_PTR(-ENOMEM); + } rc = smb2_sign_rqst(rqst, server); if (rc) { + revert_current_mid_from_hdr(server, shdr); DeleteMidQEntry(mid); return ERR_PTR(rc); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 333729cf46cd..f2938bd95c40 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -378,7 +378,7 @@ smbd_done: if (rc < 0 && rc != -EINTR) cifs_dbg(VFS, "Error %d sending data on socket to server\n", rc); - else + else if (rc > 0) rc = 0; return rc; @@ -638,6 +638,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_in_send_dec(server); if (rc < 0) { + revert_current_mid(server, mid->credits); server->sequence_number -= 2; cifs_delete_mid(mid); } @@ -786,7 +787,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, int i, j, rc = 0; int timeout, optype; struct mid_q_entry *midQ[MAX_COMPOUND]; - unsigned int credits = 0; + bool cancelled_mid[MAX_COMPOUND] = {false}; + unsigned int credits[MAX_COMPOUND] = {0}; char *buf; timeout = flags & CIFS_TIMEOUT_MASK; @@ -804,13 +806,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -ENOENT; /* - * Ensure that we do not send more than 50 overlapping requests - * to the same server. We may make this configurable later or - * use ses->maxReq. + * Ensure we obtain 1 credit per request in the compound chain. + * It can be optimized further by waiting for all the credits + * at once but this can wait long enough if we don't have enough + * credits due to some heavy operations in progress or the server + * not granting us much, so a fallback to the current approach is + * needed anyway. */ - rc = wait_for_free_request(ses->server, timeout, optype); - if (rc) - return rc; + for (i = 0; i < num_rqst; i++) { + rc = wait_for_free_request(ses->server, timeout, optype); + if (rc) { + /* + * We haven't sent an SMB packet to the server yet but + * we already obtained credits for i requests in the + * compound chain - need to return those credits back + * for future use. Note that we need to call add_credits + * multiple times to match the way we obtained credits + * in the first place and to account for in flight + * requests correctly. + */ + for (j = 0; j < i; j++) + add_credits(ses->server, 1, optype); + return rc; + } + credits[i] = 1; + } /* * Make sure that we sign in the same order that we send on this socket @@ -823,11 +843,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) { midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]); if (IS_ERR(midQ[i])) { + revert_current_mid(ses->server, i); for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); mutex_unlock(&ses->server->srv_mutex); + /* Update # of requests on wire to server */ - add_credits(ses->server, 1, optype); + for (j = 0; j < num_rqst; j++) + add_credits(ses->server, credits[j], optype); return PTR_ERR(midQ[i]); } @@ -846,8 +869,10 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) cifs_save_when_sent(midQ[i]); - if (rc < 0) + if (rc < 0) { + revert_current_mid(ses->server, num_rqst); ses->server->sequence_number -= 2; + } mutex_unlock(&ses->server->srv_mutex); @@ -874,19 +899,16 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { midQ[i]->mid_flags |= MID_WAIT_CANCELLED; midQ[i]->callback = DeleteMidQEntry; - spin_unlock(&GlobalMid_Lock); - add_credits(ses->server, 1, optype); - return rc; + cancelled_mid[i] = true; } spin_unlock(&GlobalMid_Lock); } } for (i = 0; i < num_rqst; i++) - if (midQ[i]->resp_buf) - credits += ses->server->ops->get_credits(midQ[i]); - if (!credits) - credits = 1; + if (!cancelled_mid[i] && midQ[i]->resp_buf + && (midQ[i]->mid_state == MID_RESPONSE_RECEIVED)) + credits[i] = ses->server->ops->get_credits(midQ[i]); for (i = 0; i < num_rqst; i++) { if (rc < 0) @@ -894,8 +916,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sync_mid_result(midQ[i], ses->server); if (rc != 0) { - add_credits(ses->server, credits, optype); - return rc; + /* mark this mid as cancelled to not free it below */ + cancelled_mid[i] = true; + goto out; } if (!midQ[i]->resp_buf || @@ -942,9 +965,11 @@ out: * This is prevented above by using a noop callback that will not * wake this thread except for the very last PDU. */ - for (i = 0; i < num_rqst; i++) - cifs_delete_mid(midQ[i]); - add_credits(ses->server, credits, optype); + for (i = 0; i < num_rqst; i++) { + if (!cancelled_mid[i]) + cifs_delete_mid(midQ[i]); + add_credits(ses->server, credits[i], optype); + } return rc; } @@ -229,8 +229,8 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, * * Must be called with the i_pages lock held. */ -static void *__get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp, bool (*wait_fn)(void)) +static void *get_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void ***slotp) { void *entry, **slot; struct wait_exceptional_entry_queue ewait; @@ -240,8 +240,6 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - bool revalidate; - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || @@ -256,30 +254,37 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xa_unlock_irq(&mapping->i_pages); - revalidate = wait_fn(); + schedule(); finish_wait(wq, &ewait.wait); xa_lock_irq(&mapping->i_pages); - if (revalidate) { - put_unlocked_mapping_entry(mapping, index, entry); - return ERR_PTR(-EAGAIN); - } } } -static bool entry_wait(void) +/* + * The only thing keeping the address space around is the i_pages lock + * (it's cycled in clear_inode() after removing the entries from i_pages) + * After we call xas_unlock_irq(), we cannot touch xas->xa. + */ +static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index, + void ***slotp, void *entry) { - schedule(); + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq; + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + + wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); /* - * Never return an ERR_PTR() from - * __get_unlocked_mapping_entry(), just keep looping. + * Unlike get_unlocked_entry() there is no guarantee that this + * path ever successfully retrieves an unlocked entry before an + * inode dies. Perform a non-exclusive wait in case this path + * never successfully performs its own wake up. */ - return false; -} - -static void *get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp) -{ - return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); + prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); + xa_unlock_irq(&mapping->i_pages); + schedule(); + finish_wait(wq, &ewait.wait); } static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) @@ -398,19 +403,6 @@ static struct page *dax_busy_page(void *entry) return NULL; } -static bool entry_wait_revalidate(void) -{ - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - - /* - * Tell __get_unlocked_mapping_entry() to take a break, we need - * to revalidate page->mapping after dropping locks - */ - return true; -} - bool dax_lock_mapping_entry(struct page *page) { pgoff_t index; @@ -446,14 +438,15 @@ bool dax_lock_mapping_entry(struct page *page) } index = page->index; - entry = __get_unlocked_mapping_entry(mapping, index, &slot, - entry_wait_revalidate); + entry = __radix_tree_lookup(&mapping->i_pages, index, + NULL, &slot); if (!entry) { xa_unlock_irq(&mapping->i_pages); break; - } else if (IS_ERR(entry)) { - xa_unlock_irq(&mapping->i_pages); - WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); + } else if (slot_locked(mapping, slot)) { + rcu_read_unlock(); + wait_entry_unlocked(mapping, index, &slot, entry); + rcu_read_lock(); continue; } lock_slot(mapping, slot); @@ -1667,8 +1660,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, } trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); - result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, - write); + result = vmf_insert_pfn_pmd(vmf, pfn, write); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: @@ -1782,8 +1774,7 @@ static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, break; #ifdef CONFIG_FS_DAX_PMD case PE_SIZE_PMD: - ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - pfn, true); + ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); break; #endif default: diff --git a/fs/dcache.c b/fs/dcache.c index 02566a784657..b2a00f3ff7df 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -344,7 +344,7 @@ static void dentry_free(struct dentry *dentry) } } /* if dentry was never visible to RCU, immediate free is OK */ - if (!(dentry->d_flags & DCACHE_RCUACCESS)) + if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); @@ -1202,15 +1202,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, */ void shrink_dcache_sb(struct super_block *sb) { - long freed; - do { LIST_HEAD(dispose); - freed = list_lru_walk(&sb->s_dentry_lru, + list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); - - this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } @@ -1698,7 +1694,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; - dentry->d_flags |= DCACHE_RCUACCESS; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject @@ -1723,7 +1718,7 @@ struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { - dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR; + dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; @@ -1736,10 +1731,17 @@ struct dentry *d_alloc_cursor(struct dentry * parent) * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. + * This is used for pipes, sockets et.al. - the stuff that should + * never be anyone's children or parents. Unlike all other + * dentries, these will not have RCU delay between dropping the + * last reference and freeing them. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { - return __d_alloc(sb, name); + struct dentry *dentry = __d_alloc(sb, name); + if (likely(dentry)) + dentry->d_flags |= DCACHE_NORCU; + return dentry; } EXPORT_SYMBOL(d_alloc_pseudo); @@ -1903,12 +1905,10 @@ struct dentry *d_make_root(struct inode *root_inode) if (root_inode) { res = d_alloc_anon(root_inode->i_sb); - if (res) { - res->d_flags |= DCACHE_RCUACCESS; + if (res) d_instantiate(res, root_inode); - } else { + else iput(root_inode); - } } return res; } @@ -2778,9 +2778,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; - if (dentry == old_parent) - dentry->d_flags |= DCACHE_RCUACCESS; - else + if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 13b01351dd1c..e5126fad57c5 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -163,19 +163,24 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void debugfs_evict_inode(struct inode *inode) +static void debugfs_i_callback(struct rcu_head *head) { - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); + struct inode *inode = container_of(head, struct inode, i_rcu); if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); + free_inode_nonrcu(inode); +} + +static void debugfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, debugfs_i_callback); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, - .evict_inode = debugfs_evict_inode, + .destroy_inode = debugfs_destroy_inode, }; static void debugfs_release_dentry(struct dentry *dentry) @@ -787,6 +792,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *dentry = NULL, *trap; struct name_snapshot old_name; + if (IS_ERR(old_dir)) + return old_dir; + if (IS_ERR(new_dir)) + return new_dir; + if (IS_ERR_OR_NULL(old_dentry)) + return old_dentry; + trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c53814539070..553a3f3300ae 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; s->s_op = &devpts_sops; + s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; error = -ENOMEM; diff --git a/fs/direct-io.c b/fs/direct-io.c index 199146036093..1abb7634b2d5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -679,6 +679,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; + loff_t i_size; /* * If there was a memory error and we've overwritten all the @@ -708,8 +709,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, */ create = dio->op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> - i_blkbits)) + i_size = i_size_read(dio->inode); + if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) create = 0; } diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 562fa8c3edff..47ee66d70109 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -292,6 +292,8 @@ void dlm_callback_suspend(struct dlm_ls *ls) flush_workqueue(ls->ls_callback_wq); } +#define MAX_CB_QUEUE 25 + void dlm_callback_resume(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; @@ -302,15 +304,23 @@ void dlm_callback_resume(struct dlm_ls *ls) if (!ls->ls_callback_wq) return; +more: mutex_lock(&ls->ls_cb_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { list_del_init(&lkb->lkb_cb_list); queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); count++; + if (count == MAX_CB_QUEUE) + break; } mutex_unlock(&ls->ls_cb_mutex); if (count) log_rinfo(ls, "dlm_callback_resume %d", count); + if (count == MAX_CB_QUEUE) { + count = 0; + cond_resched(); + goto more; + } } diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index cc91963683de..a928ba008d7d 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1209,6 +1209,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) if (rv < 0) { log_error(ls, "create_lkb idr error %d", rv); + dlm_free_lkb(lkb); return rv; } @@ -4179,6 +4180,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) (unsigned long long)lkb->lkb_recover_seq, ms->m_header.h_nodeid, ms->m_lkid); error = -ENOENT; + dlm_put_lkb(lkb); goto fail; } @@ -4232,6 +4234,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) lkb->lkb_id, lkb->lkb_remid, ms->m_header.h_nodeid, ms->m_lkid); error = -ENOENT; + dlm_put_lkb(lkb); goto fail; } @@ -5792,20 +5795,20 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } - - /* After ua is attached to lkb it will be freed by dlm_free_lkb(). - When DLM_IFL_USER is set, the dlm knows that this is a userspace - lock and that lkb_astparam is the dlm_user_args structure. */ - error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); - lkb->lkb_flags |= DLM_IFL_USER; - if (error) { + kfree(ua->lksb.sb_lvbptr); + ua->lksb.sb_lvbptr = NULL; + kfree(ua); __put_lkb(ls, lkb); goto out; } + /* After ua is attached to lkb it will be freed by dlm_free_lkb(). + When DLM_IFL_USER is set, the dlm knows that this is a userspace + lock and that lkb_astparam is the dlm_user_args structure. */ + lkb->lkb_flags |= DLM_IFL_USER; error = request_lock(ls, lkb, name, namelen, &args); switch (error) { diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 5ba94be006ee..6a1529e478f3 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -680,11 +680,11 @@ static int new_lockspace(const char *name, const char *cluster, kfree(ls->ls_recover_buf); out_lkbidr: idr_destroy(&ls->ls_lkbidr); + out_rsbtbl: for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { if (ls->ls_remove_names[i]) kfree(ls->ls_remove_names[i]); } - out_rsbtbl: vfree(ls->ls_rsbtbl); out_lsfree: if (do_unreg) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 82377017130f..d31b6c72b476 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0)) { + (inode->i_mapping->nrpages == 0 && !need_resched())) { spin_unlock(&inode->i_lock); continue; } @@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); + cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 37e0e7ca2501..a41120a34e6d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1154,7 +1154,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { + if (ep->ovflist != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; diff --git a/fs/exec.c b/fs/exec.c index d08cc100c129..352c1a6fa6a9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -929,7 +929,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, bytes = kernel_read(file, *buf + pos, i_size - pos, &pos); if (bytes < 0) { ret = bytes; - goto out; + goto out_free; } if (bytes == 0) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0c38e31ec938..364e647d87c0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -761,7 +761,8 @@ static loff_t ext2_max_size(int bits) { loff_t res = EXT2_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; + unsigned int upper_limit; + unsigned int ppb = 1 << (bits-2); /* This is calculated to be the largest file size for a * dense, file such that the total number of @@ -775,24 +776,34 @@ static loff_t ext2_max_size(int bits) /* total blocks in file system block size */ upper_limit >>= (bits - 9); + /* Compute how many blocks we can address by block tree */ + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + /* Does block tree limit file size? */ + if (res < upper_limit) + goto check_lfs; + res = upper_limit; + /* How many metadata blocks are needed for addressing upper_limit? */ + upper_limit -= EXT2_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; + upper_limit -= ppb; /* double indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)); - /* tripple indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); - - upper_limit -= meta_blocks; - upper_limit <<= bits; - - res += 1LL << (bits-2); - res += 1LL << (2*(bits-2)); - res += 1LL << (3*(bits-2)); + if (upper_limit < ppb * ppb) { + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb); + res -= meta_blocks; + goto check_lfs; + } + meta_blocks += 1 + ppb; + upper_limit -= ppb * ppb; + /* tripple indirect blocks for the rest */ + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) + + DIV_ROUND_UP(upper_limit, ppb*ppb); + res -= meta_blocks; +check_lfs: res <<= bits; - if (res > upper_limit) - res = upper_limit; - if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5cfb1e2f6a5b..1ee51d3a978a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -435,6 +435,9 @@ struct flex_groups { /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -1667,6 +1670,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +extern void ext4_update_dynamic_rev(struct super_block *sb); + #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ @@ -1675,6 +1680,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ @@ -1692,6 +1698,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ @@ -1709,6 +1716,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ @@ -2459,8 +2467,19 @@ int do_journal_get_write_access(handle_t *handle, #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 -extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); @@ -2542,6 +2561,8 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); @@ -2663,7 +2684,6 @@ do { \ #endif -extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); extern int ext4_update_rocompat_feature(handle_t *handle, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 15b6dd733780..df908ef79cce 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -384,7 +384,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, { struct ext4_inode_info *ei = EXT4_I(inode); - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72a361d5ef74..45aea792d22a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1035,6 +1035,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ int err = 0; + size_t ext_size = 0; /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ @@ -1126,6 +1127,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, le16_add_cpu(&neh->eh_entries, m); } + /* zero out unused area in the extent block */ + ext_size = sizeof(struct ext4_extent_header) + + sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); + memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1205,6 +1210,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } + /* zero out unused area in the extent block */ + ext_size = sizeof(struct ext4_extent_header) + + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); + memset(bh->b_data + ext_size, 0, + inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1270,6 +1280,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock, goal = 0; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; + size_t ext_size = 0; /* Try to prepend new index to old one */ if (ext_depth(inode)) @@ -1295,9 +1306,11 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, goto out; } + ext_size = sizeof(EXT4_I(inode)->i_data); /* move top-level index/leaf into new block */ - memmove(bh->b_data, EXT4_I(inode)->i_data, - sizeof(EXT4_I(inode)->i_data)); + memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); + /* zero out unused area in the extent block */ + memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); /* set size of new block */ neh = ext_block_hdr(bh); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 69d65d49837b..2c5baa5e8291 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -125,7 +125,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - if (pos >= i_size_read(inode)) + if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) return 0; if ((pos | iov_iter_alignment(from)) & blockmask) @@ -264,6 +264,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = __generic_file_write_iter(iocb, from); + /* + * Unaligned direct AIO must be the only IO in flight. Otherwise + * overlapping aligned IO after unaligned might result in data + * corruption. + */ + if (ret == -EIOCBQUEUED && unaligned_aio) + ext4_unwritten_wait(inode); inode_unlock(inode); if (ret > 0) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 26a7fe5c4fd3..5508baa11bb6 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -159,6 +159,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = err; } out: + err = file_check_and_advance_wb_err(file); + if (ret == 0) + ret = err; trace_ext4_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2addcb8730e1..091a18a51c99 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1225,7 +1225,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) if (!ext4_test_bit(bit, bitmap_bh->b_data)) goto bad_orphan; - inode = ext4_iget(sb, ino); + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(sb, "couldn't read orphan inode %lu (err %d)", diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bf7fa1507e81..e1801b288847 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1219,6 +1219,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; + Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; @@ -1260,7 +1261,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, } - partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ @@ -1285,13 +1286,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } end_range: - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* @@ -1321,16 +1320,14 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { @@ -1387,11 +1384,7 @@ end_range: partial->p + 1, partial2->p, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); - return 0; + goto cleanup; } /* @@ -1406,8 +1399,6 @@ end_range: partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } if (partial2 > chain2 && depth2 <= depth) { @@ -1415,11 +1406,21 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } } + +cleanup: + while (p && p > chain) { + BUFFER_TRACE(p->bh, "call brelse"); + brelse(p->bh); + p--; + } + while (p2 && p2 > chain2) { + BUFFER_TRACE(p2->bh, "call brelse"); + brelse(p2->bh); + p2--; + } return 0; do_indirects: @@ -1427,7 +1428,7 @@ do_indirects: switch (offsets[0]) { default: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); @@ -1435,7 +1436,7 @@ do_indirects: } case EXT4_IND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); @@ -1443,7 +1444,7 @@ do_indirects: } case EXT4_DIND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); @@ -1452,5 +1453,5 @@ do_indirects: case EXT4_TIND_BLOCK: ; } - return 0; + goto cleanup; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9c4bac18cc6c..56f6e1782d5f 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -705,8 +705,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto out_up_read; + } } ret = 1; @@ -1887,12 +1890,12 @@ int ext4_inline_data_fiemap(struct inode *inode, physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); - if (physical) - error = fiemap_fill_next_extent(fieinfo, start, physical, - inline_len, flags); brelse(iloc.bh); out: up_read(&EXT4_I(inode)->xattr_sem); + if (physical) + error = fiemap_fill_next_extent(fieinfo, start, physical, + inline_len, flags); return (error < 0 ? error : 0); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 244531d3065a..05dc5a4ba481 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2748,7 +2748,8 @@ static int ext4_writepages(struct address_space *mapping, * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ - rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits); + rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, + PAGE_SIZE >> inode->i_blkbits); } /* @@ -4786,7 +4787,9 @@ static inline u64 ext4_inode_peek_iversion(const struct inode *inode) return inode_peek_iversion(inode); } -struct inode *ext4_iget(struct super_block *sb, unsigned long ino) +struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; @@ -4800,6 +4803,18 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) gid_t i_gid; projid_t i_projid; + if ((!(flags & EXT4_IGET_SPECIAL) && + (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || + (ino < EXT4_ROOT_INO) || + (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { + if (flags & EXT4_IGET_HANDLE) + return ERR_PTR(-ESTALE); + __ext4_error(sb, function, line, + "inode #%lu: comm %s: iget: illegal inode #", + ino, current->comm); + return ERR_PTR(-EFSCORRUPTED); + } + inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); @@ -4815,18 +4830,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) raw_inode = ext4_raw_inode(&iloc); if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { - EXT4_ERROR_INODE(inode, "root inode unallocated"); + ext4_error_inode(inode, function, line, 0, + "iget: root inode unallocated"); ret = -EFSCORRUPTED; goto bad_inode; } + if ((flags & EXT4_IGET_HANDLE) && + (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { - EXT4_ERROR_INODE(inode, - "bad extra_isize %u (inode size %u)", + ext4_error_inode(inode, function, line, 0, + "iget: bad extra_isize %u " + "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; @@ -4848,7 +4871,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { - EXT4_ERROR_INODE(inode, "checksum invalid"); + ext4_error_inode(inode, function, line, 0, + "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } @@ -4905,7 +4929,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { - EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ext4_error_inode(inode, function, line, 0, + "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } @@ -4981,7 +5006,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ext4_error_inode(inode, function, line, 0, + "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; @@ -5009,8 +5035,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - EXT4_ERROR_INODE(inode, - "immutable or append flags not allowed on symlinks"); + ext4_error_inode(inode, function, line, 0, + "iget: immutable or append flags " + "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } @@ -5040,7 +5067,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) make_bad_inode(inode); } else { ret = -EFSCORRUPTED; - EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); + ext4_error_inode(inode, function, line, 0, + "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } brelse(iloc.bh); @@ -5054,13 +5082,6 @@ bad_inode: return ERR_PTR(ret); } -struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) -{ - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-EFSCORRUPTED); - return ext4_iget(sb, ino); -} - static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) @@ -5299,7 +5320,6 @@ static int ext4_do_update_inode(handle_t *handle, err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) goto out_brelse; - ext4_update_dynamic_rev(sb); ext4_set_feature_large_file(sb); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); @@ -5349,9 +5369,13 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || + sb_rdonly(inode->i_sb)) return 0; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); @@ -5367,7 +5391,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; - err = ext4_force_commit(inode->i_sb); + err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; @@ -5571,25 +5596,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) { - if (orphan) + if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); goto err_out; } } - if (!shrink) + if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); - - /* - * Blocks are going to be removed from the inode. Wait - * for dio in flight. Temporarily disable - * dioread_nolock to prevent livelock. - */ - if (orphan) { - if (!ext4_should_journal_data(inode)) { - inode_dio_wait(inode); - } else - ext4_wait_for_tail_page_commit(inode); + } else { + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. + */ + inode_dio_wait(inode); } + if (orphan && ext4_should_journal_data(inode)) + ext4_wait_for_tail_page_commit(inode); down_write(&EXT4_I(inode)->i_mmap_sem); rc = ext4_break_layouts(inode); @@ -5945,7 +5967,7 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); - BUFFER_TRACE(iloc.bh, "get_write_access"); + BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0edee31913d1..53d57cdf3c4d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; + unsigned long tmp; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_blocks, inode2->i_blocks); - swap(inode1->i_bytes, inode2->i_bytes); swap(inode1->i_atime, inode2->i_atime); swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - swap(ei1->i_flags, ei2->i_flags); + tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; + ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | + (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); + ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); @@ -115,28 +117,41 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; + qsize_t size, size_bl, diff; + blkcnt_t blocks; + unsigned short bytes; - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || - IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || - ext4_has_inline_data(inode)) - return -EINVAL; - - if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) - return -EPERM; - - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); - filemap_flush(inode->i_mapping); - filemap_flush(inode_bl->i_mapping); - /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + ext4_has_inline_data(inode)) { + err = -EINVAL; + goto journal_err_out; + } + + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto journal_err_out; + } + + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto err_out; + + err = filemap_write_and_wait(inode_bl->i_mapping); + if (err) + goto err_out; + /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); @@ -147,7 +162,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto journal_err_out; + goto err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -170,6 +185,13 @@ static long swap_inode_boot_loader(struct super_block *sb, memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } + err = dquot_initialize(inode); + if (err) + goto err_out1; + + size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; + size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; + diff = size - size_bl; swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); @@ -183,27 +205,51 @@ static long swap_inode_boot_loader(struct super_block *sb, err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { + /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); - } else { - err = ext4_mark_inode_dirty(handle, inode_bl); - if (err < 0) { - ext4_warning(inode_bl->i_sb, - "couldn't mark inode #%lu dirty (err %d)", - inode_bl->i_ino, err); - /* Revert all changes: */ - swap_inode_data(inode, inode_bl); - ext4_mark_inode_dirty(handle, inode); - ext4_mark_inode_dirty(handle, inode_bl); - } + goto err_out1; } + + blocks = inode_bl->i_blocks; + bytes = inode_bl->i_bytes; + inode_bl->i_blocks = inode->i_blocks; + inode_bl->i_bytes = inode->i_bytes; + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + goto revert; + } + + /* Bootloader inode should not be counted into quota information. */ + if (diff > 0) + dquot_free_space(inode, diff); + else + err = dquot_alloc_space(inode, -1 * diff); + + if (err < 0) { +revert: + /* Revert all changes: */ + inode_bl->i_blocks = blocks; + inode_bl->i_bytes = bytes; + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); + } + +err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); +err_out: + up_write(&EXT4_I(inode)->i_mmap_sem); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); @@ -931,7 +977,7 @@ mext_out: if (err == 0) err = err2; mnt_drop_write_file(filp); - if (!err && (o_group > EXT4_SB(sb)->s_groups_count) && + if (!err && (o_group < EXT4_SB(sb)->s_groups_count) && ext4_has_group_desc_csum(sb) && test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, o_group); @@ -953,6 +999,13 @@ resizefs_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; + /* + * We haven't replayed the journal, so we cannot use our + * block-bitmap-guided storage zapping commands. + */ + if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) + return -EROFS; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..cc229f3357f7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1539,7 +1539,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, ex->fe_len += 1 << order; } - if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) { + if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent " diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 61a9d1927817..a98bfca9c463 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -116,9 +116,9 @@ static int update_ind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -145,9 +145,9 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -175,9 +175,9 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -224,9 +224,9 @@ static int free_dind_blocks(handle_t *handle, struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -254,9 +254,9 @@ static int free_tind_blocks(handle_t *handle, struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -382,9 +382,9 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, struct ext4_extent_header *eh; block = ext4_idx_pblock(ix); - bh = sb_bread(inode->i_sb, block); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = (struct ext4_extent_header *)bh->b_data; if (eh->eh_depth != 0) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ffa25753e929..4c5aa5df6573 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -871,12 +871,15 @@ static void dx_release(struct dx_frame *frames) { struct dx_root_info *info; int i; + unsigned int indirect_levels; if (frames[0].bh == NULL) return; info = &((struct dx_root *)frames[0].bh->b_data)->info; - for (i = 0; i <= info->indirect_levels; i++) { + /* save local copy, "info" may be freed after brelse() */ + indirect_levels = info->indirect_levels; + for (i = 0; i <= indirect_levels; i++) { if (frames[i].bh == NULL) break; brelse(frames[i].bh); @@ -1571,7 +1574,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi dentry); return ERR_PTR(-EFSCORRUPTED); } - inode = ext4_iget_normal(dir->i_sb, ino); + inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1613,7 +1616,7 @@ struct dentry *ext4_get_parent(struct dentry *child) return ERR_PTR(-EFSCORRUPTED); } - return d_obtain_alias(ext4_iget_normal(child->d_sb, ino)); + return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a5efee34415f..4d5c0fc9d23a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -127,10 +127,12 @@ static int verify_group_input(struct super_block *sb, else if (free_blocks_count < 0) ext4_warning(sb, "Bad blocks count %u", input->blocks_count); - else if (!(bh = sb_bread(sb, end - 1))) + else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) { + err = PTR_ERR(bh); + bh = NULL; ext4_warning(sb, "Cannot read last block (%llu)", end - 1); - else if (outside(input->block_bitmap, start, end)) + } else if (outside(input->block_bitmap, start, end)) ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) @@ -781,11 +783,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, struct ext4_super_block *es = EXT4_SB(sb)->s_es; unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; - struct buffer_head **o_group_desc, **n_group_desc; - struct buffer_head *dind; - struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc = NULL; + struct buffer_head *dind = NULL; + struct buffer_head *gdb_bh = NULL; int gdbackups; - struct ext4_iloc iloc; + struct ext4_iloc iloc = { .bh = NULL }; __le32 *data; int err; @@ -794,21 +796,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; - goto exit_bh; + goto errout; } data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_bh; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; + goto errout; } data = (__le32 *)dind->b_data; @@ -816,18 +819,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_warning(sb, "new group %u GDT block %llu not reserved", group, gdblock); err = -EINVAL; - goto exit_dind; + goto errout; } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (unlikely(err)) - goto exit_dind; + goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_dind; + goto errout; BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, dind); @@ -837,7 +840,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dind; + goto errout; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -846,7 +849,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); - goto exit_inode; + goto errout; } /* @@ -862,7 +865,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, dind); if (unlikely(err)) { ext4_std_error(sb, err); - goto exit_inode; + goto errout; } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> (9 - EXT4_SB(sb)->s_cluster_bits); @@ -872,7 +875,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) { ext4_std_error(sb, err); iloc.bh = NULL; - goto exit_inode; + goto errout; } brelse(dind); @@ -888,15 +891,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_super(handle, sb); if (err) ext4_std_error(sb, err); - return err; - -exit_inode: +errout: kvfree(n_group_desc); brelse(iloc.bh); -exit_dind: brelse(dind); -exit_bh: brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); @@ -916,9 +915,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb, gdblock = ext4_meta_bg_first_block_no(sb, group) + ext4_bg_has_super(sb, group); - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), GFP_NOFS); @@ -934,11 +933,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb, memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); n_group_desc[gdb_num] = gdb_bh; + + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (err) { + kvfree(n_group_desc); + brelse(gdb_bh); + return err; + } + EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; kvfree(o_group_desc); - BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); return err; } @@ -975,9 +981,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, return -ENOMEM; data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; goto exit_free; } @@ -996,9 +1003,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, err = -EINVAL; goto exit_bh; } - primary[res] = sb_bread(sb, blk); - if (!primary[res]) { - err = -EIO; + primary[res] = ext4_sb_bread(sb, blk, 0); + if (IS_ERR(primary[res])) { + err = PTR_ERR(primary[res]); + primary[res] = NULL; goto exit_bh; } gdbackups = verify_reserved_gdb(sb, group, primary[res]); @@ -1631,13 +1639,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } if (reserved_gdb || gdb_off == 0) { - if (ext4_has_feature_resize_inode(sb) || + if (!ext4_has_feature_resize_inode(sb) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; } - inode = ext4_iget(sb, EXT4_RESIZE_INO); + inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); @@ -1960,12 +1968,14 @@ retry: le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); n_blocks_count = (ext4_fsblk_t)n_group * - EXT4_BLOCKS_PER_GROUP(sb); + EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); n_group--; /* set to last group number */ } if (!resize_inode) - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO, + EXT4_IGET_SPECIAL); if (IS_ERR(resize_inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(resize_inode); @@ -2071,6 +2081,10 @@ out: free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); - ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); + if (err) + ext4_warning(sb, "error (%d) occurred during " + "file system resize", err); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", + ext4_blocks_count(es)); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8a149df1c6a1..a270391228af 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -140,6 +140,29 @@ MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +/* + * This works like sb_bread() except it uses ERR_PTR for error + * returns. Currently with sb_bread it's impossible to distinguish + * between ENOMEM and EIO situations (since both result in a NULL + * return. + */ +struct buffer_head * +ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) +{ + struct buffer_head *bh = sb_getblk(sb, block); + + if (bh == NULL) + return ERR_PTR(-ENOMEM); + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + return ERR_PTR(-EIO); +} + static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { @@ -407,6 +430,12 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) spin_unlock(&sbi->s_md_lock); } +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -437,7 +466,12 @@ static void ext4_handle_error(struct super_block *sb) if (journal) jbd2_journal_abort(journal, -EIO); } - if (test_opt(sb, ERRORS_RO)) { + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (test_opt(sb, ERRORS_RO) || system_going_down()) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible @@ -445,8 +479,7 @@ static void ext4_handle_error(struct super_block *sb) */ smp_wmb(); sb->s_flags |= SB_RDONLY; - } - if (test_opt(sb, ERRORS_PANIC)) { + } else if (test_opt(sb, ERRORS_PANIC)) { if (EXT4_SB(sb)->s_journal && !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) return; @@ -665,7 +698,7 @@ void __ext4_abort(struct super_block *sb, const char *function, jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); } - if (test_opt(sb, ERRORS_PANIC)) { + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { if (EXT4_SB(sb)->s_journal && !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) return; @@ -1150,20 +1183,11 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb, { struct inode *inode; - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-ESTALE); - if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) - return ERR_PTR(-ESTALE); - - /* iget isn't really right if the inode is currently unallocated!! - * - * ext4_read_inode will return a bad_inode if the inode had been - * deleted, so we should be safe. - * + /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ - inode = ext4_iget_normal(sb, ino); + inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { @@ -1188,6 +1212,16 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, ext4_nfs_get_inode); } +static int ext4_nfs_commit_metadata(struct inode *inode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL + }; + + trace_ext4_nfs_commit_metadata(inode); + return ext4_write_inode(inode, &wbc); +} + /* * Try to release metadata pages (indirect blocks, directories) which are * mapped via the block device. Since these pages could have journal heads @@ -1392,6 +1426,7 @@ static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, + .commit_metadata = ext4_nfs_commit_metadata, }; enum { @@ -2224,7 +2259,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - ext4_update_dynamic_rev(sb); if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); @@ -3479,6 +3513,37 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } +static void ext4_clamp_want_extra_isize(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + /* determine the minimum size of new large inodes, if present */ + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && + sbi->s_want_extra_isize == 0) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (ext4_has_feature_extra_isize(sb)) { + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_want_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_want_extra_isize); + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_min_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_min_extra_isize); + } + } + /* Check if enough inode space is available */ + if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + ext4_msg(sb, KERN_INFO, + "required extra inode space not available"); + } +} + static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; @@ -4204,7 +4269,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "data=, fs mounted w/o journal"); goto failed_mount_wq; } - sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM; + sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); sbi->s_journal = NULL; @@ -4327,7 +4392,7 @@ no_journal: * so we can safely mount the rest of the filesystem now. */ - root = ext4_iget(sb, EXT4_ROOT_INO); + root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); ret = PTR_ERR(root); @@ -4353,30 +4418,7 @@ no_journal: } else if (ret) goto failed_mount4a; - /* determine the minimum size of new large inodes, if present */ - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && - sbi->s_want_extra_isize == 0) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - if (ext4_has_feature_extra_isize(sb)) { - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_want_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_want_extra_isize); - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_min_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_min_extra_isize); - } - } - /* Check if enough inode space is available */ - if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - ext4_msg(sb, KERN_INFO, "required extra inode space not" - "available"); - } + ext4_clamp_want_extra_isize(sb); ext4_set_resv_clusters(sb); @@ -4597,7 +4639,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ - journal_inode = ext4_iget(sb, journal_inum); + journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return NULL; @@ -4879,7 +4921,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) ext4_superblock_csum_set(sb); if (sync) lock_buffer(sbh); - if (buffer_write_io_error(sbh)) { + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the * superblock failed. This could happen because the @@ -5162,6 +5204,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + ext4_clamp_want_extra_isize(sb); + if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " @@ -5679,7 +5723,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, if (!qf_inums[type]) return -EPERM; - qf_inode = ext4_iget(sb, qf_inums[type]); + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); return PTR_ERR(qf_inode); @@ -5689,9 +5733,9 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_enable(qf_inode, type, format_id, flags); - iput(qf_inode); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); + iput(qf_inode); return err; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 4380c8630539..f73fc90e5daa 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -384,7 +384,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, struct inode *inode; int err; - inode = ext4_iget(parent->i_sb, ea_ino); + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, @@ -522,14 +522,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); - error = -ENODATA; if (!EXT4_I(inode)->i_file_acl) - goto cleanup; + return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - goto cleanup; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); @@ -696,26 +695,23 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); - error = 0; if (!EXT4_I(inode)->i_file_acl) - goto cleanup; + return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) - goto cleanup; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); - error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); - + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, + buffer_size); cleanup: brelse(bh); - return error; } @@ -830,9 +826,10 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) } if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - ret = -EIO; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + bh = NULL; goto out; } @@ -1490,7 +1487,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, } while (ce) { - ea_inode = ext4_iget(inode->i_sb, ce->e_value); + ea_inode = ext4_iget(inode->i_sb, ce->e_value, + EXT4_IGET_NORMAL); if (!IS_ERR(ea_inode) && !is_bad_inode(ea_inode) && (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && @@ -1702,7 +1700,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* No failures allowed past this point. */ - if (!s->not_found && here->e_value_size && here->e_value_offs) { + if (!s->not_found && here->e_value_size && !here->e_value_inum) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); @@ -1825,16 +1823,15 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ - bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bs->bh) - goto cleanup; + bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bs->bh)) + return PTR_ERR(bs->bh); ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) - goto cleanup; + return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); @@ -1843,13 +1840,10 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) - goto cleanup; + return error; bs->s.not_found = error; } - error = 0; - -cleanup: - return error; + return 0; } static int @@ -2278,9 +2272,9 @@ static struct buffer_head *ext4_xattr_get_block(struct inode *inode) if (!EXT4_I(inode)->i_file_acl) return NULL; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - return ERR_PTR(-EIO); + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); @@ -2733,7 +2727,7 @@ retry: base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; - total_ino = sizeof(struct ext4_xattr_ibody_header); + total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); error = xattr_check_inode(inode, header, end); if (error) @@ -2750,10 +2744,11 @@ retry: if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); goto cleanup; + } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); @@ -2907,11 +2902,13 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, } if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); - error = -EIO; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + if (error == -EIO) + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); + bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3064,8 +3061,11 @@ ext4_xattr_block_cache_find(struct inode *inode, while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_value); - if (!bh) { + bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); + if (IS_ERR(bh)) { + if (PTR_ERR(bh) == -ENOMEM) + return NULL; + bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 111824199a88..b9fe937a3c70 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -352,12 +352,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, return PTR_ERR(p); clone = f2fs_acl_clone(p, GFP_NOFS); - if (!clone) - goto no_mem; + if (!clone) { + ret = -ENOMEM; + goto release_acl; + } ret = f2fs_acl_create_masq(clone, mode); if (ret < 0) - goto no_mem_clone; + goto release_clone; if (ret == 0) posix_acl_release(clone); @@ -371,11 +373,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, return 0; -no_mem_clone: +release_clone: posix_acl_release(clone); -no_mem: +release_acl: posix_acl_release(p); - return -ENOMEM; + return ret; } int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 11f28342f641..4d02e76b648a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -197,12 +197,14 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, struct block_device *bdev = sbi->sb->s_bdev; int i; - for (i = 0; i < sbi->s_ndevs; i++) { - if (FDEV(i).start_blk <= blk_addr && - FDEV(i).end_blk >= blk_addr) { - blk_addr -= FDEV(i).start_blk; - bdev = FDEV(i).bdev; - break; + if (f2fs_is_multi_device(sbi)) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } } } if (bio) { @@ -216,6 +218,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) { int i; + if (!f2fs_is_multi_device(sbi)) + return 0; + for (i = 0; i < sbi->s_ndevs; i++) if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) return i; @@ -2259,6 +2264,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, bool locked = false; struct extent_info ei = {0,0,0}; int err = 0; + int flag; /* * we already allocated all the blocks, so we don't need to get @@ -2268,9 +2274,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; + /* f2fs_lock_op avoids race between write CP and convert_inline_page */ + if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + else + flag = F2FS_GET_BLOCK_PRE_AIO; + if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + __do_map_lock(sbi, flag, true); locked = true; } restart: @@ -2308,6 +2320,7 @@ restart: f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; } @@ -2321,7 +2334,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + __do_map_lock(sbi, flag, false); return err; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 214a968962a1..ebe649d9793c 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -190,8 +190,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); - if (f2fs_discard_en(sbi)) - si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ecb735142276..a4b6eacf22ea 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -450,7 +450,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ @@ -1337,6 +1336,17 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) } #endif +/* + * Test if the mounted volume is a multi-device volume. + * - For a single regular disk volume, sbi->s_ndevs is 0. + * - For a single zoned disk volume, sbi->s_ndevs is 1. + * - For a multi-device volume, sbi->s_ndevs is always 2 or more. + */ +static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) +{ + return sbi->s_ndevs > 1; +} + /* For write statistics. Suppose sector size is 512 bytes, * and the return value is in kbytes. s is of struct f2fs_sb_info. */ @@ -2613,10 +2623,19 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || is_inode_flag_set(inode, FI_NO_EXTENT)) return false; + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + return S_ISREG(inode->i_mode); } @@ -3401,11 +3420,20 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi, } #endif -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + return f2fs_sb_has_blkzoned(sbi->sb); +} - return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); +static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) +{ + return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev)); +} + +static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) +{ + return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || + f2fs_hw_should_discard(sbi); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) @@ -3438,7 +3466,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) { return (f2fs_post_read_required(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); + f2fs_is_multi_device(F2FS_I_SB(inode))); } #ifdef CONFIG_F2FS_FAULT_INJECTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5474aaa274b9..8d1eb8dec605 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -220,6 +220,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, trace_f2fs_sync_file_enter(inode); + if (S_ISDIR(inode->i_mode)) + goto go_write; + /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(inode, FI_NEED_IPU); @@ -767,7 +770,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; - bool size_changed = false; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -827,8 +829,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) down_write(&F2FS_I(inode)->i_sem); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); - - size_changed = true; } __setattr_copy(inode, attr); @@ -842,7 +842,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } /* file size may changed here */ - f2fs_mark_inode_dirty_sync(inode, size_changed); + f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1733,10 +1733,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (!get_dirty_pages(inode)) - goto skip_flush; - - f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -1744,7 +1746,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } -skip_flush: + set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1978,7 +1980,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!f2fs_hw_support_discard(F2FS_SB(sb))) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, @@ -2537,7 +2539,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; - if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num || sbi->segs_per_sec != 1) { f2fs_msg(sbi->sb, KERN_WARNING, "Can't flush %u in %d for segs_per_sec %u != 1\n", diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5c8d00422237..d44b57a363ff 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1256,7 +1256,7 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ - if (sbi->s_ndevs && sbi->segs_per_sec == 1) + if (f2fs_is_multi_device(sbi) && sbi->segs_per_sec == 1) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 115dc219344b..92703efde36e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -661,6 +661,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -669,7 +675,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 1); + f2fs_put_page(ipage, 0); return err < 0 ? err : 0; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 42ea42acb487..19a0d83aae65 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -827,6 +827,7 @@ static int truncate_node(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; int err; + pgoff_t index; err = f2fs_get_node_info(sbi, dn->nid, &ni); if (err) @@ -846,10 +847,11 @@ static int truncate_node(struct dnode_of_data *dn) clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); + index = dn->node_page->index; f2fs_put_page(dn->node_page, 1); invalidate_mapping_pages(NODE_MAPPING(sbi), - dn->node_page->index, dn->node_page->index); + index, index); dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9a8579fb3a30..ae0e5f2e67b4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -99,8 +99,12 @@ err_out: return ERR_PTR(err); } -static void del_fsync_inode(struct fsync_inode_entry *entry) +static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) { + if (drop) { + /* inode should not be recovered, drop it */ + f2fs_inode_synced(entry->inode); + } iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); @@ -321,12 +325,12 @@ next: return err; } -static void destroy_fsync_dnodes(struct list_head *head) +static void destroy_fsync_dnodes(struct list_head *head, int drop) { struct fsync_inode_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, head, list) - del_fsync_inode(entry); + del_fsync_inode(entry, drop); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -561,7 +565,7 @@ out: } static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, - struct list_head *dir_list) + struct list_head *tmp_inode_list, struct list_head *dir_list) { struct curseg_info *curseg; struct page *page = NULL; @@ -615,7 +619,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, } if (entry->blkaddr == blkaddr) - del_fsync_inode(entry); + list_move_tail(&entry->list, tmp_inode_list); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -628,7 +632,7 @@ next: int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct list_head inode_list; + struct list_head inode_list, tmp_inode_list; struct list_head dir_list; int err; int ret = 0; @@ -659,6 +663,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&tmp_inode_list); INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ @@ -677,11 +682,16 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, &dir_list); + err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); + else { + /* restore s_flags to let iput() trash data */ + sbi->sb->s_flags = s_flags; + } skip: - destroy_fsync_dnodes(&inode_list); + destroy_fsync_dnodes(&inode_list, err); + destroy_fsync_dnodes(&tmp_inode_list, err); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), @@ -690,13 +700,13 @@ skip: if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); + } else { + clear_sbi_flag(sbi, SBI_POR_DOING); } - - clear_sbi_flag(sbi, SBI_POR_DOING); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ - destroy_fsync_dnodes(&dir_list); + destroy_fsync_dnodes(&dir_list, err); if (need_writecp) { set_sbi_flag(sbi, SBI_IS_RECOVERED); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30779aaa9dba..03fa2c4d3d79 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -216,7 +216,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -228,7 +229,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true); @@ -317,13 +327,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -427,12 +443,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; @@ -555,7 +574,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) int ret = 0; int i; - if (!sbi->s_ndevs) + if (!f2fs_is_multi_device(sbi)) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { @@ -621,7 +640,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) return ret; } - if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) { + if (atomic_inc_return(&fcc->issing_flush) == 1 || + f2fs_is_multi_device(sbi)) { ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->issing_flush); @@ -727,7 +747,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) { int ret = 0, i; - if (!sbi->s_ndevs) + if (!f2fs_is_multi_device(sbi)) return 0; for (i = 1; i < sbi->s_ndevs; i++) { @@ -1270,7 +1290,7 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, trace_f2fs_queue_discard(bdev, blkstart, blklen); - if (sbi->s_ndevs) { + if (f2fs_is_multi_device(sbi)) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; @@ -1619,7 +1639,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, block_t lblkstart = blkstart; int devi = 0; - if (sbi->s_ndevs) { + if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } @@ -1725,11 +1745,11 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) return false; if (!force) { - if (!test_opt(sbi, DISCARD) || !se->valid_blocks || + if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -1835,7 +1855,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, dirty_i->nr_dirty[PRE]--; } - if (!test_opt(sbi, DISCARD)) + if (!f2fs_realtime_discard_enable(sbi)) continue; if (force && start >= cpc->trim_start && @@ -2025,8 +2045,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (f2fs_discard_en(sbi) && - !f2fs_test_and_set_bit(offset, se->discard_map)) + if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ @@ -2054,8 +2073,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (f2fs_discard_en(sbi) && - f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -2671,7 +2689,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ - if (test_opt(sbi, DISCARD)) + if (f2fs_realtime_discard_enable(sbi)) goto out; start_block = START_BLOCK(sbi, start_segno); @@ -2954,7 +2972,7 @@ static void update_device_state(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; unsigned int devidx; - if (!sbi->s_ndevs) + if (!f2fs_is_multi_device(sbi)) return; devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); @@ -3762,13 +3780,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - if (f2fs_discard_en(sbi)) { - sit_i->sentries[start].discard_map - = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, - GFP_KERNEL); - if (!sit_i->sentries[start].discard_map) - return -ENOMEM; - } + sit_i->sentries[start].discard_map + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, + GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -3916,18 +3932,16 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) total_node_blocks += se->valid_blocks; /* build discard map only one time */ - if (f2fs_discard_en(sbi)) { - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - - se->valid_blocks; - } + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; } if (sbi->segs_per_sec > 1) @@ -3965,16 +3979,13 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - if (f2fs_discard_en(sbi)) { - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks; - sbi->discard_blks -= se->valid_blocks; - } + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; } if (sbi->segs_per_sec > 1) { diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 36cfd816c160..29042e6d5126 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -138,6 +138,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); spin_lock(&f2fs_list_lock); - list_del(&sbi->s_list); + list_del_init(&sbi->s_list); spin_unlock(&f2fs_list_lock); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 287c9fe9fff9..2264f27fd26d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -360,7 +360,6 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - struct request_queue *q; substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; @@ -415,14 +414,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; break; case Opt_discard: - q = bdev_get_queue(sb->s_bdev); - if (blk_queue_discard(q)) { - set_opt(sbi, DISCARD); - } else if (!f2fs_sb_has_blkzoned(sb)) { - f2fs_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + set_opt(sbi, DISCARD); break; case Opt_nodiscard: if (f2fs_sb_has_blkzoned(sb)) { @@ -822,12 +814,13 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + if (F2FS_OPTION(sbi).inline_xattr_size < + sizeof(struct f2fs_xattr_header) / sizeof(__le32) || + F2FS_OPTION(sbi).inline_xattr_size > + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - + DEF_INLINE_RESERVED_SIZE - + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; @@ -1032,16 +1025,14 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_wait_discard_bios(sbi); - if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) { + if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && + !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; f2fs_write_checkpoint(sbi, &cpc); } - /* f2fs_write_checkpoint can update stat informaion */ - f2fs_destroy_stats(sbi); - /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. @@ -1061,6 +1052,12 @@ static void f2fs_put_super(struct super_block *sb) iput(sbi->node_inode); iput(sbi->meta_inode); + /* + * iput() can update stat information, if f2fs_write_checkpoint() + * above failed with error. + */ + f2fs_destroy_stats(sbi); + /* destroy f2fs internal modules */ f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); @@ -1399,8 +1396,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev))) - set_opt(sbi, DISCARD); + set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi->sb)) set_opt_mode(sbi, F2FS_MOUNT_LFS); else @@ -1889,6 +1885,19 @@ void f2fs_quota_off_umount(struct super_block *sb) } } +static void f2fs_truncate_quota_inode_pages(struct super_block *sb) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int type; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!dqopt->files[type]) + continue; + f2fs_inode_synced(dqopt->files[type]); + } +} + + static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) { *projid = F2FS_I(inode)->i_projid; @@ -2267,10 +2276,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) { + if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) { f2fs_msg(sb, KERN_INFO, - "Wrong segment_count / block_count (%u > %u)", - segment_count, le32_to_cpu(raw_super->block_count)); + "Wrong segment_count / block_count (%u > %llu)", + segment_count, le64_to_cpu(raw_super->block_count)); return 1; } @@ -2333,7 +2342,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int segment_count_main; unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count; - int i; + int i, j; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -2374,11 +2383,43 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_node_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Node segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_data_segno[i])); + return 1; + } + } + } + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + for (j = i; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u) and Data segment (%u)" + " has the same segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } } sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); @@ -2980,30 +3021,30 @@ try_onemore: f2fs_build_gc_manager(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); - goto free_nm; + goto free_stats; } - err = f2fs_build_stats(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); - goto free_stats; + goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size || !root->i_nlink) { iput(root); err = -EINVAL; - goto free_stats; + goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ @@ -3103,10 +3144,10 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA + f2fs_truncate_quota_inode_pages(sb); if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif - f2fs_sync_inode_meta(sbi); /* * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() @@ -3121,12 +3162,12 @@ free_sysfs: free_root_inode: dput(sb->s_root); sb->s_root = NULL; -free_stats: - f2fs_destroy_stats(sbi); free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); +free_stats: + f2fs_destroy_stats(sbi); free_nm: f2fs_destroy_node_manager(sbi); free_sm: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 81c0e5337443..98887187af4c 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -273,10 +273,16 @@ out: return count; } - *ui = t; - if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) - f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + return count; } diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index a1fcd00bbb2b..8ac1851a21c0 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -17,7 +17,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static struct mutex pids_lock; +static spinlock_t pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -61,23 +61,29 @@ void f2fs_trace_pid(struct page *page) set_page_private(page, (unsigned long)pid); +retry: if (radix_tree_preload(GFP_NOFS)) return; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; if (p) radix_tree_delete(&pids, pid); - f2fs_radix_tree_insert(&pids, pid, current); + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } trace_printk("%3x:%3x %4x %-16s\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); radix_tree_preload_end(); } @@ -122,7 +128,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - mutex_init(&pids_lock); + spin_lock_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -150,7 +156,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -158,5 +164,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 77a010e625f5..409a637f7a92 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -227,11 +227,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, { struct f2fs_xattr_entry *entry; unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > base_addr + inline_size || - (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > - base_addr + inline_size) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { *last_addr = entry; return NULL; } @@ -242,6 +242,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, if (!memcmp(entry->e_name, name, len)) break; } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } return entry; } @@ -291,7 +298,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, - void **base_addr) + void **base_addr, int *base_size) { void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; @@ -302,8 +309,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), - inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); + *base_size = inline_size + size + XATTR_PADDING_SIZE; + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS); if (!txattr_addr) return -ENOMEM; @@ -315,8 +322,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, index, len, name); - if (*xe) + if (*xe) { + *base_size = inline_size; goto check; + } } /* read from xattr node block */ @@ -477,6 +486,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, int error = 0; unsigned int size, len; void *base_addr = NULL; + int base_size; if (name == NULL) return -EINVAL; @@ -487,7 +497,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, - &entry, &base_addr); + &entry, &base_addr, &base_size); up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -501,6 +511,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (buffer) { char *pval = entry->e_name + entry->e_name_len; + + if (base_size - (pval - (char *)base_addr) < size) { + error = -ERANGE; + goto out; + } memcpy(buffer, pval, size); } error = size; diff --git a/fs/file.c b/fs/file.c index 7ffd6e9d103d..780d29e58847 100644 --- a/fs/file.c +++ b/fs/file.c @@ -457,6 +457,7 @@ struct files_struct init_files = { .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), + .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 471d863958bc..9544e2f8b79f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -331,11 +331,22 @@ struct inode_switch_wbs_context { struct work_struct work; }; +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + down_write(&bdi->wb_switch_rwsem); +} + +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + up_write(&bdi->wb_switch_rwsem); +} + static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; + struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -344,6 +355,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) void **slot; /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + + /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be @@ -435,6 +452,8 @@ skip_switch: spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); + up_read(&bdi->wb_switch_rwsem); + if (switched) { wb_wakeup(new_wb); wb_put(old_wb); @@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (inode->i_state & I_WB_SWITCH) return; + /* + * Avoid starting new switches while sync_inodes_sb() is in + * progress. Otherwise, if the down_write protected issue path + * blocks heavily, we might end up starting a large number of + * switches which will block on the rwsem. + */ + if (!down_read_trylock(&bdi->wb_switch_rwsem)) + return; + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) - return; + goto out_unlock; /* find and pin the new wb */ rcu_read_lock(); @@ -502,8 +530,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) isw->inode = inode; - atomic_inc(&isw_nr_in_flight); - /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page @@ -511,12 +537,17 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - return; + + atomic_inc(&isw_nr_in_flight); + + goto out_unlock; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); +out_unlock: + up_read(&bdi->wb_switch_rwsem); } /** @@ -878,7 +909,11 @@ restart: void cgroup_writeback_umount(void) { if (atomic_read(&isw_nr_in_flight)) { - synchronize_rcu(); + /* + * Use rcu_barrier() to wait for all pending callbacks to + * ensure that all in-flight wb switches are in the workqueue. + */ + rcu_barrier(); flush_workqueue(isw_wq); } } @@ -894,6 +929,9 @@ fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -2420,8 +2458,11 @@ void sync_inodes_sb(struct super_block *sb) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); + /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ + bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(bdi, &done); + bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index bf0da0382c9e..249de20f752a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1698,7 +1698,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; @@ -1713,6 +1712,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].offset = offset; req->page_descs[req->num_pages].length = this_num; req->num_pages++; @@ -1989,10 +1989,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; ret = -EINVAL; - if (rem < len) { - pipe_unlock(pipe); - goto out; - } + if (rem < len) + goto out_free; rem = len; while (rem) { @@ -2010,7 +2008,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - pipe_buf_get(pipe, ibuf); + if (!pipe_buf_get(pipe, ibuf)) + goto out_free; + *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2032,10 +2032,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); + pipe_lock(pipe); +out_free: for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); + pipe_unlock(pipe); -out: kvfree(bufs); return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 304900009320..58324a93e3c0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1439,7 +1439,7 @@ static int fuse_dir_open(struct inode *inode, struct file *file) static int fuse_dir_release(struct inode *inode, struct file *file) { - fuse_release_common(file, FUSE_RELEASEDIR); + fuse_release_common(file, true); return 0; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a0ffed34b85d..9a22aa580fe7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -87,12 +87,12 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) iput(req->misc.release.inode); } -static void fuse_file_put(struct fuse_file *ff, bool sync) +static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) { if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (ff->fc->no_open) { + if (ff->fc->no_open && !isdir) { /* * Drop the release request when client does not * implement 'open' @@ -179,7 +179,9 @@ void fuse_finish_open(struct inode *inode, struct file *file) file->f_op = &fuse_direct_io_file_operations; if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); - if (ff->open_flags & FOPEN_NONSEEKABLE) + if (ff->open_flags & FOPEN_STREAM) + stream_open(inode, file); + else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -245,10 +247,11 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) req->in.args[0].value = inarg; } -void fuse_release_common(struct file *file, int opcode) +void fuse_release_common(struct file *file, bool isdir) { struct fuse_file *ff = file->private_data; struct fuse_req *req = ff->reserved_req; + int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(ff, file->f_flags, opcode); @@ -270,7 +273,7 @@ void fuse_release_common(struct file *file, int opcode) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy_req != NULL); + fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -286,7 +289,7 @@ static int fuse_release(struct inode *inode, struct file *file) if (fc->writeback_cache) write_inode_now(inode, 1); - fuse_release_common(file, FUSE_RELEASE); + fuse_release_common(file, false); /* return value is ignored by VFS */ return 0; @@ -300,7 +303,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags) * iput(NULL) is a no-op and since the refcount is 1 and everything's * synchronous, we are fine with not doing igrab() here" */ - fuse_file_put(ff, true); + fuse_file_put(ff, true, false); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -805,7 +808,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) put_page(page); } if (req->ff) - fuse_file_put(req->ff, false); + fuse_file_put(req->ff, false, false); } static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -1459,7 +1462,7 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) __free_page(req->pages[i]); if (req->ff) - fuse_file_put(req->ff, false); + fuse_file_put(req->ff, false, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1525,7 +1528,7 @@ __acquires(fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - size_t crop = i_size_read(inode); + loff_t crop = i_size_read(inode); struct fuse_req *req; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { @@ -1616,7 +1619,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fc, fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, 0); + fuse_file_put(ff, false, false); return err; } @@ -1777,7 +1780,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, spin_unlock(&fc->lock); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); + dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); @@ -1930,7 +1933,7 @@ static int fuse_writepages(struct address_space *mapping, err = 0; } if (data.ff) - fuse_file_put(data.ff, false); + fuse_file_put(data.ff, false, false); kfree(data.orig_pages); out: @@ -2974,6 +2977,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } } + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + length > i_size_read(inode)) { + err = inode_newsize_ok(inode, offset + length); + if (err) + goto out; + } + if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f78e9614bb5f..cec8b8e74969 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -749,7 +749,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags); /** * Send RELEASE or RELEASEDIR request */ -void fuse_release_common(struct file *file, int opcode); +void fuse_release_common(struct file *file, bool isdir); /** * Send FSYNC or FSYNCDIR request diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4614ee25f621..ccdd8c821abd 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -107,7 +107,7 @@ static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) { - u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0); return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); } @@ -140,6 +140,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + BUG_ON(atomic_read(&gl->gl_revokes)); rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); @@ -183,15 +184,19 @@ static int demote_ok(const struct gfs2_glock *gl) void gfs2_glock_add_to_lru(struct gfs2_glock *gl) { + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + spin_lock(&lru_lock); - if (!list_empty(&gl->gl_lru)) - list_del_init(&gl->gl_lru); - else + list_del(&gl->gl_lru); + list_add_tail(&gl->gl_lru, &lru_list); + + if (!test_bit(GLF_LRU, &gl->gl_flags)) { + set_bit(GLF_LRU, &gl->gl_flags); atomic_inc(&lru_count); + } - list_add_tail(&gl->gl_lru, &lru_list); - set_bit(GLF_LRU, &gl->gl_flags); spin_unlock(&lru_lock); } @@ -201,7 +206,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) return; spin_lock(&lru_lock); - if (!list_empty(&gl->gl_lru)) { + if (test_bit(GLF_LRU, &gl->gl_flags)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); @@ -1158,8 +1163,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } - if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl) && - (glops->go_flags & GLOF_LRU)) + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) gfs2_glock_add_to_lru(gl); trace_gfs2_glock_queue(gh, 0); @@ -1455,6 +1459,7 @@ __acquires(&lru_lock) if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_add(&gl->gl_lru, &lru_list); + set_bit(GLF_LRU, &gl->gl_flags); atomic_inc(&lru_count); continue; } @@ -1462,7 +1467,6 @@ add_back_to_lru: spin_unlock(&gl->gl_lockref.lock); goto add_back_to_lru; } - clear_bit(GLF_LRU, &gl->gl_flags); gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1497,6 +1501,7 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); + clear_bit(GLF_LRU, &gl->gl_flags); freed++; continue; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 648f0ca1ad57..998051c4aea7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -744,17 +744,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, the gfs2 structures. */ if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto fail_gunlock3; posix_acl_release(default_acl); + default_acl = NULL; } if (acl) { - if (!error) - error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto fail_gunlock3; posix_acl_release(acl); + acl = NULL; } - if (error) - goto fail_gunlock3; - error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, &gfs2_initxattrs, NULL); if (error) @@ -789,10 +791,8 @@ fail_free_inode: } gfs2_rsqa_delete(ip, NULL); fail_free_acls: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); + posix_acl_release(default_acl); + posix_acl_release(acl); fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index ac7caa267ed6..62edf8f5615f 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -31,9 +31,10 @@ * @delta is the difference between the current rtt sample and the * running average srtt. We add 1/8 of that to the srtt in order to * update the current srtt estimate. The variance estimate is a bit - * more complicated. We subtract the abs value of the @delta from - * the current variance estimate and add 1/4 of that to the running - * total. + * more complicated. We subtract the current variance estimate from + * the abs value of the @delta and add 1/4 of that to the running + * total. That's equivalent to 3/4 of the current variance + * estimate plus 1/4 of the abs of @delta. * * Note that the index points at the array entry containing the smoothed * mean value, and the variance is always in the following entry @@ -49,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, s64 delta = sample - s->stats[index]; s->stats[index] += (delta >> 3); index++; - s->stats[index] += ((abs(delta) - s->stats[index]) >> 2); + s->stats[index] += (s64)(abs(delta) - s->stats[index]) >> 2; } /** diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index ee20ea42e7b5..cd85092723de 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -604,7 +604,8 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) bd->bd_bh = NULL; bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; - atomic_inc(&gl->gl_revokes); + if (atomic_inc_return(&gl->gl_revokes) == 1) + gfs2_glock_hold(gl); set_bit(GLF_LFLUSH, &gl->gl_flags); list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index f2567f958d00..8f99b395d7bf 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -662,8 +662,10 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gl = bd->bd_gl; - atomic_dec(&gl->gl_revokes); - clear_bit(GLF_LFLUSH, &gl->gl_flags); + if (atomic_dec_return(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + gfs2_glock_queue_put(gl); + } kmem_cache_free(gfs2_bufdata_cachep, bd); } } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 32920a10100e..7a24f91af29e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -426,9 +426,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, current->mm, - &pseudo_vma, - mapping, index, 0); + hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -625,8 +623,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, - index, addr); + hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ @@ -741,11 +738,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev) { struct inode *inode; - struct resv_map *resv_map; + struct resv_map *resv_map = NULL; - resv_map = resv_map_alloc(); - if (!resv_map) - return NULL; + /* + * Reserve maps are only needed for inodes that can have associated + * page allocations. + */ + if (S_ISREG(mode) || S_ISLNK(mode)) { + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; + } inode = new_inode(sb); if (inode) { @@ -780,8 +783,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } else - kref_put(&resv_map->refs, resv_map_release); + } else { + if (resv_map) + kref_put(&resv_map->refs, resv_map_release); + } return inode; } @@ -859,6 +864,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, rc = migrate_huge_page_move_mapping(mapping, newpage, page); if (rc != MIGRATEPAGE_SUCCESS) return rc; + + /* + * page_private is subpool pointer in hugetlb pages. Transfer to + * new page. PagePrivate is not associated with page_private for + * hugetlb pages and can not be set here as only page_huge_active + * pages can be migrated. + */ + if (page_private(page)) { + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + } + if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else diff --git a/fs/inode.c b/fs/inode.c index 3f778e44ad5e..97f11df6ca6a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_REMOVED; } - /* - * Recently referenced inodes and inodes with many attached pages - * get one more pass. - */ - if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; diff --git a/fs/internal.h b/fs/internal.h index d410186bc369..d109665b9e50 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -80,9 +80,7 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); -extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); /* diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..fac45206418a 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -117,6 +117,12 @@ iomap_page_create(struct inode *inode, struct page *page) atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + + /* + * migrate_page_move_mapping() assumes that pages with private data have + * their count elevated by 1. + */ + get_page(page); set_page_private(page, (unsigned long)iop); SetPagePrivate(page); return iop; @@ -133,6 +139,7 @@ iomap_page_release(struct page *page) WARN_ON_ONCE(atomic_read(&iop->write_count)); ClearPagePrivate(page); set_page_private(page, 0); + put_page(page); kfree(iop); } @@ -488,16 +495,29 @@ done: } EXPORT_SYMBOL_GPL(iomap_readpages); +/* + * iomap_is_partially_uptodate checks whether blocks within a page are + * uptodate or not. + * + * Returns true if all blocks which correspond to a file portion + * we want to read within the page are uptodate. + */ int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count) { struct iomap_page *iop = to_iomap_page(page); struct inode *inode = page->mapping->host; - unsigned first = from >> inode->i_blkbits; - unsigned last = (from + count - 1) >> inode->i_blkbits; + unsigned len, first, last; unsigned i; + /* Limit range to one page */ + len = min_t(unsigned, PAGE_SIZE - from, count); + + /* First and last blocks in range within page */ + first = from >> inode->i_blkbits; + last = (from + len - 1) >> inode->i_blkbits; + if (iop) { for (i = first; i <= last; i++) if (!test_bit(i, iop->uptodate)) @@ -552,8 +572,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (page_has_private(page)) { ClearPagePrivate(page); + get_page(newpage); set_page_private(newpage, page_private(page)); set_page_private(page, 0); + put_page(page); SetPagePrivate(newpage); } @@ -1765,6 +1787,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos, start = pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; + bool wait_for_completion = is_sync_kiocb(iocb); struct blk_plug plug; struct iomap_dio *dio; @@ -1784,7 +1807,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->end_io = end_io; dio->error = 0; dio->flags = 0; - dio->wait_for_completion = is_sync_kiocb(iocb); dio->submit.iter = iter; dio->submit.waiter = current; @@ -1839,7 +1861,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio_warn_stale_pagecache(iocb->ki_filp); ret = 0; - if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion && + if (iov_iter_rw(iter) == WRITE && !wait_for_completion && !inode->i_sb->s_dio_done_wq) { ret = sb_init_dio_done_wq(inode->i_sb); if (ret < 0) @@ -1855,7 +1877,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret <= 0) { /* magic error code to fall back to buffered I/O */ if (ret == -ENOTBLK) { - dio->wait_for_completion = true; + wait_for_completion = true; ret = 0; } break; @@ -1877,8 +1899,24 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + /* + * We are about to drop our additional submission reference, which + * might be the last reference to the dio. There are three three + * different ways we can progress here: + * + * (a) If this is the last reference we will always complete and free + * the dio ourselves. + * (b) If this is not the last reference, and we serve an asynchronous + * iocb, we must never touch the dio after the decrement, the + * I/O completion handler will complete and free it. + * (c) If this is not the last reference, but we serve a synchronous + * iocb, the I/O completion handler will wake us up on the drop + * of the final reference, and we will complete and free it here + * after we got woken by the I/O completion handler. + */ + dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { - if (!dio->wait_for_completion) + if (!wait_for_completion) return -EIOCBQUEUED; for (;;) { @@ -1895,9 +1933,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - ret = iomap_dio_complete(dio); - - return ret; + return iomap_dio_complete(dio); out_free_dio: kfree(dio); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 150cc030b4d7..65ea0355a4f6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -691,9 +691,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) the last tag we set up. */ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - - jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: + if (descriptor) + jbd2_descriptor_block_csum_set(journal, + descriptor); + for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ef6b6daaa7a..e9cf88f0bc29 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1356,16 +1356,23 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } +/* + * This function expects that the caller will have locked the journal + * buffer head, and will return with it unlocked + */ static int jbd2_write_superblock(journal_t *journal, int write_flags) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; int ret; + /* Buffer got discarded which means block device got invalidated */ + if (!buffer_mapped(bh)) + return -EIO; + trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); - lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1424,6 +1431,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); + lock_buffer(journal->j_sb_buffer); sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); @@ -1454,18 +1462,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - read_lock(&journal->j_state_lock); - /* Is it already empty? */ - if (sb->s_start == 0) { - read_unlock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); + if (sb->s_start == 0) { /* Is it already empty? */ + unlock_buffer(journal->j_sb_buffer); return; } + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); - read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, write_op); @@ -1488,9 +1495,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal) journal_superblock_t *sb = journal->j_superblock; int errcode; - read_lock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); errcode = journal->j_errno; - read_unlock(&journal->j_state_lock); if (errcode == -ESHUTDOWN) errcode = 0; jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); @@ -1894,28 +1900,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; + /* Load the checksum driver if necessary */ + if ((journal->j_chksum_driver == NULL) && + INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + /* Precompute checksum seed for all metadata */ + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + } + + lock_buffer(journal->j_sb_buffer); + /* If enabling v3 checksums, update superblock */ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - - /* Load the checksum driver */ - if (journal->j_chksum_driver == NULL) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", - 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c " - "driver.\n"); - journal->j_chksum_driver = NULL; - return 0; - } - - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, - sb->s_uuid, - sizeof(sb->s_uuid)); - } } /* If enabling v1 checksums, downgrade superblock */ @@ -1927,6 +1932,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); + unlock_buffer(journal->j_sb_buffer); return 1; #undef COMPAT_FEATURE_ON @@ -2383,22 +2389,19 @@ static struct kmem_cache *jbd2_journal_head_cache; static atomic_t nr_journal_heads = ATOMIC_INIT(0); #endif -static int jbd2_journal_init_journal_head_cache(void) +static int __init jbd2_journal_init_journal_head_cache(void) { - int retval; - - J_ASSERT(jbd2_journal_head_cache == NULL); + J_ASSERT(!jbd2_journal_head_cache); jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, NULL); /* ctor */ - retval = 0; if (!jbd2_journal_head_cache) { - retval = -ENOMEM; printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); + return -ENOMEM; } - return retval; + return 0; } static void jbd2_journal_destroy_journal_head_cache(void) @@ -2644,28 +2647,38 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void) struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; +static int __init jbd2_journal_init_inode_cache(void) +{ + J_ASSERT(!jbd2_inode_cache); + jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); + if (!jbd2_inode_cache) { + pr_emerg("JBD2: failed to create inode cache\n"); + return -ENOMEM; + } + return 0; +} + static int __init jbd2_journal_init_handle_cache(void) { + J_ASSERT(!jbd2_handle_cache); jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); - if (jbd2_handle_cache == NULL) { + if (!jbd2_handle_cache) { printk(KERN_EMERG "JBD2: failed to create handle cache\n"); return -ENOMEM; } - jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); - if (jbd2_inode_cache == NULL) { - printk(KERN_EMERG "JBD2: failed to create inode cache\n"); - kmem_cache_destroy(jbd2_handle_cache); - return -ENOMEM; - } return 0; } +static void jbd2_journal_destroy_inode_cache(void) +{ + kmem_cache_destroy(jbd2_inode_cache); + jbd2_inode_cache = NULL; +} + static void jbd2_journal_destroy_handle_cache(void) { kmem_cache_destroy(jbd2_handle_cache); jbd2_handle_cache = NULL; - kmem_cache_destroy(jbd2_inode_cache); - jbd2_inode_cache = NULL; } /* @@ -2676,21 +2689,27 @@ static int __init journal_init_caches(void) { int ret; - ret = jbd2_journal_init_revoke_caches(); + ret = jbd2_journal_init_revoke_record_cache(); + if (ret == 0) + ret = jbd2_journal_init_revoke_table_cache(); if (ret == 0) ret = jbd2_journal_init_journal_head_cache(); if (ret == 0) ret = jbd2_journal_init_handle_cache(); if (ret == 0) + ret = jbd2_journal_init_inode_cache(); + if (ret == 0) ret = jbd2_journal_init_transaction_cache(); return ret; } static void jbd2_journal_destroy_caches(void) { - jbd2_journal_destroy_revoke_caches(); + jbd2_journal_destroy_revoke_record_cache(); + jbd2_journal_destroy_revoke_table_cache(); jbd2_journal_destroy_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_inode_cache(); jbd2_journal_destroy_transaction_cache(); jbd2_journal_destroy_slabs(); } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index a1143e57a718..69b9bc329964 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -178,33 +178,41 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, return NULL; } -void jbd2_journal_destroy_revoke_caches(void) +void jbd2_journal_destroy_revoke_record_cache(void) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; +} + +void jbd2_journal_destroy_revoke_table_cache(void) +{ kmem_cache_destroy(jbd2_revoke_table_cache); jbd2_revoke_table_cache = NULL; } -int __init jbd2_journal_init_revoke_caches(void) +int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); - J_ASSERT(!jbd2_revoke_table_cache); - jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); - if (!jbd2_revoke_record_cache) - goto record_cache_failure; + if (!jbd2_revoke_record_cache) { + pr_emerg("JBD2: failed to create revoke_record cache\n"); + return -ENOMEM; + } + return 0; +} + +int __init jbd2_journal_init_revoke_table_cache(void) +{ + J_ASSERT(!jbd2_revoke_table_cache); jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, SLAB_TEMPORARY); - if (!jbd2_revoke_table_cache) - goto table_cache_failure; - return 0; -table_cache_failure: - jbd2_journal_destroy_revoke_caches(); -record_cache_failure: + if (!jbd2_revoke_table_cache) { + pr_emerg("JBD2: failed to create revoke_table cache\n"); return -ENOMEM; + } + return 0; } static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index c0b66a7a795b..e20a6703531f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -42,9 +42,11 @@ int __init jbd2_journal_init_transaction_cache(void) 0, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, NULL); - if (transaction_cache) - return 0; - return -ENOMEM; + if (!transaction_cache) { + pr_emerg("JBD2: failed to create transaction cache\n"); + return -ENOMEM; + } + return 0; } void jbd2_journal_destroy_transaction_cache(void) @@ -1219,11 +1221,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; char *committed_data = NULL; - JBUFFER_TRACE(jh, "entry"); if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1334,15 +1337,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - if (!buffer_jbd(bh)) { - ret = -EUCLEAN; - goto out; - } + if (!buffer_jbd(bh)) + return -EUCLEAN; + /* * We don't grab jh reference here since the buffer must be part * of the running transaction. */ jh = bh2jh(bh); + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + /* * This and the following assertions are unreliable since we may see jh * in inconsistent state unless we grab bh_state lock. But this is @@ -1376,9 +1381,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } journal = transaction->t_journal; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); - jbd_lock_bh_state(bh); if (jh->b_modified == 0) { @@ -1576,14 +1578,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); - /* ... but we CAN drop it from the new transaction if we - * have also modified it since the original commit. */ + /* ... but we CAN drop it from the new transaction through + * marking the buffer as freed and set j_next_transaction to + * the new transaction, so that not only the commit code + * knows it should clear dirty bits when it is done with the + * buffer, but also the buffer can be checkpointed only + * after the new transaction commits. */ - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == transaction); + set_buffer_freed(bh); + + if (!jh->b_next_transaction) { spin_lock(&journal->j_list_lock); - jh->b_next_transaction = NULL; + jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); + } else { + J_ASSERT(jh->b_next_transaction == transaction); /* * only drop a reference if this transaction modified diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 389ea53ea487..bccfc40b3a74 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1414,11 +1414,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL); - if (f->target) { - kfree(f->target); - f->target = NULL; - } - fds = f->dents; while(fds) { fd = fds; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 902a7dd10e5c..05d892c79339 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -47,7 +47,10 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) static void jffs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + + kfree(f->target); + kmem_cache_free(jffs2_inode_cachep, f); } static void jffs2_destroy_inode(struct inode *inode) @@ -101,7 +104,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - cancel_delayed_work_sync(&c->wbuf_dwork); + if (jffs2_is_writebuffered(c)) + cancel_delayed_work_sync(&c->wbuf_dwork); #endif mutex_lock(&c->alloc_sem); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 4ca0b5c18192..853a69e493f5 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -650,11 +650,10 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->id.generation = gen; /* - * set ino first. This barrier is paired with atomic_inc_not_zero in + * set ino first. This RELEASE is paired with atomic_inc_not_zero in * kernfs_find_and_get_node_by_ino */ - smp_mb__before_atomic(); - atomic_set(&kn->count, 1); + atomic_set_release(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index ff2716f9322e..0b22c39dad47 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -196,8 +196,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, return dentry; knparent = find_next_ancestor(kn, NULL); - if (WARN_ON(!knparent)) + if (WARN_ON(!knparent)) { + dput(dentry); return ERR_PTR(-EINVAL); + } do { struct dentry *dtmp; @@ -206,8 +208,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); - if (WARN_ON(!kntmp)) + if (WARN_ON(!kntmp)) { + dput(dentry); return ERR_PTR(-EINVAL); + } dtmp = lookup_one_len_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index d20b92f271c2..0a67dd4250e9 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -442,7 +442,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) fl->fl_start = req->a_res.lock.fl.fl_start; fl->fl_end = req->a_res.lock.fl.fl_end; fl->fl_type = req->a_res.lock.fl.fl_type; - fl->fl_pid = 0; + fl->fl_pid = -req->a_res.lock.fl.fl_pid; break; default: status = nlm_stat_to_errno(req->a_res.status); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 93fb7cf0b92b..f0b5c987d6ae 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host) WARN_ON_ONCE(host->h_server); - if (refcount_dec_and_test(&host->h_count)) { + if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) { WARN_ON_ONCE(!list_empty(&host->h_lockowners)); WARN_ON_ONCE(!list_empty(&host->h_granted)); WARN_ON_ONCE(!list_empty(&host->h_reclaim)); - mutex_lock(&nlm_host_mutex); nlm_destroy_host_locked(host); mutex_unlock(&nlm_host_mutex); } diff --git a/fs/namei.c b/fs/namei.c index d3ae2a84a271..2a8c41bc227f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3701,8 +3701,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && - !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 96d5f8135eb9..c092661147b3 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -290,6 +290,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat struct nfs_client *clp; const struct sockaddr *sap = data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); + int error; again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { @@ -302,9 +303,11 @@ again: if (clp->cl_cons_state > NFS_CS_READY) { refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); - nfs_wait_client_init_complete(clp); + error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); spin_lock(&nn->nfs_client_lock); + if (error < 0) + return ERR_PTR(error); goto again; } @@ -413,6 +416,8 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); + if (IS_ERR(clp)) + return clp; if (new) new->rpc_ops->free_client(new); return nfs_found_client(cl_init, clp); @@ -459,7 +464,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index aa12c3063bae..33824a0a57bf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -98,8 +98,11 @@ struct nfs_direct_req { struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ struct work_struct work; int flags; + /* for write */ #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + /* for read */ +#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ struct nfs_writeverf verf; /* unstable write verifier */ }; @@ -412,7 +415,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; - if (!PageCompound(page) && bytes < hdr->good_bytes) + if (!PageCompound(page) && bytes < hdr->good_bytes && + (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY)) set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); @@ -587,6 +591,9 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + if (iter_is_iovec(iter)) + dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d175724ff566..2478a69da0f0 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -904,7 +904,7 @@ fl_pnfs_update_layout(struct inode *ino, status = filelayout_check_deviceid(lo, fl, gfp_flags); if (status) { pnfs_put_lseg(lseg); - lseg = ERR_PTR(status); + lseg = NULL; } out: return lseg; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index fed06fd9998d..94f98e190e63 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, }; ssize_t err, err2; - if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) - return -EOPNOTSUPP; - src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) return PTR_ERR(src_lock); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4288a6ecaf75..134858507268 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,15 +133,11 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - ssize_t ret; - + if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) + return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) - return -EINVAL; -retry: - ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); - if (ret == -EAGAIN) - goto retry; - return ret; + return -EOPNOTSUPP; + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 3f23b6840547..bf34ddaa2ad7 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -44,6 +44,7 @@ #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> +#include <keys/request_key_auth-type.h> #include <linux/module.h> #include "internal.h" @@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy; struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; - struct key_construction *key_cons; + struct key *authkey; struct idmap *idmap; }; @@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = { { Opt_find_err, NULL } }; -static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); +static int nfs_idmap_legacy_upcall(struct key *, void *); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_release_pipe(struct inode *); @@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, static void nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) { - struct key_construction *cons = idmap->idmap_upcall_data->key_cons; + struct key *authkey = idmap->idmap_upcall_data->authkey; kfree(idmap->idmap_upcall_data); idmap->idmap_upcall_data = NULL; - complete_request_key(cons, ret); + complete_request_key(authkey, ret); + key_put(authkey); } static void @@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) nfs_idmap_complete_pipe_upcall_locked(idmap, ret); } -static int nfs_idmap_legacy_upcall(struct key_construction *cons, - const char *op, - void *aux) +static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) { struct idmap_legacy_upcalldata *data; + struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; - struct key *key = cons->key; + struct key *key = rka->target_key; int ret = -ENOKEY; if (!aux) @@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; - data->key_cons = cons; + data->authkey = key_get(authkey); ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) @@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, out2: kfree(data); out1: - complete_request_key(cons, ret); + complete_request_key(authkey, ret); return ret; } @@ -651,9 +652,10 @@ out: static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { + struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; - struct key_construction *cons; + struct key *authkey; struct idmap_msg im; size_t namelen_in; int ret = -ENOKEY; @@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (idmap->idmap_upcall_data == NULL) goto out_noupcall; - cons = idmap->idmap_upcall_data->key_cons; + authkey = idmap->idmap_upcall_data->authkey; + rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { ret = -ENOSPC; @@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ret = nfs_idmap_read_and_verify_message(&im, &idmap->idmap_upcall_data->idmap_msg, - cons->key, cons->authkey); + rka->target_key, authkey); if (ret >= 0) { - key_set_timeout(cons->key, nfs_idmap_cache_timeout); + key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 77f0a23dc3e4..42850fb5944b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -947,6 +947,13 @@ nfs4_sequence_process_interrupted(struct nfs_client *client, #endif /* !CONFIG_NFS_V4_1 */ +static void nfs41_sequence_res_init(struct nfs4_sequence_res *res) +{ + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; +} + static void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -958,10 +965,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, args->sa_slot = slot; res->sr_slot = slot; - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - res->sr_status = 1; - } int nfs4_setup_sequence(struct nfs_client *client, @@ -1007,6 +1010,7 @@ int nfs4_setup_sequence(struct nfs_client *client, trace_nfs4_setup_sequence(session, args); out_start: + nfs41_sequence_res_init(res); rpc_call_start(task); return 0; @@ -2905,7 +2909,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: - nfs4_sequence_free_slot(&opendata->o_res.seq_res); + if (!opendata->cancelled) + nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; } @@ -6845,7 +6850,6 @@ struct nfs4_lock_waiter { struct task_struct *task; struct inode *inode; struct nfs_lowner *owner; - bool notified; }; static int @@ -6867,13 +6871,13 @@ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, vo /* Make sure it's for the right inode */ if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) return 0; - - waiter->notified = true; } /* override "private" so we can use default_wake_function */ wait->private = waiter->task; - ret = autoremove_wake_function(wait, mode, flags, key); + ret = woken_wake_function(wait, mode, flags, key); + if (ret) + list_del_init(&wait->entry); wait->private = waiter; return ret; } @@ -6882,7 +6886,6 @@ static int nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { int status = -ERESTARTSYS; - unsigned long flags; struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; struct nfs_server *server = NFS_SERVER(state->inode); struct nfs_client *clp = server->nfs_client; @@ -6892,8 +6895,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) .s_dev = server->s_dev }; struct nfs4_lock_waiter waiter = { .task = current, .inode = state->inode, - .owner = &owner, - .notified = false }; + .owner = &owner}; wait_queue_entry_t wait; /* Don't bother with waitqueue if we don't expect a callback */ @@ -6903,27 +6905,22 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) init_wait(&wait); wait.private = &waiter; wait.func = nfs4_wake_lock_waiter; - add_wait_queue(q, &wait); while(!signalled()) { - waiter.notified = false; + add_wait_queue(q, &wait); status = nfs4_proc_setlk(state, cmd, request); - if ((status != -EAGAIN) || IS_SETLK(cmd)) + if ((status != -EAGAIN) || IS_SETLK(cmd)) { + finish_wait(q, &wait); break; - - status = -ERESTARTSYS; - spin_lock_irqsave(&q->lock, flags); - if (waiter.notified) { - spin_unlock_irqrestore(&q->lock, flags); - continue; } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&q->lock, flags); - freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT); + status = -ERESTARTSYS; + freezer_do_not_count(); + wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT); + freezer_count(); + finish_wait(q, &wait); } - finish_wait(q, &wait); return status; } #else /* !CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1698dd2ca20b..f10952680bd9 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -159,6 +159,10 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); + + /* If the client state need to recover, do it. */ + if (clp->cl_state) + nfs4_schedule_state_manager(clp); } out: return status; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 3dbd15b47c27..0ec6bce3dd69 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -989,6 +989,17 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) } } +static void +nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + LIST_HEAD(head); + + nfs_list_remove_request(req); + nfs_list_add_request(req, &head); + desc->pg_completion_ops->error_cleanup(&head); +} + /** * nfs_pageio_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor @@ -1026,10 +1037,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_page_group_unlock(req); desc->pg_moreio = 1; nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - if (mirror->pg_recoalesce) - return 0; + if (desc->pg_error < 0 || mirror->pg_recoalesce) + goto out_cleanup_subreq; /* retry add_request for this subreq */ nfs_page_group_lock(req); continue; @@ -1062,6 +1071,10 @@ err_ptr: desc->pg_error = PTR_ERR(subreq); nfs_page_group_unlock(req); return 0; +out_cleanup_subreq: + if (req != subreq) + nfs_pageio_cleanup_request(desc, subreq); + return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -1080,7 +1093,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) struct nfs_page *req; req = list_first_entry(&head, struct nfs_page, wb_list); - nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; if (desc->pg_error < 0) { @@ -1169,11 +1181,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - goto out_failed; + goto out_cleanup_subreq; } return 1; +out_cleanup_subreq: + if (req != dupreq) + nfs_pageio_cleanup_request(desc, dupreq); out_failed: /* remember fatal errors */ if (nfs_error_is_fatal(desc->pg_error)) @@ -1199,7 +1214,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); - if (!mirror->pg_recoalesce) + if (desc->pg_error < 0 || !mirror->pg_recoalesce) break; if (!nfs_do_recoalesce(desc)) break; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ac4b2f005778..6df9b85caf20 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1906,6 +1906,11 @@ static int nfs_parse_devname(const char *dev_name, size_t len; char *end; + if (unlikely(!dev_name || !*dev_name)) { + dfprintk(MOUNT, "NFS: device name not specified\n"); + return -EINVAL; + } + /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { end = strchr(++dev_name, ']'); @@ -2047,7 +2052,8 @@ static int nfs23_validate_mount_data(void *options, memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); args->nfs_server.port = ntohs(data->addr.sin_port); - if (!nfs_verify_server_address(sap)) + if (sap->sa_family != AF_INET || + !nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) @@ -2409,8 +2415,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (b->auth_info.flavor_len > 0 && - clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; Ebusy: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 586726a590d8..51d0b7913c04 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -238,9 +238,9 @@ out: } /* A writeback failed: mark the page as bad, and invalidate the page cache */ -static void nfs_set_pageerror(struct page *page) +static void nfs_set_pageerror(struct address_space *mapping) { - nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); + nfs_zap_mapping(mapping->host, mapping); } /* @@ -621,11 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_set_page_writeback(page); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); - ret = 0; + ret = req->wb_context->error; /* If there is a fatal error that covers this write, just exit */ - if (nfs_error_is_fatal_on_server(req->wb_context->error)) + if (nfs_error_is_fatal_on_server(ret)) goto out_launder; + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -635,9 +636,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_context_set_write_error(req->wb_context, ret); if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - } + } else + ret = -EAGAIN; nfs_redirty_request(req); - ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); @@ -993,7 +994,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { - nfs_set_pageerror(req->wb_page); + nfs_set_pageerror(page_file_mapping(req->wb_page)); nfs_context_set_write_error(req->wb_context, hdr->error); goto remove_req; } @@ -1329,7 +1330,8 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page_file_mapping(page)->host; + struct address_space *mapping = page_file_mapping(page); + struct inode *inode = mapping->host; int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -1347,7 +1349,7 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) - nfs_set_pageerror(page); + nfs_set_pageerror(mapping); else __set_page_dirty_nobuffers(page); out: diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9eb8086ea841..c9cf46e0c040 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -463,8 +463,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); resp->count = resp->buffer - argp->buffer; - if (resp->offset) - xdr_encode_hyper(resp->offset, argp->cookie); + if (resp->offset) { + loff_t offset = argp->cookie; + + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + resp->offset = NULL; + } RETURN_STATUS(nfserr); } @@ -533,6 +544,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) } else { xdr_encode_hyper(resp->offset, offset); } + resp->offset = NULL; } RETURN_STATUS(nfserr); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9b973f4f7d01..83919116d5cb 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -921,6 +921,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, } else { xdr_encode_hyper(cd->offset, offset64); } + cd->offset = NULL; } /* diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 601bf33c26a0..ebbb0285addb 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -926,8 +926,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) cb->cb_seq_status = 1; cb->cb_status = 0; if (minorversion) { - if (!nfsd41_cb_get_slot(clp, task)) + if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task)) return; + cb->cb_holds_slot = true; } rpc_call_start(task); } @@ -954,6 +955,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback return true; } + if (!cb->cb_holds_slot) + goto need_restart; + switch (cb->cb_seq_status) { case 0: /* @@ -992,6 +996,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback cb->cb_seq_status); } + cb->cb_holds_slot = false; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, @@ -1199,6 +1204,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_seq_status = 1; cb->cb_status = 0; cb->cb_need_restart = false; + cb->cb_holds_slot = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9d6b4f0f1a25..f35aa9f88b5e 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1015,8 +1015,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist, &write->wr_head, write->wr_buflen); - if (!nvecs) - return nfserr_io; WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9c6d1d57b598..bec75600e692 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1514,16 +1514,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; - int avail; + unsigned long avail, total_avail; spin_lock(&nfsd_drc_lock); - avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a third of the remaining memory, * unless it's the only way to give this client a slot: */ - avail = clamp_t(int, avail, slotsize, avail/3); + avail = clamp_t(int, avail, slotsize, total_avail/3); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7fb9f7c667b1..cb69660d0779 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1126,6 +1126,8 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': + if (!nn->nfsd_serv) + return -EBUSY; nfsd4_end_grace(nn); break; default: @@ -1237,8 +1239,8 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - nn->nfsd4_lease = 45; /* default lease time */ - nn->nfsd4_grace = 45; + nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 0b15dac7e609..0f07ad6dc1ef 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -70,6 +70,7 @@ struct nfsd4_callback { int cb_seq_status; int cb_status; bool cb_need_restart; + bool cb_holds_slot; }; struct nfsd4_callback_ops { diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ac6978d3208c..97a51690338e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; - else if (create) - return -EEXIST; + else if (create) { + ret = -EEXIST; + goto out; + } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); @@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* return the wd */ ret = i_mark->wd; +out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); @@ -724,8 +727,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ - if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) - return -EINVAL; + if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { + ret = -EINVAL; + goto fput_and_out; + } /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) { diff --git a/fs/nsfs.c b/fs/nsfs.c index 60702d677bd4..30d150a4f0c6 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -85,13 +85,12 @@ slow: inode->i_fop = &ns_file_operations; inode->i_private = ns; - dentry = d_alloc_pseudo(mnt->mnt_sb, &empty_name); + dentry = d_alloc_anon(mnt->mnt_sb); if (!dentry) { iput(inode); return ERR_PTR(-ENOMEM); } d_instantiate(dentry, inode); - dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 99ee093182cb..cc9b32b9db7c 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src) obj-$(CONFIG_OCFS2_FS) += \ ocfs2.o \ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1d098c3c00e0..9f8250df99f1 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -152,7 +152,6 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, #endif } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, 0, bh); @@ -306,7 +305,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, continue; } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ if (validate) set_buffer_needs_validate(bh); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 0e4166cc23a0..4ac775e32240 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); - o2net_disconnect_node(node); + if (cluster->cl_nodes[node->nd_num] == node) { + o2net_disconnect_node(node); - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; - o2net_stop_listening(node); + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } } /* XXX call into net to stop this node from trading messages */ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index bd1aab1f49a4..ef2854422a6e 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src)/.. obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index eed3db8c5b49..33431a0296a3 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src)/.. obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 4bf8d5854b27..af2888d23de3 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -148,16 +148,24 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) u64 blkno; struct dentry *parent; struct inode *dir = d_inode(child); + int set; trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); + status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + parent = ERR_PTR(status); + goto bail; + } + status = ocfs2_inode_lock(dir, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); parent = ERR_PTR(status); - goto bail; + goto unlock_nfs_sync; } status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno); @@ -166,11 +174,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) goto bail_unlock; } + status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set); + if (status < 0) { + if (status == -EINVAL) { + status = -ESTALE; + } else + mlog(ML_ERROR, "test inode bit failed %d\n", status); + parent = ERR_PTR(status); + goto bail_unlock; + } + + trace_ocfs2_get_dentry_test_bit(status, set); + if (!set) { + status = -ESTALE; + parent = ERR_PTR(status); + goto bail_unlock; + } + parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); bail_unlock: ocfs2_inode_unlock(dir, 0); +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1); + bail: trace_ocfs2_get_parent_end(parent); diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 7642b6712c39..30208233f65b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -345,13 +345,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) if (num_used || alloc->id1.bitmap1.i_used || alloc->id1.bitmap1.i_total - || la->la_bm_off) - mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + || la->la_bm_off) { + mlog(ML_ERROR, "inconsistent detected, clean journal with" + " unrecovered local alloc, please run fsck.ocfs2!\n" "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + status = -EINVAL; + goto bail; + } + osb->local_alloc_bh = alloc_bh; osb->local_alloc_state = OCFS2_LA_ENABLED; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..fc197e599e8c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4716,22 +4716,23 @@ out: /* Lock an inode and grab a bh pointing to the inode. */ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, - struct buffer_head **bh1, + struct buffer_head **bh_s, struct inode *t_inode, - struct buffer_head **bh2) + struct buffer_head **bh_t) { - struct inode *inode1; - struct inode *inode2; + struct inode *inode1 = s_inode; + struct inode *inode2 = t_inode; struct ocfs2_inode_info *oi1; struct ocfs2_inode_info *oi2; + struct buffer_head *bh1 = NULL; + struct buffer_head *bh2 = NULL; bool same_inode = (s_inode == t_inode); + bool need_swap = (inode1->i_ino > inode2->i_ino); int status; /* First grab the VFS and rw locks. */ lock_two_nondirectories(s_inode, t_inode); - inode1 = s_inode; - inode2 = t_inode; - if (inode1->i_ino > inode2->i_ino) + if (need_swap) swap(inode1, inode2); status = ocfs2_rw_lock(inode1, 1); @@ -4754,17 +4755,13 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); - if (*bh1) - *bh1 = NULL; - if (*bh2) - *bh2 = NULL; - /* We always want to lock the one with the lower lockid first. */ if (oi1->ip_blkno > oi2->ip_blkno) mlog_errno(-ENOLCK); /* lock id1 */ - status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET); + status = ocfs2_inode_lock_nested(inode1, &bh1, 1, + OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -4773,15 +4770,25 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, /* lock id2 */ if (!same_inode) { - status = ocfs2_inode_lock_nested(inode2, bh2, 1, + status = ocfs2_inode_lock_nested(inode2, &bh2, 1, OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto out_cl1; } - } else - *bh2 = *bh1; + } else { + bh2 = bh1; + } + + /* + * If we swapped inode order above, we have to swap the buffer heads + * before passing them back to the caller. + */ + if (need_swap) + swap(bh1, bh2); + *bh_s = bh1; + *bh_t = bh2; trace_ocfs2_double_lock_end( (unsigned long long)oi1->ip_blkno, @@ -4791,8 +4798,7 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, out_cl1: ocfs2_inode_unlock(inode1, 1); - brelse(*bh1); - *bh1 = NULL; + brelse(bh1); out_rw2: ocfs2_rw_unlock(inode2, 1); out_i2: diff --git a/fs/open.c b/fs/open.c index 0285ce7dbd51..a00350018a47 100644 --- a/fs/open.c +++ b/fs/open.c @@ -733,6 +733,12 @@ static int do_dentry_open(struct file *f, return 0; } + /* Any file opened for execve()/uselib() has to be a regular file. */ + if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) { + error = -EACCES; + goto cleanup_file; + } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) @@ -1209,3 +1215,21 @@ int nonseekable_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL(nonseekable_open); + +/* + * stream_open is used by subsystems that want stream-like file descriptors. + * Such file descriptors are not seekable and don't have notion of position + * (file.f_pos is always 0). Contrary to file descriptors of other regular + * files, .read() and .write() can run simultaneously. + * + * stream_open never fails and is marked to return int so that it could be + * directly used as file_operations.open . + */ +int stream_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); + filp->f_mode |= FMODE_STREAM; + return 0; +} + +EXPORT_SYMBOL(stream_open); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 1cc797a08a5b..ffc73600216b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -501,6 +501,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { int err; + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + if (S_ISREG(c->stat.mode) && !c->metacopy) { + struct path upperpath, datapath; + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry != NULL)) + return -EIO; + upperpath.dentry = temp; + + ovl_path_lowerdata(c->dentry, &datapath); + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + return err; + } + err = ovl_copy_xattr(c->lowerpath.dentry, temp); if (err) return err; @@ -518,19 +536,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } - if (S_ISREG(c->stat.mode) && !c->metacopy) { - struct path upperpath, datapath; - - ovl_path_upper(c->dentry, &upperpath); - BUG_ON(upperpath.dentry != NULL); - upperpath.dentry = temp; - - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); - if (err) - return err; - } - if (c->metacopy) { err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, NULL, 0, -EOPNOTSUPP); @@ -706,6 +711,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct path upperpath, datapath; int err; + char *capability = NULL; + ssize_t uninitialized_var(cap_size); ovl_path_upper(c->dentry, &upperpath); if (WARN_ON(upperpath.dentry == NULL)) @@ -715,15 +722,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(datapath.dentry == NULL)) return -EIO; + if (c->stat.size) { + err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, + &capability, 0); + if (err < 0 && err != -ENODATA) + goto out; + } + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); if (err) - return err; + goto out_free; + + /* + * Writing to upper file will clear security.capability xattr. We + * don't want that to happen for normal copy-up operation. + */ + if (capability) { + err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); + if (err) + goto out_free; + } + err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY); if (err) - return err; + goto out_free; ovl_set_upperdata(d_inode(c->dentry)); +out_free: + kfree(capability); +out: return err; } @@ -849,14 +878,14 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) return true; } -int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) +int ovl_maybe_copy_up(struct dentry *dentry, int flags) { int err = 0; - if (ovl_open_need_copy_up(dentry, file_flags)) { + if (ovl_open_need_copy_up(dentry, flags)) { err = ovl_want_write(dentry); if (!err) { - err = ovl_copy_up_flags(dentry, file_flags); + err = ovl_copy_up_flags(dentry, flags); ovl_drop_write(dentry); } } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 3bbde0a9f48f..336f04da80ed 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -260,7 +260,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, * hashed directory inode aliases. */ inode = ovl_get_inode(dentry->d_sb, &oip); - if (WARN_ON(IS_ERR(inode))) + if (IS_ERR(inode)) return PTR_ERR(inode); } else { WARN_ON(ovl_inode_real(inode) != d_inode(newdentry)); @@ -652,6 +652,18 @@ static int ovl_symlink(struct inode *dir, struct dentry *dentry, return ovl_create_object(dentry, S_IFLNK, 0, link); } +static int ovl_set_link_redirect(struct dentry *dentry) +{ + const struct cred *old_cred; + int err; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_set_redirect(dentry, false); + revert_creds(old_cred); + + return err; +} + static int ovl_link(struct dentry *old, struct inode *newdir, struct dentry *new) { @@ -672,7 +684,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir, goto out_drop_write; if (ovl_is_metacopy_dentry(old)) { - err = ovl_set_redirect(old, false); + err = ovl_set_link_redirect(old); if (err) goto out_drop_write; } diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 8fa37cd7818a..54e5d17d7f3e 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -754,9 +754,8 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out; } - /* Otherwise, get a connected non-upper dir or disconnected non-dir */ - if (d_is_dir(origin.dentry) && - (origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + /* Find origin.dentry again with ovl_acceptable() layer check */ + if (d_is_dir(origin.dentry)) { dput(origin.dentry); origin.dentry = NULL; err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); @@ -769,6 +768,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out_err; } + /* Get a connected non-upper dir or disconnected non-dir */ dentry = ovl_get_dentry(sb, NULL, &origin, index); out: diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 986313da0c88..0c810f20f778 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -116,11 +116,10 @@ static int ovl_real_fdget(const struct file *file, struct fd *real) static int ovl_open(struct inode *inode, struct file *file) { - struct dentry *dentry = file_dentry(file); struct file *realfile; int err; - err = ovl_open_maybe_copy_up(dentry, file->f_flags); + err = ovl_maybe_copy_up(file_dentry(file), file->f_flags); if (err) return err; @@ -390,7 +389,7 @@ static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (ret) return ret; - ret = ovl_copy_up_with_data(file_dentry(file)); + ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY); if (!ret) { ret = ovl_real_ioctl(file, cmd, arg); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 3b7ed5d2279c..b48273e846ad 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -832,7 +832,7 @@ struct inode *ovl_get_inode(struct super_block *sb, int fsid = bylower ? oip->lowerpath->layer->fsid : 0; bool is_dir, metacopy = false; unsigned long ino = 0; - int err = -ENOMEM; + int err = oip->newinode ? -EEXIST : -ENOMEM; if (!realinode) realinode = d_inode(lowerdentry); @@ -917,6 +917,7 @@ out: return inode; out_err: + pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err); inode = ERR_PTR(err); goto out; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index a3c0d9584312..80fb66426760 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding); static inline bool ovl_is_impuredir(struct dentry *dentry) { @@ -409,7 +411,7 @@ extern const struct file_operations ovl_file_operations; int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); -int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); +int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index ace4fe4c39a9..c9a2e3c6d537 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -867,28 +867,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding) { - int res; - char *s, *next, *buf = NULL; + ssize_t res; + char *buf = NULL; - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0); + res = vfs_getxattr(dentry, name, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) - return NULL; + return -ENODATA; goto fail; } - buf = kzalloc(res + padding + 1, GFP_KERNEL); - if (!buf) - return ERR_PTR(-ENOMEM); + if (res != 0) { + buf = kzalloc(res + padding, GFP_KERNEL); + if (!buf) + return -ENOMEM; - if (res == 0) - goto invalid; + res = vfs_getxattr(dentry, name, buf, res); + if (res < 0) + goto fail; + } + *value = buf; + + return res; + +fail: + pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + name, res); + kfree(buf); + return res; +} - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res); +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +{ + int res; + char *s, *next, *buf = NULL; + + res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1); + if (res == -ENODATA) + return NULL; if (res < 0) - goto fail; + return ERR_PTR(res); if (res == 0) goto invalid; @@ -904,15 +925,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) } return buf; - -err_free: - kfree(buf); - return ERR_PTR(res); -fail: - pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res); - goto err_free; invalid: pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); res = -EINVAL; - goto err_free; + kfree(buf); + return ERR_PTR(res); } diff --git a/fs/pipe.c b/fs/pipe.c index bdc5d3c0977d..2a297bce381f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -189,9 +189,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); * in the tee() system call, when we duplicate the buffers in one * pipe into another. */ -void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - get_page(buf->page); + return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); @@ -234,6 +234,14 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = anon_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, @@ -242,6 +250,12 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) +{ + if (buf->ops == &anon_pipe_buf_ops) + buf->ops = &anon_pipe_buf_nomerge_ops; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 38b67a3eab17..bf9476600c73 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1084,10 +1084,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { - pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", - task_pid_nr(p), p->comm, - p->signal->oom_score_adj, oom_adj, - task_pid_nr(task), task->comm); p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; @@ -2546,6 +2542,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, rcu_read_unlock(); return -EACCES; } + /* Prevent changes to overridden credentials. */ + if (current_cred() != current_real_cred()) { + rcu_read_unlock(); + return -EBUSY; + } rcu_read_unlock(); if (count > PAGE_SIZE) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8ae109429a88..e39bac94dead 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_misc_dentry_ops); + d_set_d_op(dentry, de->proc_dops); return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); @@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); + ent->proc_dops = &proc_misc_dentry_ops; + out: return ent; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5185d7f6a51e..95b14196f284 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,6 +44,7 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d297fe4472a9..d0137e3e585e 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head); static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * Same as oldmem_pfn_is_ram in vmcore + */ +static int (*mem_pfn_is_ram)(unsigned long pfn); + +int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (mem_pfn_is_ram) + return -EBUSY; + mem_pfn_is_ram = fn; + return 0; +} + +static int pfn_is_ram(unsigned long pfn) +{ + if (mem_pfn_is_ram) + return mem_pfn_is_ram(pfn); + else + return 1; +} + /* This doesn't grab kclist_lock, so it should only be used at init time. */ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, int type) @@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } m = NULL; /* skip the list anchor */ + } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index d5e0fcb3439e..a7b12435519e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } +static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +static const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + +static void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} + static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; @@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; return proc_register(parent, p); @@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; p->write = write; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5ec2105487a0..31f25ff3999f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -464,7 +464,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode = new_inode(sb); if (!inode) - goto out; + return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); @@ -474,8 +474,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); - inode = NULL; - goto out; + return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; @@ -500,7 +499,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (root->set_ownership) root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); -out: return inode; } @@ -549,10 +547,11 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (!inode) + if (IS_ERR(inode)) { + err = ERR_CAST(inode); goto out; + } d_set_d_op(dentry, &proc_sys_dentry_operations); err = d_splice_alias(inode, dentry); @@ -685,7 +684,7 @@ static bool proc_sys_fill_cache(struct file *file, if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); - if (!inode) { + if (IS_ERR(inode)) { d_lookup_done(child); dput(child); return false; @@ -1627,8 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - put_links(header); - start_unregistering(header); + if (parent) { + put_links(header); + start_unregistering(header); + } + if (!--header->count) kfree_rcu(header, rcu); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a027473561c6..c5819baee35c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -423,7 +423,7 @@ struct mem_size_stats { }; static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty) + bool compound, bool young, bool dirty, bool locked) { int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -450,24 +450,31 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, else mss->private_clean += size; mss->pss += (u64)size << PSS_SHIFT; + if (locked) + mss->pss_locked += (u64)size << PSS_SHIFT; return; } for (i = 0; i < nr; i++, page++) { int mapcount = page_mapcount(page); + unsigned long pss = (PAGE_SIZE << PSS_SHIFT); if (mapcount >= 2) { if (dirty || PageDirty(page)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + mss->pss += pss / mapcount; + if (locked) + mss->pss_locked += pss / mapcount; } else { if (dirty || PageDirty(page)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; - mss->pss += PAGE_SIZE << PSS_SHIFT; + mss->pss += pss; + if (locked) + mss->pss_locked += pss; } } } @@ -490,6 +497,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; if (pte_present(*pte)) { @@ -532,7 +540,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -541,6 +549,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page; /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -555,7 +564,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else VM_BUG_ON_PAGE(1, page); - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -737,11 +746,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, } } #endif - /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); - if (vma->vm_flags & VM_LOCKED) - mss->pss_locked += mss->pss; } #define SEQ_PUT_DEC(str, val) \ @@ -1132,6 +1138,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } + /* + * Avoid to modify vma->vm_flags + * without locked ops while the + * coredump reads the vm_flags. + */ + if (!mmget_still_valid(mm)) { + /* + * Silently return "count" + * like if get_task_mm() + * failed. FIXME: should this + * function have returned + * -ESRCH if get_task_mm() + * failed like if + * get_proc_task() fails? + */ + up_write(&mm->mmap_sem); + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5fcb845b9fec..8cf2218b46a7 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -482,12 +482,10 @@ static struct file_system_type pstore_fs_type = { .kill_sb = pstore_kill_sb, }; -static int __init init_pstore_fs(void) +int __init pstore_init_fs(void) { int err; - pstore_choose_compression(); - /* Create a convenient mount point for people to access pstore */ err = sysfs_create_mount_point(fs_kobj, "pstore"); if (err) @@ -500,14 +498,9 @@ static int __init init_pstore_fs(void) out: return err; } -module_init(init_pstore_fs) -static void __exit exit_pstore_fs(void) +void __exit pstore_exit_fs(void) { unregister_filesystem(&pstore_fs_type); sysfs_remove_mount_point(fs_kobj, "pstore"); } -module_exit(exit_pstore_fs) - -MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); -MODULE_LICENSE("GPL"); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index fb767e28aeb2..7062ea4bc57c 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -37,7 +37,8 @@ extern bool pstore_is_mounted(void); extern void pstore_record_init(struct pstore_record *record, struct pstore_info *psi); -/* Called during module_init() */ -extern void __init pstore_choose_compression(void); +/* Called during pstore init/exit. */ +int __init pstore_init_fs(void); +void __exit pstore_exit_fs(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 15e99d5a681d..4bae3f4fe829 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -124,26 +124,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) } } -bool pstore_cannot_block_path(enum kmsg_dump_reason reason) +/* + * Should pstore_dump() wait for a concurrent pstore_dump()? If + * not, the current pstore_dump() will report a failure to dump + * and return. + */ +static bool pstore_cannot_wait(enum kmsg_dump_reason reason) { - /* - * In case of NMI path, pstore shouldn't be blocked - * regardless of reason. - */ + /* In NMI path, pstore shouldn't block regardless of reason. */ if (in_nmi()) return true; switch (reason) { /* In panic case, other cpus are stopped by smp_send_stop(). */ case KMSG_DUMP_PANIC: - /* Emergency restart shouldn't be blocked by spin lock. */ + /* Emergency restart shouldn't be blocked. */ case KMSG_DUMP_EMERG: return true; default: return false; } } -EXPORT_SYMBOL_GPL(pstore_cannot_block_path); #if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) static int zbufsize_deflate(size_t size) @@ -274,37 +275,59 @@ static int pstore_decompress(void *in, void *out, static void allocate_buf_for_compression(void) { + struct crypto_comp *ctx; + int size; + char *buf; + + /* Skip if not built-in or compression backend not selected yet. */ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend) return; + /* Skip if no pstore backend yet or compression init already done. */ + if (!psinfo || tfm) + return; + if (!crypto_has_comp(zbackend->name, 0, 0)) { - pr_err("No %s compression\n", zbackend->name); + pr_err("Unknown compression: %s\n", zbackend->name); return; } - big_oops_buf_sz = zbackend->zbufsize(psinfo->bufsize); - if (big_oops_buf_sz <= 0) + size = zbackend->zbufsize(psinfo->bufsize); + if (size <= 0) { + pr_err("Invalid compression size for %s: %d\n", + zbackend->name, size); return; + } - big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); - if (!big_oops_buf) { - pr_err("allocate compression buffer error!\n"); + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("Failed %d byte compression buffer allocation for: %s\n", + size, zbackend->name); return; } - tfm = crypto_alloc_comp(zbackend->name, 0, 0); - if (IS_ERR_OR_NULL(tfm)) { - kfree(big_oops_buf); - big_oops_buf = NULL; - pr_err("crypto_alloc_comp() failed!\n"); + ctx = crypto_alloc_comp(zbackend->name, 0, 0); + if (IS_ERR_OR_NULL(ctx)) { + kfree(buf); + pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, + PTR_ERR(ctx)); return; } + + /* A non-NULL big_oops_buf indicates compression is available. */ + tfm = ctx; + big_oops_buf_sz = size; + big_oops_buf = buf; + + pr_info("Using compression: %s\n", zbackend->name); } static void free_buf_for_compression(void) { - if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && !IS_ERR_OR_NULL(tfm)) + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { crypto_free_comp(tfm); + tfm = NULL; + } kfree(big_oops_buf); big_oops_buf = NULL; big_oops_buf_sz = 0; @@ -358,23 +381,23 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long total = 0; const char *why; unsigned int part = 1; - unsigned long flags = 0; - int is_locked; int ret; why = get_reason_str(reason); - if (pstore_cannot_block_path(reason)) { - is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags); - if (!is_locked) { - pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" - , in_nmi() ? "NMI" : why); + if (down_trylock(&psinfo->buf_lock)) { + /* Failed to acquire lock: give up if we cannot wait. */ + if (pstore_cannot_wait(reason)) { + pr_err("dump skipped in %s path: may corrupt error record\n", + in_nmi() ? "NMI" : why); + return; + } + if (down_interruptible(&psinfo->buf_lock)) { + pr_err("could not grab semaphore?!\n"); return; } - } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); - is_locked = 1; } + oopscount++; while (total < kmsg_bytes) { char *dst; @@ -391,7 +414,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.part = part; record.buf = psinfo->buf; - if (big_oops_buf && is_locked) { + if (big_oops_buf) { dst = big_oops_buf; dst_size = big_oops_buf_sz; } else { @@ -409,7 +432,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, dst_size, &dump_size)) break; - if (big_oops_buf && is_locked) { + if (big_oops_buf) { zipped_len = pstore_compress(dst, psinfo->buf, header_size + dump_size, psinfo->bufsize); @@ -432,8 +455,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - if (is_locked) - spin_unlock_irqrestore(&psinfo->buf_lock, flags); + + up(&psinfo->buf_lock); } static struct kmsg_dumper pstore_dumper = { @@ -456,31 +479,14 @@ static void pstore_unregister_kmsg(void) #ifdef CONFIG_PSTORE_CONSOLE static void pstore_console_write(struct console *con, const char *s, unsigned c) { - const char *e = s + c; + struct pstore_record record; - while (s < e) { - struct pstore_record record; - unsigned long flags; - - pstore_record_init(&record, psinfo); - record.type = PSTORE_TYPE_CONSOLE; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_CONSOLE; - if (c > psinfo->bufsize) - c = psinfo->bufsize; - - if (oops_in_progress) { - if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) - break; - } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); - } - record.buf = (char *)s; - record.size = c; - psinfo->write(&record); - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - s += c; - c = e - s; - } + record.buf = (char *)s; + record.size = c; + psinfo->write(&record); } static struct console pstore_console = { @@ -569,6 +575,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); + sema_init(&psinfo->buf_lock, 1); spin_unlock(&pstore_lock); if (owner && !try_module_get(owner)) { @@ -576,7 +583,8 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } - allocate_buf_for_compression(); + if (psi->flags & PSTORE_FLAGS_DMESG) + allocate_buf_for_compression(); if (pstore_is_mounted()) pstore_get_records(0); @@ -774,14 +782,43 @@ void __init pstore_choose_compression(void) for (step = zbackends; step->name; step++) { if (!strcmp(compress, step->name)) { zbackend = step; - pr_info("using %s compression\n", zbackend->name); return; } } } +static int __init pstore_init(void) +{ + int ret; + + pstore_choose_compression(); + + /* + * Check if any pstore backends registered earlier but did not + * initialize compression because crypto was not ready. If so, + * initialize compression now. + */ + allocate_buf_for_compression(); + + ret = pstore_init_fs(); + if (ret) + return ret; + + return 0; +} +late_initcall(pstore_init); + +static void __exit pstore_exit(void) +{ + pstore_exit_fs(); +} +module_exit(pstore_exit) + module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "Pstore compression to use"); module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "Pstore backend to use"); + +MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); +MODULE_LICENSE("GPL"); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 03cd59375abe..316c16463b20 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -713,18 +713,15 @@ static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = dev->platform_data; + struct ramoops_platform_data pdata_local; struct ramoops_context *cxt = &oops_cxt; size_t dump_mem_sz; phys_addr_t paddr; int err = -EINVAL; if (dev_of_node(dev) && !pdata) { - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) { - pr_err("cannot allocate platform data buffer\n"); - err = -ENOMEM; - goto fail_out; - } + pdata = &pdata_local; + memset(pdata, 0, sizeof(*pdata)); err = ramoops_parse_dt(pdev, pdata); if (err < 0) @@ -806,27 +803,36 @@ static int ramoops_probe(struct platform_device *pdev) cxt->pstore.data = cxt; /* - * Since bufsize is only used for dmesg crash dumps, it - * must match the size of the dprz record (after PRZ header - * and ECC bytes have been accounted for). + * Prepare frontend flags based on which areas are initialized. + * For ramoops_init_przs() cases, the "max count" variable tells + * if there are regions present. For ramoops_init_prz() cases, + * the single region size is how to check. */ - cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; - cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); - if (!cxt->pstore.buf) { - pr_err("cannot allocate pstore crash dump buffer\n"); - err = -ENOMEM; - goto fail_clear; - } - spin_lock_init(&cxt->pstore.buf_lock); - - cxt->pstore.flags = PSTORE_FLAGS_DMESG; + cxt->pstore.flags = 0; + if (cxt->max_dump_cnt) + cxt->pstore.flags |= PSTORE_FLAGS_DMESG; if (cxt->console_size) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; - if (cxt->ftrace_size) + if (cxt->max_ftrace_cnt) cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; if (cxt->pmsg_size) cxt->pstore.flags |= PSTORE_FLAGS_PMSG; + /* + * Since bufsize is only used for dmesg crash dumps, it + * must match the size of the dprz record (after PRZ header + * and ECC bytes have been accounted for). + */ + if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) { + cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + if (!cxt->pstore.buf) { + pr_err("cannot allocate pstore crash dump buffer\n"); + err = -ENOMEM; + goto fail_clear; + } + } + err = pstore_register(&cxt->pstore); if (err) { pr_err("registering with pstore failed\n"); @@ -959,7 +965,7 @@ static int __init ramoops_init(void) return ret; } -late_initcall(ramoops_init); +postcore_initcall(ramoops_init); static void __exit ramoops_exit(void) { diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 0792595ebcfb..3c777ec80d47 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -496,6 +496,11 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, sig ^= PERSISTENT_RAM_SIG; if (prz->buffer->sig == sig) { + if (buffer_size(prz) == 0) { + pr_debug("found existing empty buffer\n"); + return 0; + } + if (buffer_size(prz) > prz->buffer_size || buffer_start(prz) > buffer_size(prz)) pr_info("found existing invalid buffer, size %zu, start %zu\n", diff --git a/fs/quota/quota.c b/fs/quota/quota.c index f0cbf58ad4da..fd5dd806f1b9 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -791,7 +791,8 @@ static int quotactl_cmd_write(int cmd) /* Return true if quotactl command is manipulating quota on/off state */ static bool quotactl_cmd_onoff(int cmd) { - return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF); + return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) || + (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF); } /* diff --git a/fs/read_write.c b/fs/read_write.c index 8a2737f0d61d..85fd7a8ee29e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -560,12 +560,13 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ static inline loff_t file_pos_read(struct file *file) { - return file->f_pos; + return file->f_mode & FMODE_STREAM ? 0 : file->f_pos; } static inline void file_pos_write(struct file *file, loff_t pos) { - file->f_pos = pos; + if ((file->f_mode & FMODE_STREAM) == 0) + file->f_pos = pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) @@ -1241,6 +1242,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_readv(fd, vec, vlen, flags); + return do_compat_preadv64(fd, vec, vlen, pos, flags); } #endif @@ -1347,6 +1351,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_writev(fd, vec, vlen, flags); + return do_compat_pwritev64(fd, vec, vlen, pos, flags); } #endif diff --git a/fs/splice.c b/fs/splice.c index b3daa971f597..485e409ef841 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -333,8 +333,8 @@ const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return 1; } @@ -1584,7 +1584,11 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } *obuf = *ibuf; /* @@ -1593,6 +1597,8 @@ retry: */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + obuf->len = len; opipe->nrbufs++; ibuf->offset += obuf->len; @@ -1656,7 +1662,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } obuf = opipe->bufs + nbuf; *obuf = *ibuf; @@ -1667,6 +1677,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + if (obuf->len > len) obuf->len = len; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 4844538eb926..c6f9b2225387 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -210,6 +210,38 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) } /** + * inode_still_linked - check whether inode in question will be re-linked. + * @c: UBIFS file-system description object + * @rino: replay entry to test + * + * O_TMPFILE files can be re-linked, this means link count goes from 0 to 1. + * This case needs special care, otherwise all references to the inode will + * be removed upon the first replay entry of an inode with link count 0 + * is found. + */ +static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino) +{ + struct replay_entry *r; + + ubifs_assert(c, rino->deletion); + ubifs_assert(c, key_type(c, &rino->key) == UBIFS_INO_KEY); + + /* + * Find the most recent entry for the inode behind @rino and check + * whether it is a deletion. + */ + list_for_each_entry_reverse(r, &c->replay_list, list) { + ubifs_assert(c, r->sqnum >= rino->sqnum); + if (key_inum(c, &r->key) == key_inum(c, &rino->key)) + return r->deletion == 0; + + } + + ubifs_assert(c, 0); + return false; +} + +/** * apply_replay_entry - apply a replay entry to the TNC. * @c: UBIFS file-system description object * @r: replay entry to apply @@ -236,6 +268,11 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { ino_t inum = key_inum(c, &r->key); + if (inode_still_linked(c, r)) { + err = 0; + break; + } + err = ubifs_tnc_remove_ino(c, inum); break; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 5df554a9f9c9..ae796e10f68b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1357,6 +1357,12 @@ reread: iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & ICBTAG_FLAG_AD_MASK; + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT && + iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG && + iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + ret = -EIO; + goto out; + } iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index b647f0bd150c..94220ba85628 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode) epos.block = eloc; epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, &eloc, 0)); + /* Error reading indirect block? */ + if (!epos.bh) + return; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 1fd3011ea623..7fd4802222b8 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -229,7 +229,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); case UFS_UID_EFT: - if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); /* Fall through */ default: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cd58939dc977..aaca81b5e119 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -630,6 +630,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); + /* no task can run (and in turn coredump) yet */ + VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -736,10 +738,18 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx; ctx = vma->vm_userfaultfd_ctx.ctx; - if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { + + if (!ctx) + return; + + if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); WRITE_ONCE(ctx->mmap_changing, true); + } else { + /* Drop uffd context if remap feature not enabled */ + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); } } @@ -876,6 +886,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -898,6 +910,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } +skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: @@ -1326,6 +1339,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1513,6 +1528,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1566,7 +1583,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(!vma_can_userfault(vma)); - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -1575,6 +1591,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (!vma->vm_userfaultfd_ctx.ctx) goto skip; + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 6fc5425b1474..2652d00842d6 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -243,7 +243,7 @@ xfs_attr3_leaf_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; - uint16_t end; + uint32_t end; /* must be 32bit - see below */ int i; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -293,6 +293,11 @@ xfs_attr3_leaf_verify( /* * Quickly check the freemap information. Attribute data has to be * aligned to 4-byte boundaries, and likewise for the free space. + * + * Note that for 64k block size filesystems, the freemap entries cannot + * overflow as they are only be16 fields. However, when checking end + * pointer of the freemap, we have to be careful to detect overflows and + * so use uint32_t for those checks. */ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr.freemap[i].base > mp->m_attr_geo->blksize) @@ -303,7 +308,9 @@ xfs_attr3_leaf_verify( return __this_address; if (ichdr.freemap[i].size & 0x3) return __this_address; - end = ichdr.freemap[i].base + ichdr.freemap[i].size; + + /* be care of 16 bit overflows here */ + end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size; if (end < ichdr.freemap[i].base) return __this_address; if (end > mp->m_attr_geo->blksize) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a47670332326..3a496ffe6551 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1683,10 +1683,13 @@ xfs_bmap_add_extent_delay_real( case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. - * The right neighbor is contiguous, the left is not. + * The right neighbor is contiguous, the left is not. Take care + * with delay -> unwritten extent allocation here because the + * delalloc record we are overwriting is always written. */ PREV.br_startblock = new->br_startblock; PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; xfs_iext_next(ifp, &bma->icur); xfs_iext_remove(bma->ip, &bma->icur, state); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 34c6d7bd4d18..bbdae2b4559f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -330,7 +330,7 @@ xfs_btree_sblock_verify_crc( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) - return __this_address; + return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 49f5f5896a43..b697866946d2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -449,6 +449,7 @@ xfs_map_blocks( } wpc->imap = imap; + xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); return 0; allocate_blocks: @@ -459,6 +460,7 @@ allocate_blocks: ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || imap.br_startoff + imap.br_blockcount <= cow_fsb); wpc->imap = imap; + xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); return 0; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6de8d90041ff..211b06e4702e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1175,9 +1175,9 @@ xfs_free_file_space( * page could be mmap'd and iomap_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ - if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) { + if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - (offset + len) & ~PAGE_MASK, LLONG_MAX); + round_down(offset + len, PAGE_SIZE), LLONG_MAX); } return error; @@ -1824,6 +1824,12 @@ xfs_swap_extents( if (error) goto out_unlock; + if (xfs_inode_has_cow_data(tip)) { + error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); + if (error) + return error; + } + /* * Extent "swapping" with rmap requires a permanent reservation and * a block reservation because it's really just a remap operation diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 12d8455bfbb2..010db5f8fb00 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1233,9 +1233,23 @@ xfs_buf_iodone( } /* - * Requeue a failed buffer for writeback + * Requeue a failed buffer for writeback. * - * Return true if the buffer has been re-queued properly, false otherwise + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + * + * Return true if the buffer was added to the buffer list, false if it was + * already on the buffer list. */ bool xfs_buf_resubmit_failed_buffers( @@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers( struct list_head *buffer_list) { struct xfs_log_item *lip; + bool ret; + + ret = xfs_buf_delwri_queue(bp, buffer_list); /* - * Clear XFS_LI_FAILED flag from all items before resubmit - * - * XFS_LI_FAILED set/clear is protected by ail_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - /* Add this buffer back to the delayed write list */ - return xfs_buf_delwri_queue(bp, buffer_list); + return ret; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0ef5ece5634c..bad90479ade2 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1616,7 +1616,7 @@ xfs_ioc_getbmap( error = 0; out_free_buf: kmem_free(buf); - return 0; + return error; } struct getfsmap_info { diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 73a1d77ec187..3091e4bc04ef 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -40,7 +40,7 @@ xfs_fill_statvfs_from_dquot( statp->f_files = limit; statp->f_ffree = (statp->f_files > dqp->q_res_icount) ? - (statp->f_ffree - dqp->q_res_icount) : 0; + (statp->f_files - dqp->q_res_icount) : 0; } } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 42ea7bab9144..7088f44c0c59 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -302,6 +302,7 @@ xfs_reflink_reserve_cow( if (error) return error; + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); trace_xfs_reflink_cow_alloc(ip, &got); return 0; } diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 4e4423153071..740ac9674848 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -119,7 +119,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) int j; seq_printf(m, "qm"); - for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + for (j = XFSSTAT_END_REFCOUNT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); return 0; |