From b7ec40d7845bffca8bb3af2ea3f192d6257bbe21 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Mar 2009 20:12:45 -0400 Subject: Btrfs: reduce stalls during transaction commit To avoid deadlocks and reduce latencies during some critical operations, some transaction writers are allowed to jump into the running transaction and make it run a little longer, while others sit around and wait for the commit to finish. This is a bit unfair, especially when the callers that jump in do a bunch of IO that makes all the others procs on the box wait. This commit reduces the stalls this produces by pre-reading file extent pointers during btrfs_finish_ordered_io before the transaction is joined. It also tunes the drop_snapshot code to politely wait for transactions that have started writing out their delayed refs to finish. This avoids new delayed refs being flooded into the queue while we're trying to close off the transaction. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..13a17477c4f4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1502,6 +1502,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered_extent; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_path *path; int compressed = 0; int ret; @@ -1509,6 +1510,23 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) if (!ret) return 0; + /* + * before we join the transaction, try to do some of our IO. + * This will limit the amount of IO that we have to do with + * the transaction running. We're unlikely to need to do any + * IO if the file extents are new, the disk_i_size checks + * covers the most common case. + */ + if (start < BTRFS_I(inode)->disk_i_size) { + path = btrfs_alloc_path(); + if (path) { + ret = btrfs_lookup_file_extent(NULL, root, path, + inode->i_ino, + start, 0); + btrfs_free_path(path); + } + } + trans = btrfs_join_transaction(root, 1); ordered_extent = btrfs_lookup_ordered_extent(inode, start); -- cgit v1.2.3 From 7f366cfecfc126731f8ac505d72026d691dac79a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Mar 2009 20:12:45 -0400 Subject: Btrfs: reduce stack in cow_file_range The fs/btrfs/inode.c code to run delayed allocation during writout needed some stack usage optimization. This is the first pass, it does the check for compression earlier on, which allows us to do the common (no compression) case higher up in the call chain. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13a17477c4f4..c427011dc453 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -204,7 +204,7 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static int cow_file_range_inline(struct btrfs_trans_handle *trans, +static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, size_t compressed_size, @@ -854,11 +854,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - if (!btrfs_test_opt(root, COMPRESS)) { - return cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); - } - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC, 1, 0, GFP_NOFS); while (start < end) { @@ -935,7 +930,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root, * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, +static noinline int run_delalloc_nocow(struct inode *inode, + struct page *locked_page, u64 start, u64 end, int *page_started, int force, unsigned long *nr_written) { @@ -1133,6 +1129,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, unsigned long *nr_written) { int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; if (btrfs_test_flag(inode, NODATACOW)) ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1140,10 +1137,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, else if (btrfs_test_flag(inode, PREALLOC)) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); + else if (!btrfs_test_opt(root, COMPRESS)) + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); else ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); - return ret; } -- cgit v1.2.3 From b9473439d3e84d9fc1a0a83faca69cc1b7566341 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 11:00:37 -0400 Subject: Btrfs: leave btree locks spinning more often btrfs_mark_buffer dirty would set dirty bits in the extent_io tree for the buffers it was dirtying. This may require a kmalloc and it was not atomic. So, anyone who called btrfs_mark_buffer_dirty had to set any btree locks they were holding to blocking first. This commit changes dirty tracking for extent buffers to just use a flag in the extent buffer. Now that we have one and only one extent buffer per page, this can be safely done without losing dirty bits along the way. This also introduces a path->leave_spinning flag that callers of btrfs_search_slot can use to indicate they will properly deal with a path returned where all the locks are spinning instead of blocking. Many of the btree search callers now expect spinning paths, resulting in better btree concurrency overall. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c427011dc453..b83a45dc717e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; btrfs_set_trans_block_group(trans, inode); key.objectid = inode->i_ino; @@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_CACHE_SIZE); - kaddr = kmap(cpage); + kaddr = kmap_atomic(cpage, KM_USER0); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap(cpage); + kunmap_atomic(kaddr, KM_USER0); i++; ptr += cur_size; @@ -1452,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, file_pos + num_bytes, file_pos, &hint); BUG_ON(ret); @@ -1474,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, fi, compression); btrfs_set_file_extent_encryption(leaf, fi, encryption); btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + + btrfs_unlock_up_safe(path, 1); + btrfs_set_lock_blocking(leaf); + btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); @@ -1486,8 +1492,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, root->root_key.objectid, trans->transid, inode->i_ino, &ins); BUG_ON(ret); - btrfs_free_path(path); + return 0; } @@ -2118,6 +2124,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 1); if (ret) { @@ -2164,6 +2171,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } + path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2515,6 +2523,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto error; @@ -2661,6 +2670,7 @@ delete: break; } if (found_extent) { + btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, leaf->start, root_owner, @@ -3466,6 +3476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); sizes[1] = name_len + sizeof(*ref); + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); if (ret != 0) goto fail; -- cgit v1.2.3 From 5d13a98f3bf5afc1113f7db184c627a44659bc29 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 11:41:46 -0400 Subject: Btrfs: readahead checksums during btrfs_finish_ordered_io This reads in blocks in the checksum btree before starting the transaction in btrfs_finish_ordered_io. It makes it much more likely we'll be able to do operations inside the transaction without needing any btree reads, which limits transaction latencies overall. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b83a45dc717e..9b4faac50c18 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1497,6 +1497,30 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, return 0; } +/* + * helper function for btrfs_finish_ordered_io, this + * just reads in some of the csum leaves to prime them into ram + * before we start the transaction. It limits the amount of btree + * reads required while inside the transaction. + */ +static noinline void reada_csum(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_ordered_sum *sum; + u64 bytenr; + + sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, + list); + bytenr = sum->sums[0].bytenr; + + /* + * we don't care about the results, the point of this search is + * just to get the btree leaves into ram + */ + btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); +} + /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -1505,7 +1529,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct btrfs_ordered_extent *ordered_extent; + struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; int compressed = 0; @@ -1528,13 +1552,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, start, 0); + ordered_extent = btrfs_lookup_ordered_extent(inode, + start); + if (!list_empty(&ordered_extent->list)) { + btrfs_release_path(root, path); + reada_csum(root, path, ordered_extent); + } btrfs_free_path(path); } } trans = btrfs_join_transaction(root, 1); - ordered_extent = btrfs_lookup_ordered_extent(inode, start); + if (!ordered_extent) + ordered_extent = btrfs_lookup_ordered_extent(inode, start); BUG_ON(!ordered_extent); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) goto nocow; -- cgit v1.2.3 From 12fcfd22fe5bf4fe74710232098bc101af497995 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 24 Mar 2009 10:24:20 -0400 Subject: Btrfs: tree logging unlink/rename fixes The tree logging code allows individual files or directories to be logged without including operations on other files and directories in the FS. It tries to commit the minimal set of changes to disk in order to fsync the single file or directory that was sent to fsync or O_SYNC. The tree logging code was allowing files and directories to be unlinked if they were part of a rename operation where only one directory in the rename was in the fsync log. This patch adds a few new rules to the tree logging. 1) on rename or unlink, if the inode being unlinked isn't in the fsync log, we must force a full commit before doing an fsync of the directory where the unlink was done. The commit isn't done during the unlink, but it is forced the next time we try to log the parent directory. Solution: record transid of last unlink/rename per directory when the directory wasn't already logged. For renames this is only done when renaming to a different directory. mkdir foo/some_dir normal commit rename foo/some_dir foo2/some_dir mkdir foo/some_dir fsync foo/some_dir/some_file The fsync above will unlink the original some_dir without recording it in its new location (foo2). After a crash, some_dir will be gone unless the fsync of some_file forces a full commit 2) we must log any new names for any file or dir that is in the fsync log. This way we make sure not to lose files that are unlinked during the same transaction. 2a) we must log any new names for any file or dir during rename when the directory they are being removed from was logged. 2a is actually the more important variant. Without the extra logging a crash might unlink the old name without recreating the new one 3) after a crash, we must go through any directories with a link count of zero and redo the rm -rf mkdir f1/foo normal commit rm -rf f1/foo fsync(f1) The directory f1 was fully removed from the FS, but fsync was never called on f1, only its parent dir. After a crash the rm -rf must be replayed. This must be able to recurse down the entire directory tree. The inode link count fixup code takes care of the ugly details. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9b4faac50c18..bffd79faffb5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir->i_ino); BUG_ON(ret != 0 && ret != -ENOENT); - if (ret != -ENOENT) - BTRFS_I(dir)->log_dirty_trans = trans->transid; ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); @@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); + + btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); @@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->disk_i_size = 0; bi->flags = 0; bi->index_cnt = (u64)-1; - bi->log_dirty_trans = 0; + bi->last_unlink_trans = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); @@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; nr = trans->blocks_used; + + btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); btrfs_end_transaction_throttle(trans, root); fail: if (drop_inode) { @@ -4666,6 +4669,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, trans = btrfs_start_transaction(root, 1); + /* + * this is an ugly little race, but the rename is required to make + * sure that if we crash, the inode is either at the old name + * or the new one. pinning the log transaction lets us make sure + * we don't allow a log commit to come in after we unlink the + * name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + btrfs_set_trans_block_group(trans, new_dir); btrfs_inc_nlink(old_dentry->d_inode); @@ -4673,6 +4685,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, old_dentry->d_name.name, old_dentry->d_name.len); @@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; + btrfs_log_new_name(trans, old_inode, old_dir, + new_dentry->d_parent); out_fail: + + /* this btrfs_end_log_trans just allows the current + * log-sub transaction to complete + */ + btrfs_end_log_trans(root); btrfs_end_transaction_throttle(trans, root); out_unlock: return ret; -- cgit v1.2.3 From 5a3f23d515a2ebf0c750db80579ca57b28cbce6d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 31 Mar 2009 13:27:11 -0400 Subject: Btrfs: add extra flushing for renames and truncates Renames and truncates are both common ways to replace old data with new data. The filesystem can make an effort to make sure the new data is on disk before actually replacing the old data. This is especially important for rename, which many application use as though it were atomic for both the data and the metadata involved. The current btrfs code will happily replace a file that is fully on disk with one that was just created and still has pending IO. If we crash after transaction commit but before the IO is done, we'll end up replacing a good file with a zero length file. The solution used here is to create a list of inodes that need special ordering and force them to disk before the commit is done. This is similar to the ext3 style data=ordering, except it is only done on selected files. Btrfs is able to get away with this because it does not wait on commits very often, even for fsync (which use a sub-commit). For renames, we order the file when it wasn't already on disk and when it is replacing an existing file. Larger files are sent to filemap_flush right away (before the transaction handle is opened). For truncates, we order if the file goes from non-zero size down to zero size. This is a little different, because at the time of the truncate the file has no dirty bytes to order. But, we flag the inode so that it is added to the ordered list on close (via release method). We also immediately add it to the ordered list of the current transaction so that we can try to flush down any writes the application sneaks in before commit. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bffd79faffb5..1cff528d5b51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + if (attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; + } else if (inode->i_size > 0 && + attr->ia_size == 0) { + + /* we're truncating a file that used to have good + * data down to zero. Make sure it gets into + * the ordered flush list so that any new writes + * get down to disk quickly. + */ + BTRFS_I(inode)->ordered_data_close = 1; + } } err = inode_setattr(inode, attr); @@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode) extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -4419,6 +4430,8 @@ again: } ClearPageChecked(page); set_page_dirty(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); + + /* + * setattr is responsible for setting the ordered_data_close flag, + * but that is only tested during the last file release. That + * could happen well after the next commit, leaving a great big + * window where new writes may get lost if someone chooses to write + * to this file after truncating to zero + * + * The inode doesn't have any dirty data here, and so if we commit + * this is a noop. If someone immediately starts writing to the inode + * it is very likely we'll catch some of their writes in this + * transaction, and the commit will find this file on the ordered + * data list with good things to send down. + * + * This is a best effort solution, there is still a window where + * using truncate to replace the contents of the file will + * end up with a zero length file after a crash. + */ + if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + btrfs_add_ordered_operation(trans, root, inode); + btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, inode->i_size); @@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_acl = BTRFS_ACL_NOT_CACHED; ei->i_default_acl = BTRFS_ACL_NOT_CACHED; INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->ordered_operations); return &ei->vfs_inode; } void btrfs_destroy_inode(struct inode *inode) { struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); @@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode) BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) posix_acl_release(BTRFS_I(inode)->i_default_acl); - spin_lock(&BTRFS_I(inode)->root->list_lock); + /* + * Make sure we're properly removed from the ordered operation + * lists. + */ + smp_mb(); + if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { + spin_lock(&root->fs_info->ordered_extent_lock); + list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + } + + spin_lock(&root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" " list\n", inode->i_ino); dump_stack(); } - spin_unlock(&BTRFS_I(inode)->root->list_lock); + spin_unlock(&root->list_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -4667,8 +4715,27 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_unlock; + /* + * we're using rename to replace one file with another. + * and the replacement file is large. Start IO on it now so + * we don't add too much work to the end of the transaction + */ + if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && + new_inode->i_size && + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + trans = btrfs_start_transaction(root, 1); + /* + * make sure the inode gets flushed if it is replacing + * something. + */ + if (new_inode && new_inode->i_size && + old_inode && S_ISREG(old_inode->i_mode)) { + btrfs_add_ordered_operation(trans, root, old_inode); + } + /* * this is an ugly little race, but the rename is required to make * sure that if we crash, the inode is either at the old name -- cgit v1.2.3 From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:21 -0700 Subject: mm: page_mkwrite change prototype to match fault Change the page_mkwrite prototype to take a struct vm_fault, and return VM_FAULT_xxx flags. There should be no functional change. This makes it possible to return much more detailed error information to the VM (and also can provide more information eg. virtual_address to the driver, which might be important in some special cases). This is required for a subsequent fix. And will also make it easier to merge page_mkwrite() with fault() in future. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Trond Myklebust Cc: Miklos Szeredi Cc: Steven Whitehouse Cc: Mark Fasheh Cc: Joel Becker Cc: Artem Bityutskiy Cc: Felix Blyakher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..ec5423790bbb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4292,8 +4292,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = fdentry(vma->vm_file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -4362,6 +4363,8 @@ again: out_unlock: unlock_page(page); out: + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } -- cgit v1.2.3 From 56a76f8275c379ed73c8a43cfa1dfa2f5e9cfa19 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:23 -0700 Subject: fs: fix page_mkwrite error cases in core code and btrfs page_mkwrite is called with neither the page lock nor the ptl held. This means a page can be concurrently truncated or invalidated out from underneath it. Callers are supposed to prevent truncate races themselves, however previously the only thing they can do in case they hit one is to raise a SIGBUS. A sigbus is wrong for the case that the page has been invalidated or truncated within i_size (eg. hole punched). Callers may also have to perform memory allocations in this path, where again, SIGBUS would be wrong. The previous patch ("mm: page_mkwrite change prototype to match fault") made it possible to properly specify errors. Convert the generic buffer.c code and btrfs to return sane error values (in the case of page removed from pagecache, VM_FAULT_NOPAGE will cause the fault handler to exit without doing anything, and the fault will be retried properly). This fixes core code, and converts btrfs as a template/example. All other filesystems defining their own page_mkwrite should be fixed in a similar manner. Acked-by: Chris Mason Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec5423790bbb..17e608c4dc70 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4307,10 +4307,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) + if (ret) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; goto out; + } - ret = -EINVAL; + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); size = i_size_read(inode); @@ -4363,8 +4368,6 @@ again: out_unlock: unlock_page(page); out: - if (ret) - ret = VM_FAULT_SIGBUS; return ret; } -- cgit v1.2.3 From 09771430f3b46ee27c69daa7ecad82007568e834 Mon Sep 17 00:00:00 2001 From: Shen Feng Date: Thu, 2 Apr 2009 16:46:06 -0400 Subject: Btrfs: free inode struct when btrfs_new_inode fails btrfs_new_inode doesn't call iput to free the inode when it fails. Signed-off-by: Shen Feng Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1cff528d5b51..0ecb3fa75dc3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (dir) { ret = btrfs_set_inode_index(dir, index); - if (ret) + if (ret) { + iput(inode); return ERR_PTR(ret); + } } /* * index_cnt is ignored for everything but a dir, @@ -3565,6 +3567,7 @@ fail: if (dir) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); + iput(inode); return ERR_PTR(ret); } -- cgit v1.2.3