From 8ca78f3eda4bf1799e8c4ba02035623fd7a347df Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 27 Jun 2012 15:05:48 +0200 Subject: Btrfs: avoid waiting for delayed refs when we must not We track two conditions to decide if we should sleep while waiting for more delayed refs, the number of delayed refs (num_refs) and the first entry in the list of blockers (first_seq). When we suspect staleness, we save num_refs and do one more cycle. If nothing changes, we then save first_seq for later comparison and do wait_event. We ought to save first_seq the very same moment we're saving num_refs. Otherwise we cannot be sure that nothing has changed and we might start waiting when we shouldn't, which could lead to starvation. Signed-off-by: Jan Schmidt --- fs/btrfs/extent-tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4b5a1e1bdefb..6e1d36702ff7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2347,12 +2347,10 @@ next: return count; } - static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, - unsigned long num_refs) + unsigned long num_refs, + struct list_head *first_seq) { - struct list_head *first_seq = delayed_refs->seq_head.next; - spin_unlock(&delayed_refs->lock); pr_debug("waiting for more refs (num %ld, first %p)\n", num_refs, first_seq); @@ -2381,6 +2379,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct list_head cluster; + struct list_head *first_seq = NULL; int ret; u64 delayed_start; int run_all = count == (unsigned long)-1; @@ -2436,8 +2435,10 @@ again: */ consider_waiting = 1; num_refs = delayed_refs->num_entries; + first_seq = root->fs_info->tree_mod_seq_list.next; } else { - wait_for_more_refs(delayed_refs, num_refs); + wait_for_more_refs(delayed_refs, + num_refs, first_seq); /* * after waiting, things have changed. we * dropped the lock and someone else might have -- cgit v1.2.3 From 097b8a7c9e48e2cb50fd0eb9315791921beaf484 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 21 Jun 2012 11:08:04 +0200 Subject: Btrfs: join tree mod log code with the code holding back delayed refs We've got two mechanisms both required for reliable backref resolving (tree mod log and holding back delayed refs). You cannot make use of one without the other. So instead of requiring the user of this mechanism to setup both correctly, we join them into a single interface. Additionally, we stop inserting non-blockers into fs_info->tree_mod_seq_list as we did before, which was of no value. Signed-off-by: Jan Schmidt --- fs/btrfs/extent-tree.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e1d36702ff7..94ce79f76e5f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2217,6 +2217,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; int count = 0; int must_insert_reserved = 0; @@ -2255,7 +2256,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = select_delayed_ref(locked_ref); if (ref && ref->seq && - btrfs_check_delayed_seq(delayed_refs, ref->seq)) { + btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { /* * there are still refs with lower seq numbers in the * process of being added. Don't run this ref yet. @@ -2337,7 +2338,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } next: - do_chunk_alloc(trans, root->fs_info->extent_root, + do_chunk_alloc(trans, fs_info->extent_root, 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); @@ -2347,18 +2348,19 @@ next: return count; } -static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, +static void wait_for_more_refs(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, unsigned long num_refs, struct list_head *first_seq) { spin_unlock(&delayed_refs->lock); pr_debug("waiting for more refs (num %ld, first %p)\n", num_refs, first_seq); - wait_event(delayed_refs->seq_wait, + wait_event(fs_info->tree_mod_seq_wait, num_refs != delayed_refs->num_entries || - delayed_refs->seq_head.next != first_seq); + fs_info->tree_mod_seq_list.next != first_seq); pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, delayed_refs->seq_head.next); + delayed_refs->num_entries, fs_info->tree_mod_seq_list.next); spin_lock(&delayed_refs->lock); } @@ -2403,6 +2405,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, again: consider_waiting = 0; spin_lock(&delayed_refs->lock); + if (count == 0) { count = delayed_refs->num_entries * 2; run_most = 1; @@ -2437,7 +2440,7 @@ again: num_refs = delayed_refs->num_entries; first_seq = root->fs_info->tree_mod_seq_list.next; } else { - wait_for_more_refs(delayed_refs, + wait_for_more_refs(root->fs_info, delayed_refs, num_refs, first_seq); /* * after waiting, things have changed. we @@ -5190,8 +5193,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + if (waitqueue_active(&root->fs_info->tree_mod_seq_wait)) + wake_up(&root->fs_info->tree_mod_seq_wait); /* * we don't take a ref on the node because we're removing it from the -- cgit v1.2.3 From 709c0486b9fe9586736b108b7233bbce0300cfa5 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 12 Sep 2011 12:22:57 +0200 Subject: Btrfs: Test code to change the order of delayed-ref processing Normally delayed refs get processed in ascending bytenr order. This correlates in most cases to the order added. To expose dependencies on this order, we start to process the tree in the middle instead of the beginning. This code is only effective when SCRAMBLE_DELAYED_REFS is defined. Signed-off-by: Arne Jansen --- fs/btrfs/extent-tree.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 94ce79f76e5f..b13f1fbc3733 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,6 +34,8 @@ #include "locking.h" #include "free-space-cache.h" +#undef SCRAMBLE_DELAYED_REFS + /* * control flags for do_chunk_alloc's force field * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk @@ -2364,6 +2366,49 @@ static void wait_for_more_refs(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); } +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; + + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2406,6 +2451,10 @@ again: consider_waiting = 0; spin_lock(&delayed_refs->lock); +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + if (count == 0) { count = delayed_refs->num_entries * 2; run_most = 1; -- cgit v1.2.3 From bed92eae26ccf280d1a2168b7509447b56675a27 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 28 Jun 2012 18:03:02 +0200 Subject: Btrfs: qgroup implementation and prototypes Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/extent-tree.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b13f1fbc3733..1a63b830846d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2409,6 +2409,40 @@ static u64 find_middle(struct rb_root *root) } #endif +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct qgroup_update *qgroup_update; + int ret = 0; + + if (list_empty(&trans->qgroup_ref_list) != + !trans->delayed_ref_elem.seq) { + /* list without seq or seq without list */ + printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n", + list_empty(&trans->qgroup_ref_list) ? "" : " not", + trans->delayed_ref_elem.seq); + BUG(); + } + + if (!trans->delayed_ref_elem.seq) + return 0; + + while (!list_empty(&trans->qgroup_ref_list)) { + qgroup_update = list_first_entry(&trans->qgroup_ref_list, + struct qgroup_update, list); + list_del(&qgroup_update->list); + if (!ret) + ret = btrfs_qgroup_account_ref( + trans, fs_info, qgroup_update->node, + qgroup_update->extent_op); + kfree(qgroup_update); + } + + btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + + return ret; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be -- cgit v1.2.3 From edf39272db4810282360f7362d43ade1d524c913 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 28 Jun 2012 18:04:55 +0200 Subject: Btrfs: call the qgroup accounting functions Signed-off-by: Jan Schmidt --- fs/btrfs/extent-tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1a63b830846d..c08337a83ace 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2479,6 +2479,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: @@ -2588,6 +2590,7 @@ again: } out: spin_unlock(&delayed_refs->lock); + assert_qgroups_uptodate(trans); return 0; } -- cgit v1.2.3 From c556723794b3487a79de1ecd6354975b1389f5ff Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 14 Sep 2011 15:44:05 +0200 Subject: Btrfs: hooks to reserve qgroup space Like block reserves, reserve a small piece of space on each transaction start and for delalloc. These are the hooks that can actually return EDQUOT to the user. The amount of space reserved is tracked in the transaction handle. Signed-off-by: Arne Jansen --- fs/btrfs/extent-tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c08337a83ace..2ce16f97730a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4565,6 +4565,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, num_bytes + + nr_extents * root->leafsize); + if (ret) + return ret; + } + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; @@ -4643,6 +4650,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + dropped * root->leafsize); + } + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } -- cgit v1.2.3 From 96c3f4331a8c1cd0a58307e4ac7e73e09d7dab23 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 21 Jun 2012 14:05:49 -0400 Subject: Btrfs: flush delayed inodes if we're short on space Those crazy gentoo guys have been complaining about ENOSPC errors on their portage volumes. This is because doing things like untar tends to create lots of new files which will soak up all the reservation space in the delayed inodes. Usually this gets papered over by the fact that we will try and commit the transaction, however if this happens in the wrong spot or we choose not to commit the transaction you will be screwed. So add the ability to expclitly flush delayed inodes to free up space. Please test this out guys to make sure it works since as usual I cannot reproduce. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 97 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 35 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e1d36702ff7..3cde907a25a5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3728,6 +3728,60 @@ commit: return btrfs_commit_transaction(trans, root); } +enum flush_state { + FLUSH_DELALLOC = 1, + FLUSH_DELALLOC_WAIT = 2, + FLUSH_DELAYED_ITEMS_NR = 3, + FLUSH_DELAYED_ITEMS = 4, + COMMIT_TRANS = 5, +}; + +static int flush_space(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 num_bytes, + u64 orig_bytes, int state) +{ + struct btrfs_trans_handle *trans; + int nr; + int ret; + + switch (state) { + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + ret = shrink_delalloc(root, num_bytes, + state == FLUSH_DELALLOC_WAIT); + if (ret > 0) + ret = 0; + break; + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) { + u64 bytes = btrfs_calc_trans_metadata_size(root, 1); + + nr = (int)div64_u64(num_bytes, bytes); + if (!nr) + nr = 1; + nr *= 2; + } else { + nr = -1; + } + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, root, nr); + btrfs_end_transaction(trans, root); + break; + case COMMIT_TRANS: + ret = may_commit_transaction(root, space_info, orig_bytes, 0); + break; + default: + ret = -ENOSPC; + break; + } + + return ret; +} /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -3749,11 +3803,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; u64 num_bytes = orig_bytes; - int retries = 0; + int flush_state = FLUSH_DELALLOC; int ret = 0; - bool committed = false; bool flushing = false; - bool wait_ordered = false; + bool committed = false; again: ret = 0; @@ -3812,9 +3865,8 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - wait_ordered = true; num_bytes = used - space_info->total_bytes + - (orig_bytes * (retries + 1)); + (orig_bytes * 2); } if (ret) { @@ -3867,8 +3919,6 @@ again: trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else { - wait_ordered = true; } } @@ -3887,36 +3937,13 @@ again: if (!ret || !flush) goto out; - /* - * We do synchronous shrinking since we don't actually unreserve - * metadata until after the IO is completed. - */ - ret = shrink_delalloc(root, num_bytes, wait_ordered); - if (ret < 0) - goto out; - - ret = 0; - - /* - * So if we were overcommitted it's possible that somebody else flushed - * out enough space and we simply didn't have enough space to reclaim, - * so go back around and try again. - */ - if (retries < 2) { - wait_ordered = true; - retries++; + ret = flush_space(root, space_info, num_bytes, orig_bytes, + flush_state); + flush_state++; + if (!ret) goto again; - } - - ret = -ENOSPC; - if (committed) - goto out; - - ret = may_commit_transaction(root, space_info, orig_bytes, 0); - if (!ret) { - committed = true; + else if (flush_state <= COMMIT_TRANS) goto again; - } out: if (flushing) { -- cgit v1.2.3 From 0e721106923be82f651dd0ee504742a8a3eb089f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Jun 2012 16:13:18 -0400 Subject: Btrfs: change how we indicate we're adding csums There is weird logic I had to put in place to make sure that when we were adding csums that we'd used the delalloc block rsv instead of the global block rsv. Part of this meant that we had to free up our transaction reservation before we ran the delayed refs since csum deletion happens during the delayed ref work. The problem with this is that when we release a reservation we will add it to the global reserve if it is not full in order to keep us going along longer before we have to force a transaction commit. By releasing our reservation before we run delayed refs we don't get the opportunity to drain down the global reserve for the work we did, so we won't refill it as often. This isn't a problem per-se, it just results in us possibly committing transactions more and more often, and in rare cases could cause those WARN_ON()'s to pop in use_block_rsv because we ran out of space in our block rsv. This also helps us by holding onto space while the delayed refs run so we don't end up with as many people trying to do things at the same time, which again will help us not force commits or hit the use_block_rsv warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3cde907a25a5..ec0328bb86db 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3961,7 +3961,10 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows || root == root->fs_info->csum_root) + if (root->ref_cows) + block_rsv = trans->block_rsv; + + if (root == root->fs_info->csum_root && trans->adding_csums) block_rsv = trans->block_rsv; if (!block_rsv) @@ -4313,6 +4316,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + if (!trans->block_rsv) + return; + if (!trans->bytes_reserved) return; -- cgit v1.2.3 From f4c738c2e7bc6d696b0d60155df7ea01684962b6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 2 Jul 2012 17:10:51 -0400 Subject: Btrfs: rework shrink_delalloc So shrink_delalloc has grown all sorts of cruft over the years thanks to many reworkings of how we track enospc. What happens now as we fill up the disk is we will loop for freaking ever hoping to reclaim a arbitrary amount of space of metadata, this was from when everybody flushed at the same time. Now we only have people flushing one at a time. So instead of trying to reclaim a huge amount of space, just try to flush a decent chunk of space, and stop looping as soon as we have enough free space to satisfy our reservation. This makes xfstests 224 go much faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 81 +++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 57 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ec0328bb86db..5e552f9cc5be 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3586,89 +3586,58 @@ out: /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; - u64 reserved; + u64 delalloc_bytes; u64 max_reclaim; - u64 reclaimed = 0; long time_left; unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; - unsigned long progress; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_may_use; - progress = space_info->reservation_progress; - - if (reserved == 0) - return 0; - - smp_mb(); - if (root->fs_info->delalloc_bytes == 0) { + delalloc_bytes = root->fs_info->delalloc_bytes; + if (delalloc_bytes == 0) { if (trans) - return 0; + return; btrfs_wait_ordered_extents(root, 0, 0); - return 0; + return; } - max_reclaim = min(reserved, to_reclaim); - nr_pages = max_t(unsigned long, nr_pages, - max_reclaim >> PAGE_CACHE_SHIFT); - while (loops < 1024) { - /* have the flusher threads jump in and do some IO */ - smp_mb(); - nr_pages = min_t(unsigned long, nr_pages, - root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); + while (delalloc_bytes && loops < 3) { + max_reclaim = min(delalloc_bytes, to_reclaim); + nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_may_use) - reclaimed += reserved - space_info->bytes_may_use; - reserved = space_info->bytes_may_use; + if (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use + orig <= + space_info->total_bytes) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; - - if (reserved == 0 || reclaimed >= max_reclaim) - break; - - if (trans && trans->transaction->blocked) - return -EAGAIN; - if (wait_ordered && !trans) { btrfs_wait_ordered_extents(root, 0, 0); } else { - time_left = schedule_timeout_interruptible(1); - - /* We were interrupted, exit */ + time_left = schedule_timeout_killable(1); if (time_left) break; } - - /* we've kicked the IO a few times, if anything has been freed, - * exit. There is no sense in looping here for a long time - * when we really need to commit the transaction, or there are - * just too many writers without enough free space - */ - - if (loops > 3) { - smp_mb(); - if (progress != space_info->reservation_progress) - break; - } - + smp_mb(); + delalloc_bytes = root->fs_info->delalloc_bytes; } - - return reclaimed >= to_reclaim; } /** @@ -3742,15 +3711,13 @@ static int flush_space(struct btrfs_root *root, { struct btrfs_trans_handle *trans; int nr; - int ret; + int ret = 0; switch (state) { case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - ret = shrink_delalloc(root, num_bytes, - state == FLUSH_DELALLOC_WAIT); - if (ret > 0) - ret = 0; + shrink_delalloc(root, num_bytes, orig_bytes, + state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: -- cgit v1.2.3 From 83eea1f1bacd5dc7b44dcf84f5fdca54fdea5453 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 10 Jul 2012 05:28:39 -0600 Subject: Btrfs: kill root from btrfs_is_free_space_inode Since root can be fetched via BTRFS_I macro directly, we can save an args for btrfs_is_free_space_inode(). Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5e552f9cc5be..d1ebd2a06116 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4444,7 +4444,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) int ret; /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(root, inode)) + if (btrfs_is_free_space_inode(inode)) flush = 0; if (flush && btrfs_transaction_in_commit(root->fs_info)) -- cgit v1.2.3 From 067893842341e7b7487062367ecfaa46c97505e0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Jul 2012 03:31:33 -0600 Subject: Btrfs: do not abort transaction in prealloc case During disk balance, we prealloc new file extent for file data relocation, but we may fail in 'no available space' case, and it leads to flipping btrfs into readonly. It is not necessary to bail out and abort transaction since we do have several ways to rescue ourselves from ENOSPC case. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d1ebd2a06116..67bd12a52369 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5748,7 +5748,11 @@ loop: ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_LIMITED); - if (ret < 0) { + /* + * Do not bail out on ENOSPC since we + * can do more things. + */ + if (ret < 0 && ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); goto out; -- cgit v1.2.3 From cf7c1ef6e1fe05864369f59dd516e816b11de7d0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Jul 2012 03:31:34 -0600 Subject: Btrfs: fix a bug of writting free space cache during balance Here is the whole story: 1) A free space cache consists of two parts: o free space cache inode, which is special becase it's stored in root tree. o free space info, which is stored as the above inode's file data. But we only build up another new inode and does not flush its free space info onto disk when we _clear and setup_ free space cache, and this ends up with that the block group cache's cache_state remains DC_SETUP instead of DC_WRITTEN. And holding DC_SETUP means that we will not truncate this free space cache inode, which means the disk offset of its file extent will remain _unchanged_ at least until next transaction finishes committing itself. 2) We can set a block group readonly when we relocate the block group. However, if the readonly block group covers the disk offset where our free space cache inode is going to write, it will force the free space cache inode into cow_file_range() and it'll end up hitting a BUG_ON. 3) Due to the above analysis, we fix this bug by adding the missing dirty flag. 4) However, it's not over, there is still another case, nospace_cache. With nospace_cache, we do not want to set dirty flag, instead we just truncate free space cache inode and bail out with setting cache state DC_WRITTEN. We can benifit from it since it saves us another 'pre-allocation' part which usually costs a lot. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 67bd12a52369..3ca26d84cce5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2903,8 +2903,13 @@ again: } spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED) { - /* We're not cached, don't bother trying to write stuff out */ + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(root, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option. + */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; @@ -7614,8 +7619,21 @@ int btrfs_read_block_groups(struct btrfs_root *root) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - if (need_clear) + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ cache->disk_cache_state = BTRFS_DC_CLEAR; + if (btrfs_test_opt(root, SPACE_CACHE)) + cache->dirty = 1; + } read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), -- cgit v1.2.3 From 799ffc3c31de57d10a4b9abcfbfeea8771acc976 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Jul 2012 03:31:35 -0600 Subject: Btrfs: add ro notification to dump_space_info Block group has ro attributes, make dump_space_info show it. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3ca26d84cce5..7843542484c9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5825,13 +5825,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used " - "%llu pinned %llu reserved\n", + printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", (unsigned long long)cache->key.objectid, (unsigned long long)cache->key.offset, (unsigned long long)btrfs_block_group_used(&cache->item), (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved); + (unsigned long long)cache->reserved, + cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } -- cgit v1.2.3 From b4d7c3c9456a311a45bc1ef8944b5ba5b176244f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Jul 2012 20:21:07 -0600 Subject: Btrfs: kill free_space pointer from inode structure Inodes always allocate free space with BTRFS_BLOCK_GROUP_DATA type, which means every inode has the same BTRFS_I(inode)->free_space pointer. This shrinks struct btrfs_inode by 4 bytes (or 8 bytes on 64 bits). Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7843542484c9..6621ed72f3c3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3139,6 +3139,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = found; return 0; } @@ -3268,12 +3270,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return get_alloc_profile(root, flags); } -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - BTRFS_BLOCK_GROUP_DATA); -} - /* * This will check the space that the inode allocates from to make sure we have * enough space for bytes. @@ -3282,6 +3278,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 used; int ret = 0, committed = 0, alloc_chunk = 1; @@ -3294,7 +3291,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) committed = 1; } - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = fs_info->data_sinfo; if (!data_sinfo) goto alloc; @@ -3335,10 +3332,9 @@ alloc: goto commit_trans; } - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } + if (!data_sinfo) + data_sinfo = fs_info->data_sinfo; + goto again; } @@ -3385,7 +3381,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", -- cgit v1.2.3 From df57dbe6bf73cc44305d81c24982a11da49b1f79 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 23 Jul 2012 05:50:03 -0600 Subject: Btrfs: make btrfs's allocation smoothly with preallocation For backref walking, we've introduce delayed ref's sequence. However, it changes our preallocation behavior. The story is that when we preallocate an extent and then mark it written piece by piece, the ideal case should be that we don't need to COW the extent, which is why we use 'preallocate'. But we may not make use of preallocation, since when we check for cross refs on the extent, we may have two ref entries which have the same content except the sequence value, and we recognize them as cross refs and do COW to allocate another extent. So we end up with several pieces of space instead of an whole extent. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6621ed72f3c3..71b2d1c7da69 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2581,8 +2581,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, node = rb_prev(node); if (node) { + int seq = ref->seq; + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) + if (ref->bytenr == bytenr && ref->seq == seq) goto out_unlock; } -- cgit v1.2.3 From cd1cfc49153ba2bef247e500d8bd4d135193ece9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jul 2012 16:03:32 -0400 Subject: Btrfs: add a barrier before a waitqueue_active check We were missing wakeups on the delayed ref waitqueue due to races on waitqueue_active. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 44f06201f376..4e1b153b7c47 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5294,6 +5294,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; + smp_mb(); if (waitqueue_active(&root->fs_info->tree_mod_seq_wait)) wake_up(&root->fs_info->tree_mod_seq_wait); -- cgit v1.2.3