From a4abeea41adfa3c143c289045f4625dfaeba2212 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Apr 2011 17:25:13 -0400 Subject: Btrfs: kill trans_mutex We use trans_mutex for lots of things, here's a basic list 1) To serialize trans_handles joining the currently running transaction 2) To make sure that no new trans handles are started while we are committing 3) To protect the dead_roots list and the transaction lists Really the serializing trans_handles joining is not too hard, and can really get bogged down in acquiring a reference to the transaction. So replace the trans_mutex with a trans_lock spinlock and use it to do the following 1) Protect fs_info->running_transaction. All trans handles have to do is check this, and then take a reference of the transaction and keep on going. 2) Protect the fs_info->trans_list. This doesn't get used too much, basically it just holds the current transactions, which will usually just be the currently committing transaction and the currently running transaction at most. 3) Protect the dead roots list. This is only ever processed by splicing the list so this is relatively simple. 4) Protect the fs_info->reloc_ctl stuff. This is very lightweight and was using the trans_mutex before, so this is a pretty straightforward change. 5) Protect fs_info->no_trans_join. Because we don't hold the trans_lock over the entirety of the commit we need to have a way to block new people from creating a new transaction while we're doing our work. So we set no_trans_join and in join_transaction we test to see if that is set, and if it is we do a wait_on_commit. 6) Make the transaction use count atomic so we don't need to take locks to modify it when we're dropping references. 7) Add a commit_lock to the transaction to make sure multiple people trying to commit the same transaction don't race and commit at the same time. 8) Make open_ioctl_trans an atomic so we don't have to take any locks for ioctl trans. I have tested this with xfstests, but obviously it is a pretty hairy change so lots of testing is greatly appreciated. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8f4b81de3ae2..522a39b0033d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -919,7 +919,6 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - u64 open_ioctl_trans; unsigned long mount_opt:20; unsigned long compress_type:4; u64 max_inline; @@ -936,7 +935,6 @@ struct btrfs_fs_info { struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; - struct mutex trans_mutex; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; @@ -957,6 +955,7 @@ struct btrfs_fs_info { struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; + spinlock_t trans_lock; struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -969,6 +968,7 @@ struct btrfs_fs_info { atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; + atomic_t open_ioctl_trans; /* * this is used by the balancing code to wait for all the pending @@ -1032,6 +1032,7 @@ struct btrfs_fs_info { int closing; int log_root_recovering; int enospc_unlink; + int trans_no_join; u64 total_pinned; @@ -1053,7 +1054,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; spinlock_t delalloc_lock; - spinlock_t new_trans_lock; u64 delalloc_bytes; /* data_alloc_cluster is only used in ssd mode */ -- cgit v1.2.3 From fcb80c2affd63237cff5b34cba5756be7c976a5a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 May 2011 10:40:22 -0400 Subject: Btrfs: fix how we do space reservation for truncate The ceph guys keep running into problems where we have space reserved in our orphan block rsv when freeing it up. This is because they tend to do snapshots alot, so their truncates tend to use a bunch of space, so when we go to do things like update the inode we have to steal reservation space in order to make the reservation happen. This happens because truncate can use as much space as it freaking feels like, but we still have to hold space for removing the orphan item and updating the inode, which will definitely always happen. So in order to fix this we need to split all of the reservation stuf up. So with this patch we have 1) The orphan block reserve which only holds the space for deleting our orphan item when everything is over. 2) The truncate block reserve which gets allocated and used specifically for the space that the truncate will use on a per truncate basis. 3) The transaction will always have 1 item's worth of data reserved so we can update the inode normally. Hopefully this will make the ceph problem go away. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 522a39b0033d..f31aed7fedd9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2224,6 +2224,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, -- cgit v1.2.3 From d82a6f1d7e8b61ed5996334d0db66651bb43641d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 15:26:06 -0400 Subject: Btrfs: kill BTRFS_I(inode)->block_group Originally this was going to be used as a way to give hints to the allocator, but frankly we can get much better hints elsewhere and it's not even used at all for anything usefull. In addition to be completely useless, when we initialize an inode we try and find a freeish block group to set as the inodes block group, and with a completely full 40gb fs this takes _forever_, so I imagine with say 1tb fs this is just unbearable. So just axe the thing altoghether, we don't need it and it saves us 8 bytes in the inode and saves us 500 microseconds per inode lookup in my testcase. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f31aed7fedd9..0f8c489bcc02 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2512,8 +2512,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint); + struct btrfs_root *new_root, u64 new_dirid); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); -- cgit v1.2.3 From 4b9465cb9e3859186eefa1ca3b990a5849386320 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jun 2011 09:36:29 -0400 Subject: Btrfs: add mount -o inode_cache This makes the inode map cache default to off until we fix the overflow problem when the free space crcs don't fit inside a single page. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8f98c2005715..4958ef5417d6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1340,6 +1340,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) +#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) -- cgit v1.2.3 From 7841cb2898f66a73062c64d0ef5733dde7279e46 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 31 May 2011 18:07:27 +0200 Subject: btrfs: add helper for fs_info->closing wrap checking of filesystem 'closing' flag and fix a few missing memory barriers. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4958ef5417d6..8490ee063709 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2354,6 +2354,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* + * Get synced with close_ctree() + */ + smp_mb(); + return fs_info->closing; +} + /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, -- cgit v1.2.3 From 7585717f304f5ed005cc4ad933a69aab3efbd136 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 20:00:16 -0400 Subject: Btrfs: fix relocation races The recent commit to get rid of our trans_mutex introduced some races with block group relocation. The problem is that relocation needs to do some record keeping about each root, and it was relying on the transaction mutex to coordinate things in subtle ways. This fix adds a mutex just for the relocation code and makes sure it doesn't have a big impact on normal operations. The race is really fixed in btrfs_record_root_in_trans, which is where we step back and wait for the relocation code to finish accounting setup. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8490ee063709..a2c91a102b72 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -967,6 +967,12 @@ struct btrfs_fs_info { struct srcu_struct subvol_srcu; spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -1172,6 +1178,14 @@ struct btrfs_root { u32 type; u64 highest_objectid; + + /* btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So in_trans_setup + * is used to tell us when more checks are required + */ + unsigned long in_trans_setup; int ref_cows; int track_dirty; int in_radix; -- cgit v1.2.3 From 9fe6a50fb764f508dd2de47a66e62e51388791fb Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 16 Jun 2011 09:04:57 +0000 Subject: btrfs: Remove unused sysfs code Removes code no longer used. The sysfs file itself is kept, because the btrfs developers expressed interest in putting new entries to sysfs. Signed-off-by: Maarten Lankhorst Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a2c91a102b72..8e948ec1ee6b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1195,7 +1195,6 @@ struct btrfs_root { struct btrfs_key defrag_max; int defrag_running; char *name; - int in_sysfs; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; -- cgit v1.2.3