summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c64
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c18
-rw-r--r--fs/btrfs/ctree.c75
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/delayed-ref.c11
-rw-r--r--fs/btrfs/dev-replace.c29
-rw-r--r--fs/btrfs/disk-io.c27
-rw-r--r--fs/btrfs/extent-tree.c9
-rw-r--r--fs/btrfs/extent_io.c14
-rw-r--r--fs/btrfs/extent_map.c11
-rw-r--r--fs/btrfs/file.c19
-rw-r--r--fs/btrfs/free-space-cache.c6
-rw-r--r--fs/btrfs/inode-map.c1
-rw-r--r--fs/btrfs/inode.c14
-rw-r--r--fs/btrfs/ioctl.c20
-rw-r--r--fs/btrfs/ordered-data.c7
-rw-r--r--fs/btrfs/qgroup.c43
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/btrfs/reada.c15
-rw-r--r--fs/btrfs/relocation.c40
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/send.c102
-rw-r--r--fs/btrfs/super.c12
-rw-r--r--fs/btrfs/sysfs.c7
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c6
-rw-r--r--fs/btrfs/tests/qgroup-tests.c4
-rw-r--r--fs/btrfs/transaction.c8
-rw-r--r--fs/btrfs/tree-log.c57
-rw-r--r--fs/btrfs/uuid-tree.c2
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/btrfs/volumes.h6
33 files changed, 410 insertions, 238 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ff0b0be92d61..5456937836b8 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -265,16 +265,17 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq)
+static void run_ordered_work(struct __btrfs_workqueue *wq,
+ struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
struct btrfs_work *work;
spinlock_t *lock = &wq->list_lock;
unsigned long flags;
+ void *wtag;
+ bool free_self = false;
while (1) {
- void *wtag;
-
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
@@ -300,16 +301,47 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
list_del(&work->ordered_list);
spin_unlock_irqrestore(lock, flags);
- /*
- * We don't want to call the ordered free functions with the
- * lock held though. Save the work as tag for the trace event,
- * because the callback could free the structure.
- */
- wtag = work;
- work->ordered_free(work);
- trace_btrfs_all_work_done(wq->fs_info, wtag);
+ if (work == self) {
+ /*
+ * This is the work item that the worker is currently
+ * executing.
+ *
+ * The kernel workqueue code guarantees non-reentrancy
+ * of work items. I.e., if a work item with the same
+ * address and work function is queued twice, the second
+ * execution is blocked until the first one finishes. A
+ * work item may be freed and recycled with the same
+ * work function; the workqueue code assumes that the
+ * original work item cannot depend on the recycled work
+ * item in that case (see find_worker_executing_work()).
+ *
+ * Note that the work of one Btrfs filesystem may depend
+ * on the work of another Btrfs filesystem via, e.g., a
+ * loop device. Therefore, we must not allow the current
+ * work item to be recycled until we are really done,
+ * otherwise we break the above assumption and can
+ * deadlock.
+ */
+ free_self = true;
+ } else {
+ /*
+ * We don't want to call the ordered free functions with
+ * the lock held though. Save the work as tag for the
+ * trace event, because the callback could free the
+ * structure.
+ */
+ wtag = work;
+ work->ordered_free(work);
+ trace_btrfs_all_work_done(wq->fs_info, wtag);
+ }
}
spin_unlock_irqrestore(lock, flags);
+
+ if (free_self) {
+ wtag = self;
+ self->ordered_free(self);
+ trace_btrfs_all_work_done(wq->fs_info, wtag);
+ }
}
static void normal_work_helper(struct btrfs_work *work)
@@ -337,7 +369,7 @@ static void normal_work_helper(struct btrfs_work *work)
work->func(work);
if (need_order) {
set_bit(WORK_DONE_BIT, &work->flags);
- run_ordered_work(wq);
+ run_ordered_work(wq, work);
}
if (!need_order)
trace_btrfs_all_work_done(wq->fs_info, wtag);
@@ -415,3 +447,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work)
{
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
+
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
+{
+ if (wq->high)
+ flush_workqueue(wq->high->normal_wq);
+
+ flush_workqueue(wq->normal->normal_wq);
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f9597355c9d..a0f6986806a4 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,4 +85,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq);
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
+
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 85dc7ab8f89e..2973d256bb44 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2018,13 +2018,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
extent_item_objectid);
if (!search_commit_root) {
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ trans = btrfs_attach_transaction(fs_info->extent_root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT &&
+ PTR_ERR(trans) != -EROFS)
+ return PTR_ERR(trans);
+ trans = NULL;
+ }
+ }
+
+ if (trans)
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- } else {
+ else
down_read(&fs_info->commit_root_sem);
- }
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
tree_mod_seq_elem.seq, &refs,
@@ -2056,7 +2062,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
free_leaf_list(refs);
out:
- if (!search_commit_root) {
+ if (trans) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
} else {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c94d3390cbfc..b5ebb43b1824 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -331,26 +331,6 @@ struct tree_mod_elem {
struct tree_mod_root old_root;
};
-static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
-{
- read_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
-{
- read_unlock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
-{
- write_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
-{
- write_unlock(&fs_info->tree_mod_log_lock);
-}
-
/*
* Pull a new tree mod seq number for our operation.
*/
@@ -370,14 +350,12 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
- tree_mod_log_write_lock(fs_info);
- spin_lock(&fs_info->tree_mod_seq_lock);
+ write_lock(&fs_info->tree_mod_log_lock);
if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
return elem->seq;
}
@@ -396,7 +374,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
if (!seq_putting)
return;
- spin_lock(&fs_info->tree_mod_seq_lock);
+ write_lock(&fs_info->tree_mod_log_lock);
list_del(&elem->list);
elem->seq = 0;
@@ -407,29 +385,27 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* blocker with lower sequence number exists, we
* cannot remove anything from the log
*/
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ write_unlock(&fs_info->tree_mod_log_lock);
return;
}
min_seq = cur_elem->seq;
}
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
/*
* anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
- tree_mod_log_write_lock(fs_info);
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = container_of(node, struct tree_mod_elem, node);
- if (tm->seq > min_seq)
+ if (tm->seq >= min_seq)
continue;
rb_erase(node, tm_root);
kfree(tm);
}
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
}
/*
@@ -440,7 +416,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* for root replace operations, or the logical address of the affected
* block for all other operations.
*
- * Note: must be called with write lock (tree_mod_log_write_lock).
+ * Note: must be called with write lock for fs_info::tree_mod_log_lock.
*/
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -480,7 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
* Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
* returns zero with the tree_mod_log_lock acquired. The caller must hold
* this until all tree mod log insertions are recorded in the rb tree and then
- * call tree_mod_log_write_unlock() to release.
+ * write unlock fs_info::tree_mod_log_lock.
*/
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb) {
@@ -490,9 +466,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
if (eb && btrfs_header_level(eb) == 0)
return 1;
- tree_mod_log_write_lock(fs_info);
+ write_lock(&fs_info->tree_mod_log_lock);
if (list_empty(&(fs_info)->tree_mod_seq_list)) {
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
return 1;
}
@@ -556,7 +532,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
}
ret = __tree_mod_log_insert(fs_info, tm);
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
kfree(tm);
@@ -620,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
ret = __tree_mod_log_insert(fs_info, tm);
if (ret)
goto free_tms;
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
@@ -631,7 +607,7 @@ free_tms:
kfree(tm_list[i]);
}
if (locked)
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
kfree(tm);
@@ -712,7 +688,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (!ret)
ret = __tree_mod_log_insert(fs_info, tm);
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
@@ -739,7 +715,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
struct tree_mod_elem *cur = NULL;
struct tree_mod_elem *found = NULL;
- tree_mod_log_read_lock(fs_info);
+ read_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
node = tm_root->rb_node;
while (node) {
@@ -767,7 +743,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
break;
}
}
- tree_mod_log_read_unlock(fs_info);
+ read_unlock(&fs_info->tree_mod_log_lock);
return found;
}
@@ -848,7 +824,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
goto free_tms;
}
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
@@ -860,7 +836,7 @@ free_tms:
kfree(tm_list[i]);
}
if (locked)
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return ret;
@@ -920,7 +896,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
goto free_tms;
ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
@@ -1271,7 +1247,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
- tree_mod_log_read_lock(fs_info);
+ read_lock(&fs_info->tree_mod_log_lock);
while (tm && tm->seq >= time_seq) {
/*
* all the operations are recorded with the operator used for
@@ -1326,7 +1302,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
if (tm->logical != first_tm->logical)
break;
}
- tree_mod_log_read_unlock(fs_info);
+ read_unlock(&fs_info->tree_mod_log_lock);
btrfs_set_header_nritems(eb, n);
}
@@ -1406,6 +1382,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_elem *tm;
struct extent_buffer *eb = NULL;
struct extent_buffer *eb_root;
+ u64 eb_root_owner = 0;
struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
@@ -1439,6 +1416,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(old);
}
} else if (old_root) {
+ eb_root_owner = btrfs_header_owner(eb_root);
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(root->fs_info, logical,
@@ -1457,7 +1435,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+ btrfs_set_header_owner(eb, eb_root_owner);
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
@@ -2971,6 +2949,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
again:
b = get_old_root(root, time_seq);
+ if (!b) {
+ ret = -EIO;
+ goto done;
+ }
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
@@ -5465,6 +5447,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
advance_left = advance_right = 0;
while (1) {
+ cond_resched();
if (advance_left && !left_end_reached) {
ret = tree_advance(left_root, left_path, &left_level,
left_root_level,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a423c36bcd72..2bc37d03d407 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -851,14 +851,12 @@ struct btrfs_fs_info {
struct list_head delayed_iputs;
struct mutex cleaner_delayed_iput_mutex;
- /* this protects tree_mod_seq_list */
- spinlock_t tree_mod_seq_lock;
atomic64_t tree_mod_seq;
- struct list_head tree_mod_seq_list;
- /* this protects tree_mod_log */
+ /* this protects tree_mod_log and tree_mod_seq_list */
rwlock_t tree_mod_log_lock;
struct rb_root tree_mod_log;
+ struct list_head tree_mod_seq_list;
atomic_t nr_async_submits;
atomic_t async_submit_draining;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8d93854a4b4f..c1ca4ce11e69 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -193,8 +193,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
- if (trans->delayed_ref_updates)
- trans->delayed_ref_updates--;
}
static bool merge_ref(struct btrfs_trans_handle *trans,
@@ -281,7 +279,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
if (head->is_data)
return;
- spin_lock(&fs_info->tree_mod_seq_lock);
+ read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
struct seq_list *elem;
@@ -289,7 +287,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct seq_list, list);
seq = elem->seq;
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ read_unlock(&fs_info->tree_mod_log_lock);
ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
list);
@@ -317,7 +315,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem;
int ret = 0;
- spin_lock(&fs_info->tree_mod_seq_lock);
+ read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct seq_list, list);
@@ -331,7 +329,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
}
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}
@@ -445,7 +443,6 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
add_tail:
list_add_tail(&ref->list, &href->ref_list);
atomic_inc(&root->num_entries);
- trans->delayed_ref_updates++;
spin_unlock(&href->lock);
return ret;
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index fb973cc0af66..395b07764269 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -511,18 +511,27 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+ mutex_lock(&uuid_mutex);
+ /* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&root->fs_info->chunk_mutex);
+ if (src_device->has_pending_chunks) {
+ mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
+ } else {
+ break;
+ }
}
- ret = btrfs_commit_transaction(trans, root);
- WARN_ON(ret);
- mutex_lock(&uuid_mutex);
- /* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- mutex_lock(&root->fs_info->chunk_mutex);
btrfs_dev_replace_lock(dev_replace, 1);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9d3352fe8dc9..1de017051928 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1712,8 +1712,8 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio->bi_error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
bio_endio(bio);
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
}
static int cleaner_kthread(void *arg)
@@ -2104,7 +2104,7 @@ static void free_root_extent_buffers(struct btrfs_root *root)
}
/* helper to cleanup tree roots */
-static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root);
@@ -2113,7 +2113,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
- if (chunk_root)
+ if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
free_root_extent_buffers(info->free_space_root);
}
@@ -2519,7 +2519,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->free_chunk_lock);
- spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
@@ -2980,6 +2979,7 @@ retry_root_backup:
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
!btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) {
+ btrfs_info(fs_info, "start tree-log replay");
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3136,7 +3136,7 @@ fail_block_groups:
btrfs_free_block_groups(fs_info);
fail_tree_roots:
- free_root_pointers(fs_info, 1);
+ free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
@@ -3165,7 +3165,7 @@ recovery_tree_root:
if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT))
goto fail_tree_roots;
- free_root_pointers(fs_info, 0);
+ free_root_pointers(fs_info, false);
/* don't use the log in recovery mode, it won't be valid */
btrfs_set_super_log_root(disk_super, 0);
@@ -3825,6 +3825,19 @@ void close_ctree(struct btrfs_root *root)
*/
btrfs_delete_unused_bgs(root->fs_info);
+ /*
+ * There might be existing delayed inode workers still running
+ * and holding an empty delayed inode item. We must wait for
+ * them to complete first because they can create a transaction.
+ * This happens when someone calls btrfs_balance_delayed_items()
+ * and then a transaction commit runs the same delayed nodes
+ * before any delayed worker has done something with the nodes.
+ * We must wait for any worker here and not at transaction
+ * commit time since that could cause a deadlock.
+ * This is a very rare case.
+ */
+ btrfs_flush_workqueue(fs_info->delayed_workers);
+
ret = btrfs_commit_super(root);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
@@ -3862,7 +3875,7 @@ void close_ctree(struct btrfs_root *root)
btrfs_stop_all_workers(fs_info);
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
- free_root_pointers(fs_info, 1);
+ free_root_pointers(fs_info, true);
iput(fs_info->btree_inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7938c48c72ff..538f378eea52 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7571,6 +7571,14 @@ search:
*/
if ((flags & extra) && !(block_group->flags & extra))
goto loop;
+
+ /*
+ * This block group has different flags than we want.
+ * It's possible that we have MIXED_GROUP flag but no
+ * block group is mixed. Just skip such block group.
+ */
+ btrfs_release_block_group(block_group, delalloc);
+ continue;
}
have_block_group:
@@ -10317,6 +10325,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
btrfs_err(info,
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
cache->key.objectid);
+ btrfs_put_block_group(cache);
ret = -EINVAL;
goto error;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4d901200be13..1372d3e5d90b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4060,6 +4060,14 @@ retry:
*/
scanned = 1;
index = 0;
+
+ /*
+ * If we're looping we could run into a page that is locked by a
+ * writer and that writer could be waiting on writeback for a
+ * page in our current bio, and thus deadlock, so flush the
+ * write bio here.
+ */
+ flush_write_bio(data);
goto retry;
}
@@ -4994,12 +5002,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
if (!eb)
- return NULL;
+ return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ if (ret) {
+ exists = ERR_PTR(ret);
goto free_eb;
+ }
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> PAGE_SHIFT, eb);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 26f9ac719d20..4f59b4089eb0 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -227,6 +227,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
struct extent_map *merge = NULL;
struct rb_node *rb;
+ /*
+ * We can't modify an extent map that is in the tree and that is being
+ * used by another task, as it can cause that other task to see it in
+ * inconsistent state during the merging. We always have 1 reference for
+ * the tree and 1 for this task (which is unpinning the extent map or
+ * clearing the logging flag), so anything > 2 means it's being used by
+ * other tasks too.
+ */
+ if (atomic_read(&em->refs) > 2)
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 437544846e4e..03661b744eaf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1555,6 +1555,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
+ only_release_metadata = false;
sector_offset = pos & (root->sectorsize - 1);
reserve_bytes = round_up(write_bytes + sector_offset,
root->sectorsize);
@@ -1704,7 +1705,6 @@ again:
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
NULL, GFP_NOFS);
- only_release_metadata = false;
}
btrfs_drop_pages(pages, num_pages);
@@ -1952,6 +1952,18 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
u64 len;
/*
+ * If the inode needs a full sync, make sure we use a full range to
+ * avoid log tree corruption, due to hole detection racing with ordered
+ * extent completion for adjacent ranges, and assertion failures during
+ * hole detection.
+ */
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ /*
* The range length can be represented by u64, we have to do the typecasts
* to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
*/
@@ -2634,6 +2646,11 @@ out_only_mutex:
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
+ struct timespec now = current_time(inode);
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = now;
+ inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 69a3c11af9d4..a84a1ceb260a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -391,6 +391,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "free space cache page truncated");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
if (!PageUptodate(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d27014b8bf72..075b59516c8c 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -159,6 +159,7 @@ static void start_caching(struct btrfs_root *root)
spin_lock(&root->ino_cache_lock);
root->ino_cache_state = BTRFS_CACHE_FINISHED;
spin_unlock(&root->ino_cache_lock);
+ wake_up(&root->ino_cache_wait);
return;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a609d2017a63..f56610ad64bd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5576,7 +5576,6 @@ static void inode_tree_del(struct inode *inode)
spin_unlock(&root->inode_lock);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- synchronize_srcu(&root->fs_info->subvol_srcu);
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
spin_unlock(&root->inode_lock);
@@ -9596,9 +9595,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
return -EXDEV;
/* close the race window with snapshot create/destroy ioctl */
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
- down_read(&root->fs_info->subvol_sem);
- if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&dest->fs_info->subvol_sem);
/*
@@ -9615,6 +9613,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
/*
* We need to find a free sequence number both in the source and
* in the destination directory for the exchange.
@@ -9781,9 +9782,8 @@ out_fail:
ret2 = btrfs_end_transaction(trans, root);
ret = ret ? ret : ret2;
out_notrans:
- if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
- up_read(&dest->fs_info->subvol_sem);
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 242584a0d3b5..eefe103c65da 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -385,6 +385,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ /*
+ * If the fs is mounted with nologreplay, which requires it to be
+ * mounted in RO mode as well, we can not allow discard on free space
+ * inside block groups, because log trees refer to extents that are not
+ * pinned in a block group's free space cache (pinning the extents is
+ * precisely the first phase of replaying a log tree).
+ */
+ if (btrfs_test_opt(fs_info, NOLOGREPLAY))
+ return -EROFS;
+
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
dev_list) {
@@ -600,12 +610,18 @@ static noinline int create_subvol(struct inode *dir,
btrfs_i_size_write(dir, dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
objectid, root->root_key.objectid,
btrfs_ino(dir), index, name, namelen);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b2d1e95de7be..7dc2284017fa 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -837,10 +837,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
}
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
+ /*
+ * If the ordered extent had an error save the error but don't
+ * exit without waiting first for all other ordered extents in
+ * the range to complete.
+ */
if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
ret = -EIO;
btrfs_put_ordered_extent(ordered);
- if (ret || end == 0 || end == start)
+ if (end == 0 || end == start)
break;
end--;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f25233093d68..0355e6d9e21c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -759,10 +759,10 @@ out:
return ret;
}
-static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_root *root)
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *l;
@@ -778,7 +778,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -1863,7 +1863,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
spin_unlock(&fs_info->qgroup_lock);
- ret = update_qgroup_status_item(trans, fs_info, quota_root);
+ ret = update_qgroup_status_item(trans);
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2380,9 +2380,6 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!btrfs_fs_closing(fs_info))
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
-
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2398,16 +2395,30 @@ out:
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d\n",
err);
- goto done;
}
- ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
- if (ret < 0) {
- err = ret;
- btrfs_err(fs_info, "fail to update qgroup status: %d", err);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (trans) {
+ ret = update_qgroup_status_item(trans);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d",
+ err);
+ }
}
+ fs_info->qgroup_rescan_running = false;
+ complete_all(&fs_info->qgroup_rescan_completion);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (!trans)
+ return;
+
btrfs_end_transaction(trans, fs_info->quota_root);
if (btrfs_fs_closing(fs_info)) {
@@ -2418,12 +2429,6 @@ out:
} else {
btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
-
-done:
- mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = false;
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- complete_all(&fs_info->qgroup_rescan_completion);
}
/*
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index af6a776fa18c..5aa07de5750e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2395,8 +2395,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bitmap_clear(rbio->dbitmap, pagenr, 1);
kunmap(p);
- for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+ for (stripe = 0; stripe < nr_data; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ kunmap(p_page);
}
__free_page(p_page);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 75bab76739be..0d1565d71231 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -734,21 +734,19 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
static void reada_start_machine_worker(struct btrfs_work *work)
{
struct reada_machine_work *rmw;
- struct btrfs_fs_info *fs_info;
int old_ioprio;
rmw = container_of(work, struct reada_machine_work, work);
- fs_info = rmw->fs_info;
-
- kfree(rmw);
old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
task_nice_ioprio(current));
set_task_ioprio(current, BTRFS_IOPRIO_READA);
- __reada_start_machine(fs_info);
+ __reada_start_machine(rmw->fs_info);
set_task_ioprio(current, old_ioprio);
- atomic_dec(&fs_info->reada_works_cnt);
+ atomic_dec(&rmw->fs_info->reada_works_cnt);
+
+ kfree(rmw);
}
static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -759,6 +757,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
u64 total = 0;
int i;
+again:
do {
enqueued = 0;
mutex_lock(&fs_devices->device_list_mutex);
@@ -771,6 +770,10 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
+ if (fs_devices->seed) {
+ fs_devices = fs_devices->seed;
+ goto again;
+ }
if (enqueued == 0)
return;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b0c3a6afe664..1003b983a8d7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -537,8 +537,8 @@ static int should_ignore_root(struct btrfs_root *root)
if (!reloc_root)
return 0;
- if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
- root->fs_info->running_transaction->transid - 1)
+ if (btrfs_header_generation(reloc_root->commit_root) ==
+ root->fs_info->running_transaction->transid)
return 0;
/*
* if there is reloc tree and it was created in previous
@@ -1185,7 +1185,7 @@ out:
free_backref_node(cache, lower);
}
- free_backref_node(cache, node);
+ remove_backref_node(cache, node);
return ERR_PTR(err);
}
ASSERT(!node || !node->detached);
@@ -1296,7 +1296,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
if (!node)
return -ENOMEM;
- node->bytenr = root->node->start;
+ node->bytenr = root->commit_root->start;
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
@@ -1328,10 +1328,11 @@ static void __del_reloc_root(struct btrfs_root *root)
if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ RB_CLEAR_NODE(&node->rb_node);
}
spin_unlock(&rc->reloc_root_tree.lock);
if (!node)
@@ -1349,7 +1350,7 @@ static void __del_reloc_root(struct btrfs_root *root)
* helper to update the 'address of tree root -> reloc tree'
* mapping
*/
-static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+static int __update_reloc_root(struct btrfs_root *root)
{
struct rb_node *rb_node;
struct mapping_node *node = NULL;
@@ -1357,7 +1358,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1369,7 +1370,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
BUG_ON((struct btrfs_root *)node->data != root);
spin_lock(&rc->reloc_root_tree.lock);
- node->bytenr = new_bytenr;
+ node->bytenr = root->node->start;
rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
@@ -1519,6 +1520,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
}
if (reloc_root->commit_root != reloc_root->node) {
+ __update_reloc_root(reloc_root);
btrfs_set_root_node(root_item, reloc_root->node);
free_extent_buffer(reloc_root->commit_root);
reloc_root->commit_root = btrfs_root_node(reloc_root);
@@ -2457,7 +2459,21 @@ out:
free_reloc_roots(&reloc_roots);
}
- BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ /*
+ * We used to have
+ *
+ * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ *
+ * here, but it's wrong. If we fail to start the transaction in
+ * prepare_to_merge() we will have only 0 ref reloc roots, none of which
+ * have actually been removed from the reloc_root_tree rb tree. This is
+ * fine because we're bailing here, and we hold a reference on the root
+ * for the list that holds it, so these roots will be cleaned up when we
+ * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
+ * will be cleaned up on unmount.
+ *
+ * The remaining nodes will be cleaned up by free_reloc_control.
+ */
}
static void free_block_list(struct rb_root *blocks)
@@ -4587,6 +4603,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
goto out_free;
}
@@ -4702,11 +4719,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- if (buf == root->node)
- __update_reloc_root(root, cow->start);
- }
-
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item))
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index edae751e870c..307b8baaf0e9 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -144,10 +144,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- if (ret < 0) {
- btrfs_abort_transaction(trans, ret);
+ if (ret < 0)
goto out;
- }
if (ret != 0) {
btrfs_print_leaf(root, path->nodes[0]);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a45f26ac5da7..edfc7ba38b33 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -37,6 +37,14 @@
#include "compression.h"
/*
+ * Maximum number of references an extent can have in order for us to attempt to
+ * issue clone operations instead of write operations. This currently exists to
+ * avoid hitting limitations of the backreference walking code (taking a lot of
+ * time and using too much memory for extents with large number of references).
+ */
+#define SEND_MAX_EXTENT_REFS 64
+
+/*
* A fs_path is a helper to dynamically build path names with unknown size.
* It reallocates the internal buffer on demand.
* It allows fast adding of path elements on the right side (normal path) and
@@ -1327,6 +1335,7 @@ static int find_extent_clone(struct send_ctx *sctx,
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
+ struct btrfs_extent_item *ei;
int compressed;
u32 i;
@@ -1376,7 +1385,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
&found_key, &flags);
up_read(&fs_info->commit_root_sem);
- btrfs_release_path(tmp_path);
if (ret < 0)
goto out;
@@ -1385,6 +1393,21 @@ static int find_extent_clone(struct send_ctx *sctx,
goto out;
}
+ ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
+ struct btrfs_extent_item);
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
+ ret = -ENOENT;
+ goto out;
+ }
+ btrfs_release_path(tmp_path);
+
/*
* Setup the clone roots.
*/
@@ -5835,68 +5858,21 @@ static int changed_extent(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
-
- if (result == BTRFS_COMPARE_TREE_CHANGED) {
- struct extent_buffer *leaf_l;
- struct extent_buffer *leaf_r;
- struct btrfs_file_extent_item *ei_l;
- struct btrfs_file_extent_item *ei_r;
-
- leaf_l = sctx->left_path->nodes[0];
- leaf_r = sctx->right_path->nodes[0];
- ei_l = btrfs_item_ptr(leaf_l,
- sctx->left_path->slots[0],
- struct btrfs_file_extent_item);
- ei_r = btrfs_item_ptr(leaf_r,
- sctx->right_path->slots[0],
- struct btrfs_file_extent_item);
-
- /*
- * We may have found an extent item that has changed
- * only its disk_bytenr field and the corresponding
- * inode item was not updated. This case happens due to
- * very specific timings during relocation when a leaf
- * that contains file extent items is COWed while
- * relocation is ongoing and its in the stage where it
- * updates data pointers. So when this happens we can
- * safely ignore it since we know it's the same extent,
- * but just at different logical and physical locations
- * (when an extent is fully replaced with a new one, we
- * know the generation number must have changed too,
- * since snapshot creation implies committing the current
- * transaction, and the inode item must have been updated
- * as well).
- * This replacement of the disk_bytenr happens at
- * relocation.c:replace_file_extents() through
- * relocation.c:btrfs_reloc_cow_block().
- */
- if (btrfs_file_extent_generation(leaf_l, ei_l) ==
- btrfs_file_extent_generation(leaf_r, ei_r) &&
- btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
- btrfs_file_extent_compression(leaf_l, ei_l) ==
- btrfs_file_extent_compression(leaf_r, ei_r) &&
- btrfs_file_extent_encryption(leaf_l, ei_l) ==
- btrfs_file_extent_encryption(leaf_r, ei_r) &&
- btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
- btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
- btrfs_file_extent_type(leaf_l, ei_l) ==
- btrfs_file_extent_type(leaf_r, ei_r) &&
- btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
- btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
- btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
- btrfs_file_extent_offset(leaf_l, ei_l) ==
- btrfs_file_extent_offset(leaf_r, ei_r) &&
- btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_num_bytes(leaf_r, ei_r))
- return 0;
- }
-
- inconsistent_snapshot_error(sctx, result, "extent");
- return -EIO;
- }
+ /*
+ * We have found an extent item that changed without the inode item
+ * having changed. This can happen either after relocation (where the
+ * disk_bytenr of an extent item is replaced at
+ * relocation.c:replace_file_extents()) or after deduplication into a
+ * file in both the parent and send snapshots (where an extent item can
+ * get modified or replaced with a new one). Note that deduplication
+ * updates the inode item, but it only changes the iversion (sequence
+ * field in the inode item) of the inode, so if a file is deduplicated
+ * the same amount of times in both the parent and send snapshots, its
+ * iversion becames the same in both snapshots, whence the inode item is
+ * the same on both snapshots.
+ */
+ if (sctx->cur_ino != sctx->cmp_key->objectid)
+ return 0;
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result != BTRFS_COMPARE_TREE_DELETED)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a7b69deb6d70..9286603a6a98 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1809,6 +1809,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
ret = -EINVAL;
goto restore;
}
@@ -2164,7 +2166,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
thresh = 4 * 1024 * 1024;
- if (!mixed && total_free_meta - thresh < block_rsv->size)
+ /*
+ * We only want to claim there's no available space if we can no longer
+ * allocate chunks for our metadata profile and our global reserve will
+ * not fit in the free metadata space. If we aren't ->full then we
+ * still can allocate chunks and thus are fine using the currently
+ * calculated f_bavail.
+ */
+ if (!mixed && block_rsv->space_info->full &&
+ total_free_meta - thresh < block_rsv->size)
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1f157fba8940..510cad48e519 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -751,7 +751,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
fs_devs->fsid_kobj.kset = btrfs_kset;
error = kobject_init_and_add(&fs_devs->fsid_kobj,
&btrfs_ktype, parent, "%pU", fs_devs->fsid);
- return error;
+ if (error) {
+ kobject_put(&fs_devs->fsid_kobj);
+ return error;
+ }
+
+ return 0;
}
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index bf62ad919a95..9edc2674b8a7 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -112,7 +112,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
- spin_lock_init(&fs_info->tree_mod_seq_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
mutex_init(&fs_info->qgroup_rescan_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index a724d9a79bd2..5e3b875d87e2 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -476,9 +476,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
root->node = alloc_test_extent_buffer(root->fs_info,
nodesize, nodesize);
- if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
- ret = -ENOMEM;
+ if (IS_ERR(root->node)) {
+ test_msg("couldn't allocate dummy buffer\n");
+ ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 9c6666692341..e0aa6b9786fa 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -488,9 +488,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
*/
root->node = alloc_test_extent_buffer(root->fs_info, nodesize,
nodesize);
- if (!root->node) {
+ if (IS_ERR(root->node)) {
test_msg("Couldn't allocate dummy buffer\n");
- ret = -ENOMEM;
+ ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fd6c74662e9a..31df020634cd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1917,6 +1917,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ /*
+ * Some places just start a transaction to commit it. We need to make
+ * sure that if this commit fails that the abort code actually marks the
+ * transaction as failed, so set trans->dirty to make the abort code do
+ * the right thing.
+ */
+ trans->dirty = true;
+
/* Stop the commit early if ->aborted is set */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 47d11a30bee7..f79682937faf 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2827,6 +2827,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
+ * Update or create log root item under the root's log_mutex to prevent
+ * races with concurrent log syncs that can lead to failure to update
+ * log root item because it was not created yet.
+ */
+ ret = update_log_root(trans, log);
+ /*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
@@ -2845,8 +2851,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
- ret = update_log_root(trans, log);
-
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/*
@@ -3343,9 +3347,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* find the first key from this transaction again */
+ /*
+ * Find the first key from this transaction again. See the note for
+ * log_new_dir_dentries, if we're logging a directory recursively we
+ * won't be holding its i_mutex, which means we can modify the directory
+ * while we're logging it. If we remove an entry between our first
+ * search and this search we'll not find the key again and can just
+ * bail.
+ */
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (WARN_ON(ret != 0))
+ if (ret != 0)
goto done;
/*
@@ -4432,13 +4443,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_inline_len(leaf,
- path->slots[0],
- extent);
- ASSERT(len == i_size);
+ BTRFS_FILE_EXTENT_INLINE)
return 0;
- }
len = btrfs_file_extent_num_bytes(leaf, extent);
/* Last extent goes beyond i_size, no need to log a hole. */
@@ -4835,7 +4841,7 @@ again:
err = btrfs_log_inode(trans, root, other_inode,
LOG_OTHER_INODE,
0, LLONG_MAX, ctx);
- iput(other_inode);
+ btrfs_add_delayed_iput(other_inode);
if (err)
goto out_unlock;
else
@@ -5253,7 +5259,7 @@ process_leaf:
}
if (btrfs_inode_in_log(di_inode, trans->transid)) {
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
break;
}
@@ -5265,7 +5271,7 @@ process_leaf:
if (!ret &&
btrfs_must_commit_transaction(trans, di_inode))
ret = 1;
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
if (ctx->log_new_dentries) {
@@ -5411,7 +5417,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (!ret && ctx && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
dir_inode, ctx);
- iput(dir_inode);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -5691,9 +5697,28 @@ again:
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
+
+ /*
+ * We didn't find the subvol, likely because it was
+ * deleted. This is ok, simply skip this log and go to
+ * the next one.
+ *
+ * We need to exclude the root because we can't have
+ * other log replays overwriting this log as we'll read
+ * it back in a few more times. This will keep our
+ * block from being modified, and we'll just bail for
+ * each subsequent pass.
+ */
+ if (ret == -ENOENT)
+ ret = btrfs_pin_extent_for_log_replay(fs_info->extent_root,
+ log->node->start,
+ log->node->len);
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
+
+ if (!ret)
+ goto next;
btrfs_handle_fs_error(fs_info, ret,
"Couldn't read target root for tree log recovery.");
goto error;
@@ -5725,7 +5750,6 @@ again:
&root->highest_objectid);
}
- key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
@@ -5733,9 +5757,10 @@ again:
if (ret)
goto error;
-
+next:
if (found_key.offset == 0)
break;
+ key.offset = found_key.offset - 1;
}
btrfs_release_path(path);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 83bb2f2aa83c..ee1c76cf8886 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -335,6 +335,8 @@ again_search_slot:
}
if (ret < 0 && ret != -ENOENT)
goto out;
+ key.offset++;
+ goto again_search_slot;
}
item_size -= sizeof(subid_le);
offset += sizeof(subid_le);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c063ac57c30e..70aa22a8a9cc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4876,6 +4876,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
for (i = 0; i < map->num_stripes; i++) {
num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+ map->stripes[i].dev->has_pending_chunks = true;
}
spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -5071,8 +5072,7 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_DUP)) {
+ BTRFS_BLOCK_GROUP_RAID5)) {
max_errors = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
max_errors = 2;
@@ -7250,6 +7250,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
for (i = 0; i < map->num_stripes; i++) {
dev = map->stripes[i].dev;
dev->commit_bytes_used = dev->bytes_used;
+ dev->has_pending_chunks = false;
}
}
unlock_chunks(root);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9c09aa29d6bd..96c1b847def6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -62,6 +62,11 @@ struct btrfs_device {
spinlock_t io_lock ____cacheline_aligned;
int running_pending;
+ /* When true means this device has pending chunk alloc in
+ * current transaction. Protected by chunk_mutex.
+ */
+ bool has_pending_chunks;
+
/* regular prio bios */
struct btrfs_pending_bios pending_bios;
/* WRITE_SYNC bios */
@@ -307,7 +312,6 @@ struct btrfs_bio {
u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
- unsigned long flags;
void *private;
atomic_t error;
int max_errors;