// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include #include "ctree.h" #include "disk-io.h" #include "print-tree.h" #include "transaction.h" #include "locking.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* * When auto defrag is enabled we queue up these defrag structs to remember * which inodes need defragging passes. */ struct inode_defrag { struct rb_node rb_node; /* Inode number */ u64 ino; /* * Transid where the defrag was added, we search for extents newer than * this. */ u64 transid; /* Root objectid */ u64 root; /* * The extent size threshold for autodefrag. * * This value is different for compressed/non-compressed extents, thus * needs to be passed from higher layer. * (aka, inode_should_defrag()) */ u32 extent_thresh; }; static int __compare_inode_defrag(struct inode_defrag *defrag1, struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; else if (defrag1->root < defrag2->root) return -1; else if (defrag1->ino > defrag2->ino) return 1; else if (defrag1->ino < defrag2->ino) return -1; else return 0; } /* * Pop a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. * * If an existing record is found the defrag item you pass in is freed. */ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; struct rb_node **p; struct rb_node *parent = NULL; int ret; p = &fs_info->defrag_inodes.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); ret = __compare_inode_defrag(defrag, entry); if (ret < 0) p = &parent->rb_left; else if (ret > 0) p = &parent->rb_right; else { /* * If we're reinserting an entry for an old defrag run, * make sure to lower the transid of our existing * record. */ if (defrag->transid < entry->transid) entry->transid = defrag->transid; entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); return -EEXIST; } } set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); return 0; } static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; if (btrfs_fs_closing(fs_info)) return 0; return 1; } /* * Insert a defrag record for this inode if auto defrag is enabled. */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; u64 transid; int ret; if (!__need_auto_defrag(fs_info)) return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) return 0; if (trans) transid = trans->transid; else transid = inode->root->last_trans; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return -ENOMEM; defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { /* * If we set IN_DEFRAG flag and evict the inode from memory, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ ret = __btrfs_add_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); return 0; } /* * Pick the defragable inode that we want, if it doesn't exist, we will get the * next one. */ static struct inode_defrag *btrfs_pick_defrag_inode( struct btrfs_fs_info *fs_info, u64 root, u64 ino) { struct inode_defrag *entry = NULL; struct inode_defrag tmp; struct rb_node *p; struct rb_node *parent = NULL; int ret; tmp.ino = ino; tmp.root = root; spin_lock(&fs_info->defrag_inodes_lock); p = fs_info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); ret = __compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) p = parent->rb_right; else goto out; } if (parent && __compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); else entry = NULL; } out: if (entry) rb_erase(parent, &fs_info->defrag_inodes); spin_unlock(&fs_info->defrag_inodes_lock); return entry; } void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { struct inode_defrag *defrag; struct rb_node *node; spin_lock(&fs_info->defrag_inodes_lock); node = rb_first(&fs_info->defrag_inodes); while (node) { rb_erase(node, &fs_info->defrag_inodes); defrag = rb_entry(node, struct inode_defrag, rb_node); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); cond_resched_lock(&fs_info->defrag_inodes_lock); node = rb_first(&fs_info->defrag_inodes); } spin_unlock(&fs_info->defrag_inodes_lock); } #define BTRFS_DEFRAG_BATCH 1024 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct inode_defrag *defrag) { struct btrfs_root *inode_root; struct inode *inode; struct btrfs_ioctl_defrag_range_args range; int ret = 0; u64 cur = 0; again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; if (!__need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); if (IS_ERR(inode_root)) { ret = PTR_ERR(inode_root); goto cleanup; } inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto cleanup; } if (cur >= i_size_read(inode)) { iput(inode); goto cleanup; } /* Do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; sb_start_write(fs_info->sb); ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); iput(inode); if (ret < 0) goto cleanup; cur = max(cur + fs_info->sectorsize, range.start); goto again; cleanup: kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; } /* * Run through the list of inodes in the FS that need defragging. */ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) { struct inode_defrag *defrag; u64 first_ino = 0; u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); while (1) { /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; if (!__need_auto_defrag(fs_info)) break; /* find an inode to defrag */ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); if (!defrag) { if (root_objectid || first_ino) { root_objectid = 0; first_ino = 0; continue; } else { break; } } first_ino = defrag->ino + 1; root_objectid = defrag->root; __btrfs_run_defrag_inode(fs_info, defrag); } atomic_dec(&fs_info->defrag_running); /* * During unmount, we use the transaction_wait queue to wait for the * defragger to stop. */ wake_up(&fs_info->transaction_wait); return 0; } /* * Defrag all the leaves in a given btree. * Read all the leaves and try to get key order to * better reflect disk order */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; int ret = 0; int wret; int level; int next_key_ret = 0; u64 last_ret = 0; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) goto out; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } level = btrfs_header_level(root->node); if (level == 0) goto out; if (root->defrag_progress.objectid == 0) { struct extent_buffer *root_node; u32 nritems; root_node = btrfs_lock_root_node(root); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ btrfs_node_key_to_cpu(root_node, &root->defrag_max, nritems - 1); btrfs_tree_unlock(root_node); free_extent_buffer(root_node); memset(&key, 0, sizeof(key)); } else { memcpy(&key, &root->defrag_progress, sizeof(key)); } path->keep_locks = 1; ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret < 0) goto out; if (ret > 0) { ret = 0; goto out; } btrfs_release_path(path); /* * We don't need a lock on a leaf. btrfs_realloc_node() will lock all * leafs from path->nodes[1], so set lowest_level to 1 to avoid later * a deadlock (attempting to write lock an already write locked leaf). */ path->lowest_level = 1; wret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (wret < 0) { ret = wret; goto out; } if (!path->nodes[1]) { ret = 0; goto out; } /* * The node at level 1 must always be locked when our path has * keep_locks set and lowest_level is 1, regardless of the value of * path->slots[1]. */ BUG_ON(path->locks[1] == 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, &root->defrag_progress); if (ret) { WARN_ON(ret == -EAGAIN); goto out; } /* * Now that we reallocated the node we can find the next key. Note that * btrfs_find_next_key() can release our path and do another search * without COWing, this is because even with path->keep_locks = 1, * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a * node when path->slots[node_level - 1] does not point to the last * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore * we search for the next key after reallocating our node. */ path->slots[1] = btrfs_header_nritems(path->nodes[1]); next_key_ret = btrfs_find_next_key(root, path, &key, 1, BTRFS_OLDEST_GENERATION); if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; } out: btrfs_free_path(path); if (ret == -EAGAIN) { if (root->defrag_max.objectid > root->defrag_progress.objectid) goto done; if (root->defrag_max.type > root->defrag_progress.type) goto done; if (root->defrag_max.offset > root->defrag_progress.offset) goto done; ret = 0; } done: if (ret != -EAGAIN) memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); return ret; } void __cold btrfs_auto_defrag_exit(void) { kmem_cache_destroy(btrfs_inode_defrag_cachep); } int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, SLAB_MEM_SPREAD, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; return 0; }