From b4e005e9589dcf89a5fe53ffbb4adf007d44fffb Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 5 Aug 2019 14:31:53 -0400
Subject: [PATCH 01/48] btrfs: add a force_chunk_alloc to space_info's sysfs

In testing various things such as the btrfsck patch to detect over
allocation of chunks, empty block group deletion, and balance I've had
various ways to force chunk allocations for debug purposes.  Add a sysfs
file to enable forcing of chunk allocation for the owning space info in
order to enable us to add testcases in the future to test these various
features easier.

[HH: rebased for 5.14]
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/sysfs.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 25a6f58..71627d4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -92,6 +92,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = {	     \
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
 static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
+static inline struct kobject *get_btrfs_kobj(struct kobject *kobj);
 
 static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
 {
@@ -699,6 +700,58 @@ static struct kobj_type btrfs_raid_ktype = {
 	.default_groups = raid_groups,
 };
 
+static ssize_t btrfs_space_info_force_chunk_alloc_show(struct kobject *kobj,
+						       struct kobj_attribute *a,
+						       char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "0\n");
+}
+
+static ssize_t btrfs_space_info_force_chunk_alloc(struct kobject *kobj,
+						  struct kobj_attribute *a,
+						  const char *buf, size_t len)
+{
+	struct btrfs_space_info *space_info = to_space_info(kobj);
+	struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+	struct btrfs_trans_handle *trans;
+	unsigned long val;
+	int ret;
+
+	if (!fs_info) {
+		printk(KERN_ERR "couldn't get fs_info\n");
+		return -EPERM;
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (sb_rdonly(fs_info->sb))
+		return -EROFS;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	/*
+	 * We don't really care, but if we echo 0 > force it seems silly to do
+	 * anything.
+	 */
+	if (val == 0)
+		return -EINVAL;
+
+	trans = btrfs_start_transaction(fs_info->extent_root, 0);
+	if (!trans)
+		return PTR_ERR(trans);
+	ret = btrfs_force_chunk_alloc(trans, space_info->flags);
+	btrfs_end_transaction(trans);
+	if (ret == 1)
+		return len;
+	return -ENOSPC;
+}
+BTRFS_ATTR_RW(space_info, force_chunk_alloc,
+	      btrfs_space_info_force_chunk_alloc_show,
+	      btrfs_space_info_force_chunk_alloc);
+
 #define SPACE_INFO_ATTR(field)						\
 static ssize_t btrfs_space_info_show_##field(struct kobject *kobj,	\
 					     struct kobj_attribute *a,	\
@@ -736,6 +789,7 @@ static struct attribute *space_info_attrs[] = {
 	BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
 	BTRFS_ATTR_PTR(space_info, disk_used),
 	BTRFS_ATTR_PTR(space_info, disk_total),
+	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
 	NULL,
 };
 ATTRIBUTE_GROUPS(space_info);
@@ -1103,6 +1157,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 	return to_fs_devs(kobj)->fs_info;
 }
 
+static inline struct kobject *get_btrfs_kobj(struct kobject *kobj)
+{
+	while (kobj) {
+		if (kobj->ktype == &btrfs_ktype)
+			return kobj;
+		kobj = kobj->parent;
+	}
+	return NULL;
+}
+
 #define NUM_FEATURE_BITS 64
 #define BTRFS_FEATURE_NAME_MAX 13
 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
-- 
2.34.1.75.gabe6bb3905


From 0d16e1dde07cac163f1571f58e28c58ee859a333 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:31 +0100
Subject: [PATCH 02/48] btrfs: check if a log tree exists at inode_logged()

In case an inode was never logged since it was loaded from disk and was
modified in the current transaction (its ->last_trans matches the ID of
the current transaction), inode_logged() returns true even if there's no
existing log tree. In this case we can simply check if a log tree exists
and return false if it does not. This avoids a caller of inode_logged()
doing some unnecessary, but harmless, work.

For btrfs_log_new_name() it avoids it logging an inode in case it was
never logged since it was loaded from disk and there is currently no log
tree for the inode's root. For the remaining callers of inode_logged(),
btrfs_del_dir_entries_in_log() and btrfs_del_inode_ref_in_log(), it has
no effect since they already check if a log tree exists through their
calls to join_running_log_trans().

So just add a check to inode_logged() to verify if a log tree exists, and
return false if it does not.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 1/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b79da89..21ebc12 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3456,6 +3456,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
 	if (inode->logged_trans == trans->transid)
 		return true;
 
+	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+		return false;
+
 	/*
 	 * The inode's logged_trans is always 0 when we load it (because it is
 	 * not persisted in the inode item or elsewhere). So if it is 0, the
-- 
2.34.1.75.gabe6bb3905


From 6578d79955278b4e27060183b67a7923862b0071 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:32 +0100
Subject: [PATCH 03/48] btrfs: remove no longer needed checks for NULL log
 context

Since commit 75b463d2b47aef ("btrfs: do not commit logs and transactions
during link and rename operations"), we always pass a non-NULL log context
to btrfs_log_inode_parent() and therefore to all the functions that it
calls. So remove the checks we have all over the place that test for a
NULL log context, making the code shorter and easier to read, as well as
reducing the size of the generated code.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 2/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 21ebc12..3cf4433 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -207,7 +207,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 	}
 
 	atomic_inc(&root->log_writers);
-	if (ctx && !ctx->logging_new_name) {
+	if (!ctx->logging_new_name) {
 		int index = root->log_transid % 2;
 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
 		ctx->log_transid = root->log_transid;
@@ -3043,9 +3043,6 @@ static void wait_for_writer(struct btrfs_root *root)
 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
 					struct btrfs_log_ctx *ctx)
 {
-	if (!ctx)
-		return;
-
 	mutex_lock(&root->log_mutex);
 	list_del_init(&ctx->list);
 	mutex_unlock(&root->log_mutex);
@@ -3786,8 +3783,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			 */
 			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(src, di, &tmp);
-			if (ctx &&
-			    (btrfs_dir_transid(src, di) == trans->transid ||
+			if ((btrfs_dir_transid(src, di) == trans->transid ||
 			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
 			    tmp.type != BTRFS_ROOT_ITEM_KEY)
 				ctx->log_new_dentries = true;
@@ -5246,7 +5242,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
 					&other_ino, &other_parent);
 			if (ret < 0) {
 				return ret;
-			} else if (ret > 0 && ctx &&
+			} else if (ret > 0 &&
 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
 				if (ins_nr > 0) {
 					ins_nr++;
@@ -5587,8 +5583,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	 * So keep it simple for this case and just don't flag the ancestors as
 	 * logged.
 	 */
-	if (!ctx ||
-	    !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+	if (!(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
 	      &inode->vfs_inode != ctx->inode)) {
 		spin_lock(&inode->lock);
 		inode->logged_trans = trans->transid;
@@ -5941,11 +5936,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 				continue;
 			}
 
-			if (ctx)
-				ctx->log_new_dentries = false;
+			ctx->log_new_dentries = false;
 			ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
 					      LOG_INODE_ALL, ctx);
-			if (!ret && ctx && ctx->log_new_dentries)
+			if (!ret && ctx->log_new_dentries)
 				ret = log_new_dir_dentries(trans, root,
 						   BTRFS_I(dir_inode), ctx);
 			btrfs_add_delayed_iput(dir_inode);
@@ -6206,7 +6200,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		goto end_trans;
 	}
 
-	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
+	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
 		log_dentries = true;
 
 	/*
-- 
2.34.1.75.gabe6bb3905


From fc90c0512000926566de7b365c9edf1fb9091acf Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:33 +0100
Subject: [PATCH 04/48] btrfs: do not log new dentries when logging that a new
 name exists

When logging a new name for an inode, due to a link or rename operation,
we don't need to log all new dentries of the parent directories and their
subdirectories. We only want to log the names of the inode and that any
new parent directories exist. So in this case don't trigger logging of
the new dentries, that is only need when doing an explicit fsync on a
directory or on a file which requires logging its parent directories.

This avoids unnecessary work and reduces contention on the extent buffers
of a log tree.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 3/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3cf4433..ba6a753 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5716,6 +5716,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 	struct btrfs_dir_list *dir_elem;
 	int ret = 0;
 
+	/*
+	 * If we are logging a new name, as part of a link or rename operation,
+	 * don't bother logging new dentries, as we just want to log the names
+	 * of an inode and that any new parents exist.
+	 */
+	if (ctx->logging_new_name)
+		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
2.34.1.75.gabe6bb3905


From 44148ad99712c4daf6e04cee991f9e7a59eed0da Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:34 +0100
Subject: [PATCH 05/48] btrfs: always update the logged transaction when
 logging new names

When we are logging a new name for an inode, due to a link or rename
operation, if the inode has ancestor inodes that are new, created in the
current transaction, we need to log that these inodes exist. To ensure
that a subsequent explicit fsync on one of these ancestor inodes does
sync the log, we don't set the logged_trans field of these inodes.
This was done in commit 75b463d2b47aef ("btrfs: do not commit logs and
transactions during link and rename operations"), to avoid syncing a
log after a rename or link operation.

In order to allow for future changes to do some optimizations, change
this behaviour to always update the logged_trans of any logged inode
and don't update the last_log_commit of the inode if we are logging
that it exists. This accomplishes that same objective with simpler
logic, allowing for some optimizations in the next patches.

So just do that simplification.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 4/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 73 +++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 39 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ba6a753..8b90eae 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5573,47 +5573,42 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	spin_lock(&inode->lock);
+	inode->logged_trans = trans->transid;
 	/*
-	 * If we are logging that an ancestor inode exists as part of logging a
-	 * new name from a link or rename operation, don't mark the inode as
-	 * logged - otherwise if an explicit fsync is made against an ancestor,
-	 * the fsync considers the inode in the log and doesn't sync the log,
-	 * resulting in the ancestor missing after a power failure unless the
-	 * log was synced as part of an fsync against any other unrelated inode.
-	 * So keep it simple for this case and just don't flag the ancestors as
-	 * logged.
+	 * Don't update last_log_commit if we logged that an inode exists.
+	 * We do this for three reasons:
+	 *
+	 * 1) We might have had buffered writes to this inode that were
+	 *    flushed and had their ordered extents completed in this
+	 *    transaction, but we did not previously log the inode with
+	 *    LOG_INODE_ALL. Later the inode was evicted and after that
+	 *    it was loaded again and this LOG_INODE_EXISTS log operation
+	 *    happened. We must make sure that if an explicit fsync against
+	 *    the inode is performed later, it logs the new extents, an
+	 *    updated inode item, etc, and syncs the log. The same logic
+	 *    applies to direct IO writes instead of buffered writes.
+	 *
+	 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+	 *    is logged with an i_size of 0 or whatever value was logged
+	 *    before. If later the i_size of the inode is increased by a
+	 *    truncate operation, the log is synced through an fsync of
+	 *    some other inode and then finally an explicit fsync against
+	 *    this inode is made, we must make sure this fsync logs the
+	 *    inode with the new i_size, the hole between old i_size and
+	 *    the new i_size, and syncs the log.
+	 *
+	 * 3) If we are logging that an ancestor inode exists as part of
+	 *    logging a new name from a link or rename operation, don't update
+	 *    its last_log_commit - otherwise if an explicit fsync is made
+	 *    against an ancestor, the fsync considers the inode in the log
+	 *    and doesn't sync the log, resulting in the ancestor missing after
+	 *    a power failure unless the log was synced as part of an fsync
+	 *    against any other unrelated inode.
 	 */
-	if (!(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
-	      &inode->vfs_inode != ctx->inode)) {
-		spin_lock(&inode->lock);
-		inode->logged_trans = trans->transid;
-		/*
-		 * Don't update last_log_commit if we logged that an inode exists.
-		 * We do this for two reasons:
-		 *
-		 * 1) We might have had buffered writes to this inode that were
-		 *    flushed and had their ordered extents completed in this
-		 *    transaction, but we did not previously log the inode with
-		 *    LOG_INODE_ALL. Later the inode was evicted and after that
-		 *    it was loaded again and this LOG_INODE_EXISTS log operation
-		 *    happened. We must make sure that if an explicit fsync against
-		 *    the inode is performed later, it logs the new extents, an
-		 *    updated inode item, etc, and syncs the log. The same logic
-		 *    applies to direct IO writes instead of buffered writes.
-		 *
-		 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
-		 *    is logged with an i_size of 0 or whatever value was logged
-		 *    before. If later the i_size of the inode is increased by a
-		 *    truncate operation, the log is synced through an fsync of
-		 *    some other inode and then finally an explicit fsync against
-		 *    this inode is made, we must make sure this fsync logs the
-		 *    inode with the new i_size, the hole between old i_size and
-		 *    the new i_size, and syncs the log.
-		 */
-		if (inode_only != LOG_INODE_EXISTS)
-			inode->last_log_commit = inode->last_sub_trans;
-		spin_unlock(&inode->lock);
-	}
+	if (inode_only != LOG_INODE_EXISTS)
+		inode->last_log_commit = inode->last_sub_trans;
+	spin_unlock(&inode->lock);
 out_unlock:
 	mutex_unlock(&inode->log_mutex);
 
-- 
2.34.1.75.gabe6bb3905


From bbac7613a5fa25c2e6fe697e9c168d506287773e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:35 +0100
Subject: [PATCH 06/48] btrfs: avoid expensive search when dropping inode items
 from log

Whenever we are logging a directory inode, logging that an inode exists or
logging an inode that has changes in its references or xattrs, we attempt
to delete items of this inode we may have previously logged (through calls
to drop_objectid_items()).

That attempt does a btree search for deletion, which is expensive because
it always acquires write locks for extent buffers at levels 2, 1 and 0,
and it balances any node that is less than half full. Acquiring the write
locks can block the task if the extent buffers are already locked or block
other tasks attempting to lock them, which is specially bad in case of log
trees since they are small due to their short life, with a root node at a
level typically not greater than level 2.

If we know that we are logging the inode for the first time in the current
transaction, we can skip the search. This change does that.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 5/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8b90eae..9f85e7b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3885,17 +3885,21 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
  * This cannot be run for file data extents because it does not
  * free the extents they point to.
  */
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *log,
 				  struct btrfs_path *path,
-				  u64 objectid, int max_key_type)
+				  struct btrfs_inode *inode,
+				  int max_key_type)
 {
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	int start_slot;
 
-	key.objectid = objectid;
+	if (!inode_logged(trans, inode))
+		return 0;
+
+	key.objectid = btrfs_ino(inode);
 	key.type = max_key_type;
 	key.offset = (u64)-1;
 
@@ -3912,7 +3916,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
 
-		if (found_key.objectid != objectid)
+		if (found_key.objectid != key.objectid)
 			break;
 
 		found_key.offset = 0;
@@ -5446,7 +5450,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
 		if (inode_only == LOG_INODE_EXISTS)
 			max_key_type = BTRFS_XATTR_ITEM_KEY;
-		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+		ret = drop_inode_items(trans, log, path, inode, max_key_type);
 	} else {
 		if (inode_only == LOG_INODE_EXISTS) {
 			/*
@@ -5470,8 +5474,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     &inode->runtime_flags)) {
 			if (inode_only == LOG_INODE_EXISTS) {
 				max_key.type = BTRFS_XATTR_ITEM_KEY;
-				ret = drop_objectid_items(trans, log, path, ino,
-							  max_key.type);
+				ret = drop_inode_items(trans, log, path, inode,
+						       max_key.type);
 			} else {
 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 					  &inode->runtime_flags);
@@ -5490,8 +5494,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			if (inode_only == LOG_INODE_ALL)
 				fast_search = true;
 			max_key.type = BTRFS_XATTR_ITEM_KEY;
-			ret = drop_objectid_items(trans, log, path, ino,
-						  max_key.type);
+			ret = drop_inode_items(trans, log, path, inode,
+					       max_key.type);
 		} else {
 			if (inode_only == LOG_INODE_ALL)
 				fast_search = true;
-- 
2.34.1.75.gabe6bb3905


From ad6255024cf5957880c91ca17a7706bef6b21be1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:36 +0100
Subject: [PATCH 07/48] btrfs: add helper to truncate inode items when logging
 inode

Move the call to btrfs_truncate_inode_items(), and the surrounding retry
loop, into a local helper function. This avoids some repetition and avoids
making the next change a bit awkward due to a bit of too much indentation.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 6/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9f85e7b..44bed47 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3941,6 +3941,21 @@ static int drop_inode_items(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+				struct btrfs_root *log_root,
+				struct btrfs_inode *inode,
+				u64 new_size, u32 min_type)
+{
+	int ret;
+
+	do {
+		ret = btrfs_truncate_inode_items(trans, log_root, inode,
+						 new_size, min_type, NULL);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
 static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    struct extent_buffer *leaf,
 			    struct btrfs_inode_item *item,
@@ -4529,13 +4544,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
 			 * Avoid logging extent items logged in past fsync calls
 			 * and leading to duplicate keys in the log tree.
 			 */
-			do {
-				ret = btrfs_truncate_inode_items(trans,
-							 root->log_root,
-							 inode, truncate_offset,
-							 BTRFS_EXTENT_DATA_KEY,
-							 NULL);
-			} while (ret == -EAGAIN);
+			ret = truncate_inode_items(trans, root->log_root, inode,
+						   truncate_offset,
+						   BTRFS_EXTENT_DATA_KEY);
 			if (ret)
 				goto out;
 			dropped_extents = true;
@@ -5481,12 +5492,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 					  &inode->runtime_flags);
 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
 					  &inode->runtime_flags);
-				while(1) {
-					ret = btrfs_truncate_inode_items(trans,
-						log, inode, 0, 0, NULL);
-					if (ret != -EAGAIN)
-						break;
-				}
+				ret = truncate_inode_items(trans, log, inode, 0, 0);
 			}
 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
 					      &inode->runtime_flags) ||
-- 
2.34.1.75.gabe6bb3905


From 3ccadf96b4f3597f9f6951a592a63703f110a539 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:37 +0100
Subject: [PATCH 08/48] btrfs: avoid expensive search when truncating inode
 items from the log

Whenever we are logging a file inode in full sync mode we call
btrfs_truncate_inode_items() to delete items of the inode we may have
previously logged.

That results in doing a btree search for deletion, which is expensive
because it always acquires write locks for extent buffers at levels 2, 1
and 0, and it balances any node that is less than half full. Acquiring
the write locks can block the task if the extent buffers are already
locked by another task or block other tasks attempting to lock them,
which is specially bad in case of log trees since they are small due to
their short life, with a root node at a level typically not greater than
level 2.

If we know that we are logging the inode for the first time in the current
transaction, we can skip the call to btrfs_truncate_inode_items(), avoiding
the deletion search. This change does that.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 7/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 44bed47..6f50907 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5492,7 +5492,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 					  &inode->runtime_flags);
 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
 					  &inode->runtime_flags);
-				ret = truncate_inode_items(trans, log, inode, 0, 0);
+				if (inode_logged(trans, inode))
+					ret = truncate_inode_items(trans, log,
+								   inode, 0, 0);
 			}
 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
 					      &inode->runtime_flags) ||
-- 
2.34.1.75.gabe6bb3905


From 76dd7a9648882c898c45a2a29ad67e218c60c172 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:39 +0100
Subject: [PATCH 09/48] btrfs: avoid attempt to drop extents when logging inode
 for the first time

When logging an extent, in the fast fsync path, we always attempt do drop
or trim any existing extents with a range that match or overlap the range
of the extent we are about to log. We do that through a call to
btrfs_drop_extents().

However this is not needed when we are logging the inode for the first
time in the current transaction, since we have no inode items of the
inode in the log tree. Calling btrfs_drop_extents() does a deletion search
on the log tree, which is expensive when we have concurrent tasks
accessing the log tree because a deletion search always acquires a write
lock on the extent buffers at levels 2, 1 and 0, adding significant lock
contention, specially taking into account the height of a log tree rarely
(if ever) goes beyond 2 or 3, due to its short life.

So skip the call to btrfs_drop_extents() when the inode was not previously
logged in the current transaction.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 9/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6f50907..f0b71d3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4379,14 +4379,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	if (ret)
 		return ret;
 
-	drop_args.path = path;
-	drop_args.start = em->start;
-	drop_args.end = em->start + em->len;
-	drop_args.replace_extent = true;
-	drop_args.extent_item_size = sizeof(*fi);
-	ret = btrfs_drop_extents(trans, log, inode, &drop_args);
-	if (ret)
-		return ret;
+	/*
+	 * If this is the first time we are logging the inode in the current
+	 * transaction, we can avoid btrfs_drop_extents(), which is expensive
+	 * because it does a deletion search, which always acquires write locks
+	 * for extent buffers at levels 2, 1 and 0. This not only wastes time
+	 * but also adds significant contention in a log tree, since log trees
+	 * are small, with a root at level 2 or 3 at most, due to their short
+	 * life span.
+	 */
+	if (inode_logged(trans, inode)) {
+		drop_args.path = path;
+		drop_args.start = em->start;
+		drop_args.end = em->start + em->len;
+		drop_args.replace_extent = true;
+		drop_args.extent_item_size = sizeof(*fi);
+		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+		if (ret)
+			return ret;
+	}
 
 	if (!drop_args.extent_inserted) {
 		key.objectid = btrfs_ino(inode);
-- 
2.34.1.75.gabe6bb3905


From af838e3d1e00a00aaac4386212f7fd8496da0d68 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:38 +0100
Subject: [PATCH 10/48] btrfs: avoid search for logged i_size when logging
 inode if possible

If we are logging that an inode exists and the inode was not logged
before, we can avoid searching in the log tree for the inode item since we
know it does not exists. That wastes time and adds more lock contention on
the extent buffers of the log tree when there are other tasks that are
logging other inodes.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 8/10 and test results are listed in the change log of the
last patch in the set.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f0b71d3..f976001 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5474,7 +5474,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			max_key_type = BTRFS_XATTR_ITEM_KEY;
 		ret = drop_inode_items(trans, log, path, inode, max_key_type);
 	} else {
-		if (inode_only == LOG_INODE_EXISTS) {
+		if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
 			/*
 			 * Make sure the new inode item we write to the log has
 			 * the same isize as the current one (if it exists).
-- 
2.34.1.75.gabe6bb3905


From 067d066a2c5f0ad43c82e1d8927977205fb41210 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Aug 2021 15:30:40 +0100
Subject: [PATCH 11/48] btrfs: do not commit delayed inode when logging a file
 in full sync mode

When logging a regular file in full sync mode, we are currently committing
its delayed inode item. This is to ensure that we never miss copying the
inode item, with its most up to date data, into the log tree.

However that is not necessary since commit e4545de5b035 ("Btrfs: fix fsync
data loss after append write"), because even if we don't find the leaf
with the inode item when looking for leaves that changed in the current
transaction, we end up logging the inode item later using the in-memory
content. In case we find the leaf containing the inode item, we already
end up using the in-memory inode for filling the inode item in the log
tree, and not the inode item that is in the fs/subvolume tree, as it
might be not up to date (copy_items() -> fill_inode_item()).

So don't commit the delayed inode item, which brings a couple of benefits:

1) Avoid writing the inode item to the fs/subvolume btree, saving time and
   reducing lock contention on the btree;

2) In case no other item for the inode was changed, added or deleted in
   the same leaf where the inode item is located, we ended up copying
   all the items in that leaf to the log tree - it's harmless from a
   functional point of view, but it wastes time and log tree space.

This patch is part of a patch set comprised of the following patches:

  btrfs: check if a log tree exists at inode_logged()
  btrfs: remove no longer needed checks for NULL log context
  btrfs: do not log new dentries when logging that a new name exists
  btrfs: always update the logged transaction when logging new names
  btrfs: avoid expensive search when dropping inode items from log
  btrfs: add helper to truncate inode items when logging inode
  btrfs: avoid expensive search when truncating inode items from the log
  btrfs: avoid search for logged i_size when logging inode if possible
  btrfs: avoid attempt to drop extents when logging inode for the first time
  btrfs: do not commit delayed inode when logging a file in full sync mode

This is patch 10/10 and the following test results compare a branch with
the whole patch set applied versus a branch without any of the patches
applied.

The following script was used to test dbench with 8 and 16 jobs on a
machine with 12 cores, 64G of RAM, a NVME device and using a non-debug
kernel config (Debian's default):

  $ cat test.sh
  #!/bin/bash

  if [ $# -ne 1 ]; then
      echo "Use $0 NUM_JOBS"
      exit 1
  fi

  NUM_JOBS=$1

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1
  MOUNT_OPTIONS="-o ssd"
  MKFS_OPTIONS="-m single -d single"

  echo "performance" | \
      tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  mkfs.btrfs -f $MKFS_OPTIONS $DEV
  mount $MOUNT_OPTIONS $DEV $MNT

  dbench -D $MNT -t 120 $NUM_JOBS

  umount $MNT

The results were the following:

8 jobs, before patchset:

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX    4113896     0.009   238.665
 Close        3021699     0.001     0.590
 Rename        174215     0.082   238.733
 Unlink        830977     0.049   238.642
 Deltree           96     2.232     8.022
 Mkdir             48     0.003     0.005
 Qpathinfo    3729013     0.005     2.672
 Qfileinfo     653206     0.001     0.152
 Qfsinfo       683866     0.002     0.526
 Sfileinfo     335055     0.004     1.571
 Find         1441800     0.016     4.288
 WriteX       2049644     0.010     3.982
 ReadX        6449786     0.003     0.969
 LockX          13400     0.002     0.043
 UnlockX        13400     0.001     0.075
 Flush         288349     2.521   245.516

Throughput 1075.73 MB/sec  8 clients  8 procs  max_latency=245.520 ms

8 jobs, after patchset:

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX    4154282     0.009   156.675
 Close        3051450     0.001     0.843
 Rename        175912     0.072     4.444
 Unlink        839067     0.048    66.050
 Deltree           96     2.131     5.979
 Mkdir             48     0.002     0.004
 Qpathinfo    3765575     0.005     3.079
 Qfileinfo     659582     0.001     0.099
 Qfsinfo       690474     0.002     0.155
 Sfileinfo     338366     0.004     1.419
 Find         1455816     0.016     3.423
 WriteX       2069538     0.010     4.328
 ReadX        6512429     0.003     0.840
 LockX          13530     0.002     0.078
 UnlockX        13530     0.001     0.051
 Flush         291158     2.500   163.468

Throughput 1105.45 MB/sec  8 clients  8 procs  max_latency=163.474 ms

+2.7% throughput, -40.1% max latency

16 jobs, before patchset:

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX    5457602     0.033   337.098
 Close        4008979     0.002     2.018
 Rename        231051     0.323   254.054
 Unlink       1102209     0.202   337.243
 Deltree          160     6.521    31.720
 Mkdir             80     0.003     0.007
 Qpathinfo    4946147     0.014     6.988
 Qfileinfo     867440     0.001     1.642
 Qfsinfo       907081     0.003     1.821
 Sfileinfo     444433     0.005     2.053
 Find         1912506     0.067     7.854
 WriteX       2724852     0.018     7.428
 ReadX        8553883     0.003     2.059
 LockX          17770     0.003     0.350
 UnlockX        17770     0.002     0.627
 Flush         382533     2.810   353.691

Throughput 1413.09 MB/sec  16 clients  16 procs  max_latency=353.696 ms

16 jobs, after patchset:

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX    5393156     0.034   303.181
 Close        3961986     0.002     1.502
 Rename        228359     0.320   253.379
 Unlink       1088920     0.206   303.409
 Deltree          160     6.419    30.088
 Mkdir             80     0.003     0.004
 Qpathinfo    4887967     0.015     7.722
 Qfileinfo     857408     0.001     1.651
 Qfsinfo       896343     0.002     2.147
 Sfileinfo     439317     0.005     4.298
 Find         1890018     0.073     8.347
 WriteX       2693356     0.018     6.373
 ReadX        8453485     0.003     3.836
 LockX          17562     0.003     0.486
 UnlockX        17562     0.002     0.635
 Flush         378023     2.802   315.904

Throughput 1454.46 MB/sec  16 clients  16 procs  max_latency=315.910 ms

+2.9% throughput, -11.3% max latency

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f976001..381d88a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5418,22 +5418,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	 * Only run delayed items if we are a directory. We want to make sure
 	 * all directory indexes hit the fs/subvolume tree so we can find them
 	 * and figure out which index ranges have to be logged.
-	 *
-	 * Otherwise commit the delayed inode only if the full sync flag is set,
-	 * as we want to make sure an up to date version is in the subvolume
-	 * tree so copy_inode_items_to_log() / copy_items() can find it and copy
-	 * it to the log tree. For a non full sync, we always log the inode item
-	 * based on the in-memory struct btrfs_inode which is always up to date.
 	 */
-	if (S_ISDIR(inode->vfs_inode.i_mode))
-		ret = btrfs_commit_inode_delayed_items(trans, inode);
-	else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
-		ret = btrfs_commit_inode_delayed_inode(inode);
-
-	if (ret) {
-		btrfs_free_path(path);
-		btrfs_free_path(dst_path);
-		return ret;
+	if (S_ISDIR(inode->vfs_inode.i_mode)) {
+		err = btrfs_commit_inode_delayed_items(trans, inode);
+		if (err)
+			goto out;
 	}
 
 	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
@@ -5634,7 +5623,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	spin_unlock(&inode->lock);
 out_unlock:
 	mutex_unlock(&inode->log_mutex);
-
+out:
 	btrfs_free_path(path);
 	btrfs_free_path(dst_path);
 	return err;
-- 
2.34.1.75.gabe6bb3905


From de8736f9d78a33a01ebcae832c1e232576e22cfd Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 16 Sep 2021 11:32:10 +0100
Subject: [PATCH 12/48] btrfs: remove root argument from btrfs_log_inode() and
 its callees

The root argument passed to btrfs_log_inode() is unncessary, as it is
always the root of the inode we are going to log. This root also gets
unnecessarily propagated to several functions called by btrfs_log_inode(),
and all of them take the inode as an argument as well. So just remove
the root argument from these functions and have them get the root from
the inode where needed.

This patch is part of a patchset comprised of the following 5 patches:

  btrfs: remove root argument from btrfs_log_inode() and its callees
  btrfs: remove redundant log root assignment from log_dir_items()
  btrfs: factor out the copying loop of dir items from log_dir_items()
  btrfs: insert items in batches when logging a directory when possible
  btrfs: keep track of the last logged keys when logging a directory

This is patch 1/5. The change log of the last patch (5/5) has performance
results.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 52 +++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 381d88a..b075477 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,7 +94,7 @@ enum {
 };
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct btrfs_inode *inode,
+			   struct btrfs_inode *inode,
 			   int inode_only,
 			   struct btrfs_log_ctx *ctx);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
@@ -3642,13 +3642,14 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
  * to replay anything deleted before the fsync
  */
 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct btrfs_inode *inode,
+			  struct btrfs_inode *inode,
 			  struct btrfs_path *path,
 			  struct btrfs_path *dst_path, int key_type,
 			  struct btrfs_log_ctx *ctx,
 			  u64 min_offset, u64 *last_offset_ret)
 {
 	struct btrfs_key min_key;
+	struct btrfs_root *root = inode->root;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src;
 	int err = 0;
@@ -3849,7 +3850,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
  * key logged by this transaction.
  */
 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct btrfs_inode *inode,
+			  struct btrfs_inode *inode,
 			  struct btrfs_path *path,
 			  struct btrfs_path *dst_path,
 			  struct btrfs_log_ctx *ctx)
@@ -3863,7 +3864,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 	min_key = 0;
 	max_key = 0;
 	while (1) {
-		ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+		ret = log_dir_items(trans, inode, path, dst_path, key_type,
 				ctx, min_key, &max_key);
 		if (ret)
 			return ret;
@@ -4360,13 +4361,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
 }
 
 static int log_one_extent(struct btrfs_trans_handle *trans,
-			  struct btrfs_inode *inode, struct btrfs_root *root,
+			  struct btrfs_inode *inode,
 			  const struct extent_map *em,
 			  struct btrfs_path *path,
 			  struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_drop_extents_args drop_args = { 0 };
-	struct btrfs_root *log = root->log_root;
+	struct btrfs_root *log = inode->root->log_root;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *leaf;
 	struct btrfs_map_token token;
@@ -4584,7 +4585,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
 }
 
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
 				     struct btrfs_inode *inode,
 				     struct btrfs_path *path,
 				     struct btrfs_log_ctx *ctx)
@@ -4649,7 +4649,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
 		write_unlock(&tree->lock);
 
-		ret = log_one_extent(trans, inode, root, em, path, ctx);
+		ret = log_one_extent(trans, inode, em, path, ctx);
 		write_lock(&tree->lock);
 		clear_em_logging(tree, em);
 		free_extent_map(em);
@@ -4738,11 +4738,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
  * with a journal, ext3/4, xfs, f2fs, etc).
  */
 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
 				struct btrfs_inode *inode,
 				struct btrfs_path *path,
 				struct btrfs_path *dst_path)
 {
+	struct btrfs_root *root = inode->root;
 	int ret;
 	struct btrfs_key key;
 	const u64 ino = btrfs_ino(inode);
@@ -4816,10 +4816,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
  * truncate operation that changes the inode's size.
  */
 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
 			   struct btrfs_inode *inode,
 			   struct btrfs_path *path)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_key key;
 	const u64 ino = btrfs_ino(inode);
@@ -5096,7 +5096,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 				if (IS_ERR(inode)) {
 					ret = PTR_ERR(inode);
 				} else {
-					ret = btrfs_log_inode(trans, root,
+					ret = btrfs_log_inode(trans,
 						      BTRFS_I(inode),
 						      LOG_OTHER_INODE_ALL,
 						      ctx);
@@ -5156,8 +5156,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 		 * well because during a rename we pin the log and update the
 		 * log with the new name before we unpin it.
 		 */
-		ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
-				      LOG_OTHER_INODE, ctx);
+		ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
 		if (ret) {
 			btrfs_add_delayed_iput(inode);
 			continue;
@@ -5368,7 +5367,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct btrfs_inode *inode,
+			   struct btrfs_inode *inode,
 			   int inode_only,
 			   struct btrfs_log_ctx *ctx)
 {
@@ -5376,7 +5375,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_path *dst_path;
 	struct btrfs_key min_key;
 	struct btrfs_key max_key;
-	struct btrfs_root *log = root->log_root;
+	struct btrfs_root *log = inode->root->log_root;
 	int err = 0;
 	int ret = 0;
 	bool fast_search = false;
@@ -5526,14 +5525,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(path);
 	btrfs_release_path(dst_path);
-	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+	err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
 	if (err)
 		goto out_unlock;
 	xattrs_logged = true;
 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
 		btrfs_release_path(path);
 		btrfs_release_path(dst_path);
-		err = btrfs_log_holes(trans, root, inode, path);
+		err = btrfs_log_holes(trans, inode, path);
 		if (err)
 			goto out_unlock;
 	}
@@ -5553,16 +5552,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		 * BTRFS_INODE_COPY_EVERYTHING set.
 		 */
 		if (!xattrs_logged && inode->logged_trans < trans->transid) {
-			err = btrfs_log_all_xattrs(trans, root, inode, path,
-						   dst_path);
+			err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
 			if (err)
 				goto out_unlock;
 			btrfs_release_path(path);
 		}
 	}
 	if (fast_search) {
-		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-						ctx);
+		ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
@@ -5577,8 +5574,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	}
 
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
-		ret = log_directory_changes(trans, root, inode, path, dst_path,
-					ctx);
+		ret = log_directory_changes(trans, inode, path, dst_path, ctx);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
@@ -5807,7 +5803,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 			ctx->log_new_dentries = false;
 			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
 				log_mode = LOG_INODE_ALL;
-			ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
+			ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
 					      log_mode, ctx);
 			btrfs_add_delayed_iput(di_inode);
 			if (ret)
@@ -5952,7 +5948,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 			}
 
 			ctx->log_new_dentries = false;
-			ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
+			ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
 					      LOG_INODE_ALL, ctx);
 			if (!ret && ctx->log_new_dentries)
 				ret = log_new_dir_dentries(trans, root,
@@ -6000,7 +5996,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
 
 		if (BTRFS_I(inode)->generation >= trans->transid &&
 		    need_log_inode(trans, BTRFS_I(inode)))
-			ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+			ret = btrfs_log_inode(trans, BTRFS_I(inode),
 					      LOG_INODE_EXISTS, ctx);
 		btrfs_add_delayed_iput(inode);
 		if (ret)
@@ -6055,7 +6051,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
 
 		if (inode->generation >= trans->transid &&
 		    need_log_inode(trans, inode)) {
-			ret = btrfs_log_inode(trans, root, inode,
+			ret = btrfs_log_inode(trans, inode,
 					      LOG_INODE_EXISTS, ctx);
 			if (ret)
 				break;
@@ -6198,7 +6194,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto end_no_trans;
 
-	ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
+	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
 	if (ret)
 		goto end_trans;
 
-- 
2.34.1.75.gabe6bb3905


From b1e633c39c53a11f4f472e549d2a302242b06e06 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 16 Sep 2021 11:32:11 +0100
Subject: [PATCH 13/48] btrfs: remove redundant log root assignment from
 log_dir_items()

At log_dir_items() we are assigning the exact same value to the local
variable 'log', once when it's declared and once again shortly after.
Remove the later assignment as it's pointless.

This patch is part of a patchset comprised of the following 5 patches:

  btrfs: remove root argument from btrfs_log_inode() and its callees
  btrfs: remove redundant log root assignment from log_dir_items()
  btrfs: factor out the copying loop of dir items from log_dir_items()
  btrfs: insert items in batches when logging a directory when possible
  btrfs: keep track of the last logged keys when logging a directory

This is patch 2/5. The change log of the last patch (5/5) has performance
results.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b075477..7e15113 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3660,8 +3660,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	u64 last_offset = (u64)-1;
 	u64 ino = btrfs_ino(inode);
 
-	log = root->log_root;
-
 	min_key.objectid = ino;
 	min_key.type = key_type;
 	min_key.offset = min_offset;
-- 
2.34.1.75.gabe6bb3905


From d480d52446cdcd67f3391c731b88753aeec667a8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 16 Sep 2021 11:32:12 +0100
Subject: [PATCH 14/48] btrfs: factor out the copying loop of dir items from
 log_dir_items()

In preparation for the next change, move the loop that processes a leaf
and copies its directory items to the log, into a separate helper
function. This makes the next change simpler and it also helps making
log_dir_items() a bit shorter (specially after the next change).

This patch is part of a patchset comprised of the following 5 patches:

  btrfs: remove root argument from btrfs_log_inode() and its callees
  btrfs: remove redundant log root assignment from log_dir_items()
  btrfs: factor out the copying loop of dir items from log_dir_items()
  btrfs: insert items in batches when logging a directory when possible
  btrfs: keep track of the last logged keys when logging a directory

This is patch 3/5. The change log of the last patch (5/5) has performance
results.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 135 ++++++++++++++++++++++++--------------------
 1 file changed, 75 insertions(+), 60 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7e15113..5376fbc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3636,6 +3636,66 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+				  struct btrfs_inode *inode,
+				  struct btrfs_path *path,
+				  struct btrfs_path *dst_path,
+				  int key_type,
+				  struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_root *log = inode->root->log_root;
+	struct extent_buffer *src = path->nodes[0];
+	const int nritems = btrfs_header_nritems(src);
+	const u64 ino = btrfs_ino(inode);
+	int i;
+
+	for (i = path->slots[0]; i < nritems; i++) {
+		struct btrfs_key key;
+		struct btrfs_dir_item *di;
+		int ret;
+
+		btrfs_item_key_to_cpu(src, &key, i);
+
+		if (key.objectid != ino || key.type != key_type)
+			return 1;
+
+		ret = overwrite_item(trans, log, dst_path, src, i, &key);
+		if (ret < 0)
+			return ret;
+
+		/*
+		 * We must make sure that when we log a directory entry, the
+		 * corresponding inode, after log replay, has a matching link
+		 * count. For example:
+		 *
+		 * touch foo
+		 * mkdir mydir
+		 * sync
+		 * ln foo mydir/bar
+		 * xfs_io -c "fsync" mydir
+		 * <crash>
+		 * <mount fs and log replay>
+		 *
+		 * Would result in a fsync log that when replayed, our file inode
+		 * would have a link count of 1, but we get two directory entries
+		 * pointing to the same inode. After removing one of the names,
+		 * it would not be possible to remove the other name, which
+		 * resulted always in stale file handle errors, and would not be
+		 * possible to rmdir the parent directory, since its i_size could
+		 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+		 * resulting in -ENOTEMPTY errors.
+		 */
+		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+		btrfs_dir_item_key_to_cpu(src, di, &key);
+		if ((btrfs_dir_transid(src, di) == trans->transid ||
+		     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+		    key.type != BTRFS_ROOT_ITEM_KEY)
+			ctx->log_new_dentries = true;
+	}
+
+	return 0;
+}
+
 /*
  * log all the items included in the current transaction for a given
  * directory.  This also creates the range items in the log tree required
@@ -3651,11 +3711,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	struct btrfs_key min_key;
 	struct btrfs_root *root = inode->root;
 	struct btrfs_root *log = root->log_root;
-	struct extent_buffer *src;
 	int err = 0;
 	int ret;
-	int i;
-	int nritems;
 	u64 first_offset = min_offset;
 	u64 last_offset = (u64)-1;
 	u64 ino = btrfs_ino(inode);
@@ -3733,61 +3790,14 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 * from our directory
 	 */
 	while (1) {
-		struct btrfs_key tmp;
-		src = path->nodes[0];
-		nritems = btrfs_header_nritems(src);
-		for (i = path->slots[0]; i < nritems; i++) {
-			struct btrfs_dir_item *di;
-
-			btrfs_item_key_to_cpu(src, &min_key, i);
-
-			if (min_key.objectid != ino || min_key.type != key_type)
-				goto done;
-
-			if (need_resched()) {
-				btrfs_release_path(path);
-				cond_resched();
-				goto search;
-			}
-
-			ret = overwrite_item(trans, log, dst_path, src, i,
-					     &min_key);
-			if (ret) {
+		ret = process_dir_items_leaf(trans, inode, path, dst_path,
+					     key_type, ctx);
+		if (ret != 0) {
+			if (ret < 0)
 				err = ret;
-				goto done;
-			}
-
-			/*
-			 * We must make sure that when we log a directory entry,
-			 * the corresponding inode, after log replay, has a
-			 * matching link count. For example:
-			 *
-			 * touch foo
-			 * mkdir mydir
-			 * sync
-			 * ln foo mydir/bar
-			 * xfs_io -c "fsync" mydir
-			 * <crash>
-			 * <mount fs and log replay>
-			 *
-			 * Would result in a fsync log that when replayed, our
-			 * file inode would have a link count of 1, but we get
-			 * two directory entries pointing to the same inode.
-			 * After removing one of the names, it would not be
-			 * possible to remove the other name, which resulted
-			 * always in stale file handle errors, and would not
-			 * be possible to rmdir the parent directory, since
-			 * its i_size could never decrement to the value
-			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
-			 */
-			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
-			btrfs_dir_item_key_to_cpu(src, di, &tmp);
-			if ((btrfs_dir_transid(src, di) == trans->transid ||
-			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
-			    tmp.type != BTRFS_ROOT_ITEM_KEY)
-				ctx->log_new_dentries = true;
+			goto done;
 		}
-		path->slots[0] = nritems;
+		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
 
 		/*
 		 * look ahead to the next item and see if it is also
@@ -3801,21 +3811,26 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 				err = ret;
 			goto done;
 		}
-		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-		if (tmp.objectid != ino || tmp.type != key_type) {
+		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+		if (min_key.objectid != ino || min_key.type != key_type) {
 			last_offset = (u64)-1;
 			goto done;
 		}
 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
 			ret = overwrite_item(trans, log, dst_path,
 					     path->nodes[0], path->slots[0],
-					     &tmp);
+					     &min_key);
 			if (ret)
 				err = ret;
 			else
-				last_offset = tmp.offset;
+				last_offset = min_key.offset;
 			goto done;
 		}
+		if (need_resched()) {
+			btrfs_release_path(path);
+			cond_resched();
+			goto search;
+		}
 	}
 done:
 	btrfs_release_path(path);
-- 
2.34.1.75.gabe6bb3905


From 1dcaae99aed37b0e4037207c864364d39f727114 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 16 Sep 2021 11:32:13 +0100
Subject: [PATCH 15/48] btrfs: insert items in batches when logging a directory
 when possible

When logging a directory, we scan its directory items from the subvolume
tree and then copy one by one into the log tree. This is not efficient
since we generally are able to insert several items in a batch, using a
single btree operation for adding several items at once. The reason we
copy items one by one is that we must check if each item was previously
logged in the current transaction, and if it was we either overwrite it
or skip it in case its content did not change in the subvolume tree (this
can happen only for dir item keys, but not for dir index keys), and doing
such check makes it a bit cumbersome to attempt batch insertions.

However the chances for doing batch insertions are very frequent and
always happen when:

1) Logging the directory for the first time in the current transaction,
   as none of the items exist in the log tree yet;

2) Logging new dir index keys, because the offset for new dir index keys
   comes from a monotonically increasing counter. This means if we keep
   adding dentries to a directory, through creation of new files and
   sub-directories or by adding new links or renaming from some other
   directory into the one we are logging, all the new dir index keys
   have a new offset that is greater than the offset of any previously
   logged index keys, so we can insert them in batches into the log tree.

For dir item keys, since their offset depends on the result of an hash
function against the dentry's name, unless the directory is being logged
for the first time in the current transaction, the chances being able to
insert the items in the log using batches is pretty much random and not
predictable, as it depends on the names of the dentries, but still happens
often enough.

So change directory logging to keep track of consecutive directory items
that don't exist yet in the log and batch insert them.

This patch is part of a patchset comprised of the following 5 patches:

  btrfs: remove root argument from btrfs_log_inode() and its callees
  btrfs: remove redundant log root assignment from log_dir_items()
  btrfs: factor out the copying loop of dir items from log_dir_items()
  btrfs: insert items in batches when logging a directory when possible
  btrfs: keep track of the last logged keys when logging a directory

This is patch 4/5. The change log of the last patch (5/5) has performance
results.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 217 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 180 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5376fbc..8a1bb6a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -368,25 +368,11 @@ static int process_one_buffer(struct btrfs_root *log,
 	return ret;
 }
 
-/*
- * Item overwrite used by replay and tree logging.  eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten.  If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_path *path,
-				   struct extent_buffer *eb, int slot,
-				   struct btrfs_key *key)
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     struct extent_buffer *eb, int slot,
+			     struct btrfs_key *key)
 {
 	int ret;
 	u32 item_size;
@@ -403,10 +389,22 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 	item_size = btrfs_item_size_nr(eb, slot);
 	src_ptr = btrfs_item_ptr_offset(eb, slot);
 
-	/* look for the key in the destination tree */
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-	if (ret < 0)
-		return ret;
+	/* Our caller must have done a search for the key for us. */
+	ASSERT(path->nodes[0] != NULL);
+
+	/*
+	 * And the slot must point to the exact key or the slot where the key
+	 * should be at (the first item with a key greater than 'key')
+	 */
+	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+		struct btrfs_key found_key;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+		ret = btrfs_comp_cpu_keys(&found_key, key);
+		ASSERT(ret >= 0);
+	} else {
+		ret = 1;
+	}
 
 	if (ret == 0) {
 		char *src_copy;
@@ -584,6 +582,36 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  struct extent_buffer *eb, int slot,
+			  struct btrfs_key *key)
+{
+	int ret;
+
+	/* Look for the key in the destination tree. */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
 /*
  * simple helper to read an inode off the disk from a given root
  * This can only be called for subvolume roots and not for the log
@@ -3636,6 +3664,68 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *log,
+				 struct extent_buffer *src,
+				 struct btrfs_path *dst_path,
+				 int start_slot,
+				 int count)
+{
+	char *ins_data = NULL;
+	struct btrfs_key *ins_keys;
+	u32 *ins_sizes;
+	struct extent_buffer *dst;
+	struct btrfs_key key;
+	u32 item_size;
+	int ret;
+	int i;
+
+	ASSERT(count > 0);
+
+	if (count == 1) {
+		btrfs_item_key_to_cpu(src, &key, start_slot);
+		item_size = btrfs_item_size_nr(src, start_slot);
+		ins_keys = &key;
+		ins_sizes = &item_size;
+	} else {
+		ins_data = kmalloc(count * sizeof(u32) +
+				   count * sizeof(struct btrfs_key), GFP_NOFS);
+		if (!ins_data)
+			return -ENOMEM;
+
+		ins_sizes = (u32 *)ins_data;
+		ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+
+		for (i = 0; i < count; i++) {
+			const int slot = start_slot + i;
+
+			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+			ins_sizes[i] = btrfs_item_size_nr(src, slot);
+		}
+	}
+
+	ret = btrfs_insert_empty_items(trans, log, dst_path, ins_keys, ins_sizes,
+				       count);
+	if (ret)
+		goto out;
+
+	dst = dst_path->nodes[0];
+	for (i = 0; i < count; i++) {
+		unsigned long src_offset;
+		unsigned long dst_offset;
+
+		dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0]);
+		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+		copy_extent_buffer(dst, src, dst_offset, src_offset, ins_sizes[i]);
+		dst_path->slots[0]++;
+	}
+	btrfs_release_path(dst_path);
+out:
+	kfree(ins_data);
+
+	return ret;
+}
+
 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *inode,
 				  struct btrfs_path *path,
@@ -3647,21 +3737,22 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 	struct extent_buffer *src = path->nodes[0];
 	const int nritems = btrfs_header_nritems(src);
 	const u64 ino = btrfs_ino(inode);
+	const bool inode_logged_before = inode_logged(trans, inode);
+	bool last_found = false;
+	int batch_start = 0;
+	int batch_size = 0;
 	int i;
 
 	for (i = path->slots[0]; i < nritems; i++) {
 		struct btrfs_key key;
-		struct btrfs_dir_item *di;
 		int ret;
 
 		btrfs_item_key_to_cpu(src, &key, i);
 
-		if (key.objectid != ino || key.type != key_type)
-			return 1;
-
-		ret = overwrite_item(trans, log, dst_path, src, i, &key);
-		if (ret < 0)
-			return ret;
+		if (key.objectid != ino || key.type != key_type) {
+			last_found = true;
+			break;
+		}
 
 		/*
 		 * We must make sure that when we log a directory entry, the
@@ -3685,15 +3776,67 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 		 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
 		 * resulting in -ENOTEMPTY errors.
 		 */
-		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
-		btrfs_dir_item_key_to_cpu(src, di, &key);
-		if ((btrfs_dir_transid(src, di) == trans->transid ||
-		     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
-		    key.type != BTRFS_ROOT_ITEM_KEY)
-			ctx->log_new_dentries = true;
+		if (!ctx->log_new_dentries) {
+			struct btrfs_dir_item *di;
+			struct btrfs_key di_key;
+
+			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+			btrfs_dir_item_key_to_cpu(src, di, &di_key);
+			if ((btrfs_dir_transid(src, di) == trans->transid ||
+			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+			    di_key.type != BTRFS_ROOT_ITEM_KEY)
+				ctx->log_new_dentries = true;
+		}
+
+		if (!inode_logged_before)
+			goto add_to_batch;
+		/*
+		 * Check if the key was already logged before. If not we can add
+		 * it to a batch for bulk insertion.
+		 */
+		ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
+		if (ret < 0) {
+			return ret;
+		} else if (ret > 0) {
+			btrfs_release_path(dst_path);
+			goto add_to_batch;
+		}
+
+		/*
+		 * Item exists in the log. Overwrite the item in the log if it
+		 * has different content or do nothing if it has exactly the same
+		 * content. And then flush the current batch if any - do it after
+		 * overwriting the current item, or we would deadlock otherwise,
+		 * since we are holding a path for the existing item.
+		 */
+		ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
+		if (ret < 0)
+			return ret;
+
+		if (batch_size > 0) {
+			ret = flush_dir_items_batch(trans, log, src, dst_path,
+						    batch_start, batch_size);
+			if (ret < 0)
+				return ret;
+			batch_size = 0;
+		}
+		continue;
+add_to_batch:
+		if (batch_size == 0)
+			batch_start = i;
+		batch_size++;
 	}
 
-	return 0;
+	if (batch_size > 0) {
+		int ret;
+
+		ret = flush_dir_items_batch(trans, log, src, dst_path,
+					    batch_start, batch_size);
+		if (ret < 0)
+			return ret;
+	}
+
+	return last_found ? 1 : 0;
 }
 
 /*
-- 
2.34.1.75.gabe6bb3905


From b8bf23be96d623393792f7cb91bf210be1ed6a14 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 16 Sep 2021 11:32:14 +0100
Subject: [PATCH 16/48] btrfs: keep track of the last logged keys when logging
 a directory

After the first time we log a directory in the current transaction, for
each directory item in a changed leaf of the subvolume tree, we have to
check if we previously logged the item, in order to overwrite it in case
its data changed or skip it in case its data hasn't changed.

Checking if we have logged each item before not only wastes times, but it
also adds lock contention on the log tree. So in order to minimize the
number of times we do such checks, keep track of the offset of the last
key we logged for a directory and, on the next time we log the directory,
skip the checks for any new keys that have an offset greater than the
offset we have previously saved. This is specially effective for index
keys, because the offset for these keys comes from a monotonically
increasing counter.

This patch is part of a patchset comprised of the following 5 patches:

  btrfs: remove root argument from btrfs_log_inode() and its callees
  btrfs: remove redundant log root assignment from log_dir_items()
  btrfs: factor out the copying loop of dir items from log_dir_items()
  btrfs: insert items in batches when logging a directory when possible
  btrfs: keep track of the last logged keys when logging a directory

This is patch 5/5.

The following test was used on a non-debug kernel to measure the impact
it has on a directory fsync:

  $ cat test-dir-fsync.sh
  #!/bin/bash

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1

  NUM_NEW_FILES=100000
  NUM_FILE_DELETES=1000

  mkfs.btrfs -f $DEV
  mount -o ssd $DEV $MNT

  mkdir $MNT/testdir

  for ((i = 1; i <= $NUM_NEW_FILES; i++)); do
      echo -n > $MNT/testdir/file_$i
  done

  # fsync the directory, this will log the new dir items and the inodes
  # they point to, because these are new inodes.
  start=$(date +%s%N)
  xfs_io -c "fsync" $MNT/testdir
  end=$(date +%s%N)

  dur=$(( (end - start) / 1000000 ))
  echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files"

  # sync to force transaction commit and wipeout the log.
  sync

  del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES ))
  for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do
      rm -f $MNT/testdir/file_$i
  done

  # fsync the directory, this will only log dir items, there are no
  # dentries pointing to new inodes.
  start=$(date +%s%N)
  xfs_io -c "fsync" $MNT/testdir
  end=$(date +%s%N)

  dur=$(( (end - start) / 1000000 ))
  echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files"

  umount $MNT

Test results with NUM_NEW_FILES set to 100 000 and 1 000 000:

**** before patchset, 100 000 files, 1000 deletes ****

dir fsync took 848 ms after adding 100000 files
dir fsync took 175 ms after deleting 1000 files

**** after patchset, 100 000 files, 1000 deletes ****

dir fsync took 758 ms after adding 100000 files  (-11.2%)
dir fsync took 63 ms after deleting 1000 files   (-94.1%)

**** before patchset, 1 000 000 files, 1000 deletes ****

dir fsync took 9945 ms after adding 1000000 files
dir fsync took 473 ms after deleting 1000 files

**** after patchset, 1 000 000 files, 1000 deletes ****

dir fsync took 8677 ms after adding 1000000 files (-13.6%)
dir fsync took 146 ms after deleting 1000 files   (-105.6%)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 39 ++++++++++++++++++++++++++++-----------
 fs/btrfs/inode.c       |  6 ++++--
 fs/btrfs/tree-log.c    | 41 +++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-log.h    |  2 ++
 4 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 76ee145..602b426 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,17 +138,34 @@ struct btrfs_inode {
 	/* a local copy of root's last_log_commit */
 	int last_log_commit;
 
-	/* total number of bytes pending delalloc, used by stat to calc the
-	 * real block usage of the file
-	 */
-	u64 delalloc_bytes;
-
-	/*
-	 * Total number of bytes pending delalloc that fall within a file
-	 * range that is either a hole or beyond EOF (and no prealloc extent
-	 * exists in the range). This is always <= delalloc_bytes.
-	 */
-	u64 new_delalloc_bytes;
+	union {
+		/*
+		 * Total number of bytes pending delalloc, used by stat to
+		 * calculate the real block usage of the file. This is used
+		 * only for files.
+		 */
+		u64 delalloc_bytes;
+		/*
+		 * The offset of the last dir item key that was logged.
+		 * This is used only for directories.
+		 */
+		u64 last_dir_item_offset;
+	};
+
+	union {
+		/*
+		 * Total number of bytes pending delalloc that fall within a file
+		 * range that is either a hole or beyond EOF (and no prealloc extent
+		 * exists in the range). This is always <= delalloc_bytes and this
+		 * is used only for files.
+		 */
+		u64 new_delalloc_bytes;
+		/*
+		 * The offset of the last dir index key that was logged.
+		 * This is used only for directories.
+		 */
+		u64 last_dir_index_offset;
+	};
 
 	/*
 	 * total number of bytes pending defrag, used by stat to check whether
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4af74b6..a2f2a45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9158,8 +9158,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
 	WARN_ON(inode->block_rsv.reserved);
 	WARN_ON(inode->block_rsv.size);
 	WARN_ON(inode->outstanding_extents);
-	WARN_ON(inode->delalloc_bytes);
-	WARN_ON(inode->new_delalloc_bytes);
+	if (!S_ISDIR(vfs_inode->i_mode)) {
+		WARN_ON(inode->delalloc_bytes);
+		WARN_ON(inode->new_delalloc_bytes);
+	}
 	WARN_ON(inode->csum_bytes);
 	WARN_ON(inode->defrag_bytes);
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8a1bb6a..24ebbed 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3738,11 +3738,17 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 	const int nritems = btrfs_header_nritems(src);
 	const u64 ino = btrfs_ino(inode);
 	const bool inode_logged_before = inode_logged(trans, inode);
+	u64 last_logged_key_offset;
 	bool last_found = false;
 	int batch_start = 0;
 	int batch_size = 0;
 	int i;
 
+	if (key_type == BTRFS_DIR_ITEM_KEY)
+		last_logged_key_offset = inode->last_dir_item_offset;
+	else
+		last_logged_key_offset = inode->last_dir_index_offset;
+
 	for (i = path->slots[0]; i < nritems; i++) {
 		struct btrfs_key key;
 		int ret;
@@ -3754,6 +3760,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 			break;
 		}
 
+		ctx->last_dir_item_offset = key.offset;
 		/*
 		 * We must make sure that when we log a directory entry, the
 		 * corresponding inode, after log replay, has a matching link
@@ -3790,6 +3797,15 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 
 		if (!inode_logged_before)
 			goto add_to_batch;
+
+		/*
+		 * If we were logged before and have logged dir items, we can skip
+		 * checking if any item with a key offset larger than the last one
+		 * we logged is in the log tree, saving time and avoiding adding
+		 * contention on the log tree.
+		 */
+		if (key.offset > last_logged_key_offset)
+			goto add_to_batch;
 		/*
 		 * Check if the key was already logged before. If not we can add
 		 * it to a batch for bulk insertion.
@@ -4016,9 +4032,31 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 	int ret;
 	int key_type = BTRFS_DIR_ITEM_KEY;
 
+	/*
+	 * If this is the first time we are being logged in the current
+	 * transaction, or we were logged before but the inode was evicted and
+	 * reloaded later, in which case its logged_trans is 0, reset the values
+	 * of the last logged key offsets. Note that we don't use the helper
+	 * function inode_logged() here - that is because the function returns
+	 * true after an inode eviction, assuming the worst case as it can not
+	 * know for sure if the inode was logged before. So we can not skip key
+	 * searches in the case the inode was evicted, because it may not have
+	 * been logged in this transaction and may have been logged in a past
+	 * transaction, so we need to reset the last dir item and index offsets
+	 * to (u64)-1.
+	 */
+	if (inode->logged_trans != trans->transid) {
+		inode->last_dir_item_offset = (u64)-1;
+		inode->last_dir_index_offset = (u64)-1;
+	}
 again:
 	min_key = 0;
 	max_key = 0;
+	if (key_type == BTRFS_DIR_ITEM_KEY)
+		ctx->last_dir_item_offset = inode->last_dir_item_offset;
+	else
+		ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
 	while (1) {
 		ret = log_dir_items(trans, inode, path, dst_path, key_type,
 				ctx, min_key, &max_key);
@@ -4030,8 +4068,11 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 	}
 
 	if (key_type == BTRFS_DIR_ITEM_KEY) {
+		inode->last_dir_item_offset = ctx->last_dir_item_offset;
 		key_type = BTRFS_DIR_INDEX_KEY;
 		goto again;
+	} else {
+		inode->last_dir_index_offset = ctx->last_dir_item_offset;
 	}
 	return 0;
 }
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 731bd9c..3ce6bdb 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
 	int log_transid;
 	bool log_new_dentries;
 	bool logging_new_name;
+	/* Tracks the last logged dir item/index key offset. */
+	u64 last_dir_item_offset;
 	struct inode *inode;
 	struct list_head list;
 	/* Only used for fast fsyncs. */
-- 
2.34.1.75.gabe6bb3905


From 1789a3b61ce8d9ee8b922847841edf76f3d0ab3d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 24 Sep 2021 12:28:13 +0100
Subject: [PATCH 17/48] btrfs: loop only once over data sizes array when
 inserting an item batch

When inserting a batch of items into a btree, we end up looping over the
data sizes array 3 times:

1) Once in the caller of btrfs_insert_empty_items(), when it populates the
   array with the data sizes for each item;

2) Once at btrfs_insert_empty_items() to sum the elements of the data
   sizes array and compute the total data size;

3) And then once again at setup_items_for_insert(), where we do exactly
   the same as what we do at btrfs_insert_empty_items(), to compute the
   total data size.

That is not bad for small arrays, but when the arrays have hundreds of
elements, the time spent on looping is not negligible. For example when
doing batch inserts of delayed items for dir index items or when logging
a directory, it's common to have 200 to 260 dir index items in a single
batch when using a leaf size of 16K and using file names between 8 and 12
characters. For a 64K leaf size, multiply that by 4. Taking into account
that during directory logging or when flushing delayed dir index items we
can have many of those large batches, the time spent on the looping adds
up quickly.

It's also more important to avoid it at setup_items_for_insert(), since
we are holding a write lock on a leaf and, in some cases, on upper nodes
of the btree, which causes us to block other tasks that want to access
the leaf and nodes for longer than necessary.

So change the code so that setup_items_for_insert() and
btrfs_insert_empty_items() no longer compute the total data size, and
instead rely on the caller to supply it. This makes us loop over the
array only once, where we can both populate the data size array and
compute the total data size, taking advantage of spatial and temporal
locality. To make this more manageable, use a structure to contain
all the relevant details for a batch of items (keys array, data sizes
array, total data size, number of items), and use it as an argument
for btrfs_insert_empty_items() and setup_items_for_insert().

This patch is part of a small patchset that is comprised of the following
patches:

  btrfs: loop only once over data sizes array when inserting an item batch
  btrfs: unexport setup_items_for_insert()
  btrfs: use single bulk copy operations when logging directories

This is patch 1/3 and performance results, and the specific tests, are
included in the changelog of patch 3/3.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c                     | 57 ++++++++++++----------------
 fs/btrfs/ctree.h                     | 57 +++++++++++++++++++++++++---
 fs/btrfs/delayed-inode.c             | 41 ++++++++++----------
 fs/btrfs/file.c                      |  3 +-
 fs/btrfs/inode.c                     |  8 +++-
 fs/btrfs/tests/extent-buffer-tests.c |  2 +-
 fs/btrfs/tests/inode-tests.c         |  4 +-
 fs/btrfs/tree-log.c                  | 31 ++++++++++-----
 8 files changed, 131 insertions(+), 72 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84627cb..bf4d150 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3605,7 +3605,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 		return ret;
 
 	path->slots[0]++;
-	setup_items_for_insert(root, path, new_key, &item_size, 1);
+	setup_item_for_insert(root, path, new_key, item_size);
 	leaf = path->nodes[0];
 	memcpy_extent_buffer(leaf,
 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -3785,13 +3785,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
  *
  * @root:	root we are inserting items to
  * @path:	points to the leaf/slot where we are going to insert new items
- * @cpu_key:	array of keys for items to be inserted
- * @data_size:	size of the body of each item we are going to insert
- * @nr:		size of @cpu_key/@data_size arrays
+ * @batch:      information about the batch of items to insert
  */
 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-			    const struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
+			    const struct btrfs_item_batch *batch)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_item *item;
@@ -3803,14 +3800,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 	int slot;
 	struct btrfs_map_token token;
 	u32 total_size;
-	u32 total_data = 0;
-
-	for (i = 0; i < nr; i++)
-		total_data += data_size[i];
-	total_size = total_data + (nr * sizeof(struct btrfs_item));
 
+	/*
+	 * Before anything else, update keys in the parent and other ancestors
+	 * if needed, then release the write locks on them, so that other tasks
+	 * can use them while we modify the leaf.
+	 */
 	if (path->slots[0] == 0) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
 		fixup_low_keys(path, &disk_key, 1);
 	}
 	btrfs_unlock_up_safe(path, 1);
@@ -3820,6 +3817,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(leaf);
+	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
 
 	if (btrfs_leaf_free_space(leaf) < total_size) {
 		btrfs_print_leaf(leaf);
@@ -3849,31 +3847,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 			item = btrfs_item_nr(i);
 			ioff = btrfs_token_item_offset(&token, item);
 			btrfs_set_token_item_offset(&token, item,
-						    ioff - total_data);
+						    ioff - batch->total_data_size);
 		}
 		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
 			      btrfs_item_nr_offset(slot),
 			      (nritems - slot) * sizeof(struct btrfs_item));
 
 		/* shift the data */
 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
-			      data_end, old_data - data_end);
+				      data_end - batch->total_data_size,
+				      BTRFS_LEAF_DATA_OFFSET + data_end,
+				      old_data - data_end);
 		data_end = old_data;
 	}
 
 	/* setup the item for the new data */
-	for (i = 0; i < nr; i++) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+	for (i = 0; i < batch->nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
 		btrfs_set_item_key(leaf, &disk_key, slot + i);
 		item = btrfs_item_nr(slot + i);
-		data_end -= data_size[i];
+		data_end -= batch->data_sizes[i];
 		btrfs_set_token_item_offset(&token, item, data_end);
-		btrfs_set_token_item_size(&token, item, data_size[i]);
+		btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
 	}
 
-	btrfs_set_header_nritems(leaf, nritems + nr);
+	btrfs_set_header_nritems(leaf, nritems + batch->nr);
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3889,20 +3888,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct btrfs_path *path,
-			    const struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
+			    const struct btrfs_item_batch *batch)
 {
 	int ret = 0;
 	int slot;
-	int i;
-	u32 total_size = 0;
-	u32 total_data = 0;
-
-	for (i = 0; i < nr; i++)
-		total_data += data_size[i];
+	u32 total_size;
 
-	total_size = total_data + (nr * sizeof(struct btrfs_item));
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
 	if (ret == 0)
 		return -EEXIST;
 	if (ret < 0)
@@ -3911,7 +3904,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 
-	setup_items_for_insert(root, path, cpu_key, data_size, nr);
+	setup_items_for_insert(root, path, batch);
 	return 0;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ae06ad5..1ddf7d1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2892,16 +2892,56 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 	return btrfs_del_items(trans, root, path, path->slots[0], 1);
 }
 
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items() and setup_items_for_insert().
+ */
+struct btrfs_item_batch {
+	/*
+	 * Pointer to an array containing the keys of the items to insert (in
+	 * sorted order).
+	 */
+	const struct btrfs_key *keys;
+	/* Pointer to an array containing the data size for each item to insert. */
+	const u32 *data_sizes;
+	/*
+	 * The sum of data sizes for all items. The caller can compute this while
+	 * setting up the data_sizes array, so it ends up being more efficient
+	 * than having btrfs_insert_empty_items() or setup_item_for_insert()
+	 * doing it, as it would avoid an extra loop over a potentially large
+	 * array, and in the case of setup_item_for_insert(), we would be doing
+	 * it while holding a write lock on a leaf and often on upper level nodes
+	 * too, unnecessarily increasing the size of a critical section.
+	 */
+	u32 total_data_size;
+	/* Size of the keys and data_sizes arrays (number of items in the batch). */
+	int nr;
+};
+
 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-			    const struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr);
+			    const struct btrfs_item_batch *batch);
+
+static inline void setup_item_for_insert(struct btrfs_root *root,
+					 struct btrfs_path *path,
+					 const struct btrfs_key *key,
+					 u32 data_size)
+{
+	struct btrfs_item_batch batch;
+
+	batch.keys = key;
+	batch.data_sizes = &data_size;
+	batch.total_data_size = data_size;
+	batch.nr = 1;
+
+	setup_items_for_insert(root, path, &batch);
+}
+
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *key, void *data, u32 data_size);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path,
-			     const struct btrfs_key *cpu_key, u32 *data_size,
-			     int nr);
+			     const struct btrfs_item_batch *batch);
 
 static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
@@ -2909,7 +2949,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 					  const struct btrfs_key *key,
 					  u32 data_size)
 {
-	return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+	struct btrfs_item_batch batch;
+
+	batch.keys = key;
+	batch.data_sizes = &data_size;
+	batch.total_data_size = data_size;
+	batch.nr = 1;
+
+	return btrfs_insert_empty_items(trans, root, path, &batch);
 }
 
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1e08eb2..e164766 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 				     struct btrfs_path *path,
 				     struct btrfs_delayed_item *first_item)
 {
-	LIST_HEAD(batch);
+	LIST_HEAD(item_list);
 	struct btrfs_delayed_item *curr;
 	struct btrfs_delayed_item *next;
 	const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+	struct btrfs_item_batch batch;
 	int total_size;
-	int nitems;
 	char *ins_data = NULL;
-	struct btrfs_key *ins_keys;
-	u32 *ins_sizes;
 	int ret;
 
-	list_add_tail(&first_item->tree_list, &batch);
-	nitems = 1;
+	list_add_tail(&first_item->tree_list, &item_list);
+	batch.total_data_size = first_item->data_len;
+	batch.nr = 1;
 	total_size = first_item->data_len + sizeof(struct btrfs_item);
 	curr = first_item;
 
@@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 		if (total_size + next_size > max_size)
 			break;
 
-		list_add_tail(&next->tree_list, &batch);
-		nitems++;
+		list_add_tail(&next->tree_list, &item_list);
+		batch.nr++;
 		total_size += next_size;
+		batch.total_data_size += next->data_len;
 		curr = next;
 	}
 
-	if (nitems == 1) {
-		ins_keys = &first_item->key;
-		ins_sizes = &first_item->data_len;
+	if (batch.nr == 1) {
+		batch.keys = &first_item->key;
+		batch.data_sizes = &first_item->data_len;
 	} else {
+		struct btrfs_key *ins_keys;
+		u32 *ins_sizes;
 		int i = 0;
 
-		ins_data = kmalloc(nitems * sizeof(u32) +
-				   nitems * sizeof(struct btrfs_key), GFP_NOFS);
+		ins_data = kmalloc(batch.nr * sizeof(u32) +
+				   batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
 		if (!ins_data) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		ins_sizes = (u32 *)ins_data;
-		ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
-		list_for_each_entry(curr, &batch, tree_list) {
+		ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+		batch.keys = ins_keys;
+		batch.data_sizes = ins_sizes;
+		list_for_each_entry(curr, &item_list, tree_list) {
 			ins_keys[i] = curr->key;
 			ins_sizes[i] = curr->data_len;
 			i++;
 		}
 	}
 
-	ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
-				       nitems);
+	ret = btrfs_insert_empty_items(trans, root, path, &batch);
 	if (ret)
 		goto out;
 
-	list_for_each_entry(curr, &batch, tree_list) {
+	list_for_each_entry(curr, &item_list, tree_list) {
 		char *data_ptr;
 
 		data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
@@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_release_path(path);
 
-	list_for_each_entry_safe(curr, next, &batch, tree_list) {
+	list_for_each_entry_safe(curr, next, &item_list, tree_list) {
 		list_del(&curr->tree_list);
 		btrfs_delayed_item_release_metadata(root, curr);
 		btrfs_release_delayed_item(curr);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a176236..eee5127 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1020,8 +1020,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 				path->slots[0]++;
 		}
-		setup_items_for_insert(root, path, &key,
-				       &args->extent_item_size, 1);
+		setup_item_for_insert(root, path, &key, args->extent_item_size);
 		args->extent_inserted = true;
 	}
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a2f2a45..8ee7940 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6441,7 +6441,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_ref *ref;
 	struct btrfs_key key[2];
 	u32 sizes[2];
-	int nitems = name ? 2 : 1;
+	struct btrfs_item_batch batch;
 	unsigned long ptr;
 	unsigned int nofs_flag;
 	int ret;
@@ -6533,7 +6533,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		goto fail;
 	}
 
-	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+	batch.keys = &key[0];
+	batch.data_sizes = &sizes[0];
+	batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+	batch.nr = name ? 2 : 1;
+	ret = btrfs_insert_empty_items(trans, root, path, &batch);
 	if (ret != 0)
 		goto fail_unlock;
 
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index df54cdf..c9ab65e 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	key.type = BTRFS_EXTENT_CSUM_KEY;
 	key.offset = 0;
 
-	setup_items_for_insert(root, path, &key, &value_len, 1);
+	setup_item_for_insert(root, path, &key, value_len);
 	item = btrfs_item_nr(0);
 	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
 			    value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index c9874b1..af62d05 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = start;
 
-	setup_items_for_insert(root, &path, &key, &value_len, 1);
+	setup_item_for_insert(root, &path, &key, value_len);
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 	btrfs_set_file_extent_generation(leaf, fi, 1);
 	btrfs_set_file_extent_type(leaf, fi, type);
@@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 
-	setup_items_for_insert(root, &path, &key, &value_len, 1);
+	setup_item_for_insert(root, &path, &key, value_len);
 }
 
 /*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 24ebbed..07b6a83 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3672,8 +3672,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 				 int count)
 {
 	char *ins_data = NULL;
-	struct btrfs_key *ins_keys;
-	u32 *ins_sizes;
+	struct btrfs_item_batch batch;
 	struct extent_buffer *dst;
 	struct btrfs_key key;
 	u32 item_size;
@@ -3681,13 +3680,18 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 	int i;
 
 	ASSERT(count > 0);
+	batch.nr = count;
 
 	if (count == 1) {
 		btrfs_item_key_to_cpu(src, &key, start_slot);
 		item_size = btrfs_item_size_nr(src, start_slot);
-		ins_keys = &key;
-		ins_sizes = &item_size;
+		batch.keys = &key;
+		batch.data_sizes = &item_size;
+		batch.total_data_size = item_size;
 	} else {
+		struct btrfs_key *ins_keys;
+		u32 *ins_sizes;
+
 		ins_data = kmalloc(count * sizeof(u32) +
 				   count * sizeof(struct btrfs_key), GFP_NOFS);
 		if (!ins_data)
@@ -3695,17 +3699,20 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 
 		ins_sizes = (u32 *)ins_data;
 		ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+		batch.keys = ins_keys;
+		batch.data_sizes = ins_sizes;
+		batch.total_data_size = 0;
 
 		for (i = 0; i < count; i++) {
 			const int slot = start_slot + i;
 
 			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
 			ins_sizes[i] = btrfs_item_size_nr(src, slot);
+			batch.total_data_size += ins_sizes[i];
 		}
 	}
 
-	ret = btrfs_insert_empty_items(trans, log, dst_path, ins_keys, ins_sizes,
-				       count);
+	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
 	if (ret)
 		goto out;
 
@@ -3716,7 +3723,8 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 
 		dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0]);
 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
-		copy_extent_buffer(dst, src, dst_offset, src_offset, ins_sizes[i]);
+		copy_extent_buffer(dst, src, dst_offset, src_offset,
+				   batch.data_sizes[i]);
 		dst_path->slots[0]++;
 	}
 	btrfs_release_path(dst_path);
@@ -4326,6 +4334,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_key *ins_keys;
 	u32 *ins_sizes;
+	struct btrfs_item_batch batch;
 	char *ins_data;
 	int i;
 	struct list_head ordered_sums;
@@ -4340,13 +4349,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 	ins_sizes = (u32 *)ins_data;
 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+	batch.keys = ins_keys;
+	batch.data_sizes = ins_sizes;
+	batch.total_data_size = 0;
+	batch.nr = nr;
 
 	for (i = 0; i < nr; i++) {
 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+		batch.total_data_size += ins_sizes[i];
 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
 	}
-	ret = btrfs_insert_empty_items(trans, log, dst_path,
-				       ins_keys, ins_sizes, nr);
+	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
 	if (ret) {
 		kfree(ins_data);
 		return ret;
-- 
2.34.1.75.gabe6bb3905


From 5b7bf438b33cdf2d075e9d8ffc0d2420a67c45e5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 24 Sep 2021 12:28:14 +0100
Subject: [PATCH 18/48] btrfs: unexport setup_items_for_insert()

Since setup_items_for_insert() is not used anymore outside of ctree.c,
make it static and remove its prototype from ctree.h. This also requires
to move the definition of setup_item_for_insert() from ctree.h to ctree.c
and move down btrfs_duplicate_item() so that it's defined after
setup_items_for_insert().

Further, since setup_item_for_insert() is used outside ctree.c, rename it
to btrfs_setup_item_for_insert().

This patch is part of a small patchset that is comprised of the following
patches:

  btrfs: loop only once over data sizes array when inserting an item batch
  btrfs: unexport setup_items_for_insert()
  btrfs: use single bulk copy operations when logging directories

This is patch 2/3 and performance results, and the specific tests, are
included in the changelog of patch 3/3.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c                     | 95 +++++++++++++++++-----------
 fs/btrfs/ctree.h                     | 24 ++-----
 fs/btrfs/file.c                      |  2 +-
 fs/btrfs/tests/extent-buffer-tests.c |  2 +-
 fs/btrfs/tests/inode-tests.c         |  4 +-
 5 files changed, 68 insertions(+), 59 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index bf4d150..387295f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3580,40 +3580,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-/*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 struct btrfs_path *path,
-			 const struct btrfs_key *new_key)
-{
-	struct extent_buffer *leaf;
-	int ret;
-	u32 item_size;
-
-	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-	ret = setup_leaf_for_split(trans, root, path,
-				   item_size + sizeof(struct btrfs_item));
-	if (ret)
-		return ret;
-
-	path->slots[0]++;
-	setup_item_for_insert(root, path, new_key, item_size);
-	leaf = path->nodes[0];
-	memcpy_extent_buffer(leaf,
-			     btrfs_item_ptr_offset(leaf, path->slots[0]),
-			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
-			     item_size);
-	return 0;
-}
-
 /*
  * make the item pointed to by the path smaller.  new_size indicates
  * how small to make it, and from_end tells us if we just chop bytes
@@ -3787,8 +3753,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
  * @path:	points to the leaf/slot where we are going to insert new items
  * @batch:      information about the batch of items to insert
  */
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-			    const struct btrfs_item_batch *batch)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+				   const struct btrfs_item_batch *batch)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_item *item;
@@ -3881,6 +3847,29 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 	}
 }
 
+/*
+ * Insert a new item into a leaf.
+ *
+ * @root:      The root of the btree.
+ * @path:      A path pointing to the target leaf and slot.
+ * @key:       The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 const struct btrfs_key *key,
+				 u32 data_size)
+{
+	struct btrfs_item_batch batch;
+
+	batch.keys = key;
+	batch.data_sizes = &data_size;
+	batch.total_data_size = data_size;
+	batch.nr = 1;
+
+	setup_items_for_insert(root, path, &batch);
+}
+
 /*
  * Given a key and some data, insert items into the tree.
  * This does all the path init required, making room in the tree if needed.
@@ -3935,6 +3924,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return ret;
 }
 
+/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path,
+			 const struct btrfs_key *new_key)
+{
+	struct extent_buffer *leaf;
+	int ret;
+	u32 item_size;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ret = setup_leaf_for_split(trans, root, path,
+				   item_size + sizeof(struct btrfs_item));
+	if (ret)
+		return ret;
+
+	path->slots[0]++;
+	btrfs_setup_item_for_insert(root, path, new_key, item_size);
+	leaf = path->nodes[0];
+	memcpy_extent_buffer(leaf,
+			     btrfs_item_ptr_offset(leaf, path->slots[0]),
+			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+			     item_size);
+	return 0;
+}
+
 /*
  * delete the pointer from a given node.
  *
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1ddf7d1..27815dc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2894,7 +2894,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 
 /*
  * Describes a batch of items to insert in a btree. This is used by
- * btrfs_insert_empty_items() and setup_items_for_insert().
+ * btrfs_insert_empty_items().
  */
 struct btrfs_item_batch {
 	/*
@@ -2918,24 +2918,10 @@ struct btrfs_item_batch {
 	int nr;
 };
 
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-			    const struct btrfs_item_batch *batch);
-
-static inline void setup_item_for_insert(struct btrfs_root *root,
-					 struct btrfs_path *path,
-					 const struct btrfs_key *key,
-					 u32 data_size)
-{
-	struct btrfs_item_batch batch;
-
-	batch.keys = key;
-	batch.data_sizes = &data_size;
-	batch.total_data_size = data_size;
-	batch.nr = 1;
-
-	setup_items_for_insert(root, path, &batch);
-}
-
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 const struct btrfs_key *key,
+				 u32 data_size);
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *key, void *data, u32 data_size);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eee5127..6c50561 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1020,7 +1020,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 				path->slots[0]++;
 		}
-		setup_item_for_insert(root, path, &key, args->extent_item_size);
+		btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
 		args->extent_inserted = true;
 	}
 
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index c9ab65e..2a95f72 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	key.type = BTRFS_EXTENT_CSUM_KEY;
 	key.offset = 0;
 
-	setup_item_for_insert(root, path, &key, value_len);
+	btrfs_setup_item_for_insert(root, path, &key, value_len);
 	item = btrfs_item_nr(0);
 	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
 			    value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index af62d05..cac89c3 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = start;
 
-	setup_item_for_insert(root, &path, &key, value_len);
+	btrfs_setup_item_for_insert(root, &path, &key, value_len);
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 	btrfs_set_file_extent_generation(leaf, fi, 1);
 	btrfs_set_file_extent_type(leaf, fi, type);
@@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 
-	setup_item_for_insert(root, &path, &key, value_len);
+	btrfs_setup_item_for_insert(root, &path, &key, value_len);
 }
 
 /*
-- 
2.34.1.75.gabe6bb3905


From a50b5c17872e0b1dbf625e9b1aac0a21ceb7cdb6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 24 Sep 2021 12:28:15 +0100
Subject: [PATCH 19/48] btrfs: use single bulk copy operations when logging
 directories

When logging a directory and inserting a batch of directory items, we are
copying the data of each item from a leaf in the fs/subvolume tree to a
leaf in a log tree, separately. This is not really needed, since we are
copying from a contiguous memory area into another one, so we can use a
single copy operation to copy all items at once.

This patch is part of a small patchset that is comprised of the following
patches:

  btrfs: loop only once over data sizes array when inserting an item batch
  btrfs: unexport setup_items_for_insert()
  btrfs: use single bulk copy operations when logging directories

This is patch 3/3.

The following test was used to compare performance of a branch without the
patchset versus one branch that has the whole patchset applied:

  $ cat dir-fsync-test.sh
  #!/bin/bash

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1

  NUM_NEW_FILES=1000000
  NUM_FILE_DELETES=1000
  LEAF_SIZE=16K

  mkfs.btrfs -f -n $LEAF_SIZE $DEV
  mount -o ssd $DEV $MNT

  mkdir $MNT/testdir

  for ((i = 1; i <= $NUM_NEW_FILES; i++)); do
      echo -n > $MNT/testdir/file_$i
  done

  # Fsync the directory, this will log the new dir items and the inodes
  # they point to, because these are new inodes.
  start=$(date +%s%N)
  xfs_io -c "fsync" $MNT/testdir
  end=$(date +%s%N)

  dur=$(( (end - start) / 1000000 ))
  echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files"

  # sync to force transaction commit and wipeout the log.
  sync

  del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES ))
  for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do
      rm -f $MNT/testdir/file_$i
  done

  # Fsync the directory, this will only log dir items, there are no
  # dentries pointing to new inodes.
  start=$(date +%s%N)
  xfs_io -c "fsync" $MNT/testdir
  end=$(date +%s%N)

  dur=$(( (end - start) / 1000000 ))
  echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files"

  umount $MNT

The tests were run on a non-debug kernel (Debian's default kernel config)
and were the following:

*** with a leaf size of 16K, before patchset ***

dir fsync took 8482 ms after adding 1000000 files
dir fsync took 166 ms after deleting 1000 files

*** with a leaf size of 16K, after patchset ***

dir fsync took 8196 ms after adding 1000000 files  (-3.4%)
dir fsync took 143 ms after deleting 1000 files    (-14.9%)

*** with a leaf size of 64K, before patchset ***

dir fsync took 12851 ms after adding 1000000 files
dir fsync took 466 ms after deleting 1000 files

*** with a leaf size of 64K, after  patchset ***

dir fsync took 12287 ms after adding 1000000 files (-4.5%)
dir fsync took 414 ms after deleting 1000 files    (-11.8%)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 07b6a83..5e6b98e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3674,6 +3674,8 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 	char *ins_data = NULL;
 	struct btrfs_item_batch batch;
 	struct extent_buffer *dst;
+	unsigned long src_offset;
+	unsigned long dst_offset;
 	struct btrfs_key key;
 	u32 item_size;
 	int ret;
@@ -3717,16 +3719,19 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 		goto out;
 
 	dst = dst_path->nodes[0];
-	for (i = 0; i < count; i++) {
-		unsigned long src_offset;
-		unsigned long dst_offset;
-
-		dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0]);
-		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
-		copy_extent_buffer(dst, src, dst_offset, src_offset,
-				   batch.data_sizes[i]);
-		dst_path->slots[0]++;
-	}
+	/*
+	 * Copy all the items in bulk, in a single copy operation. Item data is
+	 * organized such that it's placed at the end of a leaf and from right
+	 * to left. For example, the data for the second item ends at an offset
+	 * that matches the offset where the data for the first item starts, the
+	 * data for the third item ends at an offset that matches the offset
+	 * where the data of the second items starts, and so on.
+	 * Therefore our source and destination start offsets for copy match the
+	 * offsets of the last items (highest slots).
+	 */
+	dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+	src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+	copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
 	btrfs_release_path(dst_path);
 out:
 	kfree(ins_data);
-- 
2.34.1.75.gabe6bb3905


From 0796879ac8804981e0d2385db6f2d1baba120baf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 29 Sep 2021 11:17:12 -0400
Subject: [PATCH 20/48] btrfs: index free space entries on size

Currently we index free space on offset only, because usually we have a
hint from the allocator that we want to honor for locality reasons.
However if we fail to use this hint we have to go back to a brute force
search through the free space entries to find a large enough extent.

With sufficiently fragmented free space this becomes quite expensive, as
we have to linearly search all of the free space entries to find if we
have a part that's long enough.

To fix this add a cached rb tree to index based on free space entry
bytes.  This will allow us to quickly look up the largest chunk in the
free space tree for this block group, and stop searching once we've
found an entry that is too small to satisfy our allocation.  We simply
choose to use this tree if we're searching from the beginning of the
block group, as we know we do not care about locality at that point.

I wrote an allocator test that creates a 10TiB ram backed null block
device and then fallocates random files until the file system is full.
I think go through and delete all of the odd files.  Then I spawn 8
threads that fallocate 64mib files (1/2 our extent size cap) until the
file system is full again.  I use bcc's funclatency to measure the
latency of find_free_extent.  The baseline results are

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 10356    |****                                    |
       512 -> 1023       : 58242    |*************************               |
      1024 -> 2047       : 74418    |********************************        |
      2048 -> 4095       : 90393    |****************************************|
      4096 -> 8191       : 79119    |***********************************     |
      8192 -> 16383      : 35614    |***************                         |
     16384 -> 32767      : 13418    |*****                                   |
     32768 -> 65535      : 12811    |*****                                   |
     65536 -> 131071     : 17090    |*******                                 |
    131072 -> 262143     : 26465    |***********                             |
    262144 -> 524287     : 40179    |*****************                       |
    524288 -> 1048575    : 55469    |************************                |
   1048576 -> 2097151    : 48807    |*********************                   |
   2097152 -> 4194303    : 26744    |***********                             |
   4194304 -> 8388607    : 35351    |***************                         |
   8388608 -> 16777215   : 13918    |******                                  |
  16777216 -> 33554431   : 21       |                                        |

avg = 908079 nsecs, total: 580889071441 nsecs, count: 639690

And the patch results are

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 6883     |**                                      |
       512 -> 1023       : 54346    |*********************                   |
      1024 -> 2047       : 79170    |********************************        |
      2048 -> 4095       : 98890    |****************************************|
      4096 -> 8191       : 81911    |*********************************       |
      8192 -> 16383      : 27075    |**********                              |
     16384 -> 32767      : 14668    |*****                                   |
     32768 -> 65535      : 13251    |*****                                   |
     65536 -> 131071     : 15340    |******                                  |
    131072 -> 262143     : 26715    |**********                              |
    262144 -> 524287     : 43274    |*****************                       |
    524288 -> 1048575    : 53870    |*********************                   |
   1048576 -> 2097151    : 55368    |**********************                  |
   2097152 -> 4194303    : 41036    |****************                        |
   4194304 -> 8388607    : 24927    |**********                              |
   8388608 -> 16777215   : 33       |                                        |
  16777216 -> 33554431   : 9        |                                        |

avg = 623599 nsecs, total: 397259314759 nsecs, count: 637042

There's a little variation in the amount of calls done because of timing
of the threads with metadata requirements, but the avg, total, and
count's are relatively consistent between runs (usually within 2-5% of
each other).  As you can see here we have around a 30% decrease in
average latency with a 30% decrease in overall time spent in
find_free_extent.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/free-space-cache.c | 84 +++++++++++++++++++++++++++++++++----
 fs/btrfs/free-space-cache.h |  2 +
 2 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index da0eee7..068d9ba 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1576,6 +1576,44 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 	return 0;
 }
 
+static u64 free_space_info_bytes(struct btrfs_free_space *info)
+{
+	if (info->bitmap && info->max_extent_size)
+		return info->max_extent_size;
+	return info->bytes;
+}
+
+/*
+ * This is indexed in reverse of what we generally do for rb-tree's, the largest
+ * chunks are left most and the smallest are rightmost.  This is so that we can
+ * take advantage of the cached property of the cached rb-tree and simply get
+ * the largest free space chunk right away.
+ */
+static void tree_insert_bytes(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	struct rb_root_cached *root = &ctl->free_space_bytes;
+	struct rb_node **p = &root->rb_root.rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_free_space *tmp;
+	bool leftmost = true;
+
+	while (*p) {
+		parent_node = *p;
+		tmp = rb_entry(parent_node, struct btrfs_free_space,
+			       bytes_index);
+		if (free_space_info_bytes(info) < free_space_info_bytes(tmp)) {
+			p = &(*p)->rb_right;
+			leftmost = false;
+		} else {
+			p = &(*p)->rb_left;
+		}
+	}
+
+	rb_link_node(&info->bytes_index, parent_node, p);
+	rb_insert_color_cached(&info->bytes_index, root, leftmost);
+}
+
 /*
  * searches the tree for the given offset.
  *
@@ -1704,6 +1742,7 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
 		    struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &ctl->free_space_offset);
+	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
 	ctl->free_extents--;
 
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
@@ -1730,6 +1769,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	if (ret)
 		return ret;
 
+	tree_insert_bytes(ctl, info);
+
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
 		ctl->discardable_extents[BTRFS_STAT_CURR]++;
 		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1876,7 +1917,7 @@ static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
 /* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
-		unsigned long align, u64 *max_extent_size)
+		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
@@ -1887,15 +1928,37 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 	if (!ctl->free_space_offset.rb_node)
 		goto out;
 
-	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
-	if (!entry)
-		goto out;
+	if (use_bytes_index) {
+		node = rb_first_cached(&ctl->free_space_bytes);
+	} else {
+		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
+					   0, 1);
+		if (!entry)
+			goto out;
+		node = &entry->offset_index;
+	}
 
-	for (node = &entry->offset_index; node; node = rb_next(node)) {
-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	for (; node; node = rb_next(node)) {
+		if (use_bytes_index)
+			entry = rb_entry(node, struct btrfs_free_space,
+					 bytes_index);
+		else
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+
+		/*
+		 * If we are using the bytes index then all subsequent entries
+		 * in this tree are going to be < bytes, so simply set the max
+		 * extent size and exit the loop.
+		 *
+		 * If we're using the offset index then we need to keep going
+		 * through the rest of the tree.
+		 */
 		if (entry->bytes < *bytes) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
+			if (use_bytes_index)
+				break;
 			continue;
 		}
 
@@ -2482,6 +2545,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 	info->bytes = bytes;
 	info->trim_state = trim_state;
 	RB_CLEAR_NODE(&info->offset_index);
+	RB_CLEAR_NODE(&info->bytes_index);
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2785,6 +2849,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 	ctl->start = block_group->start;
 	ctl->private = block_group;
 	ctl->op = &free_space_op;
+	ctl->free_space_bytes = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&ctl->trimming_ranges);
 	mutex_init(&ctl->cache_writeout_mutex);
 
@@ -2850,6 +2915,7 @@ static void __btrfs_return_cluster_to_free_space(
 		}
 		tree_insert_offset(&ctl->free_space_offset,
 				   entry->offset, &entry->offset_index, bitmap);
+		tree_insert_bytes(ctl, entry);
 	}
 	cluster->root = RB_ROOT;
 	spin_unlock(&cluster->lock);
@@ -2951,12 +3017,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
 	u64 align_gap = 0;
 	u64 align_gap_len = 0;
 	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+	bool use_bytes_index = (offset == block_group->start);
 
 	ASSERT(!btrfs_is_zoned(block_group->fs_info));
 
 	spin_lock(&ctl->tree_lock);
 	entry = find_free_space(ctl, &offset, &bytes_search,
-				block_group->full_stripe_len, max_extent_size);
+				block_group->full_stripe_len, max_extent_size,
+				use_bytes_index);
 	if (!entry)
 		goto out;
 
@@ -3240,6 +3308,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
 
 	cluster->window_start = start * ctl->unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
+	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
 	ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3330,6 +3399,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
+		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
 		total_size += entry->bytes;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 1f23088..dd982d2 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,6 +22,7 @@ enum btrfs_trim_state {
 
 struct btrfs_free_space {
 	struct rb_node offset_index;
+	struct rb_node bytes_index;
 	u64 offset;
 	u64 bytes;
 	u64 max_extent_size;
@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
 struct btrfs_free_space_ctl {
 	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
+	struct rb_root_cached free_space_bytes;
 	u64 free_space;
 	int extents_thresh;
 	int free_extents;
-- 
2.34.1.75.gabe6bb3905


From 58ad2a25c0bdd1553bb4159d970ae97f190f4e49 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 13 Oct 2021 10:12:49 +0100
Subject: [PATCH 21/48] btrfs: fix deadlock between chunk allocation and chunk
 btree modifications

When a task is doing some modification to the chunk btree and it is not in
the context of a chunk allocation or a chunk removal, it can deadlock with
another task that is currently allocating a new data or metadata chunk.

These contextes are the following:

* When relocating a system chunk, when we need to COW the extent buffers
  that belong to the chunk btree;

* When adding a new device (ioctl), where we need to add a new device item
  to the chunk btree;

* When removing a device (ioctl), where we need to remove a device item
  from the chunk btree;

* When resizing a device (ioctl), where we need to update a device item in
  the chunk btree and may need to relocate a system chunk that lies beyond
  the new device size when shrinking a device.

The problem happens due to a sequence of steps like the following:

1) Task A starts a data or metadata chunk allocation and it locks the
   chunk mutex;

2) Task B is relocating a system chunk, and when it needs to COW an extent
   buffer of the chunk btree, it has locked both that extent buffer as
   well as its parent extent buffer;

3) Since there is not enough available system space, either because none
   of the existing system block groups have enough free space or because
   the only one with enough free space is in RO mode due to the relocation,
   task B triggers a new system chunk allocation. It blocks when trying to
   acquire the chunk mutex, currently held by task A;

4) Task A enters btrfs_chunk_alloc_add_chunk_item(), in order to insert
   the new chunk item into the chunk btree and update the existing device
   items there. But in order to do that, it has to lock the extent buffer
   that task B locked at step 2, or its parent extent buffer, but task B
   is waiting on the chunk mutex, which is currently locked by task A,
   therefore resulting in a deadlock.

One example report when the deadlock happens with system chunk relocation:

  INFO: task kworker/u9:5:546 blocked for more than 143 seconds.
        Not tainted 5.15.0-rc3+ #1
  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  task:kworker/u9:5    state:D stack:25936 pid:  546 ppid:     2 flags:0x00004000
  Workqueue: events_unbound btrfs_async_reclaim_metadata_space
  Call Trace:
   context_switch kernel/sched/core.c:4940 [inline]
   __schedule+0xcd9/0x2530 kernel/sched/core.c:6287
   schedule+0xd3/0x270 kernel/sched/core.c:6366
   rwsem_down_read_slowpath+0x4ee/0x9d0 kernel/locking/rwsem.c:993
   __down_read_common kernel/locking/rwsem.c:1214 [inline]
   __down_read kernel/locking/rwsem.c:1223 [inline]
   down_read_nested+0xe6/0x440 kernel/locking/rwsem.c:1590
   __btrfs_tree_read_lock+0x31/0x350 fs/btrfs/locking.c:47
   btrfs_tree_read_lock fs/btrfs/locking.c:54 [inline]
   btrfs_read_lock_root_node+0x8a/0x320 fs/btrfs/locking.c:191
   btrfs_search_slot_get_root fs/btrfs/ctree.c:1623 [inline]
   btrfs_search_slot+0x13b4/0x2140 fs/btrfs/ctree.c:1728
   btrfs_update_device+0x11f/0x500 fs/btrfs/volumes.c:2794
   btrfs_chunk_alloc_add_chunk_item+0x34d/0xea0 fs/btrfs/volumes.c:5504
   do_chunk_alloc fs/btrfs/block-group.c:3408 [inline]
   btrfs_chunk_alloc+0x84d/0xf50 fs/btrfs/block-group.c:3653
   flush_space+0x54e/0xd80 fs/btrfs/space-info.c:670
   btrfs_async_reclaim_metadata_space+0x396/0xa90 fs/btrfs/space-info.c:953
   process_one_work+0x9df/0x16d0 kernel/workqueue.c:2297
   worker_thread+0x90/0xed0 kernel/workqueue.c:2444
   kthread+0x3e5/0x4d0 kernel/kthread.c:319
   ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
  INFO: task syz-executor:9107 blocked for more than 143 seconds.
        Not tainted 5.15.0-rc3+ #1
  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  task:syz-executor    state:D stack:23200 pid: 9107 ppid:  7792 flags:0x00004004
  Call Trace:
   context_switch kernel/sched/core.c:4940 [inline]
   __schedule+0xcd9/0x2530 kernel/sched/core.c:6287
   schedule+0xd3/0x270 kernel/sched/core.c:6366
   schedule_preempt_disabled+0xf/0x20 kernel/sched/core.c:6425
   __mutex_lock_common kernel/locking/mutex.c:669 [inline]
   __mutex_lock+0xc96/0x1680 kernel/locking/mutex.c:729
   btrfs_chunk_alloc+0x31a/0xf50 fs/btrfs/block-group.c:3631
   find_free_extent_update_loop fs/btrfs/extent-tree.c:3986 [inline]
   find_free_extent+0x25cb/0x3a30 fs/btrfs/extent-tree.c:4335
   btrfs_reserve_extent+0x1f1/0x500 fs/btrfs/extent-tree.c:4415
   btrfs_alloc_tree_block+0x203/0x1120 fs/btrfs/extent-tree.c:4813
   __btrfs_cow_block+0x412/0x1620 fs/btrfs/ctree.c:415
   btrfs_cow_block+0x2f6/0x8c0 fs/btrfs/ctree.c:570
   btrfs_search_slot+0x1094/0x2140 fs/btrfs/ctree.c:1768
   relocate_tree_block fs/btrfs/relocation.c:2694 [inline]
   relocate_tree_blocks+0xf73/0x1770 fs/btrfs/relocation.c:2757
   relocate_block_group+0x47e/0xc70 fs/btrfs/relocation.c:3673
   btrfs_relocate_block_group+0x48a/0xc60 fs/btrfs/relocation.c:4070
   btrfs_relocate_chunk+0x96/0x280 fs/btrfs/volumes.c:3181
   __btrfs_balance fs/btrfs/volumes.c:3911 [inline]
   btrfs_balance+0x1f03/0x3cd0 fs/btrfs/volumes.c:4301
   btrfs_ioctl_balance+0x61e/0x800 fs/btrfs/ioctl.c:4137
   btrfs_ioctl+0x39ea/0x7b70 fs/btrfs/ioctl.c:4949
   vfs_ioctl fs/ioctl.c:51 [inline]
   __do_sys_ioctl fs/ioctl.c:874 [inline]
   __se_sys_ioctl fs/ioctl.c:860 [inline]
   __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
   do_syscall_x64 arch/x86/entry/common.c:50 [inline]
   do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
   entry_SYSCALL_64_after_hwframe+0x44/0xae

So fix this by making sure that whenever we try to modify the chunk btree
and we are neither in a chunk allocation context nor in a chunk remove
context, we reserve system space before modifying the chunk btree.

[HH: small backport for 5.14.x]
Reported-by: Hao Sun <sunhao.th@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CACkBjsax51i4mu6C0C3vJqQN3NR_iVuucoeG3U1HXjrgzn5FFQ@mail.gmail.com/
Fixes: 79bd37120b1495 ("btrfs: rework chunk allocation to avoid exhaustion of the system chunk array")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/block-group.c | 145 +++++++++++++++++++++++++----------------
 fs/btrfs/block-group.h |   2 +
 fs/btrfs/relocation.c  |   4 ++
 fs/btrfs/volumes.c     |  15 ++++-
 4 files changed, 110 insertions(+), 56 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a53ebc5..3da5f0c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3387,25 +3387,6 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
 		goto out;
 	}
 
-	/*
-	 * If this is a system chunk allocation then stop right here and do not
-	 * add the chunk item to the chunk btree. This is to prevent a deadlock
-	 * because this system chunk allocation can be triggered while COWing
-	 * some extent buffer of the chunk btree and while holding a lock on a
-	 * parent extent buffer, in which case attempting to insert the chunk
-	 * item (or update the device item) would result in a deadlock on that
-	 * parent extent buffer. In this case defer the chunk btree updates to
-	 * the second phase of chunk allocation and keep our reservation until
-	 * the second phase completes.
-	 *
-	 * This is a rare case and can only be triggered by the very few cases
-	 * we have where we need to touch the chunk btree outside chunk allocation
-	 * and chunk removal. These cases are basically adding a device, removing
-	 * a device or resizing a device.
-	 */
-	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		return 0;
-
 	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
 	/*
 	 * Normally we are not expected to fail with -ENOSPC here, since we have
@@ -3538,14 +3519,14 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
  * the system chunk array due to concurrent allocations") provides more details.
  *
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
  *
  * The reservation of system space, done through check_system_chunk(), as well
  * as all the updates and insertions into the chunk btree must be done while
@@ -3582,11 +3563,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
 	if (trans->allocating_chunk)
 		return -ENOSPC;
 	/*
-	 * If we are removing a chunk, don't re-enter or we would deadlock.
-	 * System space reservation and system chunk allocation is done by the
-	 * chunk remove operation (btrfs_remove_chunk()).
+	 * Allocation of system chunks can not happen through this path, as we
+	 * could end up in a deadlock if we are allocating a data or metadata
+	 * chunk and there is another task modifying the chunk btree.
+	 *
+	 * This is because while we are holding the chunk mutex, we will attempt
+	 * to add the new chunk item to the chunk btree or update an existing
+	 * device item in the chunk btree, while the other task that is modifying
+	 * the chunk btree is attempting to COW an extent buffer while holding a
+	 * lock on it and on its parent - if the COW operation triggers a system
+	 * chunk allocation, then we can deadlock because we are holding the
+	 * chunk mutex and we may need to access that extent buffer or its parent
+	 * in order to add the chunk item or update a device item.
+	 *
+	 * Tasks that want to modify the chunk tree should reserve system space
+	 * before updating the chunk btree, by calling either
+	 * btrfs_reserve_chunk_metadata() or check_system_chunk().
+	 * It's possible that after a task reserves the space, it still ends up
+	 * here - this happens in the cases described above at do_chunk_alloc().
+	 * The task will have to either retry or fail.
 	 */
-	if (trans->removing_chunk)
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 		return -ENOSPC;
 
 	space_info = btrfs_find_space_info(fs_info, flags);
@@ -3685,17 +3682,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
 	return num_dev;
 }
 
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+				u64 bytes,
+				u64 type)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_space_info *info;
 	u64 left;
-	u64 thresh;
 	int ret = 0;
-	u64 num_devs;
 
 	/*
 	 * Needed because we can end up allocating a system chunk and for an
@@ -3708,19 +3702,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
 	left = info->total_bytes - btrfs_space_info_used(info, true);
 	spin_unlock(&info->lock);
 
-	num_devs = get_profile_num_devs(fs_info, type);
-
-	/* num_devs device items to update and 1 chunk item to add or remove */
-	thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
-		btrfs_calc_insert_metadata_size(fs_info, 1);
-
-	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+	if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
-			   left, thresh, type);
+			   left, bytes, type);
 		btrfs_dump_space_info(fs_info, info, 0, 0);
 	}
 
-	if (left < thresh) {
+	if (left < bytes) {
 		u64 flags = btrfs_system_alloc_profile(fs_info);
 		struct btrfs_block_group *bg;
 
@@ -3729,21 +3717,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
 		 * needing it, as we might not need to COW all nodes/leafs from
 		 * the paths we visit in the chunk tree (they were already COWed
 		 * or created in the current transaction for example).
-		 *
-		 * Also, if our caller is allocating a system chunk, do not
-		 * attempt to insert the chunk item in the chunk btree, as we
-		 * could deadlock on an extent buffer since our caller may be
-		 * COWing an extent buffer from the chunk btree.
 		 */
 		bg = btrfs_alloc_chunk(trans, flags);
 		if (IS_ERR(bg)) {
 			ret = PTR_ERR(bg);
-		} else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+		} else {
 			/*
 			 * If we fail to add the chunk item here, we end up
 			 * trying again at phase 2 of chunk allocation, at
 			 * btrfs_create_pending_block_groups(). So ignore
-			 * any error here.
+			 * any error here. An ENOSPC here could happen, due to
+			 * the cases described at do_chunk_alloc() - the system
+			 * block group we just created was just turned into RO
+			 * mode by a scrub for example, or a running discard
+			 * temporarily removed its free space entries, etc.
 			 */
 			btrfs_chunk_alloc_add_chunk_item(trans, bg);
 		}
@@ -3752,12 +3739,60 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
 	if (!ret) {
 		ret = btrfs_block_rsv_add(fs_info->chunk_root,
 					  &fs_info->chunk_block_rsv,
-					  thresh, BTRFS_RESERVE_NO_FLUSH);
+					  bytes, BTRFS_RESERVE_NO_FLUSH);
 		if (!ret)
-			trans->chunk_bytes_reserved += thresh;
+			trans->chunk_bytes_reserved += bytes;
 	}
 }
 
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	const u64 num_devs = get_profile_num_devs(fs_info, type);
+	u64 bytes;
+
+	/* num_devs device items to update and 1 chunk item to add or remove. */
+	bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+		btrfs_calc_insert_metadata_size(fs_info, 1);
+
+	reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ * @trans:		A transaction handle.
+ * @is_item_insertion:	Indicate if the modification is for inserting a new item
+ *			in the chunk btree or if it's for the deletion or update
+ *			of an existing item.
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+				  bool is_item_insertion)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	u64 bytes;
+
+	if (is_item_insertion)
+		bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+	else
+		bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+	mutex_lock(&fs_info->chunk_mutex);
+	reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+	mutex_unlock(&fs_info->chunk_mutex);
+}
+
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
 {
 	struct btrfs_block_group *block_group;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c72a71e..37e55eb 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -289,6 +289,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
 		      enum btrfs_chunk_alloc_enum force);
 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
 void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+				  bool is_item_insertion);
 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d81bee6..cd39109 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2691,8 +2691,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 			list_add_tail(&node->list, &rc->backref_cache.changed);
 		} else {
 			path->lowest_level = node->level;
+			if (root == root->fs_info->chunk_root)
+				btrfs_reserve_chunk_metadata(trans, false);
 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
 			btrfs_release_path(path);
+			if (root == root->fs_info->chunk_root)
+				btrfs_trans_release_chunk_metadata(trans);
 			if (ret > 0)
 				ret = 0;
 		}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c34efdc..ce2bb12 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1848,8 +1848,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
 
+	btrfs_reserve_chunk_metadata(trans, true);
 	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
 				      &key, sizeof(*dev_item));
+	btrfs_trans_release_chunk_metadata(trans);
 	if (ret)
 		goto out;
 
@@ -1926,7 +1928,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
 
+	btrfs_reserve_chunk_metadata(trans, false);
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	btrfs_trans_release_chunk_metadata(trans);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
@@ -2482,7 +2486,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
 	key.type = BTRFS_DEV_ITEM_KEY;
 
 	while (1) {
+		btrfs_reserve_chunk_metadata(trans, false);
 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		btrfs_trans_release_chunk_metadata(trans);
 		if (ret < 0)
 			goto error;
 
@@ -2830,6 +2836,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 	struct btrfs_super_block *super_copy = fs_info->super_copy;
 	u64 old_total;
 	u64 diff;
+	int ret;
 
 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
 		return -EACCES;
@@ -2858,7 +2865,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 			      &trans->transaction->dev_update_list);
 	mutex_unlock(&fs_info->chunk_mutex);
 
-	return btrfs_update_device(trans, device);
+	btrfs_reserve_chunk_metadata(trans, false);
+	ret = btrfs_update_device(trans, device);
+	btrfs_trans_release_chunk_metadata(trans);
+
+	return ret;
 }
 
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -4893,8 +4904,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 			round_down(old_total - diff, fs_info->sectorsize));
 	mutex_unlock(&fs_info->chunk_mutex);
 
+	btrfs_reserve_chunk_metadata(trans, false);
 	/* Now btrfs_update_device() will change the on-disk size. */
 	ret = btrfs_update_device(trans, device);
+	btrfs_trans_release_chunk_metadata(trans);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
-- 
2.34.1.75.gabe6bb3905


From 1dcd4f7d29636597088a35f84ae08b08d233602a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 13 Oct 2021 10:12:50 +0100
Subject: [PATCH 22/48] btrfs: update comments for chunk allocation -ENOSPC
 cases

Update the comments at btrfs_chunk_alloc() and do_chunk_alloc() that
describe which cases can lead to a failure to allocate metadata and system
space despite having previously reserved space. This adds one more reason
that I previously forgot to mention.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/block-group.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 3da5f0c..8ccd91e 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3391,7 +3391,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
 	/*
 	 * Normally we are not expected to fail with -ENOSPC here, since we have
 	 * previously reserved space in the system space_info and allocated one
-	 * new system chunk if necessary. However there are two exceptions:
+	 * new system chunk if necessary. However there are three exceptions:
 	 *
 	 * 1) We may have enough free space in the system space_info but all the
 	 *    existing system block groups have a profile which can not be used
@@ -3417,7 +3417,14 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
 	 *    with enough free space got turned into RO mode by a running scrub,
 	 *    and in this case we have to allocate a new one and retry. We only
 	 *    need do this allocate and retry once, since we have a transaction
-	 *    handle and scrub uses the commit root to search for block groups.
+	 *    handle and scrub uses the commit root to search for block groups;
+	 *
+	 * 3) We had one system block group with enough free space when we called
+	 *    check_system_chunk(), but after that, right before we tried to
+	 *    allocate the last extent buffer we needed, a discard operation came
+	 *    in and it temporarily removed the last free space entry from the
+	 *    block group (discard removes a free space entry, discards it, and
+	 *    then adds back the entry to the block group cache).
 	 */
 	if (ret == -ENOSPC) {
 		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
@@ -3501,7 +3508,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
  *    properly, either intentionally or as a bug. One example where this is
  *    done intentionally is fsync, as it does not reserve any transaction units
  *    and ends up allocating a variable number of metadata extents for log
- *    tree extent buffers.
+ *    tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ *    before it tries to allocate the last extent buffer it needs, a discard
+ *    operation comes in and, temporarily, removes the last free space entry from
+ *    the only metadata block group that had free space (discard starts by
+ *    removing a free space entry from a block group, then does the discard
+ *    operation and, once it's done, it adds back the free space entry to the
+ *    block group).
  *
  * We also need this 2 phases setup when adding a device to a filesystem with
  * a seed device - we must create new metadata and system chunks without adding
-- 
2.34.1.75.gabe6bb3905


From d91c5d82947eec5c7ff91762cd1c4d0ec2879463 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Oct 2021 10:56:21 +0100
Subject: [PATCH 23/48] btrfs: remove root argument from drop_one_dir_item()

The root argument for drop_one_dir_item() always matches the root of the
given directory inode, since each log tree is associated to one and only
one subvolume/root, so remove the argument.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5e6b98e..efa159a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -921,11 +921,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
  * item
  */
 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
 				      struct btrfs_path *path,
 				      struct btrfs_inode *dir,
 				      struct btrfs_dir_item *di)
 {
+	struct btrfs_root *root = dir->root;
 	struct inode *inode;
 	char *name;
 	int name_len;
@@ -1221,7 +1221,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 	if (IS_ERR(di)) {
 		return PTR_ERR(di);
 	} else if (di) {
-		ret = drop_one_dir_item(trans, root, path, dir, di);
+		ret = drop_one_dir_item(trans, path, dir, di);
 		if (ret)
 			return ret;
 	}
@@ -1233,7 +1233,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 	if (IS_ERR(di)) {
 		return PTR_ERR(di);
 	} else if (di) {
-		ret = drop_one_dir_item(trans, root, path, dir, di);
+		ret = drop_one_dir_item(trans, path, dir, di);
 		if (ret)
 			return ret;
 	}
@@ -2050,7 +2050,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	if (!exists)
 		goto out;
 
-	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
+	ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
 	if (ret)
 		goto out;
 
-- 
2.34.1.75.gabe6bb3905


From bc582396fac9967a5ae0ec9958f379943ff01026 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Oct 2021 10:56:22 +0100
Subject: [PATCH 24/48] btrfs: remove root argument from btrfs_unlink_inode()

The root argument passed to btrfs_unlink_inode() and its callee,
__btrfs_unlink_inode(), always matches the root of the given directory and
the given inode. So remove the argument and make __btrfs_unlink_inode()
use the root of the directory.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/ctree.h    |  1 -
 fs/btrfs/inode.c    | 25 +++++++++++--------------
 fs/btrfs/tree-log.c | 14 +++++++-------
 3 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 27815dc..3c14d9f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3182,7 +3182,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
 		       const char *name, int name_len);
 int btrfs_add_link(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ee7940..4004d17 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4040,11 +4040,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
  * also drops the back refs in the inode to the directory
  */
 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
 				struct btrfs_inode *dir,
 				struct btrfs_inode *inode,
 				const char *name, int name_len)
 {
+	struct btrfs_root *root = dir->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_path *path;
 	int ret = 0;
@@ -4144,15 +4144,14 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
 		       const char *name, int name_len)
 {
 	int ret;
-	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
 	if (!ret) {
 		drop_nlink(&inode->vfs_inode);
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode(trans, inode->root, inode);
 	}
 	return ret;
 }
@@ -4181,7 +4180,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
 	struct inode *inode = d_inode(dentry);
 	int ret;
@@ -4193,7 +4191,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 			0);
 
-	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+	ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
 			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
 			dentry->d_name.len);
 	if (ret)
@@ -4207,7 +4205,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 out:
 	btrfs_end_transaction(trans);
-	btrfs_btree_balance_dirty(root->fs_info);
+	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
 	return ret;
 }
 
@@ -4558,7 +4556,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 	int err = 0;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
 	u64 last_unlink_trans;
 
@@ -4583,7 +4580,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
 
 	/* now the directory is empty */
-	err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+	err = btrfs_unlink_inode(trans, BTRFS_I(dir),
 			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
 			dentry->d_name.len);
 	if (!err) {
@@ -4604,7 +4601,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 out:
 	btrfs_end_transaction(trans);
-	btrfs_btree_balance_dirty(root->fs_info);
+	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
 
 	return err;
 }
@@ -9462,7 +9459,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
 	} else { /* src is an inode */
-		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(old_dentry->d_inode),
 					   old_dentry->d_name.name,
 					   old_dentry->d_name.len);
@@ -9478,7 +9475,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
 	} else { /* dest is an inode */
-		ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 					   BTRFS_I(new_dentry->d_inode),
 					   new_dentry->d_name.name,
 					   new_dentry->d_name.len);
@@ -9753,7 +9750,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		 */
 		btrfs_pin_log_trans(root);
 		log_pinned = true;
-		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					BTRFS_I(d_inode(old_dentry)),
 					old_dentry->d_name.name,
 					old_dentry->d_name.len);
@@ -9773,7 +9770,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 			ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
 			BUG_ON(new_inode->i_nlink == 0);
 		} else {
-			ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 						 BTRFS_I(d_inode(new_dentry)),
 						 new_dentry->d_name.name,
 						 new_dentry->d_name.len);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index efa159a..5a01b75 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -954,7 +954,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
+	ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
 			name_len);
 	if (ret)
 		goto out;
@@ -1119,7 +1119,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				inc_nlink(&inode->vfs_inode);
 				btrfs_release_path(path);
 
-				ret = btrfs_unlink_inode(trans, root, dir, inode,
+				ret = btrfs_unlink_inode(trans, dir, inode,
 						victim_name, victim_name_len);
 				kfree(victim_name);
 				if (ret)
@@ -1191,7 +1191,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 					inc_nlink(&inode->vfs_inode);
 					btrfs_release_path(path);
 
-					ret = btrfs_unlink_inode(trans, root,
+					ret = btrfs_unlink_inode(trans,
 							BTRFS_I(victim_parent),
 							inode,
 							victim_name,
@@ -1353,7 +1353,7 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
 				kfree(name);
 				goto out;
 			}
-			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+			ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
 						 inode, name, namelen);
 			kfree(name);
 			iput(dir);
@@ -1451,7 +1451,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ret = -ENOENT;
 		goto out;
 	}
-	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
 				 name, namelen);
 	if (ret)
 		goto out;
@@ -1597,7 +1597,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 			ret = btrfs_inode_ref_exists(inode, dir, key->type,
 						     name, namelen);
 			if (ret > 0) {
-				ret = btrfs_unlink_inode(trans, root,
+				ret = btrfs_unlink_inode(trans,
 							 BTRFS_I(dir),
 							 BTRFS_I(inode),
 							 name, namelen);
@@ -2347,7 +2347,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 			}
 
 			inc_nlink(inode);
-			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+			ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
 					BTRFS_I(inode), name, name_len);
 			if (!ret)
 				ret = btrfs_run_delayed_items(trans);
-- 
2.34.1.75.gabe6bb3905


From 2c79bebf1cd148b30bacddd99db66cbbc7ccac92 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Oct 2021 10:56:23 +0100
Subject: [PATCH 25/48] btrfs: remove root argument from add_link()

The root argument for tree-log.c:add_link() always matches the root of the
given directory and the given inode, so it can eliminated.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a01b75..42aefc7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1414,10 +1414,11 @@ static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
 	return ret;
 }
 
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static int add_link(struct btrfs_trans_handle *trans,
 		    struct inode *dir, struct inode *inode, const char *name,
 		    int namelen, u64 ref_index)
 {
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_dir_item *dir_item;
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -1613,7 +1614,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				goto out;
 
 			/* insert our name */
-			ret = add_link(trans, root, dir, inode, name, namelen,
+			ret = add_link(trans, dir, inode, name, namelen,
 				       ref_index);
 			if (ret)
 				goto out;
-- 
2.34.1.75.gabe6bb3905


From eeacd883bc46a68f87336c3c477dc865e790428d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Oct 2021 10:56:24 +0100
Subject: [PATCH 26/48] btrfs: remove root argument from check_item_in_log()

The root argument passed to check_item_in_log() always matches the root
of the given directory, so it can be eliminated.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 42aefc7..72347c4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2281,13 +2281,13 @@ static noinline int find_dir_range(struct btrfs_root *root,
  * to is unlinked
  */
 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
 				      struct btrfs_root *log,
 				      struct btrfs_path *path,
 				      struct btrfs_path *log_path,
 				      struct inode *dir,
 				      struct btrfs_key *dir_key)
 {
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret;
 	struct extent_buffer *eb;
 	int slot;
@@ -2561,7 +2561,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 			if (found_key.offset > range_end)
 				break;
 
-			ret = check_item_in_log(trans, root, log, path,
+			ret = check_item_in_log(trans, log, path,
 						log_path, dir,
 						&found_key);
 			if (ret)
-- 
2.34.1.75.gabe6bb3905


From c71fbf93113dcd032a77ebb28df4e6b0dbe3d3f1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Oct 2021 17:31:53 +0100
Subject: [PATCH 27/48] btrfs: only copy dir index keys when logging a
 directory

Currently, when logging a directory, we copy both dir items and dir index
items from the fs/subvolume tree to the log tree. Both items have exactly
the same data (same struct btrfs_dir_item), the difference lies in the key
values, where a dir index key contains the index number of a directory
entry while the dir item key does not, as it's used for doing fast lookups
of an entry by name, while the former is used for sorting entries when
listing a directory.

We can exploit that and log only the dir index items, since they contain
all the information needed to correctly add, replace and delete directory
entries when replaying a log tree. Logging only the dir index items is
also backward and forward compatible: an unpatched kernel (without this
change) can correctly replay a log tree generated by a patched kernel
(with this patch), and a patched kernel can correctly replay a log tree
generated by an unpatched kernel.

The backward compatibility is ensured because:

1) For inserting a new dentry: a dentry is only inserted when we find a
   new dir index key - we can only insert if we know the dir index offset,
   which is encoded in the dir index key's offset;

2) For deleting dentries: during log replay, before adding or replacing
   dentries, we first replay dentry deletions. Whenever we find a dir item
   key or a dir index key in the subvolume/fs tree that is not logged in
   a range for which the log tree is authoritative, we do the unlink of
   the dentry, which removes both the existing dir item key and the dir
   index key. Therefore logging just dir index keys is enough to ensure
   dentry deletions are correctly replayed;

3) For dentry replacements: they work when we log only dir index keys
   and this is mostly due to a combination of 1) and 2). If we replace a
   dentry with name "foobar" to point from inode A to inode B, then we
   know the dir index key for the new dentry is different from the old
   one, as it has an index number (key offset) larger than the old one.
   This results in replaying a deletion, through replay_dir_deletes(),
   that causes the old dentry to be removed, both the dir item key and
   the dir index key, as mentioned at 2). Then when processing the new
   dir index key, we add the new dentry, adding both a new dir item key
   and a new index key pointing to inode B, as stated in 1).

The forward compatibility, the ability for a patched kernel to replay a
log created by an older, unpatched kernel, comes from the changes required
for making sure we are able to replay a log that only contains dir index
keys - we simply ignore every dir item key we find.

So modify directory logging to log only dir index items, and modify the
log replay process to ignore dir item keys, from log trees created by an
unpatched kernel, and process only with dir index keys. This reduces the
amount of logged metadata by about half, and therefore the time spent
logging or fsyncing large directories (less cpu time and less IO).

The following test script was used to measure this change:

   #!/bin/bash

   DEV=/dev/nvme0n1
   MNT=/mnt/nvme0n1

   NUM_NEW_FILES=1000000
   NUM_FILE_DELETES=10000

   mkfs.btrfs -f $DEV
   mount -o ssd $DEV $MNT

   mkdir $MNT/testdir

   for ((i = 1; i <= $NUM_NEW_FILES; i++)); do
           echo -n > $MNT/testdir/file_$i
   done

   start=$(date +%s%N)
   xfs_io -c "fsync" $MNT/testdir
   end=$(date +%s%N)

   dur=$(( (end - start) / 1000000 ))
   echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files"

   # sync to force transaction commit and wipeout the log.
   sync

   del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES ))
   for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do
           rm -f $MNT/testdir/file_$i
   done

   start=$(date +%s%N)
   xfs_io -c "fsync" $MNT/testdir
   end=$(date +%s%N)

   dur=$(( (end - start) / 1000000 ))
   echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files"
   echo

   umount $MNT

The tests were run on a physical machine, with a non-debug kernel (Debian's
default kernel config), for different values of $NUM_NEW_FILES and
$NUM_FILE_DELETES, and the results were the following:

** Before patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 **

dir fsync took 8412 ms after adding 1000000 files
dir fsync took 500 ms after deleting 10000 files

** After patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 **

dir fsync took 4252 ms after adding 1000000 files   (-49.5%)
dir fsync took 269 ms after deleting 10000 files    (-46.2%)

** Before patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 **

dir fsync took 745 ms after adding 100000 files
dir fsync took 59 ms after deleting 1000 files

** After patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 **

dir fsync took 404 ms after adding 100000 files   (-45.8%)
dir fsync took 31 ms after deleting 1000 files    (-47.5%)

** Before patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 **

dir fsync took 67 ms after adding 10000 files
dir fsync took 9 ms after deleting 1000 files

** After patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 **

dir fsync took 36 ms after adding 10000 files   (-46.3%)
dir fsync took 5 ms after deleting 1000 files   (-44.4%)

** Before patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 **

dir fsync took 9 ms after adding 1000 files
dir fsync took 4 ms after deleting 100 files

** After patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 **

dir fsync took 7 ms after adding 1000 files     (-22.2%)
dir fsync took 3 ms after deleting 100 files    (-25.0%)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/btrfs_inode.h |  18 +-
 fs/btrfs/tree-log.c    | 395 ++++++++++++++++++-----------------------
 2 files changed, 182 insertions(+), 231 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 602b426..a303b0b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,19 +138,11 @@ struct btrfs_inode {
 	/* a local copy of root's last_log_commit */
 	int last_log_commit;
 
-	union {
-		/*
-		 * Total number of bytes pending delalloc, used by stat to
-		 * calculate the real block usage of the file. This is used
-		 * only for files.
-		 */
-		u64 delalloc_bytes;
-		/*
-		 * The offset of the last dir item key that was logged.
-		 * This is used only for directories.
-		 */
-		u64 last_dir_item_offset;
-	};
+	/*
+	 * Total number of bytes pending delalloc, used by stat to calculate the
+	 * real block usage of the file. This is used only for files.
+	 */
+	u64 delalloc_bytes;
 
 	union {
 		/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 72347c4..ba0f870 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1950,6 +1950,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+					struct btrfs_inode *dir,
+					struct btrfs_path *path,
+					struct btrfs_dir_item *dst_di,
+					const struct btrfs_key *log_key,
+					u8 log_type,
+					bool exists)
+{
+	struct btrfs_key found_key;
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	/* The existing dentry points to the same inode, don't delete it. */
+	if (found_key.objectid == log_key->objectid &&
+	    found_key.type == log_key->type &&
+	    found_key.offset == log_key->offset &&
+	    btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+		return 1;
+
+	/*
+	 * Don't drop the conflicting directory entry if the inode for the new
+	 * entry doesn't exist.
+	 */
+	if (!exists)
+		return 0;
+
+	return drop_one_dir_item(trans, path, dir, dst_di);
+}
+
 /*
  * take a single entry in a log directory item and replay it into
  * the subvolume.
@@ -1975,14 +2003,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 {
 	char *name;
 	int name_len;
-	struct btrfs_dir_item *dst_di;
-	struct btrfs_key found_key;
+	struct btrfs_dir_item *dir_dst_di;
+	struct btrfs_dir_item *index_dst_di;
+	bool dir_dst_matches = false;
+	bool index_dst_matches = false;
 	struct btrfs_key log_key;
+	struct btrfs_key search_key;
 	struct inode *dir;
 	u8 log_type;
 	bool exists;
 	int ret;
-	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+	bool update_size = true;
 	bool name_added = false;
 
 	dir = read_one_inode(root, key->objectid);
@@ -2008,76 +2039,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	exists = (ret == 0);
 	ret = 0;
 
-	if (key->type == BTRFS_DIR_ITEM_KEY) {
-		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
-				       name, name_len, 1);
-	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
-		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
-						     key->objectid,
-						     key->offset, name,
-						     name_len, 1);
-	} else {
-		/* Corruption */
-		ret = -EINVAL;
+	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+					   name, name_len, 1);
+	if (IS_ERR(dir_dst_di)) {
+		ret = PTR_ERR(dir_dst_di);
 		goto out;
+	} else if (dir_dst_di) {
+		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+						   dir_dst_di, &log_key, log_type,
+						   exists);
+		if (ret < 0)
+			goto out;
+		dir_dst_matches = (ret == 1);
 	}
 
-	if (IS_ERR(dst_di)) {
-		ret = PTR_ERR(dst_di);
+	btrfs_release_path(path);
+
+	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+						   key->objectid, key->offset,
+						   name, name_len, 1);
+	if (IS_ERR(index_dst_di)) {
+		ret = PTR_ERR(index_dst_di);
 		goto out;
-	} else if (!dst_di) {
-		/* we need a sequence number to insert, so we only
-		 * do inserts for the BTRFS_DIR_INDEX_KEY types
-		 */
-		if (key->type != BTRFS_DIR_INDEX_KEY)
+	} else if (index_dst_di) {
+		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+						   index_dst_di, &log_key,
+						   log_type, exists);
+		if (ret < 0)
 			goto out;
-		goto insert;
+		index_dst_matches = (ret == 1);
 	}
 
-	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
-	/* the existing item matches the logged item */
-	if (found_key.objectid == log_key.objectid &&
-	    found_key.type == log_key.type &&
-	    found_key.offset == log_key.offset &&
-	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+	btrfs_release_path(path);
+
+	if (dir_dst_matches && index_dst_matches) {
+		ret = 0;
 		update_size = false;
 		goto out;
 	}
 
-	/*
-	 * don't drop the conflicting directory entry if the inode
-	 * for the new entry doesn't exist
-	 */
-	if (!exists)
-		goto out;
-
-	ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
-	if (ret)
-		goto out;
-
-	if (key->type == BTRFS_DIR_INDEX_KEY)
-		goto insert;
-out:
-	btrfs_release_path(path);
-	if (!ret && update_size) {
-		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
-	}
-	kfree(name);
-	iput(dir);
-	if (!ret && name_added)
-		ret = 1;
-	return ret;
-
-insert:
 	/*
 	 * Check if the inode reference exists in the log for the given name,
 	 * inode and parent inode
 	 */
-	found_key.objectid = log_key.objectid;
-	found_key.type = BTRFS_INODE_REF_KEY;
-	found_key.offset = key->objectid;
-	ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+	search_key.objectid = log_key.objectid;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = key->objectid;
+	ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
 	if (ret < 0) {
 	        goto out;
 	} else if (ret) {
@@ -2087,10 +2095,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	        goto out;
 	}
 
-	found_key.objectid = log_key.objectid;
-	found_key.type = BTRFS_INODE_EXTREF_KEY;
-	found_key.offset = key->objectid;
-	ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
+	search_key.objectid = log_key.objectid;
+	search_key.type = BTRFS_INODE_EXTREF_KEY;
+	search_key.offset = key->objectid;
+	ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
 			     name_len);
 	if (ret < 0) {
 		goto out;
@@ -2109,87 +2117,76 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		name_added = true;
 	update_size = false;
 	ret = 0;
-	goto out;
+
+out:
+	if (!ret && update_size) {
+		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+	}
+	kfree(name);
+	iput(dir);
+	if (!ret && name_added)
+		ret = 1;
+	return ret;
 }
 
-/*
- * find all the names in a directory item and reconcile them into
- * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
- * one name in a directory item, but the same code gets used for
- * both directory index types
- */
+/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
 					struct extent_buffer *eb, int slot,
 					struct btrfs_key *key)
 {
-	int ret = 0;
-	u32 item_size = btrfs_item_size_nr(eb, slot);
+	int ret;
 	struct btrfs_dir_item *di;
-	int name_len;
-	unsigned long ptr;
-	unsigned long ptr_end;
-	struct btrfs_path *fixup_path = NULL;
 
-	ptr = btrfs_item_ptr_offset(eb, slot);
-	ptr_end = ptr + item_size;
-	while (ptr < ptr_end) {
-		di = (struct btrfs_dir_item *)ptr;
-		name_len = btrfs_dir_name_len(eb, di);
-		ret = replay_one_name(trans, root, path, eb, di, key);
-		if (ret < 0)
-			break;
-		ptr = (unsigned long)(di + 1);
-		ptr += name_len;
+	/* We only log dir index keys, which only contain a single dir item. */
+	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
 
-		/*
-		 * If this entry refers to a non-directory (directories can not
-		 * have a link count > 1) and it was added in the transaction
-		 * that was not committed, make sure we fixup the link count of
-		 * the inode it the entry points to. Otherwise something like
-		 * the following would result in a directory pointing to an
-		 * inode with a wrong link that does not account for this dir
-		 * entry:
-		 *
-		 * mkdir testdir
-		 * touch testdir/foo
-		 * touch testdir/bar
-		 * sync
-		 *
-		 * ln testdir/bar testdir/bar_link
-		 * ln testdir/foo testdir/foo_link
-		 * xfs_io -c "fsync" testdir/bar
-		 *
-		 * <power failure>
-		 *
-		 * mount fs, log replay happens
-		 *
-		 * File foo would remain with a link count of 1 when it has two
-		 * entries pointing to it in the directory testdir. This would
-		 * make it impossible to ever delete the parent directory has
-		 * it would result in stale dentries that can never be deleted.
-		 */
-		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
-			struct btrfs_key di_key;
+	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+	ret = replay_one_name(trans, root, path, eb, di, key);
+	if (ret < 0)
+		return ret;
 
-			if (!fixup_path) {
-				fixup_path = btrfs_alloc_path();
-				if (!fixup_path) {
-					ret = -ENOMEM;
-					break;
-				}
-			}
+	/*
+	 * If this entry refers to a non-directory (directories can not have a
+	 * link count > 1) and it was added in the transaction that was not
+	 * committed, make sure we fixup the link count of the inode the entry
+	 * points to. Otherwise something like the following would result in a
+	 * directory pointing to an inode with a wrong link that does not account
+	 * for this dir entry:
+	 *
+	 * mkdir testdir
+	 * touch testdir/foo
+	 * touch testdir/bar
+	 * sync
+	 *
+	 * ln testdir/bar testdir/bar_link
+	 * ln testdir/foo testdir/foo_link
+	 * xfs_io -c "fsync" testdir/bar
+	 *
+	 * <power failure>
+	 *
+	 * mount fs, log replay happens
+	 *
+	 * File foo would remain with a link count of 1 when it has two entries
+	 * pointing to it in the directory testdir. This would make it impossible
+	 * to ever delete the parent directory has it would result in stale
+	 * dentries that can never be deleted.
+	 */
+	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+		struct btrfs_path *fixup_path;
+		struct btrfs_key di_key;
 
-			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
-			ret = link_to_fixup_dir(trans, root, fixup_path,
-						di_key.objectid);
-			if (ret)
-				break;
-		}
-		ret = 0;
+		fixup_path = btrfs_alloc_path();
+		if (!fixup_path)
+			return -ENOMEM;
+
+		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
+		btrfs_free_path(fixup_path);
 	}
-	btrfs_free_path(fixup_path);
+
 	return ret;
 }
 
@@ -2743,12 +2740,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 						eb, i, &key);
 			if (ret)
 				break;
-		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
-			ret = replay_one_dir_item(wc->trans, root, path,
-						  eb, i, &key);
-			if (ret)
-				break;
 		}
+		/*
+		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
+		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
+		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
+		 * older kernel with such keys, ignore them.
+		 */
 	}
 	btrfs_free_path(path);
 	return ret;
@@ -3551,20 +3549,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 		goto out_unlock;
 	}
 
-	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
-				   name, name_len, -1);
-	if (IS_ERR(di)) {
-		err = PTR_ERR(di);
-		goto fail;
-	}
-	if (di) {
-		ret = btrfs_delete_one_dir_name(trans, log, path, di);
-		if (ret) {
-			err = ret;
-			goto fail;
-		}
-	}
-	btrfs_release_path(path);
+	/*
+	 * We only log dir index items of a directory, so we don't need to look
+	 * for dir item keys.
+	 */
 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
 					 index, name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -3640,7 +3628,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *log,
 				       struct btrfs_path *path,
-				       int key_type, u64 dirid,
+				       u64 dirid,
 				       u64 first_offset, u64 last_offset)
 {
 	int ret;
@@ -3649,10 +3637,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 
 	key.objectid = dirid;
 	key.offset = first_offset;
-	if (key_type == BTRFS_DIR_ITEM_KEY)
-		key.type = BTRFS_DIR_LOG_ITEM_KEY;
-	else
-		key.type = BTRFS_DIR_LOG_INDEX_KEY;
+	key.type = BTRFS_DIR_LOG_INDEX_KEY;
 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
 	if (ret)
 		return ret;
@@ -3744,7 +3729,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *inode,
 				  struct btrfs_path *path,
 				  struct btrfs_path *dst_path,
-				  int key_type,
 				  struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_root *log = inode->root->log_root;
@@ -3752,24 +3736,18 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 	const int nritems = btrfs_header_nritems(src);
 	const u64 ino = btrfs_ino(inode);
 	const bool inode_logged_before = inode_logged(trans, inode);
-	u64 last_logged_key_offset;
 	bool last_found = false;
 	int batch_start = 0;
 	int batch_size = 0;
 	int i;
 
-	if (key_type == BTRFS_DIR_ITEM_KEY)
-		last_logged_key_offset = inode->last_dir_item_offset;
-	else
-		last_logged_key_offset = inode->last_dir_index_offset;
-
 	for (i = path->slots[0]; i < nritems; i++) {
 		struct btrfs_key key;
 		int ret;
 
 		btrfs_item_key_to_cpu(src, &key, i);
 
-		if (key.objectid != ino || key.type != key_type) {
+		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
 			last_found = true;
 			break;
 		}
@@ -3818,7 +3796,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 		 * we logged is in the log tree, saving time and avoiding adding
 		 * contention on the log tree.
 		 */
-		if (key.offset > last_logged_key_offset)
+		if (key.offset > inode->last_dir_index_offset)
 			goto add_to_batch;
 		/*
 		 * Check if the key was already logged before. If not we can add
@@ -3877,7 +3855,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			  struct btrfs_inode *inode,
 			  struct btrfs_path *path,
-			  struct btrfs_path *dst_path, int key_type,
+			  struct btrfs_path *dst_path,
 			  struct btrfs_log_ctx *ctx,
 			  u64 min_offset, u64 *last_offset_ret)
 {
@@ -3891,7 +3869,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	u64 ino = btrfs_ino(inode);
 
 	min_key.objectid = ino;
-	min_key.type = key_type;
+	min_key.type = BTRFS_DIR_INDEX_KEY;
 	min_key.offset = min_offset;
 
 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
@@ -3900,9 +3878,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 * we didn't find anything from this transaction, see if there
 	 * is anything at all
 	 */
-	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+	if (ret != 0 || min_key.objectid != ino ||
+	    min_key.type != BTRFS_DIR_INDEX_KEY) {
 		min_key.objectid = ino;
-		min_key.type = key_type;
+		min_key.type = BTRFS_DIR_INDEX_KEY;
 		min_key.offset = (u64)-1;
 		btrfs_release_path(path);
 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -3910,7 +3889,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			btrfs_release_path(path);
 			return ret;
 		}
-		ret = btrfs_previous_item(root, path, ino, key_type);
+		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
 
 		/* if ret == 0 there are items for this type,
 		 * create a range to tell us the last key of this type.
@@ -3921,18 +3900,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			struct btrfs_key tmp;
 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
 					      path->slots[0]);
-			if (key_type == tmp.type)
+			if (tmp.type == BTRFS_DIR_INDEX_KEY)
 				first_offset = max(min_offset, tmp.offset) + 1;
 		}
 		goto done;
 	}
 
 	/* go backward to find any previous key */
-	ret = btrfs_previous_item(root, path, ino, key_type);
+	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
 	if (ret == 0) {
 		struct btrfs_key tmp;
 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-		if (key_type == tmp.type) {
+		if (tmp.type == BTRFS_DIR_INDEX_KEY) {
 			first_offset = tmp.offset;
 			ret = overwrite_item(trans, log, dst_path,
 					     path->nodes[0], path->slots[0],
@@ -3963,8 +3942,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 * from our directory
 	 */
 	while (1) {
-		ret = process_dir_items_leaf(trans, inode, path, dst_path,
-					     key_type, ctx);
+		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
 		if (ret != 0) {
 			if (ret < 0)
 				err = ret;
@@ -3985,7 +3963,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			goto done;
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
-		if (min_key.objectid != ino || min_key.type != key_type) {
+		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
 			last_offset = (u64)-1;
 			goto done;
 		}
@@ -4015,8 +3993,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		 * insert the log range keys to indicate where the log
 		 * is valid
 		 */
-		ret = insert_dir_log_key(trans, log, path, key_type,
-					 ino, first_offset, last_offset);
+		ret = insert_dir_log_key(trans, log, path, ino, first_offset,
+					 last_offset);
 		if (ret)
 			err = ret;
 	}
@@ -4044,35 +4022,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 	u64 min_key;
 	u64 max_key;
 	int ret;
-	int key_type = BTRFS_DIR_ITEM_KEY;
 
 	/*
 	 * If this is the first time we are being logged in the current
 	 * transaction, or we were logged before but the inode was evicted and
-	 * reloaded later, in which case its logged_trans is 0, reset the values
-	 * of the last logged key offsets. Note that we don't use the helper
+	 * reloaded later, in which case its logged_trans is 0, reset the value
+	 * of the last logged key offset. Note that we don't use the helper
 	 * function inode_logged() here - that is because the function returns
 	 * true after an inode eviction, assuming the worst case as it can not
 	 * know for sure if the inode was logged before. So we can not skip key
 	 * searches in the case the inode was evicted, because it may not have
 	 * been logged in this transaction and may have been logged in a past
-	 * transaction, so we need to reset the last dir item and index offsets
-	 * to (u64)-1.
+	 * transaction, so we need to reset the last dir index offset to (u64)-1.
 	 */
-	if (inode->logged_trans != trans->transid) {
-		inode->last_dir_item_offset = (u64)-1;
+	if (inode->logged_trans != trans->transid)
 		inode->last_dir_index_offset = (u64)-1;
-	}
-again:
+
 	min_key = 0;
 	max_key = 0;
-	if (key_type == BTRFS_DIR_ITEM_KEY)
-		ctx->last_dir_item_offset = inode->last_dir_item_offset;
-	else
-		ctx->last_dir_item_offset = inode->last_dir_index_offset;
+	ctx->last_dir_item_offset = inode->last_dir_index_offset;
 
 	while (1) {
-		ret = log_dir_items(trans, inode, path, dst_path, key_type,
+		ret = log_dir_items(trans, inode, path, dst_path,
 				ctx, min_key, &max_key);
 		if (ret)
 			return ret;
@@ -4081,13 +4052,8 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 		min_key = max_key + 1;
 	}
 
-	if (key_type == BTRFS_DIR_ITEM_KEY) {
-		inode->last_dir_item_offset = ctx->last_dir_item_offset;
-		key_type = BTRFS_DIR_INDEX_KEY;
-		goto again;
-	} else {
-		inode->last_dir_index_offset = ctx->last_dir_item_offset;
-	}
+	inode->last_dir_index_offset = ctx->last_dir_item_offset;
+
 	return 0;
 }
 
@@ -5910,18 +5876,12 @@ struct btrfs_dir_list {
  *    link_to_fixup_dir());
  *
  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
- *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ *    while logging the inode's items new index items (key type
+ *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
  *    has a size that doesn't match the sum of the lengths of all the logged
- *    names. This does not result in a problem because if a dir_item key is
- *    logged but its matching dir_index key is not logged, at log replay time we
- *    don't use it to replay the respective name (see replay_one_name()). On the
- *    other hand if only the dir_index key ends up being logged, the respective
- *    name is added to the fs/subvol tree with both the dir_item and dir_index
- *    keys created (see replay_one_name()).
- *    The directory's inode item with a wrong i_size is not a problem as well,
- *    since we don't use it at log replay time to set the i_size in the inode
- *    item of the fs/subvol tree (see overwrite_item()).
+ *    names - this is ok, not a problem, because at log replay time we set the
+ *    directory's i_size to the correct value (see replay_one_name() and
+ *    do_overwrite_item()).
  */
 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
@@ -5967,7 +5927,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 			goto next_dir_inode;
 
 		min_key.objectid = dir_elem->ino;
-		min_key.type = BTRFS_DIR_ITEM_KEY;
+		min_key.type = BTRFS_DIR_INDEX_KEY;
 		min_key.offset = 0;
 again:
 		btrfs_release_path(path);
@@ -5992,7 +5952,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 
 			btrfs_item_key_to_cpu(leaf, &min_key, i);
 			if (min_key.objectid != dir_elem->ino ||
-			    min_key.type != BTRFS_DIR_ITEM_KEY)
+			    min_key.type != BTRFS_DIR_INDEX_KEY)
 				goto next_dir_inode;
 
 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
@@ -6807,15 +6767,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 	 * was previously logged, make sure the next log attempt on the directory
 	 * is not skipped and logs the inode again. This is because the log may
 	 * not currently be authoritative for a range including the old
-	 * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
-	 * sure after a log replay we do not end up with both the new and old
-	 * dentries around (in case the inode is a directory we would have a
-	 * directory with two hard links and 2 inode references for different
-	 * parents). The next log attempt of old_dir will happen at
-	 * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
-	 * below, because we have previously set inode->last_unlink_trans to the
-	 * current transaction ID, either here or at btrfs_record_unlink_dir() in
-	 * case inode is a directory.
+	 * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we
+	 * do not end up with both the new and old dentries around (in case the
+	 * inode is a directory we would have a directory with two hard links and
+	 * 2 inode references for different parents). The next log attempt of
+	 * old_dir will happen at btrfs_log_all_parents(), called through
+	 * btrfs_log_inode_parent() below, because we have previously set
+	 * inode->last_unlink_trans to the current transaction ID, either here or
+	 * at btrfs_record_unlink_dir() in case the inode is a directory.
 	 */
 	if (old_dir)
 		old_dir->logged_trans = 0;
-- 
2.34.1.75.gabe6bb3905


From b5de5ae91d51e13de666d48493bacd45ae0928d4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Oct 2021 10:56:26 +0100
Subject: [PATCH 28/48] btrfs: remove no longer needed logic for replaying
 directory deletes

Now that we log only dir index keys when logging a directory, we no longer
need to deal with dir item keys in the log replay code for replaying
directory deletes. This is also true for the case when we replay a log
tree created by a kernel that still logs dir items.

So remove the remaining code of the replay of directory deletes algorithm
that deals with dir item keys.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c             | 158 ++++++++++++++------------------
 include/uapi/linux/btrfs_tree.h |   4 +-
 2 files changed, 72 insertions(+), 90 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ba0f870..586a88b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2203,7 +2203,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
  */
 static noinline int find_dir_range(struct btrfs_root *root,
 				   struct btrfs_path *path,
-				   u64 dirid, int key_type,
+				   u64 dirid,
 				   u64 *start_ret, u64 *end_ret)
 {
 	struct btrfs_key key;
@@ -2216,7 +2216,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
 		return 1;
 
 	key.objectid = dirid;
-	key.type = key_type;
+	key.type = BTRFS_DIR_LOG_INDEX_KEY;
 	key.offset = *start_ret;
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2230,7 +2230,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
 	if (ret != 0)
 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 
-	if (key.type != key_type || key.objectid != dirid) {
+	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
 		ret = 1;
 		goto next;
 	}
@@ -2257,7 +2257,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 
-	if (key.type != key_type || key.objectid != dirid) {
+	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
 		ret = 1;
 		goto out;
 	}
@@ -2288,95 +2288,82 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 	int ret;
 	struct extent_buffer *eb;
 	int slot;
-	u32 item_size;
 	struct btrfs_dir_item *di;
-	struct btrfs_dir_item *log_di;
 	int name_len;
-	unsigned long ptr;
-	unsigned long ptr_end;
 	char *name;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	struct btrfs_key location;
 
-again:
+	/*
+	 * Currenly we only log dir index keys. Even if we replay a log created
+	 * by an older kernel that logged both dir index and dir item keys, all
+	 * we need to do is process the dir index keys, we (and our caller) can
+	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
+	 */
+	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
+
 	eb = path->nodes[0];
 	slot = path->slots[0];
-	item_size = btrfs_item_size_nr(eb, slot);
-	ptr = btrfs_item_ptr_offset(eb, slot);
-	ptr_end = ptr + item_size;
-	while (ptr < ptr_end) {
-		di = (struct btrfs_dir_item *)ptr;
-		name_len = btrfs_dir_name_len(eb, di);
-		name = kmalloc(name_len, GFP_NOFS);
-		if (!name) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		read_extent_buffer(eb, name, (unsigned long)(di + 1),
-				  name_len);
-		log_di = NULL;
-		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
-			log_di = btrfs_lookup_dir_item(trans, log, log_path,
-						       dir_key->objectid,
-						       name, name_len, 0);
-		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
-			log_di = btrfs_lookup_dir_index_item(trans, log,
-						     log_path,
-						     dir_key->objectid,
-						     dir_key->offset,
-						     name, name_len, 0);
-		}
-		if (!log_di) {
-			btrfs_dir_item_key_to_cpu(eb, di, &location);
-			btrfs_release_path(path);
-			btrfs_release_path(log_path);
-			inode = read_one_inode(root, location.objectid);
-			if (!inode) {
-				kfree(name);
-				return -EIO;
-			}
+	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+	name_len = btrfs_dir_name_len(eb, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-			ret = link_to_fixup_dir(trans, root,
-						path, location.objectid);
-			if (ret) {
-				kfree(name);
-				iput(inode);
-				goto out;
-			}
+	read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
 
-			inc_nlink(inode);
-			ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
-					BTRFS_I(inode), name, name_len);
-			if (!ret)
-				ret = btrfs_run_delayed_items(trans);
-			kfree(name);
-			iput(inode);
-			if (ret)
-				goto out;
+	if (log) {
+		struct btrfs_dir_item *log_di;
 
-			/* there might still be more names under this key
-			 * check and repeat if required
-			 */
-			ret = btrfs_search_slot(NULL, root, dir_key, path,
-						0, 0);
-			if (ret == 0)
-				goto again;
+		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+						     dir_key->objectid,
+						     dir_key->offset,
+						     name, name_len, 0);
+		if (IS_ERR(log_di)) {
+			ret = PTR_ERR(log_di);
+			goto out;
+		} else if (log_di) {
+			/* The dentry exists in the log, we have nothing to do. */
 			ret = 0;
 			goto out;
-		} else if (IS_ERR(log_di)) {
-			kfree(name);
-			return PTR_ERR(log_di);
 		}
-		btrfs_release_path(log_path);
-		kfree(name);
+	}
 
-		ptr = (unsigned long)(di + 1);
-		ptr += name_len;
+	btrfs_dir_item_key_to_cpu(eb, di, &location);
+	btrfs_release_path(path);
+	btrfs_release_path(log_path);
+	inode = read_one_inode(root, location.objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
 	}
-	ret = 0;
+
+	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	if (ret)
+		goto out;
+
+	inc_nlink(inode);
+	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+				 name_len);
+	if (ret)
+		goto out;
+
+	ret = btrfs_run_delayed_items(trans);
+	if (ret)
+		goto out;
+
+	/*
+	 * Unlike dir item keys, dir index keys can only have one name (entry) in
+	 * them, as there are no key collisions since each key has a unique offset
+	 * (an index number), so we're done.
+	 */
 out:
 	btrfs_release_path(path);
 	btrfs_release_path(log_path);
+	kfree(name);
+	iput(inode);
 	return ret;
 }
 
@@ -2496,7 +2483,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 {
 	u64 range_start;
 	u64 range_end;
-	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
 	int ret = 0;
 	struct btrfs_key dir_key;
 	struct btrfs_key found_key;
@@ -2504,7 +2490,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 	struct inode *dir;
 
 	dir_key.objectid = dirid;
-	dir_key.type = BTRFS_DIR_ITEM_KEY;
+	dir_key.type = BTRFS_DIR_INDEX_KEY;
 	log_path = btrfs_alloc_path();
 	if (!log_path)
 		return -ENOMEM;
@@ -2518,14 +2504,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 		btrfs_free_path(log_path);
 		return 0;
 	}
-again:
+
 	range_start = 0;
 	range_end = 0;
 	while (1) {
 		if (del_all)
 			range_end = (u64)-1;
 		else {
-			ret = find_dir_range(log, path, dirid, key_type,
+			ret = find_dir_range(log, path, dirid,
 					     &range_start, &range_end);
 			if (ret < 0)
 				goto out;
@@ -2552,8 +2538,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 					      path->slots[0]);
 			if (found_key.objectid != dirid ||
-			    found_key.type != dir_key.type)
-				goto next_type;
+			    found_key.type != dir_key.type) {
+				ret = 0;
+				goto out;
+			}
 
 			if (found_key.offset > range_end)
 				break;
@@ -2572,15 +2560,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 			break;
 		range_start = range_end + 1;
 	}
-
-next_type:
 	ret = 0;
-	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
-		key_type = BTRFS_DIR_LOG_INDEX_KEY;
-		dir_key.type = BTRFS_DIR_INDEX_KEY;
-		btrfs_release_path(path);
-		goto again;
-	}
 out:
 	btrfs_release_path(path);
 	btrfs_free_path(log_path);
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index e1c4c73..5416f1f 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -146,7 +146,9 @@
 
 /*
  * dir items are the name -> inode pointers in a directory.  There is one
- * for every name in a directory.
+ * for every name in a directory.  BTRFS_DIR_LOG_ITEM_KEY is no longer used
+ * but it's still defined here for documentation purposes and to help avoid
+ * having its numerical value reused in the future.
  */
 #define BTRFS_DIR_LOG_ITEM_KEY  60
 #define BTRFS_DIR_LOG_INDEX_KEY 72
-- 
2.34.1.75.gabe6bb3905


From e9942f02c4bb955d56dd62feb5dfb06cf0330a13 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 18 Nov 2021 10:26:15 -0500
Subject: [PATCH 29/48] btrfs: only use ->max_extent_size if it is set in the
 bitmap

While adding self tests for my space index change I was hitting a
problem where the space indexed tree wasn't returning the expected
->max_extent_size.  This is because we will skip searching any entry
that doesn't have ->bytes >= the amount of bytes we want.  However we'll
still set the max_extent_size based on that entry.  The problem is if we
don't search the bitmap we won't have ->max_extent_size set properly, so
we can't really trust it.

This doesn't really result in a problem per-se, it can just result in us
not finding contiguous area that may exist.  Fix the max_extent_size
helper to return ->bytes if ->max_extent_size isn't set, and add a big
comment explaining why we're doing this.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/free-space-cache.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 068d9ba..a43857d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1907,9 +1907,33 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	return -1;
 }
 
+/*
+ * This is a little subtle.  We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0.  In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes.  So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have.  For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
+ *  call into find_free_space(), and return with max_extent_size == 4k, because
+ *  that first bitmap entry had ->max_extent_size set, but the second one did
+ *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
+ *  8k contiguous range.
+ *
+ *  Consider the other case, we have 2 8k chunks in that second entry and still
+ *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
+ *  allocator comes in it'll fully search our second bitmap, and this time it'll
+ *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
+ *  right allocation the next loop through.
+ */
 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
 {
-	if (entry->bitmap)
+	if (entry->bitmap && entry->max_extent_size)
 		return entry->max_extent_size;
 	return entry->bytes;
 }
-- 
2.34.1.75.gabe6bb3905


From f3531e23d5be36a3de6ab19d6da850533937deaf Mon Sep 17 00:00:00 2001
From: Piotr Gorski <lucjan.lucjanov@gmail.com>
Date: Fri, 19 Nov 2021 11:47:04 +0100
Subject: [PATCH 30/48] Revert "btrfs: index free space entries on size"

This reverts commit 49ec5272f1c499d7fd43b1ba735bc6cf51ea07cf.

Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
---
 fs/btrfs/free-space-cache.c | 84 ++++---------------------------------
 fs/btrfs/free-space-cache.h |  2 -
 2 files changed, 7 insertions(+), 79 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a43857d..6ca3a92 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1576,44 +1576,6 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 	return 0;
 }
 
-static u64 free_space_info_bytes(struct btrfs_free_space *info)
-{
-	if (info->bitmap && info->max_extent_size)
-		return info->max_extent_size;
-	return info->bytes;
-}
-
-/*
- * This is indexed in reverse of what we generally do for rb-tree's, the largest
- * chunks are left most and the smallest are rightmost.  This is so that we can
- * take advantage of the cached property of the cached rb-tree and simply get
- * the largest free space chunk right away.
- */
-static void tree_insert_bytes(struct btrfs_free_space_ctl *ctl,
-			      struct btrfs_free_space *info)
-{
-	struct rb_root_cached *root = &ctl->free_space_bytes;
-	struct rb_node **p = &root->rb_root.rb_node;
-	struct rb_node *parent_node = NULL;
-	struct btrfs_free_space *tmp;
-	bool leftmost = true;
-
-	while (*p) {
-		parent_node = *p;
-		tmp = rb_entry(parent_node, struct btrfs_free_space,
-			       bytes_index);
-		if (free_space_info_bytes(info) < free_space_info_bytes(tmp)) {
-			p = &(*p)->rb_right;
-			leftmost = false;
-		} else {
-			p = &(*p)->rb_left;
-		}
-	}
-
-	rb_link_node(&info->bytes_index, parent_node, p);
-	rb_insert_color_cached(&info->bytes_index, root, leftmost);
-}
-
 /*
  * searches the tree for the given offset.
  *
@@ -1742,7 +1704,6 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
 		    struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &ctl->free_space_offset);
-	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
 	ctl->free_extents--;
 
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
@@ -1769,8 +1730,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	if (ret)
 		return ret;
 
-	tree_insert_bytes(ctl, info);
-
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
 		ctl->discardable_extents[BTRFS_STAT_CURR]++;
 		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1941,7 +1900,7 @@ static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
 /* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
-		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
+		unsigned long align, u64 *max_extent_size)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
@@ -1952,37 +1911,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 	if (!ctl->free_space_offset.rb_node)
 		goto out;
 
-	if (use_bytes_index) {
-		node = rb_first_cached(&ctl->free_space_bytes);
-	} else {
-		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
-					   0, 1);
-		if (!entry)
-			goto out;
-		node = &entry->offset_index;
-	}
-
-	for (; node; node = rb_next(node)) {
-		if (use_bytes_index)
-			entry = rb_entry(node, struct btrfs_free_space,
-					 bytes_index);
-		else
-			entry = rb_entry(node, struct btrfs_free_space,
-					 offset_index);
+	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
+	if (!entry)
+		goto out;
 
-		/*
-		 * If we are using the bytes index then all subsequent entries
-		 * in this tree are going to be < bytes, so simply set the max
-		 * extent size and exit the loop.
-		 *
-		 * If we're using the offset index then we need to keep going
-		 * through the rest of the tree.
-		 */
+	for (node = &entry->offset_index; node; node = rb_next(node)) {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		if (entry->bytes < *bytes) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
-			if (use_bytes_index)
-				break;
 			continue;
 		}
 
@@ -2569,7 +2506,6 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 	info->bytes = bytes;
 	info->trim_state = trim_state;
 	RB_CLEAR_NODE(&info->offset_index);
-	RB_CLEAR_NODE(&info->bytes_index);
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2873,7 +2809,6 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 	ctl->start = block_group->start;
 	ctl->private = block_group;
 	ctl->op = &free_space_op;
-	ctl->free_space_bytes = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&ctl->trimming_ranges);
 	mutex_init(&ctl->cache_writeout_mutex);
 
@@ -2939,7 +2874,6 @@ static void __btrfs_return_cluster_to_free_space(
 		}
 		tree_insert_offset(&ctl->free_space_offset,
 				   entry->offset, &entry->offset_index, bitmap);
-		tree_insert_bytes(ctl, entry);
 	}
 	cluster->root = RB_ROOT;
 	spin_unlock(&cluster->lock);
@@ -3041,14 +2975,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
 	u64 align_gap = 0;
 	u64 align_gap_len = 0;
 	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
-	bool use_bytes_index = (offset == block_group->start);
 
 	ASSERT(!btrfs_is_zoned(block_group->fs_info));
 
 	spin_lock(&ctl->tree_lock);
 	entry = find_free_space(ctl, &offset, &bytes_search,
-				block_group->full_stripe_len, max_extent_size,
-				use_bytes_index);
+				block_group->full_stripe_len, max_extent_size);
 	if (!entry)
 		goto out;
 
@@ -3332,7 +3264,6 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
 
 	cluster->window_start = start * ctl->unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
-	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
 	ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3423,7 +3354,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
-		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
 		total_size += entry->bytes;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index dd982d2..1f23088 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,7 +22,6 @@ enum btrfs_trim_state {
 
 struct btrfs_free_space {
 	struct rb_node offset_index;
-	struct rb_node bytes_index;
 	u64 offset;
 	u64 bytes;
 	u64 max_extent_size;
@@ -46,7 +45,6 @@ static inline bool btrfs_free_space_trimming_bitmap(
 struct btrfs_free_space_ctl {
 	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
-	struct rb_root_cached free_space_bytes;
 	u64 free_space;
 	int extents_thresh;
 	int free_extents;
-- 
2.34.1.75.gabe6bb3905


From e37d4b4a407f642a472beb2f17b28768d3861bc0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 18 Nov 2021 10:26:16 -0500
Subject: [PATCH 31/48] btrfs: index free space entries on size

Currently we index free space on offset only, because usually we have a
hint from the allocator that we want to honor for locality reasons.
However if we fail to use this hint we have to go back to a brute force
search through the free space entries to find a large enough extent.

With sufficiently fragmented free space this becomes quite expensive, as
we have to linearly search all of the free space entries to find if we
have a part that's long enough.

To fix this add a cached rb tree to index based on free space entry
bytes.  This will allow us to quickly look up the largest chunk in the
free space tree for this block group, and stop searching once we've
found an entry that is too small to satisfy our allocation.  We simply
choose to use this tree if we're searching from the beginning of the
block group, as we know we do not care about locality at that point.

I wrote an allocator test that creates a 10TiB ram backed null block
device and then fallocates random files until the file system is full.
I think go through and delete all of the odd files.  Then I spawn 8
threads that fallocate 64mib files (1/2 our extent size cap) until the
file system is full again.  I use bcc's funclatency to measure the
latency of find_free_extent.  The baseline results are

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 10356    |****                                    |
       512 -> 1023       : 58242    |*************************               |
      1024 -> 2047       : 74418    |********************************        |
      2048 -> 4095       : 90393    |****************************************|
      4096 -> 8191       : 79119    |***********************************     |
      8192 -> 16383      : 35614    |***************                         |
     16384 -> 32767      : 13418    |*****                                   |
     32768 -> 65535      : 12811    |*****                                   |
     65536 -> 131071     : 17090    |*******                                 |
    131072 -> 262143     : 26465    |***********                             |
    262144 -> 524287     : 40179    |*****************                       |
    524288 -> 1048575    : 55469    |************************                |
   1048576 -> 2097151    : 48807    |*********************                   |
   2097152 -> 4194303    : 26744    |***********                             |
   4194304 -> 8388607    : 35351    |***************                         |
   8388608 -> 16777215   : 13918    |******                                  |
  16777216 -> 33554431   : 21       |                                        |

avg = 908079 nsecs, total: 580889071441 nsecs, count: 639690

And the patch results are

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 6883     |**                                      |
       512 -> 1023       : 54346    |*********************                   |
      1024 -> 2047       : 79170    |********************************        |
      2048 -> 4095       : 98890    |****************************************|
      4096 -> 8191       : 81911    |*********************************       |
      8192 -> 16383      : 27075    |**********                              |
     16384 -> 32767      : 14668    |*****                                   |
     32768 -> 65535      : 13251    |*****                                   |
     65536 -> 131071     : 15340    |******                                  |
    131072 -> 262143     : 26715    |**********                              |
    262144 -> 524287     : 43274    |*****************                       |
    524288 -> 1048575    : 53870    |*********************                   |
   1048576 -> 2097151    : 55368    |**********************                  |
   2097152 -> 4194303    : 41036    |****************                        |
   4194304 -> 8388607    : 24927    |**********                              |
   8388608 -> 16777215   : 33       |                                        |
  16777216 -> 33554431   : 9        |                                        |

avg = 623599 nsecs, total: 397259314759 nsecs, count: 637042

There's a little variation in the amount of calls done because of timing
of the threads with metadata requirements, but the avg, total, and
count's are relatively consistent between runs (usually within 2-5% of
each other).  As you can see here we have around a 30% decrease in
average latency with a 30% decrease in overall time spent in
find_free_extent.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/free-space-cache.c       | 198 +++++++++++++++++++++++-------
 fs/btrfs/free-space-cache.h       |   2 +
 fs/btrfs/tests/free-space-tests.c | 181 +++++++++++++++++++++++++++
 3 files changed, 337 insertions(+), 44 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6ca3a92..349bea1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1576,6 +1576,68 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 	return 0;
 }
 
+/*
+ * This is a little subtle.  We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0.  In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes.  So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have.  For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
+ *  call into find_free_space(), and return with max_extent_size == 4k, because
+ *  that first bitmap entry had ->max_extent_size set, but the second one did
+ *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
+ *  8k contiguous range.
+ *
+ *  Consider the other case, we have 2 8k chunks in that second entry and still
+ *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
+ *  allocator comes in it'll fully search our second bitmap, and this time it'll
+ *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
+ *  right allocation the next loop through.
+ */
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+	if (entry->bitmap && entry->max_extent_size)
+		return entry->max_extent_size;
+	return entry->bytes;
+}
+
+/*
+ * This is indexed in reverse of what we generally do for rb-tree's, the largest
+ * chunks are left most and the smallest are rightmost.  This is so that we can
+ * take advantage of the cached property of the cached rb-tree and simply get
+ * the largest free space chunk right away.
+ */
+static void tree_insert_bytes(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	struct rb_root_cached *root = &ctl->free_space_bytes;
+	struct rb_node **p = &root->rb_root.rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_free_space *tmp;
+	bool leftmost = true;
+
+	while (*p) {
+		parent_node = *p;
+		tmp = rb_entry(parent_node, struct btrfs_free_space,
+			       bytes_index);
+		if (get_max_extent_size(info) < get_max_extent_size(tmp)) {
+			p = &(*p)->rb_right;
+			leftmost = false;
+		} else {
+			p = &(*p)->rb_left;
+		}
+	}
+
+	rb_link_node(&info->bytes_index, parent_node, p);
+	rb_insert_color_cached(&info->bytes_index, root, leftmost);
+}
+
 /*
  * searches the tree for the given offset.
  *
@@ -1704,6 +1766,7 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
 		    struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &ctl->free_space_offset);
+	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
 	ctl->free_extents--;
 
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
@@ -1730,6 +1793,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	if (ret)
 		return ret;
 
+	tree_insert_bytes(ctl, info);
+
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
 		ctl->discardable_extents[BTRFS_STAT_CURR]++;
 		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1740,6 +1805,22 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	return ret;
 }
 
+static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
+				struct btrfs_free_space *info)
+{
+	ASSERT(info->bitmap);
+
+	/*
+	 * If our entry is empty it's because we're on a cluster and we don't
+	 * want to re-link it into our ctl bytes index.
+	 */
+	if (RB_EMPTY_NODE(&info->bytes_index))
+		return;
+
+	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+	tree_insert_bytes(ctl, info);
+}
+
 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 				       struct btrfs_free_space *info,
 				       u64 offset, u64 bytes)
@@ -1758,6 +1839,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 	if (info->max_extent_size > ctl->unit)
 		info->max_extent_size = 0;
 
+	relink_bitmap_entry(ctl, info);
+
 	if (start && test_bit(start - 1, info->bitmap))
 		extent_delta++;
 
@@ -1793,9 +1876,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set(info->bitmap, start, count);
 
+	/*
+	 * We set some bytes, we have no idea what the max extent size is
+	 * anymore.
+	 */
+	info->max_extent_size = 0;
 	info->bytes += bytes;
 	ctl->free_space += bytes;
 
+	relink_bitmap_entry(ctl, info);
+
 	if (start && test_bit(start - 1, info->bitmap))
 		extent_delta--;
 
@@ -1863,44 +1953,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	*bytes = (u64)(max_bits) * ctl->unit;
 	bitmap_info->max_extent_size = *bytes;
+	relink_bitmap_entry(ctl, bitmap_info);
 	return -1;
 }
 
-/*
- * This is a little subtle.  We *only* have ->max_extent_size set if we actually
- * searched through the bitmap and figured out the largest ->max_extent_size,
- * otherwise it's 0.  In the case that it's 0 we don't want to tell the
- * allocator the wrong thing, we want to use the actual real max_extent_size
- * we've found already if it's larger, or we want to use ->bytes.
- *
- * This matters because find_free_space() will skip entries who's ->bytes is
- * less than the required bytes.  So if we didn't search down this bitmap, we
- * may pick some previous entry that has a smaller ->max_extent_size than we
- * have.  For example, assume we have two entries, one that has
- * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
- * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
- *  call into find_free_space(), and return with max_extent_size == 4k, because
- *  that first bitmap entry had ->max_extent_size set, but the second one did
- *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
- *  8k contiguous range.
- *
- *  Consider the other case, we have 2 8k chunks in that second entry and still
- *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
- *  allocator comes in it'll fully search our second bitmap, and this time it'll
- *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
- *  right allocation the next loop through.
- */
-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
-{
-	if (entry->bitmap && entry->max_extent_size)
-		return entry->max_extent_size;
-	return entry->bytes;
-}
-
 /* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
-		unsigned long align, u64 *max_extent_size)
+		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
@@ -1910,16 +1970,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 
 	if (!ctl->free_space_offset.rb_node)
 		goto out;
+again:
+	if (use_bytes_index) {
+		node = rb_first_cached(&ctl->free_space_bytes);
+	} else {
+		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
+					   0, 1);
+		if (!entry)
+			goto out;
+		node = &entry->offset_index;
+	}
 
-	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
-	if (!entry)
-		goto out;
+	for (; node; node = rb_next(node)) {
+		if (use_bytes_index)
+			entry = rb_entry(node, struct btrfs_free_space,
+					 bytes_index);
+		else
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
 
-	for (node = &entry->offset_index; node; node = rb_next(node)) {
-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		/*
+		 * If we are using the bytes index then all subsequent entries
+		 * in this tree are going to be < bytes, so simply set the max
+		 * extent size and exit the loop.
+		 *
+		 * If we're using the offset index then we need to keep going
+		 * through the rest of the tree.
+		 */
 		if (entry->bytes < *bytes) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
+			if (use_bytes_index)
+				break;
 			continue;
 		}
 
@@ -1936,6 +2018,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 			tmp = entry->offset;
 		}
 
+		/*
+		 * We don't break here if we're using the bytes index because we
+		 * may have another entry that has the correct alignment that is
+		 * the right size, so we don't want to miss that possibility.
+		 * At worst this adds another loop through the logic, but if we
+		 * broke here we could prematurely ENOSPC.
+		 */
 		if (entry->bytes < *bytes + align_off) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
@@ -1943,6 +2032,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		}
 
 		if (entry->bitmap) {
+			struct rb_node *old_next = rb_next(node);
 			u64 size = *bytes;
 
 			ret = search_bitmap(ctl, entry, &tmp, &size, true);
@@ -1955,6 +2045,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 					max(get_max_extent_size(entry),
 					    *max_extent_size);
 			}
+
+			/*
+			 * The bitmap may have gotten re-arranged in the space
+			 * index here because the max_extent_size may have been
+			 * updated.  Start from the beginning again if this
+			 * happened.
+			 */
+			if (use_bytes_index && old_next != rb_next(node))
+				goto again;
 			continue;
 		}
 
@@ -2103,12 +2202,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set_bits(ctl, info, offset, bytes_to_set);
 
-	/*
-	 * We set some bytes, we have no idea what the max extent size is
-	 * anymore.
-	 */
-	info->max_extent_size = 0;
-
 	return bytes_to_set;
 
 }
@@ -2506,6 +2599,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 	info->bytes = bytes;
 	info->trim_state = trim_state;
 	RB_CLEAR_NODE(&info->offset_index);
+	RB_CLEAR_NODE(&info->bytes_index);
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2809,6 +2903,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 	ctl->start = block_group->start;
 	ctl->private = block_group;
 	ctl->op = &free_space_op;
+	ctl->free_space_bytes = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&ctl->trimming_ranges);
 	mutex_init(&ctl->cache_writeout_mutex);
 
@@ -2874,6 +2969,7 @@ static void __btrfs_return_cluster_to_free_space(
 		}
 		tree_insert_offset(&ctl->free_space_offset,
 				   entry->offset, &entry->offset_index, bitmap);
+		tree_insert_bytes(ctl, entry);
 	}
 	cluster->root = RB_ROOT;
 	spin_unlock(&cluster->lock);
@@ -2975,12 +3071,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
 	u64 align_gap = 0;
 	u64 align_gap_len = 0;
 	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+	bool use_bytes_index = (offset == block_group->start);
 
 	ASSERT(!btrfs_is_zoned(block_group->fs_info));
 
 	spin_lock(&ctl->tree_lock);
 	entry = find_free_space(ctl, &offset, &bytes_search,
-				block_group->full_stripe_len, max_extent_size);
+				block_group->full_stripe_len, max_extent_size,
+				use_bytes_index);
 	if (!entry)
 		goto out;
 
@@ -3264,6 +3362,17 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
 
 	cluster->window_start = start * ctl->unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
+	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+
+	/*
+	 * We need to know if we're currently on the normal space index when we
+	 * manipulate the bitmap so that we know we need to remove and re-insert
+	 * it into the space_index tree.  Clear the bytes_index node here so the
+	 * bitmap manipulation helpers know not to mess with the space_index
+	 * until this bitmap entry is added back into the normal cache.
+	 */
+	RB_CLEAR_NODE(&entry->bytes_index);
+
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
 	ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3354,6 +3463,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
+		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
 		total_size += entry->bytes;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 1f23088..dd982d2 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,6 +22,7 @@ enum btrfs_trim_state {
 
 struct btrfs_free_space {
 	struct rb_node offset_index;
+	struct rb_node bytes_index;
 	u64 offset;
 	u64 bytes;
 	u64 max_extent_size;
@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
 struct btrfs_free_space_ctl {
 	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
+	struct rb_root_cached free_space_bytes;
 	u64 free_space;
 	int extents_thresh;
 	int free_extents;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 8f05c1e..6f922ce 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -824,6 +824,184 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
 	return 0;
 }
 
+static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_free_space *info)
+{
+	return true;
+}
+
+static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
+{
+	const struct btrfs_free_space_op test_free_space_ops = {
+		.use_bitmap = bytes_index_use_bitmap,
+	};
+	const struct btrfs_free_space_op *orig_free_space_ops;
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *n;
+	u64 offset, max_extent_size, bytes;
+	int ret, i;
+
+	test_msg("running bytes index tests");
+
+	/* First just validate that it does everything in order. */
+	offset = 0;
+	for (i = 0; i < 10; i++) {
+		bytes = (i + 1) * SZ_1M;
+		ret = test_add_free_space_entry(cache, offset, bytes, 0);
+		if (ret) {
+			test_err("couldn't add extent entry %d\n", ret);
+			return ret;
+		}
+		offset += bytes + sectorsize;
+	}
+
+	for (n = rb_first_cached(&ctl->free_space_bytes), i = 9; n;
+	     n = rb_next(n), i--) {
+		entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+		bytes = (i + 1) * SZ_1M;
+		if (entry->bytes != bytes) {
+			test_err("invalid bytes index order, found %llu expected %llu",
+				 entry->bytes, bytes);
+			return -EINVAL;
+		}
+	}
+
+	/* Now validate bitmaps do the correct thing. */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	for (i = 0; i < 2; i++) {
+		offset = i * BITS_PER_BITMAP * sectorsize;
+		bytes = (i + 1) * SZ_1M;
+		ret = test_add_free_space_entry(cache, offset, bytes, 1);
+		if (ret) {
+			test_err("couldn't add bitmap entry");
+			return ret;
+		}
+	}
+
+	for (n = rb_first_cached(&ctl->free_space_bytes), i = 1; n;
+	     n = rb_next(n), i--) {
+		entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+		bytes = (i + 1) * SZ_1M;
+		if (entry->bytes != bytes) {
+			test_err("invalid bytes index order, found %llu expected %llu",
+				 entry->bytes, bytes);
+			return -EINVAL;
+		}
+	}
+
+	/* Now validate bitmaps with different ->max_extent_size. */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	orig_free_space_ops = cache->free_space_ctl->op;
+	cache->free_space_ctl->op = &test_free_space_ops;
+
+	ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
+	if (ret) {
+		test_err("couldn't add bitmap entry");
+		return ret;
+	}
+
+	offset = BITS_PER_BITMAP * sectorsize;
+	ret = test_add_free_space_entry(cache, offset, sectorsize, 1);
+	if (ret) {
+		test_err("couldn't add bitmap_entry");
+		return ret;
+	}
+
+	/*
+	 * Now set a bunch of sectorsize extents in the first entry so it's
+	 * ->bytes is large.
+	 */
+	for (i = 2; i < 20; i += 2) {
+		offset = sectorsize * i;
+		ret = btrfs_add_free_space(cache, offset, sectorsize);
+		if (ret) {
+			test_err("error populating sparse bitmap %d", ret);
+			return ret;
+		}
+	}
+
+	/*
+	 * Now set a contiguous extent in the second bitmap so its
+	 * ->max_extent_size is larger than the first bitmaps.
+	 */
+	offset = (BITS_PER_BITMAP * sectorsize) + sectorsize;
+	ret = btrfs_add_free_space(cache, offset, sectorsize);
+	if (ret) {
+		test_err("error adding contiguous extent %d", ret);
+		return ret;
+	}
+
+	/*
+	 * Since we don't set ->max_extent_size unless we search everything
+	 * should be indexed on bytes.
+	 */
+	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+			 struct btrfs_free_space, bytes_index);
+	if (entry->bytes != (10 * sectorsize)) {
+		test_err("error, wrong entry in the first slot in bytes_index");
+		return -EINVAL;
+	}
+
+	max_extent_size = 0;
+	offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3,
+					    0, &max_extent_size);
+	if (offset != 0) {
+		test_err("found space to alloc even though we don't have enough space");
+		return -EINVAL;
+	}
+
+	if (max_extent_size != (2 * sectorsize)) {
+		test_err("got the wrong max_extent size %llu expected %llu",
+			 max_extent_size, (unsigned long long)(2 * sectorsize));
+		return -EINVAL;
+	}
+
+	/*
+	 * The search should have re-arranged the bytes index to use the
+	 * ->max_extent_size, validate it's now what we expect it to be.
+	 */
+	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+			 struct btrfs_free_space, bytes_index);
+	if (entry->bytes != (2 * sectorsize)) {
+		test_err("error, the bytes index wasn't recalculated properly");
+		return -EINVAL;
+	}
+
+	/* Add another sectorsize to re-arrange the tree back to ->bytes. */
+	offset = (BITS_PER_BITMAP * sectorsize) - sectorsize;
+	ret = btrfs_add_free_space(cache, offset, sectorsize);
+	if (ret) {
+		test_err("error adding extent to the sparse entry %d", ret);
+		return ret;
+	}
+
+	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+			 struct btrfs_free_space, bytes_index);
+	if (entry->bytes != (11 * sectorsize)) {
+		test_err("error, wrong entry in the first slot in bytes_index");
+		return -EINVAL;
+	}
+
+	/*
+	 * Now make sure we find our correct entry after searching that will
+	 * result in a re-arranging of the tree.
+	 */
+	max_extent_size = 0;
+	offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2,
+					    0, &max_extent_size);
+	if (offset != (BITS_PER_BITMAP * sectorsize)) {
+		test_err("error, found %llu instead of %llu for our alloc",
+			 offset,
+			 (unsigned long long)(BITS_PER_BITMAP * sectorsize));
+		return -EINVAL;
+	}
+
+	cache->free_space_ctl->op = orig_free_space_ops;
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	return 0;
+}
+
 int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 {
 	struct btrfs_fs_info *fs_info;
@@ -871,6 +1049,9 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 		goto out;
 
 	ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
+	if (ret)
+		goto out;
+	ret = test_bytes_index(cache, sectorsize);
 out:
 	btrfs_free_dummy_block_group(cache);
 	btrfs_free_dummy_root(root);
-- 
2.34.1.75.gabe6bb3905


From ab46d1b760af62da9de7104d0326a7532cceb187 Mon Sep 17 00:00:00 2001
From: Piotr Gorski <lucjan.lucjanov@gmail.com>
Date: Fri, 19 Nov 2021 21:30:28 +0100
Subject: [PATCH 32/48] Revert "btrfs: index free space entries on size"

This reverts commit d599d3b21dc57c2886666c9383778d2abef0ece0.

Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
---
 fs/btrfs/free-space-cache.c       | 198 +++++++-----------------------
 fs/btrfs/free-space-cache.h       |   2 -
 fs/btrfs/tests/free-space-tests.c | 181 ---------------------------
 3 files changed, 44 insertions(+), 337 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 349bea1..6ca3a92 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1576,68 +1576,6 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 	return 0;
 }
 
-/*
- * This is a little subtle.  We *only* have ->max_extent_size set if we actually
- * searched through the bitmap and figured out the largest ->max_extent_size,
- * otherwise it's 0.  In the case that it's 0 we don't want to tell the
- * allocator the wrong thing, we want to use the actual real max_extent_size
- * we've found already if it's larger, or we want to use ->bytes.
- *
- * This matters because find_free_space() will skip entries who's ->bytes is
- * less than the required bytes.  So if we didn't search down this bitmap, we
- * may pick some previous entry that has a smaller ->max_extent_size than we
- * have.  For example, assume we have two entries, one that has
- * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
- * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
- *  call into find_free_space(), and return with max_extent_size == 4k, because
- *  that first bitmap entry had ->max_extent_size set, but the second one did
- *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
- *  8k contiguous range.
- *
- *  Consider the other case, we have 2 8k chunks in that second entry and still
- *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
- *  allocator comes in it'll fully search our second bitmap, and this time it'll
- *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
- *  right allocation the next loop through.
- */
-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
-{
-	if (entry->bitmap && entry->max_extent_size)
-		return entry->max_extent_size;
-	return entry->bytes;
-}
-
-/*
- * This is indexed in reverse of what we generally do for rb-tree's, the largest
- * chunks are left most and the smallest are rightmost.  This is so that we can
- * take advantage of the cached property of the cached rb-tree and simply get
- * the largest free space chunk right away.
- */
-static void tree_insert_bytes(struct btrfs_free_space_ctl *ctl,
-			      struct btrfs_free_space *info)
-{
-	struct rb_root_cached *root = &ctl->free_space_bytes;
-	struct rb_node **p = &root->rb_root.rb_node;
-	struct rb_node *parent_node = NULL;
-	struct btrfs_free_space *tmp;
-	bool leftmost = true;
-
-	while (*p) {
-		parent_node = *p;
-		tmp = rb_entry(parent_node, struct btrfs_free_space,
-			       bytes_index);
-		if (get_max_extent_size(info) < get_max_extent_size(tmp)) {
-			p = &(*p)->rb_right;
-			leftmost = false;
-		} else {
-			p = &(*p)->rb_left;
-		}
-	}
-
-	rb_link_node(&info->bytes_index, parent_node, p);
-	rb_insert_color_cached(&info->bytes_index, root, leftmost);
-}
-
 /*
  * searches the tree for the given offset.
  *
@@ -1766,7 +1704,6 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
 		    struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &ctl->free_space_offset);
-	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
 	ctl->free_extents--;
 
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
@@ -1793,8 +1730,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	if (ret)
 		return ret;
 
-	tree_insert_bytes(ctl, info);
-
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
 		ctl->discardable_extents[BTRFS_STAT_CURR]++;
 		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1805,22 +1740,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	return ret;
 }
 
-static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
-				struct btrfs_free_space *info)
-{
-	ASSERT(info->bitmap);
-
-	/*
-	 * If our entry is empty it's because we're on a cluster and we don't
-	 * want to re-link it into our ctl bytes index.
-	 */
-	if (RB_EMPTY_NODE(&info->bytes_index))
-		return;
-
-	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
-	tree_insert_bytes(ctl, info);
-}
-
 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 				       struct btrfs_free_space *info,
 				       u64 offset, u64 bytes)
@@ -1839,8 +1758,6 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 	if (info->max_extent_size > ctl->unit)
 		info->max_extent_size = 0;
 
-	relink_bitmap_entry(ctl, info);
-
 	if (start && test_bit(start - 1, info->bitmap))
 		extent_delta++;
 
@@ -1876,16 +1793,9 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set(info->bitmap, start, count);
 
-	/*
-	 * We set some bytes, we have no idea what the max extent size is
-	 * anymore.
-	 */
-	info->max_extent_size = 0;
 	info->bytes += bytes;
 	ctl->free_space += bytes;
 
-	relink_bitmap_entry(ctl, info);
-
 	if (start && test_bit(start - 1, info->bitmap))
 		extent_delta--;
 
@@ -1953,14 +1863,44 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	*bytes = (u64)(max_bits) * ctl->unit;
 	bitmap_info->max_extent_size = *bytes;
-	relink_bitmap_entry(ctl, bitmap_info);
 	return -1;
 }
 
+/*
+ * This is a little subtle.  We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0.  In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes.  So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have.  For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
+ *  call into find_free_space(), and return with max_extent_size == 4k, because
+ *  that first bitmap entry had ->max_extent_size set, but the second one did
+ *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
+ *  8k contiguous range.
+ *
+ *  Consider the other case, we have 2 8k chunks in that second entry and still
+ *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
+ *  allocator comes in it'll fully search our second bitmap, and this time it'll
+ *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
+ *  right allocation the next loop through.
+ */
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+	if (entry->bitmap && entry->max_extent_size)
+		return entry->max_extent_size;
+	return entry->bytes;
+}
+
 /* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
-		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
+		unsigned long align, u64 *max_extent_size)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
@@ -1970,38 +1910,16 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 
 	if (!ctl->free_space_offset.rb_node)
 		goto out;
-again:
-	if (use_bytes_index) {
-		node = rb_first_cached(&ctl->free_space_bytes);
-	} else {
-		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
-					   0, 1);
-		if (!entry)
-			goto out;
-		node = &entry->offset_index;
-	}
 
-	for (; node; node = rb_next(node)) {
-		if (use_bytes_index)
-			entry = rb_entry(node, struct btrfs_free_space,
-					 bytes_index);
-		else
-			entry = rb_entry(node, struct btrfs_free_space,
-					 offset_index);
+	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
+	if (!entry)
+		goto out;
 
-		/*
-		 * If we are using the bytes index then all subsequent entries
-		 * in this tree are going to be < bytes, so simply set the max
-		 * extent size and exit the loop.
-		 *
-		 * If we're using the offset index then we need to keep going
-		 * through the rest of the tree.
-		 */
+	for (node = &entry->offset_index; node; node = rb_next(node)) {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		if (entry->bytes < *bytes) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
-			if (use_bytes_index)
-				break;
 			continue;
 		}
 
@@ -2018,13 +1936,6 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 			tmp = entry->offset;
 		}
 
-		/*
-		 * We don't break here if we're using the bytes index because we
-		 * may have another entry that has the correct alignment that is
-		 * the right size, so we don't want to miss that possibility.
-		 * At worst this adds another loop through the logic, but if we
-		 * broke here we could prematurely ENOSPC.
-		 */
 		if (entry->bytes < *bytes + align_off) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
@@ -2032,7 +1943,6 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		}
 
 		if (entry->bitmap) {
-			struct rb_node *old_next = rb_next(node);
 			u64 size = *bytes;
 
 			ret = search_bitmap(ctl, entry, &tmp, &size, true);
@@ -2045,15 +1955,6 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 					max(get_max_extent_size(entry),
 					    *max_extent_size);
 			}
-
-			/*
-			 * The bitmap may have gotten re-arranged in the space
-			 * index here because the max_extent_size may have been
-			 * updated.  Start from the beginning again if this
-			 * happened.
-			 */
-			if (use_bytes_index && old_next != rb_next(node))
-				goto again;
 			continue;
 		}
 
@@ -2202,6 +2103,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set_bits(ctl, info, offset, bytes_to_set);
 
+	/*
+	 * We set some bytes, we have no idea what the max extent size is
+	 * anymore.
+	 */
+	info->max_extent_size = 0;
+
 	return bytes_to_set;
 
 }
@@ -2599,7 +2506,6 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 	info->bytes = bytes;
 	info->trim_state = trim_state;
 	RB_CLEAR_NODE(&info->offset_index);
-	RB_CLEAR_NODE(&info->bytes_index);
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2903,7 +2809,6 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 	ctl->start = block_group->start;
 	ctl->private = block_group;
 	ctl->op = &free_space_op;
-	ctl->free_space_bytes = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&ctl->trimming_ranges);
 	mutex_init(&ctl->cache_writeout_mutex);
 
@@ -2969,7 +2874,6 @@ static void __btrfs_return_cluster_to_free_space(
 		}
 		tree_insert_offset(&ctl->free_space_offset,
 				   entry->offset, &entry->offset_index, bitmap);
-		tree_insert_bytes(ctl, entry);
 	}
 	cluster->root = RB_ROOT;
 	spin_unlock(&cluster->lock);
@@ -3071,14 +2975,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
 	u64 align_gap = 0;
 	u64 align_gap_len = 0;
 	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
-	bool use_bytes_index = (offset == block_group->start);
 
 	ASSERT(!btrfs_is_zoned(block_group->fs_info));
 
 	spin_lock(&ctl->tree_lock);
 	entry = find_free_space(ctl, &offset, &bytes_search,
-				block_group->full_stripe_len, max_extent_size,
-				use_bytes_index);
+				block_group->full_stripe_len, max_extent_size);
 	if (!entry)
 		goto out;
 
@@ -3362,17 +3264,6 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
 
 	cluster->window_start = start * ctl->unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
-	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
-
-	/*
-	 * We need to know if we're currently on the normal space index when we
-	 * manipulate the bitmap so that we know we need to remove and re-insert
-	 * it into the space_index tree.  Clear the bytes_index node here so the
-	 * bitmap manipulation helpers know not to mess with the space_index
-	 * until this bitmap entry is added back into the normal cache.
-	 */
-	RB_CLEAR_NODE(&entry->bytes_index);
-
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
 	ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3463,7 +3354,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
-		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
 		total_size += entry->bytes;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index dd982d2..1f23088 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,7 +22,6 @@ enum btrfs_trim_state {
 
 struct btrfs_free_space {
 	struct rb_node offset_index;
-	struct rb_node bytes_index;
 	u64 offset;
 	u64 bytes;
 	u64 max_extent_size;
@@ -46,7 +45,6 @@ static inline bool btrfs_free_space_trimming_bitmap(
 struct btrfs_free_space_ctl {
 	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
-	struct rb_root_cached free_space_bytes;
 	u64 free_space;
 	int extents_thresh;
 	int free_extents;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 6f922ce..8f05c1e 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -824,184 +824,6 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
 	return 0;
 }
 
-static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
-				   struct btrfs_free_space *info)
-{
-	return true;
-}
-
-static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
-{
-	const struct btrfs_free_space_op test_free_space_ops = {
-		.use_bitmap = bytes_index_use_bitmap,
-	};
-	const struct btrfs_free_space_op *orig_free_space_ops;
-	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
-	struct btrfs_free_space *entry;
-	struct rb_node *n;
-	u64 offset, max_extent_size, bytes;
-	int ret, i;
-
-	test_msg("running bytes index tests");
-
-	/* First just validate that it does everything in order. */
-	offset = 0;
-	for (i = 0; i < 10; i++) {
-		bytes = (i + 1) * SZ_1M;
-		ret = test_add_free_space_entry(cache, offset, bytes, 0);
-		if (ret) {
-			test_err("couldn't add extent entry %d\n", ret);
-			return ret;
-		}
-		offset += bytes + sectorsize;
-	}
-
-	for (n = rb_first_cached(&ctl->free_space_bytes), i = 9; n;
-	     n = rb_next(n), i--) {
-		entry = rb_entry(n, struct btrfs_free_space, bytes_index);
-		bytes = (i + 1) * SZ_1M;
-		if (entry->bytes != bytes) {
-			test_err("invalid bytes index order, found %llu expected %llu",
-				 entry->bytes, bytes);
-			return -EINVAL;
-		}
-	}
-
-	/* Now validate bitmaps do the correct thing. */
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-	for (i = 0; i < 2; i++) {
-		offset = i * BITS_PER_BITMAP * sectorsize;
-		bytes = (i + 1) * SZ_1M;
-		ret = test_add_free_space_entry(cache, offset, bytes, 1);
-		if (ret) {
-			test_err("couldn't add bitmap entry");
-			return ret;
-		}
-	}
-
-	for (n = rb_first_cached(&ctl->free_space_bytes), i = 1; n;
-	     n = rb_next(n), i--) {
-		entry = rb_entry(n, struct btrfs_free_space, bytes_index);
-		bytes = (i + 1) * SZ_1M;
-		if (entry->bytes != bytes) {
-			test_err("invalid bytes index order, found %llu expected %llu",
-				 entry->bytes, bytes);
-			return -EINVAL;
-		}
-	}
-
-	/* Now validate bitmaps with different ->max_extent_size. */
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-	orig_free_space_ops = cache->free_space_ctl->op;
-	cache->free_space_ctl->op = &test_free_space_ops;
-
-	ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
-	if (ret) {
-		test_err("couldn't add bitmap entry");
-		return ret;
-	}
-
-	offset = BITS_PER_BITMAP * sectorsize;
-	ret = test_add_free_space_entry(cache, offset, sectorsize, 1);
-	if (ret) {
-		test_err("couldn't add bitmap_entry");
-		return ret;
-	}
-
-	/*
-	 * Now set a bunch of sectorsize extents in the first entry so it's
-	 * ->bytes is large.
-	 */
-	for (i = 2; i < 20; i += 2) {
-		offset = sectorsize * i;
-		ret = btrfs_add_free_space(cache, offset, sectorsize);
-		if (ret) {
-			test_err("error populating sparse bitmap %d", ret);
-			return ret;
-		}
-	}
-
-	/*
-	 * Now set a contiguous extent in the second bitmap so its
-	 * ->max_extent_size is larger than the first bitmaps.
-	 */
-	offset = (BITS_PER_BITMAP * sectorsize) + sectorsize;
-	ret = btrfs_add_free_space(cache, offset, sectorsize);
-	if (ret) {
-		test_err("error adding contiguous extent %d", ret);
-		return ret;
-	}
-
-	/*
-	 * Since we don't set ->max_extent_size unless we search everything
-	 * should be indexed on bytes.
-	 */
-	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
-			 struct btrfs_free_space, bytes_index);
-	if (entry->bytes != (10 * sectorsize)) {
-		test_err("error, wrong entry in the first slot in bytes_index");
-		return -EINVAL;
-	}
-
-	max_extent_size = 0;
-	offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3,
-					    0, &max_extent_size);
-	if (offset != 0) {
-		test_err("found space to alloc even though we don't have enough space");
-		return -EINVAL;
-	}
-
-	if (max_extent_size != (2 * sectorsize)) {
-		test_err("got the wrong max_extent size %llu expected %llu",
-			 max_extent_size, (unsigned long long)(2 * sectorsize));
-		return -EINVAL;
-	}
-
-	/*
-	 * The search should have re-arranged the bytes index to use the
-	 * ->max_extent_size, validate it's now what we expect it to be.
-	 */
-	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
-			 struct btrfs_free_space, bytes_index);
-	if (entry->bytes != (2 * sectorsize)) {
-		test_err("error, the bytes index wasn't recalculated properly");
-		return -EINVAL;
-	}
-
-	/* Add another sectorsize to re-arrange the tree back to ->bytes. */
-	offset = (BITS_PER_BITMAP * sectorsize) - sectorsize;
-	ret = btrfs_add_free_space(cache, offset, sectorsize);
-	if (ret) {
-		test_err("error adding extent to the sparse entry %d", ret);
-		return ret;
-	}
-
-	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
-			 struct btrfs_free_space, bytes_index);
-	if (entry->bytes != (11 * sectorsize)) {
-		test_err("error, wrong entry in the first slot in bytes_index");
-		return -EINVAL;
-	}
-
-	/*
-	 * Now make sure we find our correct entry after searching that will
-	 * result in a re-arranging of the tree.
-	 */
-	max_extent_size = 0;
-	offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2,
-					    0, &max_extent_size);
-	if (offset != (BITS_PER_BITMAP * sectorsize)) {
-		test_err("error, found %llu instead of %llu for our alloc",
-			 offset,
-			 (unsigned long long)(BITS_PER_BITMAP * sectorsize));
-		return -EINVAL;
-	}
-
-	cache->free_space_ctl->op = orig_free_space_ops;
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-	return 0;
-}
-
 int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 {
 	struct btrfs_fs_info *fs_info;
@@ -1049,9 +871,6 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 		goto out;
 
 	ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
-	if (ret)
-		goto out;
-	ret = test_bytes_index(cache, sectorsize);
 out:
 	btrfs_free_dummy_block_group(cache);
 	btrfs_free_dummy_root(root);
-- 
2.34.1.75.gabe6bb3905


From 90bae146e63ecb82505574348cab71267cc4d097 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 18 Nov 2021 16:33:15 -0500
Subject: [PATCH 33/48] btrfs: index free space entries on size

Currently we index free space on offset only, because usually we have a
hint from the allocator that we want to honor for locality reasons.
However if we fail to use this hint we have to go back to a brute force
search through the free space entries to find a large enough extent.

With sufficiently fragmented free space this becomes quite expensive, as
we have to linearly search all of the free space entries to find if we
have a part that's long enough.

To fix this add a cached rb tree to index based on free space entry
bytes.  This will allow us to quickly look up the largest chunk in the
free space tree for this block group, and stop searching once we've
found an entry that is too small to satisfy our allocation.  We simply
choose to use this tree if we're searching from the beginning of the
block group, as we know we do not care about locality at that point.

I wrote an allocator test that creates a 10TiB ram backed null block
device and then fallocates random files until the file system is full.
I think go through and delete all of the odd files.  Then I spawn 8
threads that fallocate 64mib files (1/2 our extent size cap) until the
file system is full again.  I use bcc's funclatency to measure the
latency of find_free_extent.  The baseline results are

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 10356    |****                                    |
       512 -> 1023       : 58242    |*************************               |
      1024 -> 2047       : 74418    |********************************        |
      2048 -> 4095       : 90393    |****************************************|
      4096 -> 8191       : 79119    |***********************************     |
      8192 -> 16383      : 35614    |***************                         |
     16384 -> 32767      : 13418    |*****                                   |
     32768 -> 65535      : 12811    |*****                                   |
     65536 -> 131071     : 17090    |*******                                 |
    131072 -> 262143     : 26465    |***********                             |
    262144 -> 524287     : 40179    |*****************                       |
    524288 -> 1048575    : 55469    |************************                |
   1048576 -> 2097151    : 48807    |*********************                   |
   2097152 -> 4194303    : 26744    |***********                             |
   4194304 -> 8388607    : 35351    |***************                         |
   8388608 -> 16777215   : 13918    |******                                  |
  16777216 -> 33554431   : 21       |                                        |

avg = 908079 nsecs, total: 580889071441 nsecs, count: 639690

And the patch results are

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 6883     |**                                      |
       512 -> 1023       : 54346    |*********************                   |
      1024 -> 2047       : 79170    |********************************        |
      2048 -> 4095       : 98890    |****************************************|
      4096 -> 8191       : 81911    |*********************************       |
      8192 -> 16383      : 27075    |**********                              |
     16384 -> 32767      : 14668    |*****                                   |
     32768 -> 65535      : 13251    |*****                                   |
     65536 -> 131071     : 15340    |******                                  |
    131072 -> 262143     : 26715    |**********                              |
    262144 -> 524287     : 43274    |*****************                       |
    524288 -> 1048575    : 53870    |*********************                   |
   1048576 -> 2097151    : 55368    |**********************                  |
   2097152 -> 4194303    : 41036    |****************                        |
   4194304 -> 8388607    : 24927    |**********                              |
   8388608 -> 16777215   : 33       |                                        |
  16777216 -> 33554431   : 9        |                                        |

avg = 623599 nsecs, total: 397259314759 nsecs, count: 637042

There's a little variation in the amount of calls done because of timing
of the threads with metadata requirements, but the avg, total, and
count's are relatively consistent between runs (usually within 2-5% of
each other).  As you can see here we have around a 30% decrease in
average latency with a 30% decrease in overall time spent in
find_free_extent.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/free-space-cache.c | 181 +++++++++++++++++++++++++++---------
 fs/btrfs/free-space-cache.h |   2 +
 2 files changed, 139 insertions(+), 44 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6ca3a92..6a64048 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1576,6 +1576,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 	return 0;
 }
 
+/*
+ * This is a little subtle.  We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0.  In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes.  So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have.  For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
+ *  call into find_free_space(), and return with max_extent_size == 4k, because
+ *  that first bitmap entry had ->max_extent_size set, but the second one did
+ *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
+ *  8k contiguous range.
+ *
+ *  Consider the other case, we have 2 8k chunks in that second entry and still
+ *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
+ *  allocator comes in it'll fully search our second bitmap, and this time it'll
+ *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
+ *  right allocation the next loop through.
+ */
+static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
+{
+	if (entry->bitmap && entry->max_extent_size)
+		return entry->max_extent_size;
+	return entry->bytes;
+}
+
+/*
+ * We want the largest entry to be leftmost, so this is inverted from what you'd
+ * normally expect.
+ */
+static bool entry_less(struct rb_node *node, const struct rb_node *parent)
+{
+	const struct btrfs_free_space *entry, *exist;
+
+	entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+	exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
+	return get_max_extent_size(exist) < get_max_extent_size(entry);
+}
+
 /*
  * searches the tree for the given offset.
  *
@@ -1704,6 +1748,7 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
 		    struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &ctl->free_space_offset);
+	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
 	ctl->free_extents--;
 
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
@@ -1730,6 +1775,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	if (ret)
 		return ret;
 
+	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
 		ctl->discardable_extents[BTRFS_STAT_CURR]++;
 		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1740,6 +1787,22 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	return ret;
 }
 
+static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
+				struct btrfs_free_space *info)
+{
+	ASSERT(info->bitmap);
+
+	/*
+	 * If our entry is empty it's because we're on a cluster and we don't
+	 * want to re-link it into our ctl bytes index.
+	 */
+	if (RB_EMPTY_NODE(&info->bytes_index))
+		return;
+
+	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+}
+
 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 				       struct btrfs_free_space *info,
 				       u64 offset, u64 bytes)
@@ -1758,6 +1821,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 	if (info->max_extent_size > ctl->unit)
 		info->max_extent_size = 0;
 
+	relink_bitmap_entry(ctl, info);
+
 	if (start && test_bit(start - 1, info->bitmap))
 		extent_delta++;
 
@@ -1793,9 +1858,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set(info->bitmap, start, count);
 
+	/*
+	 * We set some bytes, we have no idea what the max extent size is
+	 * anymore.
+	 */
+	info->max_extent_size = 0;
 	info->bytes += bytes;
 	ctl->free_space += bytes;
 
+	relink_bitmap_entry(ctl, info);
+
 	if (start && test_bit(start - 1, info->bitmap))
 		extent_delta--;
 
@@ -1863,44 +1935,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	*bytes = (u64)(max_bits) * ctl->unit;
 	bitmap_info->max_extent_size = *bytes;
+	relink_bitmap_entry(ctl, bitmap_info);
 	return -1;
 }
 
-/*
- * This is a little subtle.  We *only* have ->max_extent_size set if we actually
- * searched through the bitmap and figured out the largest ->max_extent_size,
- * otherwise it's 0.  In the case that it's 0 we don't want to tell the
- * allocator the wrong thing, we want to use the actual real max_extent_size
- * we've found already if it's larger, or we want to use ->bytes.
- *
- * This matters because find_free_space() will skip entries who's ->bytes is
- * less than the required bytes.  So if we didn't search down this bitmap, we
- * may pick some previous entry that has a smaller ->max_extent_size than we
- * have.  For example, assume we have two entries, one that has
- * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
- * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
- *  call into find_free_space(), and return with max_extent_size == 4k, because
- *  that first bitmap entry had ->max_extent_size set, but the second one did
- *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
- *  8k contiguous range.
- *
- *  Consider the other case, we have 2 8k chunks in that second entry and still
- *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
- *  allocator comes in it'll fully search our second bitmap, and this time it'll
- *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
- *  right allocation the next loop through.
- */
-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
-{
-	if (entry->bitmap && entry->max_extent_size)
-		return entry->max_extent_size;
-	return entry->bytes;
-}
-
 /* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
-		unsigned long align, u64 *max_extent_size)
+		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
@@ -1910,16 +1952,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 
 	if (!ctl->free_space_offset.rb_node)
 		goto out;
+again:
+	if (use_bytes_index) {
+		node = rb_first_cached(&ctl->free_space_bytes);
+	} else {
+		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
+					   0, 1);
+		if (!entry)
+			goto out;
+		node = &entry->offset_index;
+	}
 
-	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
-	if (!entry)
-		goto out;
+	for (; node; node = rb_next(node)) {
+		if (use_bytes_index)
+			entry = rb_entry(node, struct btrfs_free_space,
+					 bytes_index);
+		else
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
 
-	for (node = &entry->offset_index; node; node = rb_next(node)) {
-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		/*
+		 * If we are using the bytes index then all subsequent entries
+		 * in this tree are going to be < bytes, so simply set the max
+		 * extent size and exit the loop.
+		 *
+		 * If we're using the offset index then we need to keep going
+		 * through the rest of the tree.
+		 */
 		if (entry->bytes < *bytes) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
+			if (use_bytes_index)
+				break;
 			continue;
 		}
 
@@ -1936,6 +2000,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 			tmp = entry->offset;
 		}
 
+		/*
+		 * We don't break here if we're using the bytes index because we
+		 * may have another entry that has the correct alignment that is
+		 * the right size, so we don't want to miss that possibility.
+		 * At worst this adds another loop through the logic, but if we
+		 * broke here we could prematurely ENOSPC.
+		 */
 		if (entry->bytes < *bytes + align_off) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
@@ -1943,6 +2014,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		}
 
 		if (entry->bitmap) {
+			struct rb_node *old_next = rb_next(node);
 			u64 size = *bytes;
 
 			ret = search_bitmap(ctl, entry, &tmp, &size, true);
@@ -1955,6 +2027,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 					max(get_max_extent_size(entry),
 					    *max_extent_size);
 			}
+
+			/*
+			 * The bitmap may have gotten re-arranged in the space
+			 * index here because the max_extent_size may have been
+			 * updated.  Start from the beginning again if this
+			 * happened.
+			 */
+			if (use_bytes_index && old_next != rb_next(node))
+				goto again;
 			continue;
 		}
 
@@ -2103,12 +2184,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set_bits(ctl, info, offset, bytes_to_set);
 
-	/*
-	 * We set some bytes, we have no idea what the max extent size is
-	 * anymore.
-	 */
-	info->max_extent_size = 0;
-
 	return bytes_to_set;
 
 }
@@ -2506,6 +2581,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 	info->bytes = bytes;
 	info->trim_state = trim_state;
 	RB_CLEAR_NODE(&info->offset_index);
+	RB_CLEAR_NODE(&info->bytes_index);
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2809,6 +2885,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 	ctl->start = block_group->start;
 	ctl->private = block_group;
 	ctl->op = &free_space_op;
+	ctl->free_space_bytes = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&ctl->trimming_ranges);
 	mutex_init(&ctl->cache_writeout_mutex);
 
@@ -2874,6 +2951,8 @@ static void __btrfs_return_cluster_to_free_space(
 		}
 		tree_insert_offset(&ctl->free_space_offset,
 				   entry->offset, &entry->offset_index, bitmap);
+		rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
+			      entry_less);
 	}
 	cluster->root = RB_ROOT;
 	spin_unlock(&cluster->lock);
@@ -2975,12 +3054,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
 	u64 align_gap = 0;
 	u64 align_gap_len = 0;
 	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+	bool use_bytes_index = (offset == block_group->start);
 
 	ASSERT(!btrfs_is_zoned(block_group->fs_info));
 
 	spin_lock(&ctl->tree_lock);
 	entry = find_free_space(ctl, &offset, &bytes_search,
-				block_group->full_stripe_len, max_extent_size);
+				block_group->full_stripe_len, max_extent_size,
+				use_bytes_index);
 	if (!entry)
 		goto out;
 
@@ -3264,6 +3345,17 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
 
 	cluster->window_start = start * ctl->unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
+	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+
+	/*
+	 * We need to know if we're currently on the normal space index when we
+	 * manipulate the bitmap so that we know we need to remove and re-insert
+	 * it into the space_index tree.  Clear the bytes_index node here so the
+	 * bitmap manipulation helpers know not to mess with the space_index
+	 * until this bitmap entry is added back into the normal cache.
+	 */
+	RB_CLEAR_NODE(&entry->bytes_index);
+
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
 	ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3354,6 +3446,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
+		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
 		total_size += entry->bytes;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 1f23088..dd982d2 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,6 +22,7 @@ enum btrfs_trim_state {
 
 struct btrfs_free_space {
 	struct rb_node offset_index;
+	struct rb_node bytes_index;
 	u64 offset;
 	u64 bytes;
 	u64 max_extent_size;
@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
 struct btrfs_free_space_ctl {
 	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
+	struct rb_root_cached free_space_bytes;
 	u64 free_space;
 	int extents_thresh;
 	int free_extents;
-- 
2.34.1.75.gabe6bb3905


From de770f8689a774b743cb11c4250c83a9584eace2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 13 Dec 2021 08:45:12 +0000
Subject: [PATCH 34/48] btrfs: fix invalid delayed ref after subvolume creation
 failure

When creating a subvolume, at ioctl.c:create_subvol(), if we fail to
insert the new root's root item into the root tree, we are freeing the
metadata extent we reserved for the new root to prevent a metadata
extent leak, as we don't abort the transaction at that point (since
there is nothing at that point that is irreversible).

However we allocated the metadata extent for the new root which we are
creating for the new subvolume, so its delayed reference refers to the
ID of this new root. But when we free the metadata extent we pass the
root of the subvolume where the new subvolume is located to
btrfs_free_tree_block() - this is incorrect because this will generate
a delayed reference that refers to the ID of the parent subvolume's root,
and not to ID of the new root.

This results in a failure when running delayed references that leads to
a transaction abort and a trace like the following:

[ 3868.738042] RIP: 0010:__btrfs_free_extent+0x709/0x950 [btrfs]
[ 3868.739857] Code: 68 0f 85 e6 fb ff (...)
[ 3868.742963] RSP: 0018:ffffb0e9045cf910 EFLAGS: 00010246
[ 3868.743908] RAX: 00000000fffffffe RBX: 00000000fffffffe RCX: 0000000000000002
[ 3868.745312] RDX: 00000000fffffffe RSI: 0000000000000002 RDI: ffff90b0cd793b88
[ 3868.746643] RBP: 000000000e5d8000 R08: 0000000000000000 R09: ffff90b0cd793b88
[ 3868.747979] R10: 0000000000000002 R11: 00014ded97944d68 R12: 0000000000000000
[ 3868.749373] R13: ffff90b09afe4a28 R14: 0000000000000000 R15: ffff90b0cd793b88
[ 3868.750725] FS:  00007f281c4a8b80(0000) GS:ffff90b3ada00000(0000) knlGS:0000000000000000
[ 3868.752275] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3868.753515] CR2: 00007f281c6a5000 CR3: 0000000108a42006 CR4: 0000000000370ee0
[ 3868.754869] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3868.756228] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 3868.757803] Call Trace:
[ 3868.758281]  <TASK>
[ 3868.758655]  ? btrfs_merge_delayed_refs+0x178/0x1c0 [btrfs]
[ 3868.759827]  __btrfs_run_delayed_refs+0x2b1/0x1250 [btrfs]
[ 3868.761047]  btrfs_run_delayed_refs+0x86/0x210 [btrfs]
[ 3868.762069]  ? lock_acquired+0x19f/0x420
[ 3868.762829]  btrfs_commit_transaction+0x69/0xb20 [btrfs]
[ 3868.763860]  ? _raw_spin_unlock+0x29/0x40
[ 3868.764614]  ? btrfs_block_rsv_release+0x1c2/0x1e0 [btrfs]
[ 3868.765870]  create_subvol+0x1d8/0x9a0 [btrfs]
[ 3868.766766]  btrfs_mksubvol+0x447/0x4c0 [btrfs]
[ 3868.767669]  ? preempt_count_add+0x49/0xa0
[ 3868.768444]  __btrfs_ioctl_snap_create+0x123/0x190 [btrfs]
[ 3868.769639]  ? _copy_from_user+0x66/0xa0
[ 3868.770391]  btrfs_ioctl_snap_create_v2+0xbb/0x140 [btrfs]
[ 3868.771495]  btrfs_ioctl+0xd1e/0x35c0 [btrfs]
[ 3868.772364]  ? __slab_free+0x10a/0x360
[ 3868.773198]  ? rcu_read_lock_sched_held+0x12/0x60
[ 3868.774121]  ? lock_release+0x223/0x4a0
[ 3868.774863]  ? lock_acquired+0x19f/0x420
[ 3868.775634]  ? rcu_read_lock_sched_held+0x12/0x60
[ 3868.776530]  ? trace_hardirqs_on+0x1b/0xe0
[ 3868.777373]  ? _raw_spin_unlock_irqrestore+0x3e/0x60
[ 3868.778280]  ? kmem_cache_free+0x321/0x3c0
[ 3868.779011]  ? __x64_sys_ioctl+0x83/0xb0
[ 3868.779718]  __x64_sys_ioctl+0x83/0xb0
[ 3868.780387]  do_syscall_64+0x3b/0xc0
[ 3868.781059]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[ 3868.781953] RIP: 0033:0x7f281c59e957
[ 3868.782585] Code: 3c 1c 48 f7 d8 4c (...)
[ 3868.785867] RSP: 002b:00007ffe1f83e2b8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 3868.787198] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f281c59e957
[ 3868.788450] RDX: 00007ffe1f83e2c0 RSI: 0000000050009418 RDI: 0000000000000003
[ 3868.789748] RBP: 00007ffe1f83f300 R08: 0000000000000000 R09: 00007ffe1f83fe36
[ 3868.791214] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000003
[ 3868.792468] R13: 0000000000000003 R14: 00007ffe1f83e2c0 R15: 00000000000003cc
[ 3868.793765]  </TASK>
[ 3868.794037] irq event stamp: 0
[ 3868.794548] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[ 3868.795670] hardirqs last disabled at (0): [<ffffffff98294214>] copy_process+0x934/0x2040
[ 3868.797086] softirqs last  enabled at (0): [<ffffffff98294214>] copy_process+0x934/0x2040
[ 3868.798309] softirqs last disabled at (0): [<0000000000000000>] 0x0
[ 3868.799284] ---[ end trace be24c7002fe27747 ]---
[ 3868.799928] BTRFS info (device dm-0): leaf 241188864 gen 1268 total ptrs 214 free space 469 owner 2
[ 3868.801133] BTRFS info (device dm-0): refs 2 lock_owner 225627 current 225627
[ 3868.802056]  item 0 key (237436928 169 0) itemoff 16250 itemsize 33
[ 3868.802863]          extent refs 1 gen 1265 flags 2
[ 3868.803447]          ref#0: tree block backref root 1610
(...)
[ 3869.064354]  item 114 key (241008640 169 0) itemoff 12488 itemsize 33
[ 3869.065421]          extent refs 1 gen 1268 flags 2
[ 3869.066115]          ref#0: tree block backref root 1689
(...)
[ 3869.403834] BTRFS error (device dm-0): unable to find ref byte nr 241008640 parent 0 root 1622  owner 0 offset 0
[ 3869.405641] BTRFS: error (device dm-0) in __btrfs_free_extent:3076: errno=-2 No such entry
[ 3869.407138] BTRFS: error (device dm-0) in btrfs_run_delayed_refs:2159: errno=-2 No such entry

Fix this by passing the new subvolume's root ID to btrfs_free_tree_block().
This requires changing the root argument of btrfs_free_tree_block() from
struct btrfs_root * to a u64, since at this point during the subvolume
creation we have not yet created the struct btrfs_root for the new
subvolume, and btrfs_free_tree_block() only needs a root ID and nothing
else from a struct btrfs_root.

This was triggered by test case generic/475 from fstests.

[HH: small backport to 5.15.x]
Fixes: 67addf29004c5b ("btrfs: fix metadata extent leak after failure to create subvolume")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/ctree.c           | 17 +++++++++--------
 fs/btrfs/ctree.h           |  7 ++++++-
 fs/btrfs/extent-tree.c     | 13 +++++++------
 fs/btrfs/free-space-tree.c |  4 ++--
 fs/btrfs/ioctl.c           |  9 +++++----
 fs/btrfs/qgroup.c          |  3 ++-
 6 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 387295f..4d745e3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -462,8 +462,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		BUG_ON(ret < 0);
 		rcu_assign_pointer(root->node, cow);
 
-		btrfs_free_tree_block(trans, root, buf, parent_start,
-				      last_ref);
+		btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+				      parent_start, last_ref);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
@@ -484,8 +484,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 				return ret;
 			}
 		}
-		btrfs_free_tree_block(trans, root, buf, parent_start,
-				      last_ref);
+		btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+				      parent_start, last_ref);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -926,7 +926,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		free_extent_buffer(mid);
 
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer_stale(mid);
 		return 0;
@@ -985,7 +985,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			btrfs_tree_unlock(right);
 			del_ptr(root, path, level + 1, pslot + 1);
 			root_sub_used(root, right->len);
-			btrfs_free_tree_block(trans, root, right, 0, 1);
+			btrfs_free_tree_block(trans, btrfs_root_id(root), right,
+					      0, 1);
 			free_extent_buffer_stale(right);
 			right = NULL;
 		} else {
@@ -1030,7 +1031,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_unlock(mid);
 		del_ptr(root, path, level + 1, pslot);
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
 		free_extent_buffer_stale(mid);
 		mid = NULL;
 	} else {
@@ -4031,7 +4032,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	root_sub_used(root, leaf->len);
 
 	atomic_inc(&leaf->refs);
-	btrfs_free_tree_block(trans, root, leaf, 0, 1);
+	btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
 	free_extent_buffer_stale(leaf);
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3c14d9f..095cf53 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2245,6 +2245,11 @@ static inline bool btrfs_root_dead(const struct btrfs_root *root)
 	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
 }
 
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+	return root->root_key.objectid;
+}
+
 /* struct btrfs_root_backup */
 BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
 		   tree_root, 64);
@@ -2707,7 +2712,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     u64 empty_size,
 					     enum btrfs_lock_nesting nest);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
+			   u64 root_id,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref);
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 87c23c5..3ace925 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,20 +3278,20 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 }
 
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
+			   u64 root_id,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_ref generic_ref = { 0 };
 	int ret;
 
 	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
 			       buf->start, buf->len, parent);
 	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
-			    root->root_key.objectid);
+			    root_id);
 
-	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
 		btrfs_ref_tree_mod(fs_info, &generic_ref);
 		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
 		BUG_ON(ret); /* -ENOMEM */
@@ -3301,7 +3301,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		struct btrfs_block_group *cache;
 		bool must_pin = false;
 
-		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		if (root_id != BTRFS_TREE_LOG_OBJECTID) {
 			ret = check_ref_cleanup(trans, buf->start);
 			if (!ret) {
 				btrfs_redirty_list_add(trans->transaction, buf);
@@ -5436,7 +5436,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			goto owner_mismatch;
 	}
 
-	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+	btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+			      wc->refs[level] == 1);
 out:
 	wc->refs[level] = 0;
 	wc->flags[level] = 0;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index a33bca9..3abec44 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1256,8 +1256,8 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
 	btrfs_tree_lock(free_space_root->node);
 	btrfs_clean_tree_block(free_space_root->node);
 	btrfs_tree_unlock(free_space_root->node);
-	btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
-			      0, 1);
+	btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+			      free_space_root->node, 0, 1);
 
 	btrfs_put_root(free_space_root);
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc61813..0fa8b1a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -615,11 +615,12 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
 		 * Since we don't abort the transaction in this case, free the
 		 * tree block so that we don't leak space and leave the
 		 * filesystem in an inconsistent state (an extent item in the
-		 * extent tree without backreferences). Also no need to have
-		 * the tree block locked since it is not in any tree at this
-		 * point, so no other task can find it and use it.
+		 * extent tree with a backreference for a root that does not
+		 * exists). Also no need to have the tree block locked since it
+		 * is not in any tree at this point, so no other task can find
+		 * it and use it.
 		 */
-		btrfs_free_tree_block(trans, root, leaf, 0, 1);
+		btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
 		free_extent_buffer(leaf);
 		goto fail;
 	}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index db680f5..6c037f1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1219,7 +1219,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	btrfs_tree_lock(quota_root->node);
 	btrfs_clean_tree_block(quota_root->node);
 	btrfs_tree_unlock(quota_root->node);
-	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+	btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+			      quota_root->node, 0, 1);
 
 	btrfs_put_root(quota_root);
 
-- 
2.34.1.75.gabe6bb3905


From 769112ff6aaf6d1ad2d974ed1f2d259eed519d35 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 13 Dec 2021 08:45:13 +0000
Subject: [PATCH 35/48] btrfs: fix warning when freeing leaf after subvolume
 creation failure

When creating a subvolume, at ioctl.c:create_subvol(), if we fail to
insert the root item for the new subvolume into the root tree, we can
trigger the following warning:

[78961.741046] WARNING: CPU: 0 PID: 4079814 at fs/btrfs/extent-tree.c:3357 btrfs_free_tree_block+0x2af/0x310 [btrfs]
[78961.743344] Modules linked in:
[78961.749440]  dm_snapshot dm_thin_pool (...)
[78961.773648] CPU: 0 PID: 4079814 Comm: fsstress Not tainted 5.16.0-rc4-btrfs-next-108 #1
[78961.775198] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[78961.777266] RIP: 0010:btrfs_free_tree_block+0x2af/0x310 [btrfs]
[78961.778398] Code: 17 00 48 85 (...)
[78961.781067] RSP: 0018:ffffaa4001657b28 EFLAGS: 00010202
[78961.781877] RAX: 0000000000000213 RBX: ffff897f8a796910 RCX: 0000000000000000
[78961.782780] RDX: 0000000000000000 RSI: 0000000011004000 RDI: 00000000ffffffff
[78961.783764] RBP: ffff8981f490e800 R08: 0000000000000001 R09: 0000000000000000
[78961.784740] R10: 0000000000000000 R11: 0000000000000001 R12: ffff897fc963fcc8
[78961.785665] R13: 0000000000000001 R14: ffff898063548000 R15: ffff898063548000
[78961.786620] FS:  00007f31283c6b80(0000) GS:ffff8982ace00000(0000) knlGS:0000000000000000
[78961.787717] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[78961.788598] CR2: 00007f31285c3000 CR3: 000000023fcc8003 CR4: 0000000000370ef0
[78961.789568] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[78961.790585] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[78961.791684] Call Trace:
[78961.792082]  <TASK>
[78961.792359]  create_subvol+0x5d1/0x9a0 [btrfs]
[78961.793054]  btrfs_mksubvol+0x447/0x4c0 [btrfs]
[78961.794009]  ? preempt_count_add+0x49/0xa0
[78961.794705]  __btrfs_ioctl_snap_create+0x123/0x190 [btrfs]
[78961.795712]  ? _copy_from_user+0x66/0xa0
[78961.796382]  btrfs_ioctl_snap_create_v2+0xbb/0x140 [btrfs]
[78961.797392]  btrfs_ioctl+0xd1e/0x35c0 [btrfs]
[78961.798172]  ? __slab_free+0x10a/0x360
[78961.798820]  ? rcu_read_lock_sched_held+0x12/0x60
[78961.799664]  ? lock_release+0x223/0x4a0
[78961.800321]  ? lock_acquired+0x19f/0x420
[78961.800992]  ? rcu_read_lock_sched_held+0x12/0x60
[78961.801796]  ? trace_hardirqs_on+0x1b/0xe0
[78961.802495]  ? _raw_spin_unlock_irqrestore+0x3e/0x60
[78961.803358]  ? kmem_cache_free+0x321/0x3c0
[78961.804071]  ? __x64_sys_ioctl+0x83/0xb0
[78961.804711]  __x64_sys_ioctl+0x83/0xb0
[78961.805348]  do_syscall_64+0x3b/0xc0
[78961.805969]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[78961.806830] RIP: 0033:0x7f31284bc957
[78961.807517] Code: 3c 1c 48 f7 d8 (...)

This is because we are calling btrfs_free_tree_block() on an extent
buffer that is dirty. Fix that by cleaning the extent buffer, with
btrfs_clean_tree_block(), before freeing it.

This was triggered by test case generic/475 from fstests.

Fixes: 67addf29004c5b ("btrfs: fix metadata extent leak after failure to create subvolume")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/ioctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0fa8b1a..813b5fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -616,10 +616,11 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
 		 * tree block so that we don't leak space and leave the
 		 * filesystem in an inconsistent state (an extent item in the
 		 * extent tree with a backreference for a root that does not
-		 * exists). Also no need to have the tree block locked since it
-		 * is not in any tree at this point, so no other task can find
-		 * it and use it.
+		 * exists).
 		 */
+		btrfs_tree_lock(leaf);
+		btrfs_clean_tree_block(leaf);
+		btrfs_tree_unlock(leaf);
 		btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
 		free_extent_buffer(leaf);
 		goto fail;
-- 
2.34.1.75.gabe6bb3905


From 8b5d5b7a13fdddb0ac98940844d8f3454980d80b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 13 Dec 2021 08:45:14 +0000
Subject: [PATCH 36/48] btrfs: skip transaction commit after failure to create
 subvolume

At ioctl.c:create_subvol(), when we fail to create a subvolume we always
commit the transaction. In most cases this is a no-op, since all the error
paths, except for one, abort the transaction - the only exception is when
we fail to insert the new root item into the root tree, in that case we
don't abort the transaction because we didn't do anything that is
irreversible - however we end up committing the transaction which although
is not a functional problem, it adds unnecessary rotation of the backup
roots in the superblock and unnecessary work.

So change that to commit a transaction only when no error happened,
otherwise just call btrfs_end_transaction() to release our reference on
the transaction.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/ioctl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 813b5fc..97e8289 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -516,7 +516,6 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
 	struct timespec64 cur_time = current_time(dir);
 	struct inode *inode;
 	int ret;
-	int err;
 	dev_t anon_dev = 0;
 	u64 objectid;
 	u64 index = 0;
@@ -696,9 +695,10 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
 	trans->bytes_reserved = 0;
 	btrfs_subvolume_release_metadata(root, &block_rsv);
 
-	err = btrfs_commit_transaction(trans);
-	if (err && !ret)
-		ret = err;
+	if (ret)
+		btrfs_end_transaction(trans);
+	else
+		ret = btrfs_commit_transaction(trans);
 
 	if (!ret) {
 		inode = btrfs_lookup_dentry(dir, dentry);
-- 
2.34.1.75.gabe6bb3905


From cf5d57f8ff96b11c92ec68bc0aa3c2b1a3e23a4a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 14 Dec 2021 11:29:01 +0000
Subject: [PATCH 37/48] btrfs: fix missing last dir item offset update when
 logging directory

When logging a directory, once we finish processing a leaf that is full
of dir items, if we find the next leaf was not modified in the current
transaction, we grab the first key of that next leaf and log it as to
mark the end of a key range boundary.

However we did not update the value of ctx->last_dir_item_offset, which
tracks the offset of the last logged key. This can result in subsequent
logging of the same directory in the current transaction to not realize
that key was already logged, and then add it to the middle of a batch
that starts with a lower key, resulting later in a leaf with one key
that is duplicated and at non-consecutive slots. When that happens we get
an error later when writing out the leaf, reporting that there is a pair
of keys in wrong order. The report is something like the following:

Dec 13 21:44:50 kernel: BTRFS critical (device dm-0): corrupt leaf:
root=18446744073709551610 block=118444032 slot=21, bad key order, prev
(704687 84 4146773349) current (704687 84 1063561078)
Dec 13 21:44:50 kernel: BTRFS info (device dm-0): leaf 118444032 gen
91449 total ptrs 39 free space 546 owner 18446744073709551610
Dec 13 21:44:50 kernel:         item 0 key (704687 1 0) itemoff 3835
itemsize 160
Dec 13 21:44:50 kernel:                 inode generation 35532 size
1026 mode 40755
Dec 13 21:44:50 kernel:         item 1 key (704687 12 704685) itemoff
3822 itemsize 13
Dec 13 21:44:50 kernel:         item 2 key (704687 24 3817753667)
itemoff 3736 itemsize 86
Dec 13 21:44:50 kernel:         item 3 key (704687 60 0) itemoff 3728 itemsize 8
Dec 13 21:44:50 kernel:         item 4 key (704687 72 0) itemoff 3720 itemsize 8
Dec 13 21:44:50 kernel:         item 5 key (704687 84 140445108)
itemoff 3666 itemsize 54
Dec 13 21:44:50 kernel:                 dir oid 704793 type 1
Dec 13 21:44:50 kernel:         item 6 key (704687 84 298800632)
itemoff 3599 itemsize 67
Dec 13 21:44:50 kernel:                 dir oid 707849 type 2
Dec 13 21:44:50 kernel:         item 7 key (704687 84 476147658)
itemoff 3532 itemsize 67
Dec 13 21:44:50 kernel:                 dir oid 707901 type 2
Dec 13 21:44:50 kernel:         item 8 key (704687 84 633818382)
itemoff 3471 itemsize 61
Dec 13 21:44:50 kernel:                 dir oid 704694 type 2
Dec 13 21:44:50 kernel:         item 9 key (704687 84 654256665)
itemoff 3403 itemsize 68
Dec 13 21:44:50 kernel:                 dir oid 707841 type 1
Dec 13 21:44:50 kernel:         item 10 key (704687 84 995843418)
itemoff 3331 itemsize 72
Dec 13 21:44:50 kernel:                 dir oid 2167736 type 1
Dec 13 21:44:50 kernel:         item 11 key (704687 84 1063561078)
itemoff 3278 itemsize 53
Dec 13 21:44:50 kernel:                 dir oid 704799 type 2
Dec 13 21:44:50 kernel:         item 12 key (704687 84 1101156010)
itemoff 3225 itemsize 53
Dec 13 21:44:50 kernel:                 dir oid 704696 type 1
Dec 13 21:44:50 kernel:         item 13 key (704687 84 2521936574)
itemoff 3173 itemsize 52
Dec 13 21:44:50 kernel:                 dir oid 704704 type 2
Dec 13 21:44:50 kernel:         item 14 key (704687 84 2618368432)
itemoff 3112 itemsize 61
Dec 13 21:44:50 kernel:                 dir oid 704738 type 1
Dec 13 21:44:50 kernel:         item 15 key (704687 84 2676316190)
itemoff 3046 itemsize 66
Dec 13 21:44:50 kernel:                 dir oid 2167729 type 1
Dec 13 21:44:50 kernel:         item 16 key (704687 84 3319104192)
itemoff 2986 itemsize 60
Dec 13 21:44:50 kernel:                 dir oid 704745 type 2
Dec 13 21:44:50 kernel:         item 17 key (704687 84 3908046265)
itemoff 2929 itemsize 57
Dec 13 21:44:50 kernel:                 dir oid 2167734 type 1
Dec 13 21:44:50 kernel:         item 18 key (704687 84 3945713089)
itemoff 2857 itemsize 72
Dec 13 21:44:50 kernel:                 dir oid 2167730 type 1
Dec 13 21:44:50 kernel:         item 19 key (704687 84 4077169308)
itemoff 2795 itemsize 62
Dec 13 21:44:50 kernel:                 dir oid 704688 type 1
Dec 13 21:44:50 kernel:         item 20 key (704687 84 4146773349)
itemoff 2727 itemsize 68
Dec 13 21:44:50 kernel:                 dir oid 707892 type 1
Dec 13 21:44:50 kernel:         item 21 key (704687 84 1063561078)
itemoff 2674 itemsize 53
Dec 13 21:44:50 kernel:                 dir oid 704799 type 2
Dec 13 21:44:50 kernel:         item 22 key (704687 96 2) itemoff 2612
itemsize 62
Dec 13 21:44:50 kernel:         item 23 key (704687 96 6) itemoff 2551
itemsize 61
Dec 13 21:44:50 kernel:         item 24 key (704687 96 7) itemoff 2498
itemsize 53
Dec 13 21:44:50 kernel:         item 25 key (704687 96 12) itemoff
2446 itemsize 52
Dec 13 21:44:50 kernel:         item 26 key (704687 96 14) itemoff
2385 itemsize 61
Dec 13 21:44:50 kernel:         item 27 key (704687 96 18) itemoff
2325 itemsize 60
Dec 13 21:44:50 kernel:         item 28 key (704687 96 24) itemoff
2271 itemsize 54
Dec 13 21:44:50 kernel:         item 29 key (704687 96 28) itemoff
2218 itemsize 53
Dec 13 21:44:50 kernel:         item 30 key (704687 96 62) itemoff
2150 itemsize 68
Dec 13 21:44:50 kernel:         item 31 key (704687 96 66) itemoff
2083 itemsize 67
Dec 13 21:44:50 kernel:         item 32 key (704687 96 75) itemoff
2015 itemsize 68
Dec 13 21:44:50 kernel:         item 33 key (704687 96 79) itemoff
1948 itemsize 67
Dec 13 21:44:50 kernel:         item 34 key (704687 96 82) itemoff
1882 itemsize 66
Dec 13 21:44:50 kernel:         item 35 key (704687 96 83) itemoff
1810 itemsize 72
Dec 13 21:44:50 kernel:         item 36 key (704687 96 85) itemoff
1753 itemsize 57
Dec 13 21:44:50 kernel:         item 37 key (704687 96 87) itemoff
1681 itemsize 72
Dec 13 21:44:50 kernel:         item 38 key (704694 1 0) itemoff 1521
itemsize 160
Dec 13 21:44:50 kernel:                 inode generation 35534 size 30
mode 40755
Dec 13 21:44:50 kernel: BTRFS error (device dm-0): block=118444032
write time tree block corruption detected

So fix that by adding the missing update of ctx->last_dir_item_offset with
the offset of the boundary key.

Reported-by: Chris Murphy <lists@colorremedies.com>
Link: https://lore.kernel.org/linux-btrfs/CAJCQCtT+RSzpUjbMq+UfzNUMe1X5+1G+DnAGbHC=OZ=iRS24jg@mail.gmail.com/
Fixes: dc2872247ec0ca ("btrfs: keep track of the last logged keys when logging a directory")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 586a88b..d7257e7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3948,6 +3948,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			goto done;
 		}
 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+			ctx->last_dir_item_offset = min_key.offset;
 			ret = overwrite_item(trans, log, dst_path,
 					     path->nodes[0], path->slots[0],
 					     &min_key);
-- 
2.34.1.75.gabe6bb3905


From ed6d09d5ae204c20b0dad547193f72cac40f59b4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 15 Dec 2021 12:19:58 +0000
Subject: [PATCH 38/48] btrfs: don't log unnecessary boundary keys when logging
 directory

Before we start to log dir index keys from a leaf, we check if there is a
previous index key, which normally is at the end of a leaf that was not
changed in the current transaction. Then we log that key and set the start
of logged range (item of type BTRFS_DIR_LOG_INDEX_KEY) to the offset of
that key. This is to ensure that if there were deleted index keys between
that key and the first key we are going to log, those deletions are
replayed in case we need to replay to the log after a power failure.
However we really don't need to log that previous key, we can just set the
start of the logged range to that key's offset plus 1. This achieves the
same and avoids logging one dir index key.

The same logic is performed when we finish logging the index keys of a
leaf and we find that the next leaf has index keys and was not changed in
the current transaction. We are logging the first key of that next leaf
and use its offset as the end of range we log. This is just to ensure that
if there were deleted index keys between the last index key we logged and
the first key of that next leaf, those index keys are deleted if we end
up replaying the log. However that is not necessary, we can avoid logging
that first index key of the next leaf and instead set the end of the
logged range to match the offset of that index key minus 1.

So avoid logging those index keys at the boundaries and adjust the start
and end offsets of the logged ranges as described above.

This patch is part of a patchset comprised of the following patches:

  1/4 btrfs: don't log unnecessary boundary keys when logging directory
  2/4 btrfs: put initial index value of a directory in a constant
  3/4 btrfs: stop copying old dir items when logging a directory
  4/4 btrfs: stop trying to log subdirectories created in past transactions

Performance test results are listed in the changelog of patch 3/4.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d7257e7..e945bfd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3890,17 +3890,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
 	if (ret == 0) {
 		struct btrfs_key tmp;
+
 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-		if (tmp.type == BTRFS_DIR_INDEX_KEY) {
-			first_offset = tmp.offset;
-			ret = overwrite_item(trans, log, dst_path,
-					     path->nodes[0], path->slots[0],
-					     &tmp);
-			if (ret) {
-				err = ret;
-				goto done;
-			}
-		}
+		/*
+		 * The dir index key before the first one we found that needs to
+		 * be logged might be in a previous leaf, and there might be a
+		 * gap between these keys, meaning that we had deletions that
+		 * happened. So the key range item we log (key type
+		 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
+		 * previous key's offset plus 1, so that those deletes are replayed.
+		 */
+		if (tmp.type == BTRFS_DIR_INDEX_KEY)
+			first_offset = tmp.offset + 1;
 	}
 	btrfs_release_path(path);
 
@@ -3948,14 +3949,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			goto done;
 		}
 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
-			ctx->last_dir_item_offset = min_key.offset;
-			ret = overwrite_item(trans, log, dst_path,
-					     path->nodes[0], path->slots[0],
-					     &min_key);
-			if (ret)
-				err = ret;
-			else
-				last_offset = min_key.offset;
+			/*
+			 * The next leaf was not changed in the current transaction
+			 * and has at least one dir index key.
+			 * We check for the next key because there might have been
+			 * one or more deletions between the last key we logged and
+			 * that next key. So the key range item we log (key type
+			 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
+			 * offset minus 1, so that those deletes are replayed.
+			 */
+			last_offset = min_key.offset - 1;
 			goto done;
 		}
 		if (need_resched()) {
-- 
2.34.1.75.gabe6bb3905


From a1147512e703d3ef02f3f851976e151610de508e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 15 Dec 2021 12:19:59 +0000
Subject: [PATCH 39/48] btrfs: put initial index value of a directory in a
 constant

At btrfs_set_inode_index_count() we refer twice to the number 2 as the
initial index value for a directory (when it's empty), with a proper
comment explaining the reason for that value. In the next patch I'll
have to use that magic value in the directory logging code, so put
the value in a #define at btrfs_inode.h, to avoid hardcoding the
magic value again at tree-log.c.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/btrfs_inode.h | 12 ++++++++++--
 fs/btrfs/inode.c       | 10 ++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a303b0b..098320c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -13,6 +13,13 @@
 #include "ordered-data.h"
 #include "delayed-inode.h"
 
+/*
+ * Since we search a directory based on f_pos (struct dir_context::pos) we have
+ * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
+ * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
+ */
+#define BTRFS_DIR_START_INDEX 2
+
 /*
  * ordered_data_close is set by truncate when a file that used
  * to have good data has been truncated to zero.  When it is set
@@ -173,8 +180,9 @@ struct btrfs_inode {
 	u64 disk_i_size;
 
 	/*
-	 * if this is a directory then index_cnt is the counter for the index
-	 * number for new files that are created
+	 * If this is a directory then index_cnt is the counter for the index
+	 * number for new files that are created. For an empty directory, this
+	 * must be initialized to BTRFS_DIR_START_INDEX.
 	 */
 	u64 index_cnt;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4004d17..188adc5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6328,14 +6328,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
 		goto out;
 	ret = 0;
 
-	/*
-	 * MAGIC NUMBER EXPLANATION:
-	 * since we search a directory based on f_pos we have to start at 2
-	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
-	 * else has to start at 2
-	 */
 	if (path->slots[0] == 0) {
-		inode->index_cnt = 2;
+		inode->index_cnt = BTRFS_DIR_START_INDEX;
 		goto out;
 	}
 
@@ -6346,7 +6340,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
 
 	if (found_key.objectid != btrfs_ino(inode) ||
 	    found_key.type != BTRFS_DIR_INDEX_KEY) {
-		inode->index_cnt = 2;
+		inode->index_cnt = BTRFS_DIR_START_INDEX;
 		goto out;
 	}
 
-- 
2.34.1.75.gabe6bb3905


From 6a108f4464cfd2e71cbdba0f5b4f43d32dd97118 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 15 Dec 2021 12:20:00 +0000
Subject: [PATCH 40/48] btrfs: stop copying old dir items when logging a
 directory

When logging a directory, we go over every leaf of the subvolume tree that
was changed in the current transaction and copy all its dir index keys to
the log tree.

That includes copying dir index keys created in past transactions. This is
done mostly for simplicity, as after logging the keys we log an item that
specifies the start and end ranges of the keys we logged. That item is
then used during log replay to figure out which keys need to be deleted -
every key in that range that we find in the subvolume tree and is not in
the log tree, needs to be deleted.

Now that we log only dir index keys, and not dir item keys anymore, when
we remove dentries from a directory (due to unlink and rename operations),
we can get entire leaves that we changed only for deleting old dir index
keys, or that have few dir index keys that are new - this is due to the
fact that the offset for new index keys comes from a monotonically
increasing counter.

We can avoid logging dir index keys from past transactions, and in order
to track the deletions, only log range items (BTRFS_DIR_LOG_INDEX_KEY key
type) when we find gaps between consecutive index keys. This massively
reduces the amount of logged metadata when we have deleted directory
entries, even if it's a small percentage of the total number of entries.
The reduction comes from both less items that are logged and instead of
logging many dir index items (struct btrfs_dir_item), which have a size
of 30 bytes plus a file name, we typically log just a few range items
(struct btrfs_dir_log_item), which take only 8 bytes each.

Even if no entries were deleted from a directory and only new entries
were added, we typically still get a reduction on the amount of logged
metadata, because it's very likely the first leaf that got the new
dir index entries also has several old dir index entries.

So change the logging logic to not log dir index keys created in past
transactions and log a range item for every gap it finds between each
pair of consecutive index keys, to ensure deletions are tracked and
replayed on log replay.

This patch is part of a patchset comprised of the following patches:

 1/4 btrfs: don't log unnecessary boundary keys when logging directory
 2/4 btrfs: put initial index value of a directory in a constant
 3/4 btrfs: stop copying old dir items when logging a directory
 4/4 btrfs: stop trying to log subdirectories created in past transactions

The following test was run on a branch without this patchset and on a
branch with the first three patches applied:

  $ cat test.sh
  #!/bin/bash

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1

  NUM_FILES=1000000
  NUM_FILE_DELETES=10000

  MKFS_OPTIONS="-O no-holes -R free-space-tree"
  MOUNT_OPTIONS="-o ssd"

  mkfs.btrfs -f $MKFS_OPTIONS $DEV
  mount $MOUNT_OPTIONS $DEV $MNT

  mkdir $MNT/testdir
  for ((i = 1; i <= $NUM_FILES; i++)); do
      echo -n > $MNT/testdir/file_$i
  done

  sync

  del_inc=$(( $NUM_FILES / $NUM_FILE_DELETES ))
  for ((i = 1; i <= $NUM_FILES; i += $del_inc)); do
      rm -f $MNT/testdir/file_$i
  done

  start=$(date +%s%N)
  xfs_io -c "fsync" $MNT/testdir
  end=$(date +%s%N)

  dur=$(( (end - start) / 1000000 ))
  echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files"
  echo

  umount $MNT

The test was run on a non-debug kernel (Debian's default kernel config),
and the results were the following for various values of NUM_FILES and
NUM_FILE_DELETES:

** before, NUM_FILES = 1 000 000, NUM_FILE_DELETES = 10 000 **

dir fsync took 585 ms after deleting 10000 files

** after, NUM_FILES = 1 000 000, NUM_FILE_DELETES = 10 000 **

dir fsync took 34 ms after deleting 10000 files   (-94.2%)

** before, NUM_FILES = 100 000, NUM_FILE_DELETES = 1 000 **

dir fsync took 50 ms after deleting 1000 files

** after, NUM_FILES = 100 000, NUM_FILE_DELETES = 1 000 **

dir fsync took 7 ms after deleting 1000 files    (-86.0%)

** before, NUM_FILES = 10 000, NUM_FILE_DELETES = 100 **

dir fsync took 9 ms after deleting 100 files

** after, NUM_FILES = 10 000, NUM_FILE_DELETES = 100 **

dir fsync took 5 ms after deleting 100 files     (-44.4%)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 83 +++++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e945bfd..d90282a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3709,7 +3709,8 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *inode,
 				  struct btrfs_path *path,
 				  struct btrfs_path *dst_path,
-				  struct btrfs_log_ctx *ctx)
+				  struct btrfs_log_ctx *ctx,
+				  u64 *last_old_dentry_offset)
 {
 	struct btrfs_root *log = inode->root->log_root;
 	struct extent_buffer *src = path->nodes[0];
@@ -3722,6 +3723,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 	int i;
 
 	for (i = path->slots[0]; i < nritems; i++) {
+		struct btrfs_dir_item *di;
 		struct btrfs_key key;
 		int ret;
 
@@ -3732,7 +3734,34 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 			break;
 		}
 
+		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
 		ctx->last_dir_item_offset = key.offset;
+
+		/*
+		 * Skip ranges of items that consist only of dir item keys created
+		 * in past transactions. However if we find a gap, we must log a
+		 * dir index range item for that gap, so that index keys in that
+		 * gap are deleted during log replay.
+		 */
+		if (btrfs_dir_transid(src, di) < trans->transid) {
+			if (key.offset > *last_old_dentry_offset + 1) {
+				ret = insert_dir_log_key(trans, log, dst_path,
+						 ino, *last_old_dentry_offset + 1,
+						 key.offset - 1);
+				/*
+				 * -EEXIST should never happen because when we
+				 * log a directory in full mode (LOG_INODE_ALL)
+				 * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from
+				 * the log tree.
+				 */
+				ASSERT(ret != -EEXIST);
+				if (ret < 0)
+					return ret;
+			}
+
+			*last_old_dentry_offset = key.offset;
+			continue;
+		}
 		/*
 		 * We must make sure that when we log a directory entry, the
 		 * corresponding inode, after log replay, has a matching link
@@ -3756,14 +3785,10 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 		 * resulting in -ENOTEMPTY errors.
 		 */
 		if (!ctx->log_new_dentries) {
-			struct btrfs_dir_item *di;
 			struct btrfs_key di_key;
 
-			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(src, di, &di_key);
-			if ((btrfs_dir_transid(src, di) == trans->transid ||
-			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
-			    di_key.type != BTRFS_ROOT_ITEM_KEY)
+			if (di_key.type != BTRFS_ROOT_ITEM_KEY)
 				ctx->log_new_dentries = true;
 		}
 
@@ -3844,7 +3869,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	struct btrfs_root *log = root->log_root;
 	int err = 0;
 	int ret;
-	u64 first_offset = min_offset;
+	u64 last_old_dentry_offset = min_offset - 1;
 	u64 last_offset = (u64)-1;
 	u64 ino = btrfs_ino(inode);
 
@@ -3878,10 +3903,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		 */
 		if (ret == 0) {
 			struct btrfs_key tmp;
+
 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
 					      path->slots[0]);
 			if (tmp.type == BTRFS_DIR_INDEX_KEY)
-				first_offset = max(min_offset, tmp.offset) + 1;
+				last_old_dentry_offset = tmp.offset;
 		}
 		goto done;
 	}
@@ -3901,7 +3927,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		 * previous key's offset plus 1, so that those deletes are replayed.
 		 */
 		if (tmp.type == BTRFS_DIR_INDEX_KEY)
-			first_offset = tmp.offset + 1;
+			last_old_dentry_offset = tmp.offset;
 	}
 	btrfs_release_path(path);
 
@@ -3923,7 +3949,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 * from our directory
 	 */
 	while (1) {
-		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
+		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+					     &last_old_dentry_offset);
 		if (ret != 0) {
 			if (ret < 0)
 				err = ret;
@@ -3974,13 +4001,21 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	if (err == 0) {
 		*last_offset_ret = last_offset;
 		/*
-		 * insert the log range keys to indicate where the log
-		 * is valid
+		 * In case the leaf was changed in the current transaction but
+		 * all its dir items are from a past transaction, the last item
+		 * in the leaf is a dir item and there's no gap between that last
+		 * dir item and the first one on the next leaf (which did not
+		 * change in the current transaction), then we don't need to log
+		 * a range, last_old_dentry_offset is == to last_offset.
 		 */
-		ret = insert_dir_log_key(trans, log, path, ino, first_offset,
-					 last_offset);
-		if (ret)
-			err = ret;
+		ASSERT(last_old_dentry_offset <= last_offset);
+		if (last_old_dentry_offset < last_offset) {
+			ret = insert_dir_log_key(trans, log, path, ino,
+						 last_old_dentry_offset + 1,
+						 last_offset);
+			if (ret)
+				err = ret;
+		}
 	}
 	return err;
 }
@@ -4022,7 +4057,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 	if (inode->logged_trans != trans->transid)
 		inode->last_dir_index_offset = (u64)-1;
 
-	min_key = 0;
+	min_key = BTRFS_DIR_START_INDEX;
 	max_key = 0;
 	ctx->last_dir_item_offset = inode->last_dir_index_offset;
 
@@ -5873,7 +5908,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 				struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_root *log = root->log_root;
 	struct btrfs_path *path;
 	LIST_HEAD(dir_list);
 	struct btrfs_dir_list *dir_elem;
@@ -5915,7 +5949,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 		min_key.offset = 0;
 again:
 		btrfs_release_path(path);
-		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+		ret = btrfs_search_forward(root, &min_key, path, trans->transid);
 		if (ret < 0) {
 			goto next_dir_inode;
 		} else if (ret > 0) {
@@ -5923,7 +5957,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 			goto next_dir_inode;
 		}
 
-process_leaf:
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
 		for (i = path->slots[0]; i < nritems; i++) {
@@ -5980,16 +6013,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 			}
 			break;
 		}
-		if (i == nritems) {
-			ret = btrfs_next_leaf(log, path);
-			if (ret < 0) {
-				goto next_dir_inode;
-			} else if (ret > 0) {
-				ret = 0;
-				goto next_dir_inode;
-			}
-			goto process_leaf;
-		}
 		if (min_key.offset < (u64)-1) {
 			min_key.offset++;
 			goto again;
-- 
2.34.1.75.gabe6bb3905


From 7443a4f8d53115ed2e6e756c90d98b71f1bfd92d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 15 Dec 2021 12:20:01 +0000
Subject: [PATCH 41/48] btrfs: stop trying to log subdirectories created in
 past transactions

When logging a directory we are trying to log subdirectories that were
changed in the current transaction and created in a past transaction.
This type of behaviour was introduced by commit 2f2ff0ee5e4303 ("Btrfs:
fix metadata inconsistencies after directory fsync"), to fix some metadata
inconsistencies that in the meanwhile no longer need this behaviour due to
numerous other changes that happened throughout the years.

This behaviour, besides not needed anymore, it's also undesirable because:

1) It's not reliable because it's only triggered for the directories
   of dentries (dir items) that happen to be present on a leaf that
   was changed in the current transaction. If a dentry that points to
   a directory resides on a leaf that was not changed in the current
   transaction, then it's not logged, as at log_dir_items() and
   log_new_dir_dentries() we use btrfs_search_forward();

2) It's not required by posix or any standard, it's undefined territory.
   The only way to guarantee a subdirectory is logged, it to explicitly
   fsync it;

Making the behaviour guaranteed would require scanning all directory
items, check which point to a directory, and then fsync each subdirectory
which was modified in the current transaction. This could be very
expensive for large directories with many subdirectories and/or large
subdirectories.

So remove that obsolete logic.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d90282a..4129898 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5974,8 +5974,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 
 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
 			type = btrfs_dir_type(leaf, di);
-			if (btrfs_dir_transid(leaf, di) < trans->transid &&
-			    type != BTRFS_FT_DIR)
+			if (btrfs_dir_transid(leaf, di) < trans->transid)
 				continue;
 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
-- 
2.34.1.75.gabe6bb3905


From 3cd72fe082a4953214d99c4862dca9c6478a5f7b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 17 Dec 2021 12:35:07 +0000
Subject: [PATCH 42/48] btrfs: fix reserved space leak on log tree nodes after
 transaction abort

After the recent changes by commit c2e39305299f01 ("btrfs: clear extent
buffer uptodate when we fail to write it") and its followup fix that has
the subject "btrfs: check WRITE_ERR when trying to read an extent buffer"
(not yet in Linus' tree), after a transaction abort we can often end up
not unreserving the space that was reserved for log tree extent buffers.

This happens because if writeback for a log tree extent buffer failed,
than we have cleared the EXTENT_BUFFER_UPTODATE from the extent buffer
and we have also set the bit EXTENT_BUFFER_WRITE_ERR on it. Later on,
when trying to free the log tree with free_log_tree(), which iterates
over the tree, we can end up getting an -EIO error when trying to read
a node or leaf, since read_extent_buffer_pages() returns -EIO if an
extent buffer does not have EXTENT_BUFFER_UPTODATE set and has the
EXTENT_BUFFER_WRITE_ERR bit set. Getting that -EIO means we return
immediately as we can not iterate over the entire tree.

In that case we never update the reserved space for every extent buffer
in the respective block group and space_info object. When this happens
we get the following traces when unmmounting the filesystem:

[174957.284509] BTRFS: error (device dm-0) in cleanup_transaction:1913: errno=-5 IO failure
[174957.286497] BTRFS: error (device dm-0) in free_log_tree:3420: errno=-5 IO failure
[174957.399379] ------------[ cut here ]------------
[174957.402497] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:127 btrfs_put_block_group+0x77/0xb0 [btrfs]
[174957.407523] Modules linked in: btrfs overlay dm_zero (...)
[174957.424917] CPU: 2 PID: 3206883 Comm: umount Tainted: G        W         5.16.0-rc5-btrfs-next-109 #1
[174957.426689] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[174957.428716] RIP: 0010:btrfs_put_block_group+0x77/0xb0 [btrfs]
[174957.429717] Code: 21 48 8b bd (...)
[174957.432867] RSP: 0018:ffffb70d41cffdd0 EFLAGS: 00010206
[174957.433632] RAX: 0000000000000001 RBX: ffff8b09c3848000 RCX: ffff8b0758edd1c8
[174957.434689] RDX: 0000000000000001 RSI: ffffffffc0b467e7 RDI: ffff8b0758edd000
[174957.436068] RBP: ffff8b0758edd000 R08: 0000000000000000 R09: 0000000000000000
[174957.437114] R10: 0000000000000246 R11: 0000000000000000 R12: ffff8b09c3848148
[174957.438140] R13: ffff8b09c3848198 R14: ffff8b0758edd188 R15: dead000000000100
[174957.439317] FS:  00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000
[174957.440402] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[174957.441164] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0
[174957.442117] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[174957.443076] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[174957.443948] Call Trace:
[174957.444264]  <TASK>
[174957.444538]  btrfs_free_block_groups+0x255/0x3c0 [btrfs]
[174957.445238]  close_ctree+0x301/0x357 [btrfs]
[174957.445803]  ? call_rcu+0x16c/0x290
[174957.446250]  generic_shutdown_super+0x74/0x120
[174957.446832]  kill_anon_super+0x14/0x30
[174957.447305]  btrfs_kill_super+0x12/0x20 [btrfs]
[174957.447890]  deactivate_locked_super+0x31/0xa0
[174957.448440]  cleanup_mnt+0x147/0x1c0
[174957.448888]  task_work_run+0x5c/0xa0
[174957.449336]  exit_to_user_mode_prepare+0x1e5/0x1f0
[174957.449934]  syscall_exit_to_user_mode+0x16/0x40
[174957.450512]  do_syscall_64+0x48/0xc0
[174957.450980]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[174957.451605] RIP: 0033:0x7f328fdc4a97
[174957.452059] Code: 03 0c 00 f7 (...)
[174957.454320] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[174957.455262] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97
[174957.456131] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0
[174957.457118] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40
[174957.458005] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000
[174957.459113] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000
[174957.460193]  </TASK>
[174957.460534] irq event stamp: 0
[174957.461003] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[174957.461947] hardirqs last disabled at (0): [<ffffffffb0e94214>] copy_process+0x934/0x2040
[174957.463147] softirqs last  enabled at (0): [<ffffffffb0e94214>] copy_process+0x934/0x2040
[174957.465116] softirqs last disabled at (0): [<0000000000000000>] 0x0
[174957.466323] ---[ end trace bc7ee0c490bce3af ]---
[174957.467282] ------------[ cut here ]------------
[174957.468184] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:3976 btrfs_free_block_groups+0x330/0x3c0 [btrfs]
[174957.470066] Modules linked in: btrfs overlay dm_zero (...)
[174957.483137] CPU: 2 PID: 3206883 Comm: umount Tainted: G        W         5.16.0-rc5-btrfs-next-109 #1
[174957.484691] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[174957.486853] RIP: 0010:btrfs_free_block_groups+0x330/0x3c0 [btrfs]
[174957.488050] Code: 00 00 00 ad de (...)
[174957.491479] RSP: 0018:ffffb70d41cffde0 EFLAGS: 00010206
[174957.492520] RAX: ffff8b08d79310b0 RBX: ffff8b09c3848000 RCX: 0000000000000000
[174957.493868] RDX: 0000000000000001 RSI: fffff443055ee600 RDI: ffffffffb1131846
[174957.495183] RBP: ffff8b08d79310b0 R08: 0000000000000000 R09: 0000000000000000
[174957.496580] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8b08d7931000
[174957.498027] R13: ffff8b09c38492b0 R14: dead000000000122 R15: dead000000000100
[174957.499438] FS:  00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000
[174957.500990] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[174957.502117] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0
[174957.503513] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[174957.504864] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[174957.506167] Call Trace:
[174957.506654]  <TASK>
[174957.507047]  close_ctree+0x301/0x357 [btrfs]
[174957.507867]  ? call_rcu+0x16c/0x290
[174957.508567]  generic_shutdown_super+0x74/0x120
[174957.509447]  kill_anon_super+0x14/0x30
[174957.510194]  btrfs_kill_super+0x12/0x20 [btrfs]
[174957.511123]  deactivate_locked_super+0x31/0xa0
[174957.511976]  cleanup_mnt+0x147/0x1c0
[174957.512610]  task_work_run+0x5c/0xa0
[174957.513309]  exit_to_user_mode_prepare+0x1e5/0x1f0
[174957.514231]  syscall_exit_to_user_mode+0x16/0x40
[174957.515069]  do_syscall_64+0x48/0xc0
[174957.515718]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[174957.516688] RIP: 0033:0x7f328fdc4a97
[174957.517413] Code: 03 0c 00 f7 d8 (...)
[174957.521052] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[174957.522514] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97
[174957.523950] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0
[174957.525375] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40
[174957.526763] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000
[174957.528058] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000
[174957.529404]  </TASK>
[174957.529843] irq event stamp: 0
[174957.530256] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[174957.531061] hardirqs last disabled at (0): [<ffffffffb0e94214>] copy_process+0x934/0x2040
[174957.532075] softirqs last  enabled at (0): [<ffffffffb0e94214>] copy_process+0x934/0x2040
[174957.533083] softirqs last disabled at (0): [<0000000000000000>] 0x0
[174957.533865] ---[ end trace bc7ee0c490bce3b0 ]---
[174957.534452] BTRFS info (device dm-0): space_info 4 has 1070841856 free, is not full
[174957.535404] BTRFS info (device dm-0): space_info total=1073741824, used=2785280, pinned=0, reserved=49152, may_use=0, readonly=65536 zone_unusable=0
[174957.537029] BTRFS info (device dm-0): global_block_rsv: size 0 reserved 0
[174957.537859] BTRFS info (device dm-0): trans_block_rsv: size 0 reserved 0
[174957.538697] BTRFS info (device dm-0): chunk_block_rsv: size 0 reserved 0
[174957.539552] BTRFS info (device dm-0): delayed_block_rsv: size 0 reserved 0
[174957.540403] BTRFS info (device dm-0): delayed_refs_rsv: size 0 reserved 0

This is often triggered with test cases generic/475 and generic/648 from
fstests, which makes the tests fail.

So fix this by iterating over the io tree that contains the ranges of all
log tree metadata extents and call unaccount_log_buffer() for the range of
each metadata extent. This is only called during the transaction abort
path if we failed to walk over the entire log tree.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4129898..95414a3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3397,6 +3397,32 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * If when freeing a log tree we fail to iterate over the entire tree due to a
+ * past writeback failure, then we have not properly freed every metadata extent.
+ * In that case we use this function that does not iterate the log tree but it
+ * still adjusts the reserved bytes in the block group of each metadata extent.
+ */
+static void unaccount_all_log_buffers(struct btrfs_root *log)
+{
+	struct btrfs_fs_info *fs_info = log->fs_info;
+	u64 start = 0;
+	u64 end;
+
+	while (!find_first_extent_bit(&log->dirty_log_pages, start, &start, &end,
+		      EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, NULL)) {
+		u64 bytenr;
+
+		for (bytenr = start; bytenr < end; bytenr += fs_info->nodesize) {
+			unaccount_log_buffer(fs_info, bytenr);
+			cond_resched();
+		}
+
+		start = end + 1;
+	}
+
+}
+
 static void free_log_tree(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *log)
 {
@@ -3409,10 +3435,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 	if (log->node) {
 		ret = walk_log_tree(trans, log, &wc);
 		if (ret) {
-			if (trans)
+			if (trans) {
 				btrfs_abort_transaction(trans, ret);
-			else
+			} else {
 				btrfs_handle_fs_error(log->fs_info, ret, NULL);
+				unaccount_all_log_buffers(log);
+			}
 		}
 	}
 
-- 
2.34.1.75.gabe6bb3905


From b7d2506b7d2124c4f7ac12477add6efd0c0a33c1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 5 Oct 2021 16:35:25 -0400
Subject: [PATCH 43/48] btrfs: add a BTRFS_FS_ERROR helper

We have a few flags that are inconsistently used to describe the fs in
different states of failure.  As of 5963ffcaf383 ("btrfs: always abort
the transaction if we abort a trans handle") we will always set
BTRFS_FS_STATE_ERROR if we abort, so we don't have to check both ABORTED
and ERROR to see if things have gone wrong.  Add a helper to check
BTRFS_FS_STATE_ERROR and then convert all checkers of FS_STATE_ERROR to
use the helper.

The TRANS_ABORTED bit check was added in af7227338135 ("Btrfs: clean up
resources during umount after trans is aborted") but is not actually
specific.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/disk-io.c     |  8 +++-----
 fs/btrfs/extent_io.c   |  2 +-
 fs/btrfs/file.c        |  2 +-
 fs/btrfs/inode.c       |  6 +++---
 fs/btrfs/scrub.c       |  2 +-
 fs/btrfs/super.c       |  2 +-
 fs/btrfs/transaction.c | 11 +++++------
 fs/btrfs/tree-log.c    |  2 +-
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 095cf53..a40e47c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3607,6 +3607,9 @@ do {								\
 			  (errno), fmt, ##args);		\
 } while (0)
 
+#define BTRFS_FS_ERROR(fs_info)	(unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+						   &(fs_info)->fs_state)))
+
 __printf(5, 6)
 __cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1a2621..a9a07d6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1961,8 +1961,7 @@ static int transaction_kthread(void *arg)
 		wake_up_process(fs_info->cleaner_kthread);
 		mutex_unlock(&fs_info->transaction_kthread_mutex);
 
-		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
-				      &fs_info->fs_state)))
+		if (BTRFS_FS_ERROR(fs_info))
 			btrfs_cleanup_transaction(fs_info);
 		if (!kthread_should_stop() &&
 				(!btrfs_transaction_blocked(fs_info) ||
@@ -4243,7 +4242,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 		drop_ref = true;
 	spin_unlock(&fs_info->fs_roots_radix_lock);
 
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+	if (BTRFS_FS_ERROR(fs_info)) {
 		ASSERT(root->log_root == NULL);
 		if (root->reloc_root) {
 			btrfs_put_root(root->reloc_root);
@@ -4394,8 +4393,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 			btrfs_err(fs_info, "commit super ret %d", ret);
 	}
 
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
-	    test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+	if (BTRFS_FS_ERROR(fs_info))
 		btrfs_error_commit_super(fs_info);
 
 	kthread_stop(fs_info->transaction_kthread);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 250fd3c..0e48784 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4879,7 +4879,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 	 *   extent io tree. Thus we don't want to submit such wild eb
 	 *   if the fs already has error.
 	 */
-	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+	if (!BTRFS_FS_ERROR(fs_info)) {
 		ret = flush_write_bio(&epd);
 	} else {
 		ret = -EROFS;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6c50561..74a7ece 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2011,7 +2011,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	 * have opened a file as writable, we have to stop this write operation
 	 * to ensure consistency.
 	 */
-	if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+	if (BTRFS_FS_ERROR(inode->root->fs_info))
 		return -EROFS;
 
 	if (!(iocb->ki_flags & IOCB_DIRECT) &&
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 188adc5..7745c06 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4372,7 +4372,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
 	struct inode *inode;
 	u64 objectid = 0;
 
-	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+	if (!BTRFS_FS_ERROR(fs_info))
 		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
 
 	spin_lock(&root->inode_lock);
@@ -9982,7 +9982,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte
 	};
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+	if (BTRFS_FS_ERROR(fs_info))
 		return -EROFS;
 
 	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@ -10001,7 +10001,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
 	struct list_head splice;
 	int ret;
 
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+	if (BTRFS_FS_ERROR(fs_info))
 		return -EROFS;
 
 	INIT_LIST_HEAD(&splice);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 62f4baf..c6da282 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3956,7 +3956,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 	int	ret;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+	if (BTRFS_FS_ERROR(fs_info))
 		return -EROFS;
 
 	/* Seed devices of a new filesystem has their own generation. */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7f91d62..a1c54a2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2006,7 +2006,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (ret)
 			goto restore;
 	} else {
-		if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		if (BTRFS_FS_ERROR(fs_info)) {
 			btrfs_err(fs_info,
 				"Remounting read-write after error is not allowed");
 			ret = -EINVAL;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 14b9fdc..1c3a118 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -283,7 +283,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
 	spin_lock(&fs_info->trans_lock);
 loop:
 	/* The file system has been taken offline. No new transactions. */
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+	if (BTRFS_FS_ERROR(fs_info)) {
 		spin_unlock(&fs_info->trans_lock);
 		return -EROFS;
 	}
@@ -331,7 +331,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
 		 */
 		kfree(cur_trans);
 		goto loop;
-	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+	} else if (BTRFS_FS_ERROR(fs_info)) {
 		spin_unlock(&fs_info->trans_lock);
 		kfree(cur_trans);
 		return -EROFS;
@@ -579,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	bool do_chunk_alloc = false;
 	int ret;
 
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+	if (BTRFS_FS_ERROR(fs_info))
 		return ERR_PTR(-EROFS);
 
 	if (current->journal_info) {
@@ -991,8 +991,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (throttle)
 		btrfs_run_delayed_iputs(info);
 
-	if (TRANS_ABORTED(trans) ||
-	    test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
+	if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
 		wake_up_process(info->transaction_kthread);
 		if (TRANS_ABORTED(trans))
 			err = trans->aborted;
@@ -2155,7 +2154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		 * abort to prevent writing a new superblock that reflects a
 		 * corrupt state (pointing to trees with unwritten nodes/leafs).
 		 */
-		if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+		if (BTRFS_FS_ERROR(fs_info)) {
 			ret = -EROFS;
 			goto cleanup_transaction;
 		}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 95414a3..ddbb2d0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3338,7 +3338,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * writing the super here would result in transid mismatches.  If there
 	 * is an error here just bail.
 	 */
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+	if (BTRFS_FS_ERROR(fs_info)) {
 		ret = -EIO;
 		btrfs_set_log_full_commit(trans);
 		btrfs_abort_transaction(trans, ret);
-- 
2.34.1.75.gabe6bb3905


From c66c725bc7741bbd152c26a246dce210d564bf10 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 5 Oct 2021 16:35:24 -0400
Subject: [PATCH 44/48] btrfs: change error handling for btrfs_delete_*_in_log

Currently we will abort the transaction if we get a random error (like
-EIO) while trying to remove the directory entries from the root log
during rename.

However since these are simply log tree related errors, we can mark the
trans as needing a full commit.  Then if the error was truly
catastrophic we'll hit it during the normal commit and abort as
appropriate.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c    | 16 +++-------------
 fs/btrfs/tree-log.c | 40 ++++++++++++++--------------------------
 fs/btrfs/tree-log.h | 16 ++++++++--------
 3 files changed, 25 insertions(+), 47 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7745c06..ddec13a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4104,19 +4104,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		goto err;
 	}
 
-	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
-			dir_ino);
-	if (ret != 0 && ret != -ENOENT) {
-		btrfs_abort_transaction(trans, ret);
-		goto err;
-	}
-
-	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
-			index);
-	if (ret == -ENOENT)
-		ret = 0;
-	else if (ret)
-		btrfs_abort_transaction(trans, ret);
+	btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+				   dir_ino);
+	btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
 
 	/*
 	 * If we have a pending delayed iput we could end up with the final iput
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ddbb2d0..7e34c43 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3529,10 +3529,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
  * This optimizations allows us to avoid relogging the entire inode
  * or the entire directory.
  */
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 const char *name, int name_len,
-				 struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  const char *name, int name_len,
+				  struct btrfs_inode *dir, u64 index)
 {
 	struct btrfs_root *log;
 	struct btrfs_dir_item *di;
@@ -3542,11 +3542,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	u64 dir_ino = btrfs_ino(dir);
 
 	if (!inode_logged(trans, dir))
-		return 0;
+		return;
 
 	ret = join_running_log_trans(root);
 	if (ret)
-		return 0;
+		return;
 
 	mutex_lock(&dir->log_mutex);
 
@@ -3584,48 +3584,36 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	btrfs_free_path(path);
 out_unlock:
 	mutex_unlock(&dir->log_mutex);
-	if (err == -ENOSPC) {
+	if (err < 0)
 		btrfs_set_log_full_commit(trans);
-		err = 0;
-	} else if (err < 0) {
-		btrfs_abort_transaction(trans, err);
-	}
-
 	btrfs_end_log_trans(root);
-
-	return err;
 }
 
 /* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       const char *name, int name_len,
-			       struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				const char *name, int name_len,
+				struct btrfs_inode *inode, u64 dirid)
 {
 	struct btrfs_root *log;
 	u64 index;
 	int ret;
 
 	if (!inode_logged(trans, inode))
-		return 0;
+		return;
 
 	ret = join_running_log_trans(root);
 	if (ret)
-		return 0;
+		return;
 	log = root->log_root;
 	mutex_lock(&inode->log_mutex);
 
 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
 				  dirid, &index);
 	mutex_unlock(&inode->log_mutex);
-	if (ret == -ENOSPC) {
+	if (ret < 0 && ret != -ENOENT)
 		btrfs_set_log_full_commit(trans);
-		ret = 0;
-	} else if (ret < 0 && ret != -ENOENT)
-		btrfs_abort_transaction(trans, ret);
 	btrfs_end_log_trans(root);
-
-	return ret;
 }
 
 /*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 3ce6bdb..f6811c3 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -70,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct dentry *dentry,
 			  struct btrfs_log_ctx *ctx);
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 const char *name, int name_len,
-				 struct btrfs_inode *dir, u64 index);
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       const char *name, int name_len,
-			       struct btrfs_inode *inode, u64 dirid);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  const char *name, int name_len,
+				  struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				const char *name, int name_len,
+				struct btrfs_inode *inode, u64 dirid);
 void btrfs_end_log_trans(struct btrfs_root *root);
 void btrfs_pin_log_trans(struct btrfs_root *root);
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
-- 
2.34.1.75.gabe6bb3905


From 7606e75cff78cb8c2cd2b8fc8a2527968338be03 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 5 Oct 2021 16:35:23 -0400
Subject: [PATCH 45/48] btrfs: change handle_fs_error in recover_log_trees to
 aborts

During inspection of the return path for replay I noticed that we don't
actually abort the transaction if we get a failure during replay.  This
isn't a problem necessarily, as we properly return the error and will
fail to mount.  However we still leave this dangling transaction that
could conceivably be committed without thinking there was an error.

We were using btrfs_handle_fs_error() here, but that pre-dates the
transaction abort code.  Simply replace the btrfs_handle_fs_error()
calls with transaction aborts, so we still know where exactly things
went wrong, and add a few in some other un-handled error cases.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7e34c43..b137584 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -6535,8 +6535,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
 	ret = walk_log_tree(trans, log_root_tree, &wc);
 	if (ret) {
-		btrfs_handle_fs_error(fs_info, ret,
-			"Failed to pin buffers while recovering log root tree.");
+		btrfs_abort_transaction(trans, ret);
 		goto error;
 	}
 
@@ -6549,8 +6548,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 
 		if (ret < 0) {
-			btrfs_handle_fs_error(fs_info, ret,
-				    "Couldn't find tree log root.");
+			btrfs_abort_transaction(trans, ret);
 			goto error;
 		}
 		if (ret > 0) {
@@ -6567,8 +6565,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 		log = btrfs_read_tree_root(log_root_tree, &found_key);
 		if (IS_ERR(log)) {
 			ret = PTR_ERR(log);
-			btrfs_handle_fs_error(fs_info, ret,
-				    "Couldn't read tree log root.");
+			btrfs_abort_transaction(trans, ret);
 			goto error;
 		}
 
@@ -6596,8 +6593,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
 			if (!ret)
 				goto next;
-			btrfs_handle_fs_error(fs_info, ret,
-				"Couldn't read target root for tree log recovery.");
+			btrfs_abort_transaction(trans, ret);
 			goto error;
 		}
 
@@ -6605,14 +6601,15 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
 		if (ret)
 			/* The loop needs to continue due to the root refs */
-			btrfs_handle_fs_error(fs_info, ret,
-				"failed to record the log root in transaction");
+			btrfs_abort_transaction(trans, ret);
 		else
 			ret = walk_log_tree(trans, log, &wc);
 
 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
 						      path);
+			if (ret)
+				btrfs_abort_transaction(trans, ret);
 		}
 
 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
@@ -6629,6 +6626,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 			 * could only happen during mount.
 			 */
 			ret = btrfs_init_root_free_objectid(root);
+			if (ret)
+				btrfs_abort_transaction(trans, ret);
 		}
 
 		wc.replay_dest->log_root = NULL;
-- 
2.34.1.75.gabe6bb3905


From ada789a9d3ccbd2e9e8bcc5b96527f4592e165f8 Mon Sep 17 00:00:00 2001
From: Piotr Gorski <lucjan.lucjanov@gmail.com>
Date: Wed, 5 Jan 2022 13:48:14 +0100
Subject: [PATCH 46/48] Revert "btrfs: fix reserved space leak on log tree
 nodes after transaction abort"

This reverts commit eac92caa849ff3ea098e676351f07bedd1dc273d.

Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
---
 fs/btrfs/tree-log.c | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b137584..5f234c3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3397,32 +3397,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-/*
- * If when freeing a log tree we fail to iterate over the entire tree due to a
- * past writeback failure, then we have not properly freed every metadata extent.
- * In that case we use this function that does not iterate the log tree but it
- * still adjusts the reserved bytes in the block group of each metadata extent.
- */
-static void unaccount_all_log_buffers(struct btrfs_root *log)
-{
-	struct btrfs_fs_info *fs_info = log->fs_info;
-	u64 start = 0;
-	u64 end;
-
-	while (!find_first_extent_bit(&log->dirty_log_pages, start, &start, &end,
-		      EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, NULL)) {
-		u64 bytenr;
-
-		for (bytenr = start; bytenr < end; bytenr += fs_info->nodesize) {
-			unaccount_log_buffer(fs_info, bytenr);
-			cond_resched();
-		}
-
-		start = end + 1;
-	}
-
-}
-
 static void free_log_tree(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *log)
 {
@@ -3435,12 +3409,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 	if (log->node) {
 		ret = walk_log_tree(trans, log, &wc);
 		if (ret) {
-			if (trans) {
+			if (trans)
 				btrfs_abort_transaction(trans, ret);
-			} else {
+			else
 				btrfs_handle_fs_error(log->fs_info, ret, NULL);
-				unaccount_all_log_buffers(log);
-			}
 		}
 	}
 
-- 
2.34.1.75.gabe6bb3905


From cfb654f2e2d9b22f51c0d6e8d1c6cc78028f7e85 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 4 Jan 2022 12:54:15 +0000
Subject: [PATCH 47/48] btrfs: fix log tree cleanup after a transaction abort

After the recent changes made by commit c2e39305299f01 ("btrfs: clear
extent buffer uptodate when we fail to write it") and its followup fix,
commit 651740a5024117 ("btrfs: check WRITE_ERR when trying to read an
extent buffer"), we can now end up not cleaning up space reservations of
log tree extent buffers after a transaction abort happens, as well as not
cleaning up still dirty extent buffers.

This happens because if writeback for a log tree extent buffer failed,
than we have cleared the EXTENT_BUFFER_UPTODATE from the extent buffer
and we have also set the bit EXTENT_BUFFER_WRITE_ERR on it. Later on,
when trying to free the log tree with free_log_tree(), which iterates
over the tree, we can end up getting an -EIO error when trying to read
a node or a leaf, since read_extent_buffer_pages() returns -EIO if an
extent buffer does not have EXTENT_BUFFER_UPTODATE set and has the
EXTENT_BUFFER_WRITE_ERR bit set. Getting that -EIO means that we return
immediately as we can not iterate over the entire tree.

In that case we never update the reserved space for an extent buffer in
the respective block group and space_info object, as well as for any other
extent buffers that we not yet iterated over.

When this happens we get the following traces when unmounting the fs:

[174957.284509] BTRFS: error (device dm-0) in cleanup_transaction:1913: errno=-5 IO failure
[174957.286497] BTRFS: error (device dm-0) in free_log_tree:3420: errno=-5 IO failure
[174957.399379] ------------[ cut here ]------------
[174957.402497] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:127 btrfs_put_block_group+0x77/0xb0 [btrfs]
[174957.407523] Modules linked in: btrfs overlay dm_zero (...)
[174957.424917] CPU: 2 PID: 3206883 Comm: umount Tainted: G        W         5.16.0-rc5-btrfs-next-109 #1
[174957.426689] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[174957.428716] RIP: 0010:btrfs_put_block_group+0x77/0xb0 [btrfs]
[174957.429717] Code: 21 48 8b bd (...)
[174957.432867] RSP: 0018:ffffb70d41cffdd0 EFLAGS: 00010206
[174957.433632] RAX: 0000000000000001 RBX: ffff8b09c3848000 RCX: ffff8b0758edd1c8
[174957.434689] RDX: 0000000000000001 RSI: ffffffffc0b467e7 RDI: ffff8b0758edd000
[174957.436068] RBP: ffff8b0758edd000 R08: 0000000000000000 R09: 0000000000000000
[174957.437114] R10: 0000000000000246 R11: 0000000000000000 R12: ffff8b09c3848148
[174957.438140] R13: ffff8b09c3848198 R14: ffff8b0758edd188 R15: dead000000000100
[174957.439317] FS:  00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000
[174957.440402] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[174957.441164] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0
[174957.442117] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[174957.443076] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[174957.443948] Call Trace:
[174957.444264]  <TASK>
[174957.444538]  btrfs_free_block_groups+0x255/0x3c0 [btrfs]
[174957.445238]  close_ctree+0x301/0x357 [btrfs]
[174957.445803]  ? call_rcu+0x16c/0x290
[174957.446250]  generic_shutdown_super+0x74/0x120
[174957.446832]  kill_anon_super+0x14/0x30
[174957.447305]  btrfs_kill_super+0x12/0x20 [btrfs]
[174957.447890]  deactivate_locked_super+0x31/0xa0
[174957.448440]  cleanup_mnt+0x147/0x1c0
[174957.448888]  task_work_run+0x5c/0xa0
[174957.449336]  exit_to_user_mode_prepare+0x1e5/0x1f0
[174957.449934]  syscall_exit_to_user_mode+0x16/0x40
[174957.450512]  do_syscall_64+0x48/0xc0
[174957.450980]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[174957.451605] RIP: 0033:0x7f328fdc4a97
[174957.452059] Code: 03 0c 00 f7 (...)
[174957.454320] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[174957.455262] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97
[174957.456131] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0
[174957.457118] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40
[174957.458005] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000
[174957.459113] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000
[174957.460193]  </TASK>
[174957.460534] irq event stamp: 0
[174957.461003] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[174957.461947] hardirqs last disabled at (0): [<ffffffffb0e94214>] copy_process+0x934/0x2040
[174957.463147] softirqs last  enabled at (0): [<ffffffffb0e94214>] copy_process+0x934/0x2040
[174957.465116] softirqs last disabled at (0): [<0000000000000000>] 0x0
[174957.466323] ---[ end trace bc7ee0c490bce3af ]---
[174957.467282] ------------[ cut here ]------------
[174957.468184] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:3976 btrfs_free_block_groups+0x330/0x3c0 [btrfs]
[174957.470066] Modules linked in: btrfs overlay dm_zero (...)
[174957.483137] CPU: 2 PID: 3206883 Comm: umount Tainted: G        W         5.16.0-rc5-btrfs-next-109 #1
[174957.484691] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[174957.486853] RIP: 0010:btrfs_free_block_groups+0x330/0x3c0 [btrfs]
[174957.488050] Code: 00 00 00 ad de (...)
[174957.491479] RSP: 0018:ffffb70d41cffde0 EFLAGS: 00010206
[174957.492520] RAX: ffff8b08d79310b0 RBX: ffff8b09c3848000 RCX: 0000000000000000
[174957.493868] RDX: 0000000000000001 RSI: fffff443055ee600 RDI: ffffffffb1131846
[174957.495183] RBP: ffff8b08d79310b0 R08: 0000000000000000 R09: 0000000000000000
[174957.496580] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8b08d7931000
[174957.498027] R13: ffff8b09c38492b0 R14: dead000000000122 R15: dead000000000100
[174957.499438] FS:  00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000
[174957.500990] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[174957.502117] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0
[174957.503513] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[174957.504864] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[174957.506167] Call Trace:
[174957.506654]  <TASK>
[174957.507047]  close_ctree+0x301/0x357 [btrfs]
[174957.507867]  ? call_rcu+0x16c/0x290
[174957.508567]  generic_shutdown_super+0x74/0x120
[174957.509447]  kill_anon_super+0x14/0x30
[174957.510194]  btrfs_kill_super+0x12/0x20 [btrfs]
[174957.511123]  deactivate_locked_super+0x31/0xa0
[174957.511976]  cleanup_mnt+0x147/0x1c0
[174957.512610]  task_work_run+0x5c/0xa0
[174957.513309]  exit_to_user_mode_prepare+0x1e5/0x1f0
[174957.514231]  syscall_exit_to_user_mode+0x16/0x40
[174957.515069]  do_syscall_64+0x48/0xc0
[174957.515718]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[174957.516688] RIP: 0033:0x7f328fdc4a97
[174957.517413] Code: 03 0c 00 f7 d8 (...)
[174957.521052] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[174957.522514] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97
[174957.523950] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0
[174957.525375] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40
[174957.526763] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000
[174957.528058] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000
[174957.529404]  </TASK>
[174957.529843] irq event stamp: 0
[174957.530256] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[174957.531061] hardirqs last disabled at (0): [<ffffffffb0e94214>] copy_process+0x934/0x2040
[174957.532075] softirqs last  enabled at (0): [<ffffffffb0e94214>] copy_process+0x934/0x2040
[174957.533083] softirqs last disabled at (0): [<0000000000000000>] 0x0
[174957.533865] ---[ end trace bc7ee0c490bce3b0 ]---
[174957.534452] BTRFS info (device dm-0): space_info 4 has 1070841856 free, is not full
[174957.535404] BTRFS info (device dm-0): space_info total=1073741824, used=2785280, pinned=0, reserved=49152, may_use=0, readonly=65536 zone_unusable=0
[174957.537029] BTRFS info (device dm-0): global_block_rsv: size 0 reserved 0
[174957.537859] BTRFS info (device dm-0): trans_block_rsv: size 0 reserved 0
[174957.538697] BTRFS info (device dm-0): chunk_block_rsv: size 0 reserved 0
[174957.539552] BTRFS info (device dm-0): delayed_block_rsv: size 0 reserved 0
[174957.540403] BTRFS info (device dm-0): delayed_refs_rsv: size 0 reserved 0

This also means that in case we have log tree extent buffers that are
still dirty, we can end up not cleaning them up in case we find an
extent buffer with EXTENT_BUFFER_WRITE_ERR set on it, as in that case
we have no way for iterating over the rest of the tree.

This issue is very often triggered with test cases generic/475 and
generic/648 from fstests.

So instead of trying to iterate a log tree when freeing it after a
transaction abort, iterate over the io tree that tracks the ranges of the
log tree's extent buffers, and unaccount each extent buffer's range as
well as clear any extent buffers that are still dirty. Also because when
we trigger writeback of log tree extent buffers we clear their range from
the ->dirty_log_pages io tree, we need to make sure the range is kept but
with a different bit, so that we can find it if we ever run into a
transaction abort - so when triggering the writeback, instead of clearing
the range, convert it to the EXTENT_UPTODATE bit.

As a final note, while the commits mentioned above make this issue easy to
trigger with tests that use the device mapper to simulate device errors,
the issue could happen before, it was just much less likely. For example
after writing an extent buffer of a log tree, it could be evicted from
memory due to memory pressure, and then later during transaction commit
while cleaning up the log tree through walk_log_tree() we get an -EIO when
trying to read the extent buffer from disk - resulting in cleanup of the
remaining log tree extent buffers not being made.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/ctree.c       |  40 ++++++---
 fs/btrfs/transaction.c |  23 +++++-
 fs/btrfs/tree-log.c    | 180 +++++++++++++++++++++++++----------------
 fs/btrfs/tree-log.h    |   7 ++
 4 files changed, 166 insertions(+), 84 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4d745e3..a80474a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -15,6 +15,7 @@
 #include "volumes.h"
 #include "qgroup.h"
 #include "tree-mod-log.h"
+#include "tree-log.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
@@ -111,6 +112,30 @@ noinline void btrfs_release_path(struct btrfs_path *p)
 	}
 }
 
+static void delete_tree_block(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *eb,
+			      u64 parent,
+			      bool is_last_ref)
+{
+	const u64 root_id = btrfs_root_id(root);
+
+	btrfs_free_tree_block(trans, root_id, eb, parent, is_last_ref);
+	/*
+	 * If we are deleting a block from a log tree, then delete its range from
+	 * the io tree that tracks the blocks. This is only to ensure that if a
+	 * transaction abort happens, we are able to do proper cleanup of space
+	 * reservations, because we may not be able to iterate over the log tree
+	 * in case he had a writeback failure for a log tree node - so we rely on
+	 * the io tree to figure out the range of each log tree block. We ignore
+	 * any error from clear_extent_bits() because it's not common and it's
+	 * not critical either.
+	 */
+	if (root_id == BTRFS_TREE_LOG_OBJECTID)
+		clear_extent_bits(&root->dirty_log_pages, eb->start,
+				  eb->start + eb->len - 1, BTRFS_LOG_PAGES_BITS);
+}
+
 /*
  * safely gets a reference on the root node of a tree.  A lock
  * is not taken, so a concurrent writer may put a different node
@@ -462,8 +487,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		BUG_ON(ret < 0);
 		rcu_assign_pointer(root->node, cow);
 
-		btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
-				      parent_start, last_ref);
+		delete_tree_block(trans, root, buf, parent_start, last_ref);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
@@ -484,8 +508,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 				return ret;
 			}
 		}
-		btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
-				      parent_start, last_ref);
+		delete_tree_block(trans, root, buf, parent_start, last_ref);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -926,7 +949,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		free_extent_buffer(mid);
 
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+		delete_tree_block(trans, root, mid, 0, true);
 		/* once for the root ptr */
 		free_extent_buffer_stale(mid);
 		return 0;
@@ -985,8 +1008,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			btrfs_tree_unlock(right);
 			del_ptr(root, path, level + 1, pslot + 1);
 			root_sub_used(root, right->len);
-			btrfs_free_tree_block(trans, btrfs_root_id(root), right,
-					      0, 1);
+			delete_tree_block(trans, root, right, 0, true);
 			free_extent_buffer_stale(right);
 			right = NULL;
 		} else {
@@ -1031,7 +1053,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_unlock(mid);
 		del_ptr(root, path, level + 1, pslot);
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+		delete_tree_block(trans, root, mid, 0, true);
 		free_extent_buffer_stale(mid);
 		mid = NULL;
 	} else {
@@ -4032,7 +4054,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	root_sub_used(root, leaf->len);
 
 	atomic_inc(&leaf->refs);
-	btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
+	delete_tree_block(trans, root, leaf, 0, true);
 	free_extent_buffer_stale(leaf);
 }
 /*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1c3a118..f4e7dd6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1075,7 +1075,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
  * on all the pages and clear them from the dirty pages state tree
  */
 static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
-				       struct extent_io_tree *dirty_pages)
+				       struct extent_io_tree *dirty_pages,
+				       bool is_log_tree)
 {
 	int err = 0;
 	int werr = 0;
@@ -1093,8 +1094,22 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
 		 * after committing the log because the tree can be accessed
 		 * concurrently - we do it only at transaction commit time when
 		 * it's safe to do it (through extent_io_tree_release()).
+		 *
+		 * For a log tree, we convert the range bit so that we know
+		 * about the range of log tree extent buffers even after they
+		 * were written, so that if a transaction abort happens we
+		 * know about the logical bytenr of the extents and can free
+		 * them up, releasing reserved space in their block groups and
+		 * in the metadata space_info. Ignore any errors in this case,
+		 * we have no way to handle them, if they happen they are
+		 * harmless, they only result in some warnings during unmount.
 		 */
-		err = clear_extent_bit(dirty_pages, start, end,
+		if (is_log_tree)
+			convert_extent_bit(dirty_pages, start, end,
+					   EXTENT_UPTODATE, EXTENT_NEED_WAIT,
+					   &cached_state);
+		else
+			err = clear_extent_bit(dirty_pages, start, end,
 				       EXTENT_NEED_WAIT, 0, 0, &cached_state);
 		if (err == -ENOMEM)
 			err = 0;
@@ -1118,7 +1133,7 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
 	bool errors = false;
 	int err;
 
-	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+	err = __btrfs_wait_marked_extents(fs_info, dirty_pages, false);
 	if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
 		errors = true;
 
@@ -1136,7 +1151,7 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
 
 	ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 
-	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+	err = __btrfs_wait_marked_extents(fs_info, dirty_pages, true);
 	if ((mark & EXTENT_DIRTY) &&
 	    test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
 		errors = true;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5f234c3..48fa671 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2755,6 +2755,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
 	btrfs_put_block_group(cache);
 }
 
+static int release_tree_block(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *log,
+			      struct extent_buffer *eb)
+{
+	btrfs_tree_lock(eb);
+	btrfs_clean_tree_block(eb);
+	btrfs_wait_tree_block_writeback(eb);
+	btrfs_tree_unlock(eb);
+
+	/*
+	 * We ignore errors here on purpose. They should be rare and if they
+	 * happen then nothing really serious happens, we just trigger some
+	 * warnings on unmount due to releasing reserved space twice for an
+	 * extent buffer - plus only in the transaction abort case, so it's
+	 * very rare for this to happen.
+	 */
+	clear_extent_bits(&log->dirty_log_pages, eb->start,
+			  eb->start + eb->len - 1, BTRFS_LOG_PAGES_BITS);
+
+	return btrfs_pin_reserved_extent(trans, eb->start, eb->len);
+}
+
 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path, int *level,
@@ -2765,7 +2787,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 	u64 ptr_gen;
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
-	u32 blocksize;
 	int ret = 0;
 
 	while (*level > 0) {
@@ -2782,7 +2803,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
-		blocksize = fs_info->nodesize;
 
 		next = btrfs_find_create_tree_block(fs_info, bytenr,
 						    btrfs_header_owner(cur),
@@ -2807,24 +2827,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 					return ret;
 				}
 
-				if (trans) {
-					btrfs_tree_lock(next);
-					btrfs_clean_tree_block(next);
-					btrfs_wait_tree_block_writeback(next);
-					btrfs_tree_unlock(next);
-					ret = btrfs_pin_reserved_extent(trans,
-							bytenr, blocksize);
-					if (ret) {
-						free_extent_buffer(next);
-						return ret;
-					}
-					btrfs_redirty_list_add(
-						trans->transaction, next);
-				} else {
-					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
-						clear_extent_buffer_dirty(next);
-					unaccount_log_buffer(fs_info, bytenr);
+				ret = release_tree_block(trans, root, next);
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
 				}
+				btrfs_redirty_list_add(trans->transaction, next);
 			}
 			free_extent_buffer(next);
 			continue;
@@ -2853,7 +2861,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path, int *level,
 				 struct walk_control *wc)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int i;
 	int slot;
 	int ret;
@@ -2876,26 +2883,10 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				struct extent_buffer *next;
 
 				next = path->nodes[*level];
-
-				if (trans) {
-					btrfs_tree_lock(next);
-					btrfs_clean_tree_block(next);
-					btrfs_wait_tree_block_writeback(next);
-					btrfs_tree_unlock(next);
-					ret = btrfs_pin_reserved_extent(trans,
-						     path->nodes[*level]->start,
-						     path->nodes[*level]->len);
-					if (ret)
-						return ret;
-					btrfs_redirty_list_add(trans->transaction,
-							       next);
-				} else {
-					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
-						clear_extent_buffer_dirty(next);
-
-					unaccount_log_buffer(fs_info,
-						path->nodes[*level]->start);
-				}
+				ret = release_tree_block(trans, root, next);
+				if (ret)
+					return ret;
+				btrfs_redirty_list_add(trans->transaction, next);
 			}
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -2913,13 +2904,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 static int walk_log_tree(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *log, struct walk_control *wc)
 {
-	struct btrfs_fs_info *fs_info = log->fs_info;
 	int ret = 0;
 	int wret;
 	int level;
 	struct btrfs_path *path;
 	int orig_level;
 
+	ASSERT(trans != NULL);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -2959,22 +2951,10 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 			struct extent_buffer *next;
 
 			next = path->nodes[orig_level];
-
-			if (trans) {
-				btrfs_tree_lock(next);
-				btrfs_clean_tree_block(next);
-				btrfs_wait_tree_block_writeback(next);
-				btrfs_tree_unlock(next);
-				ret = btrfs_pin_reserved_extent(trans,
-						next->start, next->len);
-				if (ret)
-					goto out;
-				btrfs_redirty_list_add(trans->transaction, next);
-			} else {
-				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
-					clear_extent_buffer_dirty(next);
-				unaccount_log_buffer(fs_info, next->start);
-			}
+			ret = release_tree_block(trans, log, next);
+			if (ret)
+				goto out;
+			btrfs_redirty_list_add(trans->transaction, next);
 		}
 	}
 
@@ -3397,35 +3377,93 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * When cleaning up an aborted transaction we can't iterate the log tree because
+ * in case writeback failed for one of its extent buffers, we won't be able to
+ * read it, we'll get -EIO from btrfs_read_buffer(). So we instead iterate over
+ * the log tree's ->dirty_log_pages io tree.
+ */
+static void unaccount_all_log_buffers(struct btrfs_root *log)
+{
+	u64 start = 0;
+	u64 end;
+
+	while (!find_first_extent_bit(&log->dirty_log_pages, start, &start, &end,
+				      BTRFS_LOG_PAGES_BITS, NULL)) {
+		struct btrfs_fs_info *fs_info = log->fs_info;
+
+		for (; start < end; start += fs_info->nodesize) {
+			struct extent_buffer *eb;
+
+			cond_resched();
+
+			unaccount_log_buffer(fs_info, start);
+			eb = find_extent_buffer(fs_info, start);
+			/*
+			 * eb can be NULL in case it was already written and
+			 * evicted from memory.
+			 */
+			if (!eb)
+				continue;
+
+			wait_on_extent_buffer_writeback(eb);
+			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
+				clear_extent_buffer_dirty(eb);
+			free_extent_buffer(eb);
+		}
+
+		start = end + 1;
+	}
+}
+
 static void free_log_tree(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *log)
 {
-	int ret;
-	struct walk_control wc = {
-		.free = 1,
-		.process_func = process_one_buffer
-	};
+	if (trans && log->node) {
+		int ret;
+		struct walk_control wc = {
+			  .free = 1,
+			  .process_func = process_one_buffer
+		};
 
-	if (log->node) {
 		ret = walk_log_tree(trans, log, &wc);
 		if (ret) {
-			if (trans)
-				btrfs_abort_transaction(trans, ret);
-			else
-				btrfs_handle_fs_error(log->fs_info, ret, NULL);
+			btrfs_abort_transaction(trans, ret);
+			/*
+			 * We may have failed iterating the whole tree, so we
+			 * fallback to the transaction abort cleanup path.
+			 */
+			unaccount_all_log_buffers(log);
 		}
+	} else if (!trans) {
+		unaccount_all_log_buffers(log);
 	}
 
-	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
-			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+	/*
+	 * Cleanup every state record. When walking the log tree we clean the
+	 * range for each extent buffer as we clean it up, but that might fail
+	 * with -ENOMEM due to extent state record splits and we ignore such
+	 * errors during the walk - it's rare but it could happen. By clearing
+	 * the whole range here we don't need to have such state splits and
+	 * allocations, so this way it's guaranteed to always succeed. So we
+	 * almosts always end up here with an empty io tree, except for the
+	 * transaction abort case (when @trans is NULL).
+	 */
+	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, BTRFS_LOG_PAGES_BITS);
 	extent_io_tree_release(&log->log_csum_range);
 
 	btrfs_put_root(log);
 }
 
 /*
- * free all the extents used by the tree log.  This should be called
- * at commit time of the full transaction
+ * Free all the extents used by a log tree.
+ *
+ * @trans:	A transaction handle or NULL.
+ * @root:	The parent root of a log tree.
+ *
+ * This should be called either at commit time of the full transaction, or when
+ * cleaning up a transaction that was aborted. In the former case @trans is not
+ * NULL, while in the second case @trans must be NULL.
  */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index f6811c3..d11a16a 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -12,6 +12,13 @@
 /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
 #define BTRFS_NO_LOG_SYNC 256
 
+/*
+ * All the bits we can set for an extent buffer's range in a log root's
+ * ->dirty_log_pages io tree.
+ */
+#define BTRFS_LOG_PAGES_BITS (EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT | \
+			      EXTENT_UPTODATE)
+
 struct btrfs_log_ctx {
 	int log_ret;
 	int log_transid;
-- 
2.34.1.75.gabe6bb3905


From ac1dab8adf6ce2c05bb9693dd82dddd23bc38c75 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 4 Jan 2022 12:53:41 +0000
Subject: [PATCH 48/48] btrfs: remove write and wait of struct walk_control

The ->write and ->wait fields of struct walk_control, used for log trees,
are not used since 2008, more specifically since commit d0c803c4049c5c
("Btrfs: Record dirty pages tree-log pages in an extent_io tree") and
since commit d0c803c4049c5c ("Btrfs: Record dirty pages tree-log pages in
an extent_io tree"). So just remove them, along with the function
btrfs_write_tree_block(), which is also not used anymore after removing
the ->write member.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/tree-log.c | 28 +++++-----------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 48fa671..563053b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -269,12 +269,6 @@ void btrfs_end_log_trans(struct btrfs_root *root)
 	}
 }
 
-static int btrfs_write_tree_block(struct extent_buffer *buf)
-{
-	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
-					buf->start + buf->len - 1);
-}
-
 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
 	filemap_fdatawait_range(buf->pages[0]->mapping,
@@ -293,16 +287,6 @@ struct walk_control {
 	 */
 	int free;
 
-	/* should we write out the extent buffer?  This is used
-	 * while flushing the log tree to disk during a sync
-	 */
-	int write;
-
-	/* should we wait for the extent buffer io to finish?  Also used
-	 * while flushing the log tree to disk for a sync
-	 */
-	int wait;
-
 	/* pin only walk, we record which extents on disk belong to the
 	 * log trees
 	 */
@@ -353,17 +337,15 @@ static int process_one_buffer(struct btrfs_root *log,
 			return ret;
 	}
 
-	if (wc->pin)
+	if (wc->pin) {
 		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
 						      eb->len);
+		if (ret)
+			return ret;
 
-	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
-		if (wc->pin && btrfs_header_level(eb) == 0)
+		if (btrfs_buffer_uptodate(eb, gen, 0) &&
+		    btrfs_header_level(eb) == 0)
 			ret = btrfs_exclude_logged_extents(eb);
-		if (wc->write)
-			btrfs_write_tree_block(eb);
-		if (wc->wait)
-			btrfs_wait_tree_block_writeback(eb);
 	}
 	return ret;
 }
-- 
2.34.1.75.gabe6bb3905