Btrfs: add device counters for detected IO and checksum errors

The goal is to detect when drives start to get an increased error rate, when drives should be replaced soon. Therefore statistic counters are added that count IO errors (read, write and flush). Additionally, the software detected errors like checksum errors and corrupted blocks are counted. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
2025-06-17 20:25:19 +00:00 · 2012-05-25 16:06:08 +02:00 · 2012-05-25 16:06:08 +02:00 · 442a4f6308
commit 442a4f6308
parent d07eb91170
6 changed files with 230 additions and 24 deletions
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@ -2557,18 +2557,19 @@ recovery_tree_root:

 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
-
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
+		struct btrfs_device *device = (struct btrfs_device *)
+			bh->b_private;
+
 		printk_ratelimited(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-				       bdevname(bh->b_bdev, b));
+				   "I/O error on %s\n", device->name);
 		/* note, we dont' set_buffer_write_io_error because we have
 		 * our own ways of dealing with the IO errors
 		 */
 		clear_buffer_uptodate(bh);
+		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
 	}
 	unlock_buffer(bh);
 	put_bh(bh);
@ -2683,6 +2684,7 @@ static int write_dev_supers(struct btrfs_device *device,
 			set_buffer_uptodate(bh);
 			lock_buffer(bh);
 			bh->b_end_io = btrfs_end_buffer_write_sync;
+			bh->b_private = device;
 		}

 		/*
@ -2741,6 +2743,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 		}
 		if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
+			if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+				btrfs_dev_stat_inc_and_print(device,
+					BTRFS_DEV_STAT_FLUSH_ERRS);
 		}

 		/* drop the reference from the wait == 0 run */