From a315e68f6e8b3006c29482dbfc4d928f098c449c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 Mar 2017 23:04:20 +0000 Subject: Btrfs: fix invalid attempt to free reserved space on failure to cow range When attempting to COW a file range (we are starting writeback and doing COW), if we manage to reserve an extent for the range we will write into but fail after reserving it and before creating the respective ordered extent, we end up in an error path where we attempt to decrement the data space's bytes_may_use counter after we already did it while reserving the extent, leading to a warning/trace like the following: [ 847.621524] ------------[ cut here ]------------ [ 847.625441] WARNING: CPU: 5 PID: 4905 at fs/btrfs/extent-tree.c:4316 btrfs_free_reserved_data_space_noquota+0x60/0x9f [btrfs] [ 847.633704] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq i2c_piix4 ppdev psmouse tpm_tis serio_raw pcspkr parport_pc tpm_tis_core i2c_core sg [ 847.644616] CPU: 5 PID: 4905 Comm: xfs_io Not tainted 4.10.0-rc8-btrfs-next-37+ #2 [ 847.648601] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 [ 847.648601] Call Trace: [ 847.648601] dump_stack+0x67/0x90 [ 847.648601] __warn+0xc2/0xdd [ 847.648601] warn_slowpath_null+0x1d/0x1f [ 847.648601] btrfs_free_reserved_data_space_noquota+0x60/0x9f [btrfs] [ 847.648601] btrfs_clear_bit_hook+0x140/0x258 [btrfs] [ 847.648601] clear_state_bit+0x87/0x128 [btrfs] [ 847.648601] __clear_extent_bit+0x222/0x2b7 [btrfs] [ 847.648601] clear_extent_bit+0x17/0x19 [btrfs] [ 847.648601] extent_clear_unlock_delalloc+0x3b/0x6b [btrfs] [ 847.648601] cow_file_range.isra.39+0x387/0x39a [btrfs] [ 847.648601] run_delalloc_nocow+0x4d7/0x70e [btrfs] [ 847.648601] ? arch_local_irq_save+0x9/0xc [ 847.648601] run_delalloc_range+0xa7/0x2b5 [btrfs] [ 847.648601] writepage_delalloc.isra.31+0xb9/0x15c [btrfs] [ 847.648601] __extent_writepage+0x249/0x2e8 [btrfs] [ 847.648601] extent_write_cache_pages.constprop.33+0x28b/0x36c [btrfs] [ 847.648601] ? arch_local_irq_save+0x9/0xc [ 847.648601] ? mark_lock+0x24/0x201 [ 847.648601] extent_writepages+0x4b/0x5c [btrfs] [ 847.648601] ? btrfs_writepage_start_hook+0xed/0xed [btrfs] [ 847.648601] btrfs_writepages+0x28/0x2a [btrfs] [ 847.648601] do_writepages+0x23/0x2c [ 847.648601] __filemap_fdatawrite_range+0x5a/0x61 [ 847.648601] filemap_fdatawrite_range+0x13/0x15 [ 847.648601] btrfs_fdatawrite_range+0x20/0x46 [btrfs] [ 847.648601] start_ordered_ops+0x19/0x23 [btrfs] [ 847.648601] btrfs_sync_file+0x136/0x42c [btrfs] [ 847.648601] vfs_fsync_range+0x8c/0x9e [ 847.648601] vfs_fsync+0x1c/0x1e [ 847.648601] do_fsync+0x31/0x4a [ 847.648601] SyS_fsync+0x10/0x14 [ 847.648601] entry_SYSCALL_64_fastpath+0x18/0xad [ 847.648601] RIP: 0033:0x7f5b05200800 [ 847.648601] RSP: 002b:00007ffe204f71c8 EFLAGS: 00000246 ORIG_RAX: 000000000000004a [ 847.648601] RAX: ffffffffffffffda RBX: ffffffff8109637b RCX: 00007f5b05200800 [ 847.648601] RDX: 00000000008bd0a0 RSI: 00000000008bd2e0 RDI: 0000000000000003 [ 847.648601] RBP: ffffc90001d67f98 R08: 000000000000ffff R09: 000000000000001f [ 847.648601] R10: 00000000000001f6 R11: 0000000000000246 R12: 0000000000000046 [ 847.648601] R13: ffffc90001d67f78 R14: 00007f5b054be740 R15: 00007f5b054be740 [ 847.648601] ? trace_hardirqs_off_caller+0x3f/0xaa [ 847.685787] ---[ end trace 2a4a3e15382508e8 ]--- So fix this by not attempting to decrement the data space info's bytes_may_use counter if we already reserved the extent and an error happened before creating the ordered extent. We are already correctly freeing the reserved extent if an error happens, so there's no additional measure needed. Signed-off-by: Filipe Manana Reviewed-by: Liu Bo --- fs/btrfs/extent_io.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_io.h') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3e4fad4a909d..48a30d0e71fb 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -14,7 +14,7 @@ #define EXTENT_DEFRAG (1U << 6) #define EXTENT_BOUNDARY (1U << 9) #define EXTENT_NODATASUM (1U << 10) -#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_CLEAR_META_RESV (1U << 11) #define EXTENT_FIRST_DELALLOC (1U << 12) #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) @@ -22,6 +22,8 @@ #define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_CLEAR_DATA_RESV (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* -- cgit From a7e3b975a0f9296162b72ac6ab7fad9631a07630 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 3 Apr 2017 10:45:46 +0100 Subject: Btrfs: fix reported number of inode blocks Currently when there are buffered writes that were not yet flushed and they fall within allocated ranges of the file (that is, not in holes or beyond eof assuming there are no prealloc extents beyond eof), btrfs simply reports an incorrect number of used blocks through the stat(2) system call (or any of its variants), regardless of mount options or inode flags (compress, compress-force, nodatacow). This is because the number of blocks used that is reported is based on the current number of bytes in the vfs inode plus the number of dealloc bytes in the btrfs inode. The later covers bytes that both fall within allocated regions of the file and holes. Example scenarios where the number of reported blocks is wrong while the buffered writes are not flushed: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt/sdc $ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec) $ sync $ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec) # The following should have reported 64K... $ du -h /mnt/sdc/foo1 128K /mnt/sdc/foo1 $ sync # After flushing the buffered write, it now reports the correct value. $ du -h /mnt/sdc/foo1 64K /mnt/sdc/foo1 $ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec) $ sync $ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2 wrote 65536/65536 bytes at offset 65536 64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec) # The following should have reported 128K... $ du -h /mnt/sdc/foo2 192K /mnt/sdc/foo2 $ sync # After flushing the buffered write, it now reports the correct value. $ du -h /mnt/sdc/foo2 128K /mnt/sdc/foo2 So the number of used file blocks is simply incorrect, unlike in other filesystems such as ext4 and xfs for example, but only while the buffered writes are not flushed. Fix this by tracking the number of delalloc bytes that fall within holes and beyond eof of a file, and use instead this new counter when reporting the number of used blocks for an inode. Another different problem that exists is that the delalloc bytes counter is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from the respective range in the inode's iotree) and the vfs inode's bytes counter is only incremented when writeback finishes (through insert_reserved_file_extent()). Therefore while writeback is ongoing we simply report a wrong number of blocks used by an inode if the write operation covers a range previously unallocated. While this change does not fix this problem, it does minimizes it a lot by shortening that time window, as the new dealloc bytes counter (new_delalloc_bytes) is only decremented when writeback finishes right before updating the vfs inode's bytes counter. Fully fixing this second problem is not trivial and will be addressed later by a different patch. Signed-off-by: Filipe Manana --- fs/btrfs/extent_io.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/extent_io.h') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 48a30d0e71fb..d5ff51b973a4 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -21,6 +21,7 @@ #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_CLEAR_DATA_RESV (1U << 17) +#define EXTENT_DELALLOC_NEW (1U << 18) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) -- cgit