Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs updates from Chris Mason:
 "This pull is mostly cleanups and fixes:

   - The raid5/6 cleanups from Zhao Lei fixup some long standing warts
     in the code and add improvements on top of the scrubbing support
     from 3.19.

   - Josef has round one of our ENOSPC fixes coming from large btrfs
     clusters here at FB.

   - Dave Sterba continues a long series of cleanups (thanks Dave), and
     Filipe continues hammering on corner cases in fsync and others

  This all was held up a little trying to track down a use-after-free in
  btrfs raid5/6.  It's not clear yet if this is just made easier to
  trigger with this pull or if its a new bug from the raid5/6 cleanups.
  Dave Sterba is the only one to trigger it so far, but he has a
  consistent way to reproduce, so we'll get it nailed shortly"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (68 commits)
  Btrfs: don't remove extents and xattrs when logging new names
  Btrfs: fix fsync data loss after adding hard link to inode
  Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group
  Btrfs: account for large extents with enospc
  Btrfs: don't set and clear delalloc for O_DIRECT writes
  Btrfs: only adjust outstanding_extents when we do a short write
  btrfs: Fix out-of-space bug
  Btrfs: scrub, fix sleep in atomic context
  Btrfs: fix scheduler warning when syncing log
  Btrfs: Remove unnecessary placeholder in btrfs_err_code
  btrfs: cleanup init for list in free-space-cache
  btrfs: delete chunk allocation attemp when setting block group ro
  btrfs: clear bio reference after submit_one_bio()
  Btrfs: fix scrub race leading to use-after-free
  Btrfs: add missing cleanup on sysfs init failure
  Btrfs: fix race between transaction commit and empty block group removal
  btrfs: add more checks to btrfs_read_sys_array
  btrfs: cleanup, rename a few variables in btrfs_read_sys_array
  btrfs: add checks for sys_chunk_array sizes
  btrfs: more superblock checks, lower bounds on devices and sectorsize/nodesize
  ...
This commit is contained in:
Linus Torvalds 2015-02-19 14:36:00 -08:00
commit 2b9fb532d4
34 changed files with 1065 additions and 863 deletions

View file

@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
insert:
btrfs_release_path(path);
/* try to insert the key into the destination tree */
path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
path->skip_release_on_error = 0;
/* make sure any existing item is the correct size */
if (ret == -EEXIST) {
if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
@ -488,8 +490,20 @@ insert:
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
if (btrfs_inode_generation(eb, src_item) == 0)
if (btrfs_inode_generation(eb, src_item) == 0) {
struct extent_buffer *dst_eb = path->nodes[0];
if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
struct btrfs_map_token token;
u64 ino_size = btrfs_inode_size(eb, src_item);
btrfs_init_map_token(&token);
btrfs_set_token_inode_size(dst_eb, dst_item,
ino_size, &token);
}
goto no_copy;
}
if (overwrite_root &&
S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@ -844,7 +858,7 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
char *name, int namelen)
const char *name, int namelen)
{
struct btrfs_path *path;
struct btrfs_inode_ref *ref;
@ -1254,13 +1268,14 @@ out:
}
static int insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
struct btrfs_root *root, u64 ino)
{
int ret;
ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
if (ret > 0)
ret = btrfs_insert_orphan_item(trans, root, offset);
ret = btrfs_insert_orphan_item(trans, root, ino);
if (ret == -EEXIST)
ret = 0;
return ret;
}
@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
cur_offset = 0;
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
}
btrfs_release_path(path);
if (ret < 0)
if (ret < 0 && ret != -ENOENT)
return ret;
return nlink;
}
@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
nlink = ret;
ret = count_inode_extrefs(root, inode, path);
if (ret == -ENOENT)
ret = 0;
if (ret < 0)
goto out;
@ -1556,6 +1569,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
return ret;
}
/*
* Return true if an inode reference exists in the log for the given name,
* inode and parent inode.
*/
static bool name_in_log_ref(struct btrfs_root *log_root,
const char *name, const int name_len,
const u64 dirid, const u64 ino)
{
struct btrfs_key search_key;
search_key.objectid = ino;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = dirid;
if (backref_in_log(log_root, &search_key, dirid, name, name_len))
return true;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = btrfs_extref_hash(dirid, name, name_len);
if (backref_in_log(log_root, &search_key, dirid, name, name_len))
return true;
return false;
}
/*
* take a single entry in a log directory item and replay it into
* the subvolume.
@ -1666,10 +1703,17 @@ out:
return ret;
insert:
if (name_in_log_ref(root->log_root, name, name_len,
key->objectid, log_key.objectid)) {
/* The dentry will be added later. */
ret = 0;
update_size = false;
goto out;
}
btrfs_release_path(path);
ret = insert_one_name(trans, root, path, key->objectid, key->offset,
name, name_len, log_type, &log_key);
if (ret && ret != -ENOENT)
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
update_size = false;
ret = 0;
@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
return -ENOMEM;
@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
if (atomic_read(&root->log_writers))
schedule();
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
mutex_lock(&root->log_mutex);
}
}
@ -3219,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
struct inode *inode, int log_inode_only)
struct inode *inode, int log_inode_only,
u64 logged_isize)
{
struct btrfs_map_token token;
@ -3232,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* to say 'update this inode with these values'
*/
btrfs_set_token_inode_generation(leaf, item, 0, &token);
btrfs_set_token_inode_size(leaf, item, 0, &token);
btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
} else {
btrfs_set_token_inode_generation(leaf, item,
BTRFS_I(inode)->generation,
@ -3245,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
btrfs_set_token_timespec_sec(leaf, &item->atime,
inode->i_atime.tv_sec, &token);
btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
btrfs_set_token_timespec_nsec(leaf, &item->atime,
inode->i_atime.tv_nsec, &token);
btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
btrfs_set_token_timespec_sec(leaf, &item->mtime,
inode->i_mtime.tv_sec, &token);
btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
btrfs_set_token_timespec_nsec(leaf, &item->mtime,
inode->i_mtime.tv_nsec, &token);
btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
btrfs_set_token_timespec_sec(leaf, &item->ctime,
inode->i_ctime.tv_sec, &token);
btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
btrfs_set_token_timespec_nsec(leaf, &item->ctime,
inode->i_ctime.tv_nsec, &token);
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
@ -3284,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
btrfs_release_path(path);
return 0;
}
@ -3293,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *dst_path,
struct btrfs_path *src_path, u64 *last_extent,
int start_slot, int nr, int inode_only)
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
unsigned long src_offset;
unsigned long dst_offset;
@ -3350,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
dst_path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
inode, inode_only == LOG_INODE_EXISTS);
inode, inode_only == LOG_INODE_EXISTS,
logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
src_offset, ins_sizes[i]);
@ -3902,6 +3949,33 @@ process:
return ret;
}
static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
struct btrfs_path *path, u64 *size_ret)
{
struct btrfs_key key;
int ret;
key.objectid = btrfs_ino(inode);
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
if (ret < 0) {
return ret;
} else if (ret > 0) {
*size_ret = i_size_read(inode);
} else {
struct btrfs_inode_item *item;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
*size_ret = btrfs_inode_size(path->nodes[0], item);
}
btrfs_release_path(path);
return 0;
}
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@ -3939,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 logged_isize = 0;
path = btrfs_alloc_path();
if (!path)
@ -3966,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
/* Only run delayed items if we are a dir or a new file */
/*
* Only run delayed items if we are a dir or a new file.
* Otherwise commit the delayed inode only, which is needed in
* order for the log replay code to mark inodes for link count
* fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
*/
if (S_ISDIR(inode->i_mode) ||
BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
ret = btrfs_commit_inode_delayed_items(trans, inode);
if (ret) {
btrfs_free_path(path);
btrfs_free_path(dst_path);
return ret;
}
else
ret = btrfs_commit_inode_delayed_inode(inode);
if (ret) {
btrfs_free_path(path);
btrfs_free_path(dst_path);
return ret;
}
mutex_lock(&BTRFS_I(inode)->log_mutex);
@ -3988,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
if (inode_only == LOG_INODE_EXISTS) {
max_key_type = BTRFS_INODE_EXTREF_KEY;
max_key.type = max_key_type;
}
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags)) {
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
ret = btrfs_truncate_inode_items(trans, log,
inode, 0, 0);
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags) ||
if (inode_only == LOG_INODE_EXISTS) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
* This is necessary to prevent data loss after log
* replay, and also to prevent doing a wrong expanding
* truncate - for e.g. create file, write 4K into offset
* 0, fsync, write 4K into offset 4096, add hard link,
* fsync some other file (to sync log), power fail - if
* we use the inode's current i_size, after log replay
* we get a 8Kb file, with the last 4Kb extent as a hole
* (zeroes), as if an expanding truncate happened,
* instead of getting a file of 4Kb only.
*/
err = logged_inode_size(log, inode, path,
&logged_isize);
if (err)
goto out_unlock;
}
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_INODE_EXTREF_KEY;
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
ret = btrfs_truncate_inode_items(trans, log,
inode, 0, 0);
}
} else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags) ||
inode_only == LOG_INODE_EXISTS) {
if (inode_only == LOG_INODE_ALL)
if (inode_only == LOG_INODE_ALL) {
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
max_key.type = BTRFS_XATTR_ITEM_KEY;
} else {
max_key.type = BTRFS_INODE_EXTREF_KEY;
}
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
@ -4047,7 +4163,8 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, &last_extent,
ins_start_slot, ins_nr, inode_only);
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@ -4071,7 +4188,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
&last_extent, ins_start_slot,
ins_nr, inode_only);
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@ -4092,7 +4209,8 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, &last_extent,
ins_start_slot, ins_nr, inode_only);
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@ -4273,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
const struct dentry * const first_parent = parent;
const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
last_committed);
sb = inode->i_sb;
@ -4328,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
inode_only = LOG_INODE_EXISTS;
while (1) {
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
@ -4337,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (root != BTRFS_I(inode)->root)
break;
/*
* On unlink we must make sure our immediate parent directory
* inode is fully logged. This is to prevent leaving dangling
* directory index entries and a wrong directory inode's i_size.
* Not doing so can result in a directory being impossible to
* delete after log replay (rmdir will always fail with error
* -ENOTEMPTY).
*/
if (did_unlink && parent == first_parent)
inode_only = LOG_INODE_ALL;
else
inode_only = LOG_INODE_EXISTS;
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
root->fs_info->last_trans_committed ||
inode_only == LOG_INODE_ALL) {
ret = btrfs_log_inode(trans, root, inode, inode_only,
0, LLONG_MAX, ctx);
if (ret)