diff options
Diffstat (limited to 'fs/btrfs')
42 files changed, 11555 insertions, 4464 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d2cf5a54a4b8..a35eb36b32fd 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,25 +1,10 @@ -ifneq ($(KERNELRELEASE),) -# kbuild part of makefile obj-$(CONFIG_BTRFS_FS) := btrfs.o -btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o -else - -# Normal Makefile - -KERNELDIR := /lib/modules/`uname -r`/build -all: - $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install -clean: - $(MAKE) -C $(KERNELDIR) M=`pwd` clean - -endif + export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o delayed-ref.o relocation.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 1d53b62dbba5..f128427b995b 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -29,46 +29,28 @@ #ifdef CONFIG_FS_POSIX_ACL -static void btrfs_update_cached_acl(struct inode *inode, - struct posix_acl **p_acl, - struct posix_acl *acl) -{ - spin_lock(&inode->i_lock); - if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED) - posix_acl_release(*p_acl); - *p_acl = posix_acl_dup(acl); - spin_unlock(&inode->i_lock); -} - static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { int size; const char *name; char *value = NULL; - struct posix_acl *acl = NULL, **p_acl; + struct posix_acl *acl; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; - p_acl = &BTRFS_I(inode)->i_acl; break; case ACL_TYPE_DEFAULT: name = POSIX_ACL_XATTR_DEFAULT; - p_acl = &BTRFS_I(inode)->i_default_acl; break; default: - return ERR_PTR(-EINVAL); + BUG(); } - spin_lock(&inode->i_lock); - if (*p_acl != BTRFS_ACL_NOT_CACHED) - acl = posix_acl_dup(*p_acl); - spin_unlock(&inode->i_lock); - - if (acl) - return acl; - - size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { value = kzalloc(size, GFP_NOFS); @@ -77,12 +59,15 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); if (size > 0) { acl = posix_acl_from_xattr(value, size); - btrfs_update_cached_acl(inode, p_acl, acl); + set_cached_acl(inode, type, acl); } kfree(value); - } else if (size == -ENOENT) { + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; - btrfs_update_cached_acl(inode, p_acl, acl); + set_cached_acl(inode, type, acl); + } else { + acl = ERR_PTR(-EIO); } return acl; @@ -113,7 +98,6 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; const char *name; - struct posix_acl **p_acl; char *value = NULL; mode_t mode; @@ -133,13 +117,11 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) ret = 0; inode->i_mode = mode; name = POSIX_ACL_XATTR_ACCESS; - p_acl = &BTRFS_I(inode)->i_acl; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EINVAL : 0; name = POSIX_ACL_XATTR_DEFAULT; - p_acl = &BTRFS_I(inode)->i_default_acl; break; default: return -EINVAL; @@ -164,7 +146,7 @@ out: kfree(value); if (!ret) - btrfs_update_cached_acl(inode, p_acl, acl); + set_cached_acl(inode, type, acl); return ret; } @@ -256,7 +238,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (IS_POSIXACL(dir) && acl) { @@ -343,9 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -int btrfs_check_acl(struct inode *inode, int mask) -{ - return 0; -} - #endif /* CONFIG_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index c84ca1f5259a..6e4f6c50a120 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -20,12 +20,12 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> -#include <linux/ftrace.h> #include "async-thread.h" #define WORK_QUEUED_BIT 0 #define WORK_DONE_BIT 1 #define WORK_ORDER_DONE_BIT 2 +#define WORK_HIGH_PRIO_BIT 3 /* * container for the kthread task pointer and the list of pending work @@ -37,6 +37,7 @@ struct btrfs_worker_thread { /* list of struct btrfs_work that are waiting for service */ struct list_head pending; + struct list_head prio_pending; /* list of worker threads from struct btrfs_workers */ struct list_head worker_list; @@ -104,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, spin_lock_irqsave(&workers->lock, flags); - while (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - + while (1) { + if (!list_empty(&workers->prio_order_list)) { + work = list_entry(workers->prio_order_list.next, + struct btrfs_work, order_list); + } else if (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + } else { + break; + } if (!test_bit(WORK_DONE_BIT, &work->flags)) break; @@ -144,8 +151,14 @@ static int worker_loop(void *arg) do { spin_lock_irq(&worker->lock); again_locked: - while (!list_empty(&worker->pending)) { - cur = worker->pending.next; + while (1) { + if (!list_empty(&worker->prio_pending)) + cur = worker->prio_pending.next; + else if (!list_empty(&worker->pending)) + cur = worker->pending.next; + else + break; + work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); clear_bit(WORK_QUEUED_BIT, &work->flags); @@ -164,7 +177,6 @@ again_locked: spin_lock_irq(&worker->lock); check_idle_worker(worker); - } if (freezing(current)) { worker->working = 0; @@ -179,7 +191,8 @@ again_locked: * jump_in? */ smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; /* @@ -192,13 +205,18 @@ again_locked: */ schedule_timeout(1); smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; + if (kthread_should_stop()) + break; + /* still no more work?, sleep for real */ spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) goto again_locked; /* @@ -208,7 +226,8 @@ again_locked: worker->working = 0; spin_unlock_irq(&worker->lock); - schedule(); + if (!kthread_should_stop()) + schedule(); } __set_current_state(TASK_RUNNING); } @@ -245,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); spin_lock_init(&workers->lock); workers->max_workers = max; workers->idle_thresh = 32; @@ -270,16 +290,17 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) } INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); + worker->workers = workers; worker->task = kthread_run(worker_loop, worker, "btrfs-%s-%d", workers->name, workers->num_workers + i); - worker->workers = workers; if (IS_ERR(worker->task)) { - kfree(worker); ret = PTR_ERR(worker->task); + kfree(worker); goto fail; } @@ -393,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work) goto out; spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); /* by definition we're busy, take ourselves off the idle @@ -419,6 +443,11 @@ out: return 0; } +void btrfs_set_work_high_prio(struct btrfs_work *work) +{ + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); +} + /* * places a struct btrfs_work into the pending queue of one of the kthreads */ @@ -435,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) worker = find_worker(workers); if (workers->ordered) { spin_lock_irqsave(&workers->lock, flags); - list_add_tail(&work->order_list, &workers->order_list); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } spin_unlock_irqrestore(&workers->lock, flags); } else { INIT_LIST_HEAD(&work->order_list); @@ -443,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); check_busy_worker(worker); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 31be4ed8b63e..1b511c109db6 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -85,6 +85,7 @@ struct btrfs_workers { * of work items waiting for completion */ struct list_head order_list; + struct list_head prio_order_list; /* lock for finding the next worker thread to queue on */ spinlock_t lock; @@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 72677ce2b74f..ea1ea0af8c0e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -53,10 +53,6 @@ struct btrfs_inode { /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; - /* standard acl pointers */ - struct posix_acl *i_acl; - struct posix_acl *i_default_acl; - /* for keeping track of orphaned inodes */ struct list_head i_orphan; @@ -66,6 +62,15 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; + /* + * list for tracking inodes that must be sent to disk before a + * rename or truncate commit + */ + struct list_head ordered_operations; + + /* node for the red-black tree that links inodes in subvolume root */ + struct rb_node rb_node; + /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; @@ -86,12 +91,6 @@ struct btrfs_inode { */ u64 logged_trans; - /* - * trans that last made a change that should be fully fsync'd. This - * gets reset to zero each time the inode is logged - */ - u64 log_dirty_trans; - /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file */ @@ -121,6 +120,25 @@ struct btrfs_inode { /* the start of block group preferred for allocations. */ u64 block_group; + /* the fsync log has some corner cases that mean we have to check + * directories to see if any unlinks have been done before + * the directory was logged. See tree-log.c for all the + * details + */ + u64 last_unlink_trans; + + /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + * + * yes, its silly to have a single bitflag, but we might grow more + * of these. + */ + unsigned ordered_data_close:1; + struct inode vfs_inode; }; @@ -135,5 +153,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } - #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ab07627084f1..de1e2fd32080 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode, u32 csum; u32 *cb_sum = &cb->sums; - if (btrfs_test_flag(inode, NODATASUM)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; for (i = 0; i < cb->nr_pages; i++) { @@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ atomic_inc(&cb->pending_bios); - if (!btrfs_test_flag(inode, NODATASUM)) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_lookup_bio_sums(root, inode, comp_bio, sums); } @@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!btrfs_test_flag(inode, NODATASUM)) + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) btrfs_lookup_bio_sums(root, inode, comp_bio, sums); ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h deleted file mode 100644 index 6e1b3de36700..000000000000 --- a/fs/btrfs/crc32c.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_CRC32C__ -#define __BTRFS_CRC32C__ -#include <linux/crc32c.h> - -/* - * this file used to do more for selecting the HW version of crc32c, - * perhaps it will one day again soon. - */ -#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) -#endif - diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 37f31b5529aa..60a45f3a4e91 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, u32 nritems; int ret = 0; int level; - struct btrfs_root *new_root; - - new_root = kmalloc(sizeof(*new_root), GFP_NOFS); - if (!new_root) - return -ENOMEM; - - memcpy(new_root, root, sizeof(*new_root)); - new_root->root_key.objectid = new_root_objectid; + struct btrfs_disk_key disk_key; WARN_ON(root->ref_cows && trans->transid != root->fs_info->running_transaction->transid); @@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, - new_root_objectid, trans->transid, - level, buf->start, 0); - if (IS_ERR(cow)) { - kfree(new_root); + cow = btrfs_alloc_free_block(trans, root, buf->len, 0, + new_root_objectid, &disk_key, level, + buf->start, 0); + if (IS_ERR(cow)) return PTR_ERR(cow); - } copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_owner(cow, new_root_objectid); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, new_root_objectid); write_extent_buffer(cow, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); - ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); - kfree(new_root); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; @@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, } /* + * check if the tree block can be shared by multiple trees + */ +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf) +{ + /* + * Tree blocks not in refernece counted trees and tree roots + * are never shared. If a block was allocated after the last + * snapshot and the block was not allocated by tree relocation, + * we know the block is not shared. + */ + if (root->ref_cows && + buf != root->node && buf != root->commit_root && + (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item) || + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 1; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (root->ref_cows && + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + return 1; +#endif + return 0; +} + +static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *cow) +{ + u64 refs; + u64 owner; + u64 flags; + u64 new_flags = 0; + int ret; + + /* + * Backrefs update rules: + * + * Always use full backrefs for extent pointers in tree block + * allocated by tree relocation. + * + * If a shared tree block is no longer referenced by its owner + * tree (btrfs_header_owner(buf) == root->root_key.objectid), + * use full backrefs for extent pointers in tree block. + * + * If a tree block is been relocating + * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), + * use full backrefs for extent pointers in tree block. + * The reason for this is some operations (such as drop tree) + * are only allowed for blocks use full backrefs. + */ + + if (btrfs_block_can_be_shared(root, buf)) { + ret = btrfs_lookup_extent_info(trans, root, buf->start, + buf->len, &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + } else { + refs = 1; + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; + else + flags = 0; + } + + owner = btrfs_header_owner(buf); + BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + + if (refs > 1) { + if ((owner == root->root_key.objectid || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + ret = btrfs_inc_ref(trans, root, buf, 1); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_dec_ref(trans, root, buf, 0); + BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 1); + BUG_ON(ret); + } + new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else { + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + } + if (new_flags != 0) { + ret = btrfs_set_disk_extent_flags(trans, root, + buf->start, + buf->len, + new_flags, 0); + BUG_ON(ret); + } + } else { + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, buf, 1); + BUG_ON(ret); + } + clean_tree_block(trans, root, buf); + } + return 0; +} + +/* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked * dirty and returned locked. If you modify the block it needs to be marked @@ -254,63 +375,47 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, * empty_size -- a hint that you plan on doing more cow. This is the size in * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. - * - * prealloc_dest -- if you have already reserved a destination for the cow, - * this uses that block instead of allocating a new one. - * btrfs_alloc_reserved_extent is used to finish the allocation. */ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size, - u64 prealloc_dest) + u64 search_start, u64 empty_size) { - u64 parent_start; + struct btrfs_disk_key disk_key; struct extent_buffer *cow; - u32 nritems; - int ret = 0; int level; int unlock_orig = 0; + u64 parent_start; if (*cow_ret == buf) unlock_orig = 1; btrfs_assert_tree_locked(buf); - if (parent) - parent_start = parent->start; - else - parent_start = 0; - WARN_ON(root->ref_cows && trans->transid != root->fs_info->running_transaction->transid); WARN_ON(root->ref_cows && trans->transid != root->last_trans); level = btrfs_header_level(buf); - nritems = btrfs_header_nritems(buf); - if (prealloc_dest) { - struct btrfs_key ins; + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); - ins.objectid = prealloc_dest; - ins.offset = buf->len; - ins.type = BTRFS_EXTENT_ITEM_KEY; + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + else + parent_start = 0; + } else + parent_start = 0; - ret = btrfs_alloc_reserved_extent(trans, root, parent_start, - root->root_key.objectid, - trans->transid, level, &ins); - BUG_ON(ret); - cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len, level); - } else { - cow = btrfs_alloc_free_block(trans, root, buf->len, - parent_start, - root->root_key.objectid, - trans->transid, level, - search_start, empty_size); - } + cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, + root->root_key.objectid, &disk_key, + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -319,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_owner(cow, root->root_key.objectid); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, root->root_key.objectid); write_extent_buffer(cow, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (btrfs_header_generation(buf) != trans->transid) { - u32 nr_extents; - ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); - if (ret) - return ret; - - ret = btrfs_cache_ref(trans, root, buf, nr_extents); - WARN_ON(ret); - } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { - /* - * There are only two places that can drop reference to - * tree blocks owned by living reloc trees, one is here, - * the other place is btrfs_drop_subtree. In both places, - * we check reference count while tree block is locked. - * Furthermore, if reference count is one, it won't get - * increased by someone else. - */ - u32 refs; - ret = btrfs_lookup_extent_ref(trans, root, buf->start, - buf->len, &refs); - BUG_ON(ret); - if (refs == 1) { - ret = btrfs_update_ref(trans, root, buf, cow, - 0, nritems); - clean_tree_block(trans, root, buf); - } else { - ret = btrfs_inc_ref(trans, root, buf, cow, NULL); - } - BUG_ON(ret); - } else { - ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); - if (ret) - return ret; - clean_tree_block(trans, root, buf); - } - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); - WARN_ON(ret); - } + update_ref_for_cow(trans, root, buf, cow); if (buf == root->node) { WARN_ON(parent && parent != buf); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + parent_start = buf->start; + else + parent_start = 0; spin_lock(&root->node_lock); root->node = cow; extent_buffer_get(cow); spin_unlock(&root->node_lock); - if (buf != root->commit_root) { - btrfs_free_extent(trans, root, buf->start, - buf->len, buf->start, - root->root_key.objectid, - btrfs_header_generation(buf), - level, 1); - } + btrfs_free_extent(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, + level, 0); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_set_node_blockptr(parent, parent_slot, cow->start); - WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - WARN_ON(btrfs_header_generation(parent) != trans->transid); btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, btrfs_header_owner(parent), - btrfs_header_generation(parent), level, 1); + parent_start, root->root_key.objectid, + level, 0); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -405,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } +static inline int should_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) +{ + if (btrfs_header_generation(buf) == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && + !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 0; + return 1; +} + /* * cows a single block, see __btrfs_cow_block for the real work. * This version of it has extra checks so that a block isn't cow'd more than @@ -413,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, u64 prealloc_dest) + struct extent_buffer **cow_ret) { u64 search_start; int ret; @@ -432,11 +519,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(1); } - if (btrfs_header_generation(buf) == trans->transid && - btrfs_header_owner(buf) == root->root_key.objectid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; - WARN_ON(prealloc_dest); return 0; } @@ -447,8 +531,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0, - prealloc_dest); + parent_slot, cow_ret, search_start, 0); return ret; } @@ -492,7 +575,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) /* * same as comp_keys only with two btrfs_key's */ -static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; @@ -617,7 +700,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize), 0); + (end_slot - i) * blocksize)); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -868,6 +951,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, return -1; } +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + return bin_search(eb, key, level, slot); +} + /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). * NULL is returned on error. @@ -937,19 +1026,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(!child); btrfs_tree_lock(child); btrfs_set_lock_blocking(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child); BUG_ON(ret); spin_lock(&root->node_lock); root->node = child; spin_unlock(&root->node_lock); - ret = btrfs_update_extent_ref(trans, root, child->start, - mid->start, child->start, - root->root_key.objectid, - trans->transid, level - 1); - BUG_ON(ret); - add_root_to_dirty_list(root); btrfs_tree_unlock(child); @@ -960,9 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* once for the path */ free_extent_buffer(mid); ret = btrfs_free_extent(trans, root, mid->start, mid->len, - mid->start, root->root_key.objectid, - btrfs_header_generation(mid), - level, 1); + 0, root->root_key.objectid, level, 1); /* once for the root ptr */ free_extent_buffer(mid); return ret; @@ -971,6 +1052,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; + if (btrfs_header_nritems(mid) > 2) + return 0; + if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -979,7 +1063,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(left); btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left, 0); + parent, pslot - 1, &left); if (wret) { ret = wret; goto enospc; @@ -990,7 +1074,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(right); btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right, 0); + parent, pslot + 1, &right); if (wret) { ret = wret; goto enospc; @@ -1016,7 +1100,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; if (btrfs_header_nritems(right) == 0) { u64 bytenr = right->start; - u64 generation = btrfs_header_generation(parent); u32 blocksize = right->len; clean_tree_block(trans, root, right); @@ -1028,9 +1111,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; wret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - generation, level, 1); + blocksize, 0, + root->root_key.objectid, + level, 0); if (wret) ret = wret; } else { @@ -1065,7 +1148,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } if (btrfs_header_nritems(mid) == 0) { /* we've managed to empty the middle node, drop it */ - u64 root_gen = btrfs_header_generation(parent); u64 bytenr = mid->start; u32 blocksize = mid->len; @@ -1077,9 +1159,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; wret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, - btrfs_header_owner(parent), - root_gen, level, 1); + 0, root->root_key.objectid, + level, 0); if (wret) ret = wret; } else { @@ -1171,7 +1252,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left, 0); + pslot - 1, &left); if (ret) wret = 1; else { @@ -1222,7 +1303,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right, 0); + &right); if (ret) wret = 1; else { @@ -1262,9 +1343,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them. */ -static noinline void reada_for_search(struct btrfs_root *root, - struct btrfs_path *path, - int level, int slot, u64 objectid) +static void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) { struct extent_buffer *node; struct btrfs_disk_key disk_key; @@ -1343,12 +1424,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, int ret = 0; int blocksize; - parent = path->nodes[level - 1]; + parent = path->nodes[level + 1]; if (!parent) return 0; nritems = btrfs_header_nritems(parent); - slot = path->slots[level]; + slot = path->slots[level + 1]; blocksize = btrfs_level_size(root, level); if (slot > 0) { @@ -1359,7 +1440,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = 0; free_extent_buffer(eb); } - if (slot < nritems) { + if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); @@ -1369,7 +1450,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, } if (block1 || block2) { ret = -EAGAIN; + + /* release the whole path */ btrfs_release_path(root, path); + + /* read the blocks */ if (block1) readahead_tree_block(root, block1, blocksize, 0); if (block2) @@ -1379,7 +1464,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, eb = read_tree_block(root, block1, blocksize, 0); free_extent_buffer(eb); } - if (block1) { + if (block2) { eb = read_tree_block(root, block2, blocksize, 0); free_extent_buffer(eb); } @@ -1451,7 +1536,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks || path->lowest_level) + if (path->keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1465,6 +1550,138 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) } /* + * helper function for btrfs_search_slot. The goal is to find a block + * in cache without setting the path to blocking. If we find the block + * we return zero and the path is unchanged. + * + * If we can't find the block, we set the path blocking and do some + * reada. -EAGAIN is returned and the search must be repeated. + */ +static int +read_block_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + struct btrfs_key *key) +{ + u64 blocknr; + u64 gen; + u32 blocksize; + struct extent_buffer *b = *eb_ret; + struct extent_buffer *tmp; + int ret; + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + /* + * we found an up to date block without sleeping, return + * right away + */ + *eb_ret = tmp; + return 0; + } + + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. Don't release the lock on the current + * level because we need to walk this node to figure + * out which blocks to read. + */ + btrfs_unlock_up_safe(p, level + 1); + btrfs_set_path_blocking(p); + + if (tmp) + free_extent_buffer(tmp); + if (p->reada) + reada_for_search(root, p, level, slot, key->objectid); + + btrfs_release_path(NULL, p); + + ret = -EAGAIN; + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp) { + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!btrfs_buffer_uptodate(tmp, 0)) + ret = -EIO; + free_extent_buffer(tmp); + } + return ret; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len) +{ + int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return 0; + +again: + ret = -EAGAIN; +done: + return ret; +} + +/* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf * level of the path (level 0) @@ -1482,17 +1699,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow) { struct extent_buffer *b; - struct extent_buffer *tmp; int slot; int ret; int level; - int should_reada = p->reada; int lowest_unlock = 1; - int blocksize; u8 lowest_level = 0; - u64 blocknr; - u64 gen; - struct btrfs_key prealloc_block; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1501,13 +1712,18 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (ins_len < 0) lowest_unlock = 2; - prealloc_block.objectid = 0; - again: - if (p->skip_locking) - b = btrfs_root_node(root); - else - b = btrfs_lock_root_node(root); + if (p->search_commit_root) { + b = root->commit_root; + extent_buffer_get(b); + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + } while (b) { level = btrfs_header_level(b); @@ -1523,50 +1739,19 @@ again: if (cow) { int wret; - /* is a cow on this block not required */ - if (btrfs_header_generation(b) == trans->transid && - btrfs_header_owner(b) == root->root_key.objectid && - !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { - goto cow_done; - } - - /* ok, we have to cow, is our old prealloc the right - * size? - */ - if (prealloc_block.objectid && - prealloc_block.offset != b->len) { - btrfs_release_path(root, p); - btrfs_free_reserved_extent(root, - prealloc_block.objectid, - prealloc_block.offset); - prealloc_block.objectid = 0; - goto again; - } - /* - * for higher level blocks, try not to allocate blocks - * with the block and the parent locks held. + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here */ - if (level > 0 && !prealloc_block.objectid) { - u32 size = b->len; - u64 hint = b->start; - - btrfs_release_path(root, p); - ret = btrfs_reserve_extent(trans, root, - size, size, 0, - hint, (u64)-1, - &prealloc_block, 0); - BUG_ON(ret); - goto again; - } + if (!should_cow_block(trans, root, b)) + goto cow_done; btrfs_set_path_blocking(p); wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], - &b, prealloc_block.objectid); - prealloc_block.objectid = 0; + p->slots[level + 1], &b); if (wret) { free_extent_buffer(b); ret = wret; @@ -1611,51 +1796,15 @@ cow_done: if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - if ((p->search_for_split || ins_len > 0) && - btrfs_header_nritems(b) >= - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); - - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - slot = p->slots[level]; - } else if (ins_len < 0 && - btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + ret = setup_nodes_for_search(trans, root, p, b, level, + ins_len); + if (ret == -EAGAIN) + goto again; + else if (ret) + goto done; + b = p->nodes[level]; + slot = p->slots[level]; - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - if (!b) { - btrfs_release_path(NULL, p); - goto again; - } - slot = p->slots[level]; - BUG_ON(btrfs_header_nritems(b) == 1); - } unlock_up(p, level, lowest_unlock); /* this is only true while dropping a snapshot */ @@ -1664,44 +1813,14 @@ cow_done: goto done; } - blocknr = btrfs_node_blockptr(b, slot); - gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); + ret = read_block_for_search(trans, root, p, + &b, level, slot, key); + if (ret == -EAGAIN) + goto again; + + if (ret == -EIO) + goto done; - tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - b = tmp; - } else { - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. - */ - if (level > 0) { - btrfs_release_path(NULL, p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - - tmp = read_tree_block(root, blocknr, - blocksize, gen); - if (tmp) - free_extent_buffer(tmp); - goto again; - } else { - btrfs_set_path_blocking(p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - b = read_node_slot(root, b, slot); - } - } if (!p->skip_locking) { int lret; @@ -1742,147 +1861,13 @@ done: * we don't really know what they plan on doing with the path * from here on, so for now just mark it as blocking */ - btrfs_set_path_blocking(p); - if (prealloc_block.objectid) { - btrfs_free_reserved_extent(root, - prealloc_block.objectid, - prealloc_block.offset); - } + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(root, p); return ret; } -int btrfs_merge_path(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *node_keys, - u64 *nodes, int lowest_level) -{ - struct extent_buffer *eb; - struct extent_buffer *parent; - struct btrfs_key key; - u64 bytenr; - u64 generation; - u32 blocksize; - int level; - int slot; - int key_match; - int ret; - - eb = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); - BUG_ON(ret); - - btrfs_set_lock_blocking(eb); - - parent = eb; - while (1) { - level = btrfs_header_level(parent); - if (level == 0 || level <= lowest_level) - break; - - ret = bin_search(parent, &node_keys[lowest_level], level, - &slot); - if (ret && slot > 0) - slot--; - - bytenr = btrfs_node_blockptr(parent, slot); - if (nodes[level - 1] == bytenr) - break; - - blocksize = btrfs_level_size(root, level - 1); - generation = btrfs_node_ptr_generation(parent, slot); - btrfs_node_key_to_cpu(eb, &key, slot); - key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); - - if (generation == trans->transid) { - eb = read_tree_block(root, bytenr, blocksize, - generation); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - } - - /* - * if node keys match and node pointer hasn't been modified - * in the running transaction, we can merge the path. for - * blocks owened by reloc trees, the node pointer check is - * skipped, this is because these blocks are fully controlled - * by the space balance code, no one else can modify them. - */ - if (!nodes[level - 1] || !key_match || - (generation == trans->transid && - btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { - if (level == 1 || level == lowest_level + 1) { - if (generation == trans->transid) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - break; - } - - if (generation != trans->transid) { - eb = read_tree_block(root, bytenr, blocksize, - generation); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - } - - ret = btrfs_cow_block(trans, root, eb, parent, slot, - &eb, 0); - BUG_ON(ret); - - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { - if (!nodes[level - 1]) { - nodes[level - 1] = eb->start; - memcpy(&node_keys[level - 1], &key, - sizeof(node_keys[0])); - } else { - WARN_ON(1); - } - } - - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - parent = eb; - continue; - } - - btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); - btrfs_set_node_ptr_generation(parent, slot, trans->transid); - btrfs_mark_buffer_dirty(parent); - - ret = btrfs_inc_extent_ref(trans, root, - nodes[level - 1], - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - level - 1); - BUG_ON(ret); - - /* - * If the block was created in the running transaction, - * it's possible this is the last reference to it, so we - * should drop the subtree. - */ - if (generation == trans->transid) { - ret = btrfs_drop_subtree(trans, root, eb, parent); - BUG_ON(ret); - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } else { - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - level - 1, 1); - BUG_ON(ret); - } - break; - } - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - return 0; -} - /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -2008,9 +1993,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(src); btrfs_mark_buffer_dirty(dst); - ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items); - BUG_ON(ret); - return ret; } @@ -2070,9 +2052,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(src); btrfs_mark_buffer_dirty(dst); - ret = btrfs_update_ref(trans, root, src, dst, 0, push_items); - BUG_ON(ret); - return ret; } @@ -2092,7 +2071,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct extent_buffer *c; struct extent_buffer *old; struct btrfs_disk_key lower_key; - int ret; BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); @@ -2104,16 +2082,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, trans->transid, + root->root_key.objectid, &lower_key, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); - memset_extent_buffer(c, 0, 0, root->nodesize); + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); btrfs_set_header_bytenr(c, c->start); btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); write_extent_buffer(c, root->fs_info->fsid, @@ -2138,12 +2117,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, root->node = c; spin_unlock(&root->node_lock); - ret = btrfs_update_extent_ref(trans, root, lower->start, - lower->start, c->start, - root->root_key.objectid, - trans->transid, level - 1); - BUG_ON(ret); - /* the super has an extra ref to root->node */ free_extent_buffer(old); @@ -2174,8 +2147,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(!path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); - if (slot > nritems) - BUG(); + BUG_ON(slot > nritems); if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) BUG(); if (slot != nritems) { @@ -2232,20 +2204,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, } c_nritems = btrfs_header_nritems(c); + mid = (c_nritems + 1) / 2; + btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, - path->nodes[level + 1]->start, + split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - trans->transid, level, c->start, 0); + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); - btrfs_set_header_flags(split, btrfs_header_flags(c)); + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); - btrfs_set_header_flags(split, 0); write_extent_buffer(split, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(split), BTRFS_FSID_SIZE); @@ -2253,7 +2226,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - mid = (c_nritems + 1) / 2; copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), @@ -2266,16 +2238,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - btrfs_node_key(split, &disk_key, 0); wret = insert_ptr(trans, root, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); if (wret) ret = wret; - ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid); - BUG_ON(ret); - if (path->slots[level] >= mid) { path->slots[level] -= mid; btrfs_tree_unlock(c); @@ -2329,65 +2297,25 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } -/* - * push some data in the path leaf to the right, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - * - * returns 1 if the push failed because the other node didn't have enough - * room, 0 if everything worked out and < 0 if there were major errors. - */ -static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems) { struct extent_buffer *left = path->nodes[0]; - struct extent_buffer *right; - struct extent_buffer *upper; + struct extent_buffer *upper = path->nodes[1]; struct btrfs_disk_key disk_key; int slot; u32 i; - int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; - u32 left_nritems; u32 nr; u32 right_nritems; u32 data_end; u32 this_item_size; - int ret; - - slot = path->slots[1]; - if (!path->nodes[1]) - return 1; - - upper = path->nodes[1]; - if (slot >= btrfs_header_nritems(upper) - 1) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - right = read_node_slot(root, upper, slot + 1); - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right, 0); - if (ret) - goto out_unlock; - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - left_nritems = btrfs_header_nritems(left); - if (left_nritems == 0) - goto out_unlock; if (empty) nr = 0; @@ -2397,6 +2325,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (path->slots[0] >= left_nritems) push_space += data_size; + slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { item = btrfs_item_nr(left, i); @@ -2499,9 +2428,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_mark_buffer_dirty(left); btrfs_mark_buffer_dirty(right); - ret = btrfs_update_ref(trans, root, left, right, 0, push_items); - BUG_ON(ret); - btrfs_item_key(right, &disk_key, 0); btrfs_set_node_key(upper, &disk_key, slot + 1); btrfs_mark_buffer_dirty(upper); @@ -2528,24 +2454,82 @@ out_unlock: } /* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + int slot; + int free_space; + u32 left_nritems; + int ret; + + if (!path->nodes[1]) + return 1; + + slot = path->slots[1]; + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + return __push_leaf_right(trans, root, path, data_size, empty, + right, free_space, left_nritems); +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise */ -static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int data_size, + int empty, struct extent_buffer *left, + int free_space, int right_nritems) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; - struct extent_buffer *left; int slot; int i; - int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; u32 old_left_nritems; - u32 right_nritems; u32 nr; int ret = 0; int wret; @@ -2553,41 +2537,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root u32 old_left_item_size; slot = path->slots[1]; - if (slot == 0) - return 1; - if (!path->nodes[1]) - return 1; - - right_nritems = btrfs_header_nritems(right); - if (right_nritems == 0) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - left = read_node_slot(root, path->nodes[1], slot - 1); - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left, 0); - if (ret) { - /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; - goto out; - } - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } if (empty) nr = right_nritems; @@ -2723,10 +2672,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (right_nritems) btrfs_mark_buffer_dirty(right); - ret = btrfs_update_ref(trans, root, right, left, - old_left_nritems, push_items); - BUG_ON(ret); - btrfs_item_key(right, &disk_key, 0); wret = fixup_low_keys(trans, root, path, &disk_key, 1); if (wret) @@ -2755,6 +2700,151 @@ out: } /* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int free_space; + u32 right_nritems; + int ret = 0; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + return __push_leaf_left(trans, root, path, data_size, + empty, left, free_space, right_nritems); +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) +{ + int data_copy_size; + int rt_data_off; + int i; + int ret = 0; + int wret; + struct btrfs_disk_key disk_key; + + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff; + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_set_header_nritems(l, mid); + ret = 0; + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + + return ret; +} + +/* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. * @@ -2766,19 +2856,16 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size, int extend) { + struct btrfs_disk_key disk_key; struct extent_buffer *l; u32 nritems; int mid; int slot; struct extent_buffer *right; - int data_copy_size; - int rt_data_off; - int i; int ret = 0; int wret; - int double_split; + int split; int num_doubles = 0; - struct btrfs_disk_key disk_key; /* first try to make some room by pushing left and right */ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { @@ -2803,85 +2890,32 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return ret; } again: - double_split = 0; + split = 1; l = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(l); mid = (nritems + 1) / 2; - right = btrfs_alloc_free_block(trans, root, root->leafsize, - path->nodes[1]->start, - root->root_key.objectid, - trans->transid, 0, l->start, 0); - if (IS_ERR(right)) { - BUG_ON(1); - return PTR_ERR(right); - } - - memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(right, right->start); - btrfs_set_header_generation(right, trans->transid); - btrfs_set_header_owner(right, root->root_key.objectid); - btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(right), - BTRFS_FSID_SIZE); - - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(right), - BTRFS_UUID_SIZE); if (mid <= slot) { if (nritems == 1 || leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (slot >= nritems) { - btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - path->slots[1] += 1; - btrfs_mark_buffer_dirty(right); - return ret; - } - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - double_split = 1; + split = 0; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + split = 2; + } } } } else { if (leaf_space_used(l, 0, mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (!extend && data_size && slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, - path->slots[1], 1); - if (wret) - ret = wret; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } - btrfs_mark_buffer_dirty(right); - return ret; + split = 0; } else if ((extend || !data_size) && slot == 0) { mid = 1; } else { @@ -2889,81 +2923,85 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { - double_split = 1; + split = 2 ; } } } } - nritems = nritems - mid; - btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); - - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), - nritems * sizeof(struct btrfs_item)); - - copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - - data_copy_size, btrfs_leaf_data(l) + - leaf_data_end(root, l), data_copy_size); - - rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_end_nr(l, mid); - - for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); - u32 ioff; - - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); - } + if (split == 0) + btrfs_cpu_key_to_disk(&disk_key, ins_key); + else + btrfs_item_key(l, &disk_key, mid); - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; + right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + root->root_key.objectid, + &disk_key, 0, l->start, 0); + if (IS_ERR(right)) { + BUG_ON(1); + return PTR_ERR(right); } - btrfs_set_header_nritems(l, mid); - ret = 0; - btrfs_item_key(right, &disk_key, 0); - wret = insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(right, right->start); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(right); - btrfs_mark_buffer_dirty(l); - BUG_ON(path->slots[0] != slot); + write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); - ret = btrfs_update_ref(trans, root, l, right, 0, nritems); - BUG_ON(ret); + if (split == 0) { + if (mid <= slot) { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; - if (mid <= slot) { - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] -= mid; - path->slots[1] += 1; - } else { - btrfs_tree_unlock(right); - free_extent_buffer(right); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + } else { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); + if (wret) + ret = wret; + } + } + btrfs_mark_buffer_dirty(right); + return ret; } - BUG_ON(path->slots[0] < 0); + ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); + BUG_ON(ret); - if (double_split) { + if (split == 2) { BUG_ON(num_doubles != 0); num_doubles++; goto again; } + return ret; } @@ -3021,26 +3059,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, return -EAGAIN; } + btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &orig_key, path, sizeof(struct btrfs_item), 1); path->keep_locks = 0; BUG_ON(ret); + btrfs_unlock_up_safe(path, 1); + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + +split: /* * make sure any changes to the path from split_leaf leave it * in a blocking state */ btrfs_set_path_blocking(path); - leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); - -split: item = btrfs_item_nr(leaf, path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); - buf = kmalloc(item_size, GFP_NOFS); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, path->slots[0]), item_size); @@ -3359,7 +3398,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, /* figure out how many keys we can insert in here */ total_data = data_size[0]; for (i = 1; i < nr; i++) { - if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) + if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) break; total_data += data_size[i]; } @@ -3445,39 +3484,27 @@ out: } /* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. + * this is a helper for btrfs_insert_empty_items, the main goal here is + * to save stack depth by doing the bulk of the work in a function + * that doesn't call btrfs_search_slot */ -int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) +static noinline_for_stack int +setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) { - struct extent_buffer *leaf; struct btrfs_item *item; - int ret = 0; - int slot; - int slot_orig; int i; u32 nritems; - u32 total_size = 0; - u32 total_data = 0; unsigned int data_end; struct btrfs_disk_key disk_key; + int ret; + struct extent_buffer *leaf; + int slot; - for (i = 0; i < nr; i++) - total_data += data_size[i]; - - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - slot_orig = path->slots[0]; leaf = path->nodes[0]; + slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3489,9 +3516,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, BUG(); } - slot = path->slots[0]; - BUG_ON(slot < 0); - if (slot != nritems) { unsigned int old_data = btrfs_item_end_nr(leaf, slot); @@ -3547,21 +3571,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, data_end -= data_size[i]; btrfs_set_item_size(leaf, item, data_size[i]); } + btrfs_set_header_nritems(leaf, nritems + nr); - btrfs_mark_buffer_dirty(leaf); ret = 0; if (slot == 0) { + struct btrfs_disk_key disk_key; btrfs_cpu_key_to_disk(&disk_key, cpu_key); ret = fixup_low_keys(trans, root, path, &disk_key, 1); } + btrfs_unlock_up_safe(path, 1); + btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(root, leaf) < 0) { btrfs_print_leaf(root, leaf); BUG(); } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + int ret = 0; + int slot; + int i; + u32 total_size = 0; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + BUG_ON(slot < 0); + + ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + total_data, total_size, nr); + out: - btrfs_unlock_up_safe(path, 1); return ret; } @@ -3633,9 +3696,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* * a helper function to delete the leaf pointed to by path->slots[1] and - * path->nodes[1]. bytenr is the node block pointer, but since the callers - * already know it, it is faster to have them pass it down than to - * read it out of the node again. + * path->nodes[1]. * * This deletes the pointer in path->nodes[1] and frees the leaf * block extent. zero is returned if it all worked out, < 0 otherwise. @@ -3643,15 +3704,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 bytenr) +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { int ret; - u64 root_gen = btrfs_header_generation(path->nodes[1]); - u64 parent_start = path->nodes[1]->start; - u64 parent_owner = btrfs_header_owner(path->nodes[1]); + WARN_ON(btrfs_header_generation(leaf) != trans->transid); ret = del_ptr(trans, root, path, 1, path->slots[1]); if (ret) return ret; @@ -3662,10 +3722,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_extent(trans, root, bytenr, - btrfs_level_size(root, 0), - parent_start, parent_owner, - root_gen, 0, 1); + ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, + 0, root->root_key.objectid, 0, 0); return ret; } /* @@ -3733,7 +3791,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - ret = btrfs_del_leaf(trans, root, path, leaf->start); + ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); } } else { @@ -3749,7 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -3757,6 +3815,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, slot = path->slots[1]; extent_buffer_get(leaf); + btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; @@ -3770,8 +3829,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, - leaf->start); + ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); free_extent_buffer(leaf); } else { @@ -4042,28 +4100,44 @@ next: int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { int slot; - int level = 1; + int level; struct extent_buffer *c; - struct extent_buffer *next = NULL; + struct extent_buffer *next; struct btrfs_key key; u32 nritems; int ret; + int old_spinning = path->leave_spinning; + int force_blocking = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + /* + * we take the blocks in an order that upsets lockdep. Using + * blocking mode is the only way around it. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + force_blocking = 1; +#endif + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; btrfs_release_path(root, path); + path->keep_locks = 1; + + if (!force_blocking) + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) return ret; - btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -4073,19 +4147,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (nritems > 0 && path->slots[0] < nritems - 1) { path->slots[0]++; + ret = 0; goto done; } while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - return 1; + if (!path->nodes[level]) { + ret = 1; + goto done; + } slot = path->slots[level] + 1; c = path->nodes[level]; if (slot >= btrfs_header_nritems(c)) { level++; - if (level == BTRFS_MAX_LEVEL) - return 1; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } continue; } @@ -4094,16 +4173,27 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } - /* the path was set to blocking above */ - if (level == 1 && (path->locks[1] || path->skip_locking) && - path->reada) - reada_for_search(root, path, level, slot, 0); + next = c; + ret = read_block_for_search(NULL, root, path, &next, level, + slot, &key); + if (ret == -EAGAIN) + goto again; + + if (ret < 0) { + btrfs_release_path(root, path); + goto done; + } - next = read_node_slot(root, c, slot); if (!path->skip_locking) { - btrfs_assert_tree_locked(c); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } break; } @@ -4113,27 +4203,47 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) c = path->nodes[level]; if (path->locks[level]) btrfs_tree_unlock(c); + free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) path->locks[level] = 1; + if (!level) break; - btrfs_set_path_blocking(path); - if (level == 1 && path->locks[1] && path->reada) - reada_for_search(root, path, level, slot, 0); - next = read_node_slot(root, next, 0); + ret = read_block_for_search(NULL, root, path, &next, level, + 0, &key); + if (ret == -EAGAIN) + goto again; + + if (ret < 0) { + btrfs_release_path(root, path); + goto done; + } + if (!path->skip_locking) { btrfs_assert_tree_locked(path->nodes[level]); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } } + ret = 0; done: unlock_up(path, 0, 1); - return 0; + path->leave_spinning = old_spinning; + if (!old_spinning) + btrfs_set_path_blocking(path); + + return ret; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e1d4e30e9d8..98a873838717 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,10 +41,17 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" -#define BTRFS_ACL_NOT_CACHED ((void *)-1) - #define BTRFS_MAX_LEVEL 8 +#define BTRFS_COMPAT_EXTENT_TREE_V0 + +/* + * files bigger than this get some pre-flushing when they are added + * to the ordered operations list. That way we limit the total + * work done by the commit + */ +#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) + /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -136,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_MAX 9 /* - * the key defines the order in the tree, and so it also defines (optimal) - * block layout. objectid corresonds to the inode number. The flags - * tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with flags of 1 might refer to the inode - * data, flags of 2 may point to file data in the btree and flags == 3 - * may point to extents. + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. * * offset is the starting byte offset for this key in the stream. * @@ -193,7 +203,7 @@ struct btrfs_dev_item { /* * starting byte of this partition on the device, - * to allowr for stripe alignment in the future + * to allow for stripe alignment in the future */ __le64 start_offset; @@ -257,7 +267,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } #define BTRFS_FSID_SIZE 16 -#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) +#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) +#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) +#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) + +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 /* * every tree block (leaf or node) starts with this header. @@ -286,7 +307,6 @@ struct btrfs_header { sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) /* * this is a very generous portion of the super block, giving us @@ -345,9 +365,12 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_COMPAT_SUPP 0x0 -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 -#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 +#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) + +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF /* * A leaf is full of items. offset and size tell us where to find @@ -401,32 +424,75 @@ struct btrfs_path { int locks[BTRFS_MAX_LEVEL]; int reada; /* keep some upper locks as we walk down */ - int keep_locks; - int skip_locking; int lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ - int search_for_split; + unsigned int search_for_split:1; + unsigned int keep_locks:1; + unsigned int skip_locking:1; + unsigned int leave_spinning:1; + unsigned int search_commit_root:1; }; /* * items in the extent btree are used to record the objectid of the * owner of the block and the number of references */ + struct btrfs_extent_item { + __le64 refs; + __le64 generation; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_extent_item_v0 { __le32 refs; } __attribute__ ((__packed__)); -struct btrfs_extent_ref { +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ + sizeof(struct btrfs_item)) + +#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) +#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) + +/* following flags only apply to tree blocks */ + +/* use full backrefs for extent pointers in the block */ +#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) + +struct btrfs_tree_block_info { + struct btrfs_disk_key key; + u8 level; +} __attribute__ ((__packed__)); + +struct btrfs_extent_data_ref { + __le64 root; + __le64 objectid; + __le64 offset; + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_shared_data_ref { + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_extent_inline_ref { + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +/* old style backrefs item */ +struct btrfs_extent_ref_v0 { __le64 root; __le64 generation; __le64 objectid; - __le32 num_refs; + __le32 count; } __attribute__ ((__packed__)); + /* dev extents record free space on individual devices. The owner * field points back to the chunk allocation mapping tree that allocated * the extent. The chunk tree uuid field is a way to double check the owner @@ -625,18 +691,35 @@ struct btrfs_space_info { struct rw_semaphore groups_sem; }; -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; +/* + * free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations and data allocations in ssd mode. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* largest extent in this cluster */ + u64 max_size; + + /* first extent starting offset */ + u64 window_start; + + struct btrfs_block_group_cache *block_group; + /* + * when a cluster is allocated from a block group, we put the + * cluster onto a list in the block group so that it can + * be freed before the block group is freed. + */ + struct list_head block_group_list; }; struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; spinlock_t lock; - struct mutex alloc_mutex; struct mutex cache_mutex; u64 pinned; u64 reserved; @@ -648,6 +731,7 @@ struct btrfs_block_group_cache { struct btrfs_space_info *space_info; /* free space cache stuff */ + spinlock_t tree_lock; struct rb_root free_space_bytes; struct rb_root free_space_offset; @@ -659,14 +743,14 @@ struct btrfs_block_group_cache { /* usage count */ atomic_t count; -}; -struct btrfs_leaf_ref_tree { - struct rb_root root; - struct list_head list; - spinlock_t lock; + /* List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; }; +struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; struct btrfs_fs_info { @@ -688,15 +772,18 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; struct extent_io_tree pinned_extents; - struct extent_io_tree pending_del; - struct extent_io_tree extent_ins; /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; u64 generation; u64 last_trans_committed; - u64 last_trans_new_blockgroup; + + /* + * this is updated to the current trans every time a full commit + * is required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; u64 open_ioctl_trans; unsigned long mount_opt; u64 max_extent; @@ -717,12 +804,20 @@ struct btrfs_fs_info { struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; - struct mutex extent_ins_mutex; - struct mutex pinned_mutex; struct mutex chunk_mutex; struct mutex drop_mutex; struct mutex volume_mutex; struct mutex tree_reloc_mutex; + + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make + * sure the commit code doesn't find the list temporarily empty + * because another function happens to be doing non-waiting preflush + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -737,10 +832,29 @@ struct btrfs_fs_info { * ordered extents */ spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ struct list_head ordered_extents; + + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ struct list_head delalloc_inodes; /* + * special rename and truncate targets that must be on disk before + * we're allowed to commit. This is basically the ext3 style + * data=ordered list. + */ + struct list_head ordered_operations; + + /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers * can run with FS locks held, and the writers may be waiting for @@ -767,20 +881,18 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - /* tree relocation relocated fields */ - struct list_head dead_reloc_roots; - struct btrfs_leaf_ref_tree reloc_ref_tree; - struct btrfs_leaf_ref_tree shared_ref_tree; - struct kobject super_kobj; struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; - atomic_t throttles; - atomic_t throttle_gen; u64 total_pinned; + + /* protected by the delalloc lock, used to keep from writing + * metadata until there is a nice batch + */ + u64 dirty_metadata_bytes; struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -792,11 +904,17 @@ struct btrfs_fs_info { */ struct list_head space_info; + struct reloc_control *reloc_ctl; + spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; - u64 last_alloc; - u64 last_data_alloc; + + /* data_alloc_cluster is only used in ssd mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* all metadata allocations go through this cluster */ + struct btrfs_free_cluster meta_alloc_cluster; spinlock_t ref_cache_lock; u64 total_ref_cache_size; @@ -808,6 +926,9 @@ struct btrfs_fs_info { u64 metadata_alloc_profile; u64 system_alloc_profile; + unsigned data_chunk_allocations; + unsigned metadata_ratio; + void *bdev_holder; }; @@ -815,7 +936,6 @@ struct btrfs_fs_info { * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ -struct btrfs_dirty_root; struct btrfs_root { struct extent_buffer *node; @@ -823,9 +943,6 @@ struct btrfs_root { spinlock_t node_lock; struct extent_buffer *commit_root; - struct btrfs_leaf_ref_tree *ref_tree; - struct btrfs_leaf_ref_tree ref_tree_struct; - struct btrfs_dirty_root *dirty_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; @@ -876,10 +993,15 @@ struct btrfs_root { /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; + struct list_head root_list; + spinlock_t list_lock; - struct list_head dead_list; struct list_head orphan_list; + spinlock_t inode_lock; + /* red-black tree that keeps track of in-memory inodes */ + struct rb_root inode_tree; + /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -888,7 +1010,6 @@ struct btrfs_root { }; /* - * inode items have the data typically returned from stat and store other * info about object characteristics. There is one for every file and dir in * the FS @@ -919,7 +1040,7 @@ struct btrfs_root { #define BTRFS_EXTENT_CSUM_KEY 128 /* - * root items point to tree roots. There are typically in the root + * root items point to tree roots. They are typically in the root * tree used by the super block to find all the other trees */ #define BTRFS_ROOT_ITEM_KEY 132 @@ -942,7 +1063,16 @@ struct btrfs_root { * are used, and how many references there are to each block */ #define BTRFS_EXTENT_ITEM_KEY 168 -#define BTRFS_EXTENT_REF_KEY 180 + +#define BTRFS_TREE_BLOCK_REF_KEY 176 + +#define BTRFS_EXTENT_DATA_REF_KEY 178 + +#define BTRFS_EXTENT_REF_V0_KEY 180 + +#define BTRFS_SHARED_BLOCK_REF_KEY 182 + +#define BTRFS_SHARED_DATA_REF_KEY 184 /* * block groups give us hints into the extent allocation trees. Which @@ -966,6 +1096,10 @@ struct btrfs_root { #define BTRFS_MOUNT_SSD (1 << 3) #define BTRFS_MOUNT_DEGRADED (1 << 4) #define BTRFS_MOUNT_COMPRESS (1 << 5) +#define BTRFS_MOUNT_NOTREELOG (1 << 6) +#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) +#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) +#define BTRFS_MOUNT_NOSSD (1 << 9) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -979,12 +1113,14 @@ struct btrfs_root { #define BTRFS_INODE_READONLY (1 << 2) #define BTRFS_INODE_NOCOMPRESS (1 << 3) #define BTRFS_INODE_PREALLOC (1 << 4) -#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ - ~BTRFS_INODE_##flag) -#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ - BTRFS_INODE_##flag) -#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ - BTRFS_INODE_##flag) +#define BTRFS_INODE_SYNC (1 << 5) +#define BTRFS_INODE_IMMUTABLE (1 << 6) +#define BTRFS_INODE_APPEND (1 << 7) +#define BTRFS_INODE_NODUMP (1 << 8) +#define BTRFS_INODE_NOATIME (1 << 9) +#define BTRFS_INODE_DIRSYNC (1 << 10) + + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1240,24 +1376,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) return (u8 *)((unsigned long)dev + ptr); } -/* struct btrfs_extent_ref */ -BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); -BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); -BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); -BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); -BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, - objectid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, - num_refs, 32); -/* struct btrfs_extent_item */ -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); -BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, - refs, 32); +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, + root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + BUG(); + return 0; +} + +BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); +BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, + generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); +BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); @@ -1481,6 +1660,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) return (flags & flag) == flag; } +static inline int btrfs_header_backref_rev(struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, + int rev) +{ + u64 flags = btrfs_header_flags(eb); + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) { unsigned long ptr = offsetof(struct btrfs_header, fsid); @@ -1704,50 +1898,41 @@ static inline struct dentry *fdentry(struct file *file) } /* extent-tree.c */ +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs); int btrfs_update_pinned_extents(struct btrfs_root *root, u64 bytenr, u64 num, int pin); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 bytenr); -int btrfs_extent_post_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr); int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u32 blocksize, u64 parent, - u64 root_objectid, - u64 ref_generation, - int level, - u64 hint, - u64 empty_size); + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, int level); -int btrfs_alloc_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 min_bytes, - u64 root_objectid, u64 ref_generation, - u64 owner, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, u64 data); -int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins); -int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, @@ -1755,18 +1940,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *orig_buf, struct extent_buffer *buf, - u32 *nr_extents); -int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, u32 nr_extents); -int btrfs_update_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *orig_buf, - struct extent_buffer *buf, int start_slot, int nr); + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin); + u64 root_objectid, u64 owner, u64 offset); + int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1774,13 +1959,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid); -int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 orig_parent, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid); + u64 root_objectid, u64 owner, u64 offset); + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); @@ -1792,16 +1972,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); -int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); -int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, u64 orig_start); -int btrfs_add_dead_reloc_root(struct btrfs_root *root); -int btrfs_cleanup_reloc_trees(struct btrfs_root *root); -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + struct btrfs_block_group_cache *group); + u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -1816,13 +1989,12 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); /* ctree.c */ +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot); +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -int btrfs_merge_path(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *node_keys, - u64 *nodes, int lowest_level); int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); @@ -1838,11 +2010,13 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, u64 prealloc_dest); + struct extent_buffer **cow_ret); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf); int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u32 data_size); int btrfs_truncate_item(struct btrfs_trans_handle *trans, @@ -1869,9 +2043,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); -int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 bytenr); static inline int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) @@ -1903,8 +2074,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root - *root); +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -1930,8 +2100,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_search_root(struct btrfs_root *root, u64 search_start, u64 *found_objectid); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, - struct btrfs_root *latest_root); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); /* dir-item.c */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, @@ -2060,11 +2231,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); -void btrfs_read_locked_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, int wait); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); @@ -2072,12 +2242,8 @@ void btrfs_destroy_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); -struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, - struct btrfs_root *root, int wait); -struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, - struct btrfs_root *root); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *is_new); + struct btrfs_root *root); int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, @@ -2093,6 +2259,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void btrfs_update_iflags(struct inode *inode); +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); /* file.c */ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); @@ -2102,7 +2270,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_block); + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_block); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); @@ -2129,25 +2298,20 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ +#ifdef CONFIG_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); +#else +#define btrfs_check_acl NULL +#endif int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); -/* free-space-cache.c */ -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes); -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, - u64 bytes); -u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +/* relocation.c */ +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c new file mode 100644 index 000000000000..84e6781413b1 --- /dev/null +++ b/fs/btrfs/delayed-ref.c @@ -0,0 +1,919 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/sort.h> +#include "ctree.h" +#include "delayed-ref.h" +#include "transaction.h" + +/* + * delayed back reference update tracking. For subvolume trees + * we queue up extent allocations and backref maintenance for + * delayed processing. This avoids deep call chains where we + * add extents in the middle of btrfs_search_slot, and it allows + * us to buffer up frequently modified backrefs in an rb tree instead + * of hammering updates on the extent allocation tree. + */ + +/* + * compare two delayed tree backrefs with same bytenr and type + */ +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, + struct btrfs_delayed_tree_ref *ref1) +{ + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * compare two delayed data backrefs with same bytenr and type + */ +static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, + struct btrfs_delayed_data_ref *ref1) +{ + if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->objectid < ref2->objectid) + return -1; + if (ref1->objectid > ref2->objectid) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * entries in the rb tree are ordered by the byte number of the extent, + * type of the delayed backrefs and content of delayed backrefs. + */ +static int comp_entry(struct btrfs_delayed_ref_node *ref2, + struct btrfs_delayed_ref_node *ref1) +{ + if (ref1->bytenr < ref2->bytenr) + return -1; + if (ref1->bytenr > ref2->bytenr) + return 1; + if (ref1->is_head && ref2->is_head) + return 0; + if (ref2->is_head) + return -1; + if (ref1->is_head) + return 1; + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { + return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), + btrfs_delayed_node_to_tree_ref(ref1)); + } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), + btrfs_delayed_node_to_data_ref(ref1)); + } + BUG(); + return 0; +} + +/* + * insert a new ref into the rbtree. This returns any existing refs + * for the same (bytenr,parent) tuple, or NULL if the new node was properly + * inserted. + */ +static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_node *ins; + int cmp; + + ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + rb_node); + + cmp = comp_entry(entry, ins); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * find an head entry based on bytenr. This returns the delayed ref + * head if it was able to find one, or NULL if nothing was in that spot + */ +static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, + u64 bytenr, + struct btrfs_delayed_ref_node **last) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int cmp; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + if (last) + *last = entry; + + if (bytenr < entry->bytenr) + cmp = -1; + else if (bytenr > entry->bytenr) + cmp = 1; + else if (!btrfs_delayed_ref_is_head(entry)) + cmp = 1; + else + cmp = 0; + + if (cmp < 0) + n = n->rb_left; + else if (cmp > 0) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + assert_spin_locked(&delayed_refs->lock); + if (mutex_trylock(&head->mutex)) + return 0; + + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + spin_lock(&delayed_refs->lock); + if (!head->node.in_tree) { + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + btrfs_put_delayed_ref(&head->node); + return 0; +} + +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 start) +{ + int count = 0; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + + delayed_refs = &trans->transaction->delayed_refs; + if (start == 0) { + node = rb_first(&delayed_refs->root); + } else { + ref = NULL; + find_ref_head(&delayed_refs->root, start, &ref); + if (ref) { + struct btrfs_delayed_ref_node *tmp; + + node = rb_prev(&ref->rb_node); + while (node) { + tmp = rb_entry(node, + struct btrfs_delayed_ref_node, + rb_node); + if (tmp->bytenr < start) + break; + ref = tmp; + node = rb_prev(&ref->rb_node); + } + node = &ref->rb_node; + } else + node = rb_first(&delayed_refs->root); + } +again: + while (node && count < 32) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + if (list_empty(&head->cluster)) { + list_add_tail(&head->cluster, cluster); + delayed_refs->run_delayed_start = + head->node.bytenr; + count++; + + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + } else if (count) { + /* the goal of the clustering is to find extents + * that are likely to end up in the same extent + * leaf on disk. So, we don't want them spread + * all over the tree. Stop now if we've hit + * a head that was already in use + */ + break; + } + } + node = rb_next(node); + } + if (count) { + return 0; + } else if (start) { + /* + * we've gone to the end of the rbtree without finding any + * clusters. start from the beginning and try again + */ + start = 0; + node = rb_first(&delayed_refs->root); + goto again; + } + return 1; +} + +/* + * This checks to see if there are any delayed refs in the + * btree for a given bytenr. It returns one if it finds any + * and zero otherwise. + * + * If it only finds a head node, it returns 0. + * + * The idea is to use this when deciding if you can safely delete an + * extent from the extent allocation tree. There may be a pending + * ref in the rbtree that adds or removes references, so as long as this + * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent + * allocation tree. + */ +int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *prev_node; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + if (ref) { + prev_node = rb_prev(&ref->rb_node); + if (!prev_node) + goto out; + ref = rb_entry(prev_node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr == bytenr) + ret = 1; + } +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + +/* + * helper function to lookup reference count and flags of extent. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + delayed_refs = &trans->transaction->delayed_refs; +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + spin_lock(&delayed_refs->lock); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + if (ref) { + head = btrfs_delayed_node_to_head(ref); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += ref->ref_mod; + mutex_unlock(&head->mutex); + } + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out: + spin_unlock(&delayed_refs->lock); + btrfs_free_path(path); + return ret; +} + +/* + * helper function to update an extent delayed ref in the + * rbtree. existing and update must both have the same + * bytenr and parent + * + * This may free existing if the update cancels out whatever + * operation it was doing. + */ +static noinline void +update_existing_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + if (update->action != existing->action) { + /* + * this is effectively undoing either an add or a + * drop. We decrement the ref_mod, and if it goes + * down to zero we just delete the entry without + * every changing the extent allocation tree. + */ + existing->ref_mod--; + if (existing->ref_mod == 0) { + rb_erase(&existing->rb_node, + &delayed_refs->root); + existing->in_tree = 0; + btrfs_put_delayed_ref(existing); + delayed_refs->num_entries--; + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + } + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + /* + * the action on the existing ref matches + * the action on the ref we're trying to add. + * Bump the ref_mod by one so the backref that + * is eventually added/removed has the correct + * reference count + */ + existing->ref_mod += update->ref_mod; + } +} + +/* + * helper function to update the accounting in the head ref + * existing and update must have the same bytenr + */ +static noinline void +update_existing_head_ref(struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + struct btrfs_delayed_ref_head *existing_ref; + struct btrfs_delayed_ref_head *ref; + + existing_ref = btrfs_delayed_node_to_head(existing); + ref = btrfs_delayed_node_to_head(update); + BUG_ON(existing_ref->is_data != ref->is_data); + + if (ref->must_insert_reserved) { + /* if the extent was freed and then + * reallocated before the delayed ref + * entries were processed, we can end up + * with an existing head ref without + * the must_insert_reserved flag set. + * Set it again here + */ + existing_ref->must_insert_reserved = ref->must_insert_reserved; + + /* + * update the num_bytes so we make sure the accounting + * is done correctly + */ + existing->num_bytes = update->num_bytes; + + } + + if (ref->extent_op) { + if (!existing_ref->extent_op) { + existing_ref->extent_op = ref->extent_op; + } else { + if (ref->extent_op->update_key) { + memcpy(&existing_ref->extent_op->key, + &ref->extent_op->key, + sizeof(ref->extent_op->key)); + existing_ref->extent_op->update_key = 1; + } + if (ref->extent_op->update_flags) { + existing_ref->extent_op->flags_to_set |= + ref->extent_op->flags_to_set; + existing_ref->extent_op->update_flags = 1; + } + kfree(ref->extent_op); + } + } + /* + * update the reference mod on the head to reflect this new operation + */ + existing->ref_mod += update->ref_mod; +} + +/* + * helper function to actually insert a head node into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count. + */ +static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, + int action, int is_data) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref_head *head_ref = NULL; + struct btrfs_delayed_ref_root *delayed_refs; + int count_mod = 1; + int must_insert_reserved = 0; + + /* + * the head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ + if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + else if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; + + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update + * the reserved accounting when the extent is finally added, or + * if a later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record + * that accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not required. + */ + if (action == BTRFS_ADD_DELAYED_EXTENT) + must_insert_reserved = 1; + else + must_insert_reserved = 0; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = count_mod; + ref->type = 0; + ref->action = 0; + ref->is_head = 1; + ref->in_tree = 1; + + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + head_ref->is_data = is_data; + + INIT_LIST_HEAD(&head_ref->cluster); + mutex_init(&head_ref->mutex); + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_head_ref(existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed tree ref into the rbtree. + */ +static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_tree_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + + full_ref = btrfs_delayed_node_to_tree_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_BLOCK_REF_KEY; + } else { + full_ref->root = ref_root; + ref->type = BTRFS_TREE_BLOCK_REF_KEY; + } + full_ref->level = level; + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_ref(trans, delayed_refs, existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed data ref into the rbtree. + */ +static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 owner, u64 offset, + int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_data_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + + full_ref = btrfs_delayed_node_to_data_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_DATA_REF_KEY; + } else { + full_ref->root = ref_root; + ref->type = BTRFS_EXTENT_DATA_REF_KEY; + } + full_ref->objectid = owner; + full_ref->offset = offset; + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_ref(trans, delayed_refs, existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * add a delayed tree ref. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + */ +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + BUG_ON(extent_op && extent_op->is_data); + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 0); + BUG_ON(ret); + + ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, level, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. + */ +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_data_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + BUG_ON(extent_op && !extent_op->is_data); + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 1); + BUG_ON(ret); + + ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, owner, offset, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) + return -ENOMEM; + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); + BUG_ON(ret); + + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * this does a simple search for the head node for a given extent. + * It must be called with the delayed ref spinlock held, and it returns + * the head node if any where found, or NULL if not. + */ +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + if (ref) + return btrfs_delayed_node_to_head(ref); + return NULL; +} + +/* + * add a delayed ref to the tree. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + * + * The main point of this call is to add and remove a backreference in a single + * shot, taking the lock only once, and only searching for the head node once. + * + * It is the same as doing a ref add and delete in two separate calls. + */ +#if 0 +int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 orig_parent, + u64 parent, u64 orig_ref_root, u64 ref_root, + u64 orig_ref_generation, u64 ref_generation, + u64 owner_objectid, int pin) +{ + struct btrfs_delayed_ref *ref; + struct btrfs_delayed_ref *old_ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS); + if (!old_ref) { + kfree(ref); + return -ENOMEM; + } + + /* + * the parent = 0 case comes from cases where we don't actually + * know the parent yet. It will get updated later via a add/drop + * pair. + */ + if (parent == 0) + parent = bytenr; + if (orig_parent == 0) + orig_parent = bytenr; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + kfree(old_ref); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, + (u64)-1, 0, 0, 0, + BTRFS_UPDATE_DELAYED_HEAD, 0); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, ref_generation, + owner_objectid, BTRFS_ADD_DELAYED_REF, 0); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes, + orig_parent, orig_ref_root, + orig_ref_generation, owner_objectid, + BTRFS_DROP_DELAYED_REF, pin); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} +#endif diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h new file mode 100644 index 000000000000..f6fc67ddad36 --- /dev/null +++ b/fs/btrfs/delayed-ref.h @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __DELAYED_REF__ +#define __DELAYED_REF__ + +/* these are the possible values of struct btrfs_delayed_ref->action */ +#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ +#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ +#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ +#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ + +struct btrfs_delayed_ref_node { + struct rb_node rb_node; + + /* the starting bytenr of the extent */ + u64 bytenr; + + /* the size of the extent */ + u64 num_bytes; + + /* ref count on this data structure */ + atomic_t refs; + + /* + * how many refs is this entry adding or deleting. For + * head refs, this may be a negative number because it is keeping + * track of the total mods done to the reference count. + * For individual refs, this will always be a positive number + * + * It may be more than one, since it is possible for a single + * parent to have more than one ref on an extent + */ + int ref_mod; + + unsigned int action:8; + unsigned int type:8; + /* is this node still in the rbtree? */ + unsigned int is_head:1; + unsigned int in_tree:1; +}; + +struct btrfs_delayed_extent_op { + struct btrfs_disk_key key; + u64 flags_to_set; + unsigned int update_key:1; + unsigned int update_flags:1; + unsigned int is_data:1; +}; + +/* + * the head refs are used to hold a lock on a given extent, which allows us + * to make sure that only one process is running the delayed refs + * at a time for a single extent. They also store the sum of all the + * reference count modifications we've queued up. + */ +struct btrfs_delayed_ref_head { + struct btrfs_delayed_ref_node node; + + /* + * the mutex is held while running the refs, and it is also + * held when checking the sum of reference modifications. + */ + struct mutex mutex; + + struct list_head cluster; + + struct btrfs_delayed_extent_op *extent_op; + /* + * when a new extent is allocated, it is just reserved in memory + * The actual extent isn't inserted into the extent allocation tree + * until the delayed ref is processed. must_insert_reserved is + * used to flag a delayed ref so the accounting can be updated + * when a full insert is done. + * + * It is possible the extent will be freed before it is ever + * inserted into the extent allocation tree. In this case + * we need to update the in ram accounting to properly reflect + * the free has happened. + */ + unsigned int must_insert_reserved:1; + unsigned int is_data:1; +}; + +struct btrfs_delayed_tree_ref { + struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + int level; +}; + +struct btrfs_delayed_data_ref { + struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + u64 objectid; + u64 offset; +}; + +struct btrfs_delayed_ref_root { + struct rb_root root; + + /* this spin lock protects the rbtree and the entries inside */ + spinlock_t lock; + + /* how many delayed ref updates we've queued, used by the + * throttling code + */ + unsigned long num_entries; + + /* total number of head nodes in tree */ + unsigned long num_heads; + + /* total number of head nodes ready for processing */ + unsigned long num_heads_ready; + + /* + * set when the tree is flushing before a transaction commit, + * used by the throttling code to decide if new updates need + * to be run right away + */ + int flushing; + + u64 run_delayed_start; +}; + +static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + WARN_ON(atomic_read(&ref->refs) == 0); + if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(ref->in_tree); + kfree(ref); + } +} + +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op); + +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags); +int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 orig_parent, + u64 parent, u64 orig_ref_root, u64 ref_root, + u64 orig_ref_generation, u64 ref_generation, + u64 owner_objectid, int pin); +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head); +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 search_start); +/* + * a node might live in a head or a regular ref, this lets you + * test for the proper type to use. + */ +static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) +{ + return node->is_head; +} + +/* + * helper functions to cast a node into its container + */ +static inline struct btrfs_delayed_tree_ref * +btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_tree_ref, node); +} + +static inline struct btrfs_delayed_data_ref * +btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_data_ref, node); +} + +static inline struct btrfs_delayed_ref_head * +btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(!btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_ref_head, node); +} +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 926a0b287a7d..1d70236ba00c 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root key.objectid = dir; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + path->leave_spinning = 1; + data_size = sizeof(*dir_item) + name_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3e18175248e0..d28d29c95f7c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -26,8 +26,8 @@ #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/crc32c.h> #include "compat.h" -#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -36,12 +36,14 @@ #include "print-tree.h" #include "async-thread.h" #include "locking.h" -#include "ref-cache.h" #include "tree-log.h" +#include "free-space-cache.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); +static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); + /* * end_io_wq structs are used to do processing in task context when an IO is * complete. This is used during reads to verify checksums, and it is used @@ -171,7 +173,7 @@ out: u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) { - return btrfs_crc32c(seed, data, len); + return crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) @@ -231,10 +233,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk(KERN_INFO "btrfs: %s checksum verify failed " - "on %llu wanted %X found %X level %d\n", - root->fs_info->sb->s_id, - buf->start, val, found, btrfs_header_level(buf)); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs: %s checksum verify " + "failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, + (unsigned long long)buf->start, val, found, + btrfs_header_level(buf)); + } if (result != (char *)&inline_result) kfree(result); return 1; @@ -267,10 +273,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk("parent transid verify failed on %llu wanted %llu found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); + if (printk_ratelimit()) { + printk("parent transid verify failed on %llu wanted %llu " + "found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + } ret = 1; clear_extent_buffer_uptodate(io_tree, eb); out: @@ -414,9 +423,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, found_start = btrfs_header_bytenr(eb); if (found_start != start) { - printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad tree block start " + "%llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -428,8 +440,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } if (check_tree_block_fsid(root, eb)) { - printk(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -578,19 +592,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; atomic_inc(&fs_info->nr_async_submits); + + if (rw & (1 << BIO_RW_SYNCIO)) + btrfs_set_work_high_prio(&async->work); + btrfs_queue_worker(&fs_info->workers, &async->work); -#if 0 - int limit = btrfs_async_submit_limit(fs_info); - if (atomic_read(&fs_info->nr_async_submits) > limit) { - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) < limit), - HZ/10); - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_bios) < limit), - HZ/10); - } -#endif while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { wait_event(fs_info->async_submit_wait, @@ -655,6 +662,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } + /* * kthread helpers are used to submit writes so that checksumming * can happen in parallel across all CPUs @@ -668,14 +676,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct extent_buffer *eb; + int was_dirty; + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (!(current->flags & PF_MEMALLOC)) { + return extent_write_full_page(tree, page, + btree_get_extent, wbc); + } - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; + redirty_page_for_writepage(wbc, page); + eb = btrfs_find_tree_block(root, page_offset(page), + PAGE_CACHE_SIZE); + WARN_ON(!eb); + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; + spin_unlock(&root->fs_info->delalloc_lock); } - return extent_write_full_page(tree, page, btree_get_extent, wbc); + free_extent_buffer(eb); + + unlock_page(page); + return 0; } static int btree_writepages(struct address_space *mapping, @@ -684,15 +709,15 @@ static int btree_writepages(struct address_space *mapping, struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_root *root = BTRFS_I(mapping->host)->root; u64 num_dirty; - u64 start = 0; unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; - num_dirty = count_range_bits(tree, &start, (u64)-1, - thresh, EXTENT_DIRTY); + /* this is a bit racy, but that's ok */ + num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty < thresh) return 0; } @@ -747,27 +772,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -#if 0 -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct buffer_head *bh; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct buffer_head *head; - if (!page_has_buffers(page)) { - create_empty_buffers(page, root->fs_info->sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - head = page_buffers(page); - bh = head; - do { - if (buffer_dirty(bh)) - csum_tree_block(root, bh, 0); - bh = bh->b_this_page; - } while (bh != head); - return block_write_full_page(page, btree_get_block, wbc); -} -#endif - static struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, @@ -845,8 +849,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, if (ret == 0) set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); - else - WARN_ON(1); return buf; } @@ -859,9 +861,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); - /* ugh, clear_extent_buffer_dirty can be expensive */ - btrfs_set_lock_blocking(buf); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= buf->len) + root->fs_info->dirty_metadata_bytes -= buf->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -875,7 +885,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, { root->node = NULL; root->commit_root = NULL; - root->ref_tree = NULL; root->sectorsize = sectorsize; root->nodesize = nodesize; root->leafsize = leafsize; @@ -890,12 +899,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_inode_alloc = 0; root->name = NULL; root->in_sysfs = 0; + root->inode_tree.rb_node = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); - INIT_LIST_HEAD(&root->dead_list); + INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->node_lock); spin_lock_init(&root->list_lock); + spin_lock_init(&root->inode_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -909,9 +920,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); - btrfs_leaf_ref_tree_init(&root->ref_tree_struct); - root->ref_tree = &root->ref_tree_struct; - memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -950,6 +958,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); return 0; } @@ -1016,20 +1025,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ root->ref_cows = 0; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, BTRFS_TREE_LOG_OBJECTID, - trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); } + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); root->node = leaf; - btrfs_set_header_nritems(root->node, 0); - btrfs_set_header_level(root->node, 0); - btrfs_set_header_bytenr(root->node, root->node->start); - btrfs_set_header_generation(root->node, trans->transid); - btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); write_extent_buffer(root->node, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(root->node), @@ -1072,8 +1080,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, inode_item->nbytes = cpu_to_le64(root->leafsize); inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); - btrfs_set_root_generation(&log_root->root_item, trans->transid); + btrfs_set_root_node(&log_root->root_item, log_root->node); WARN_ON(root->log_root); root->log_root = log_root; @@ -1135,6 +1142,7 @@ out: blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); insert: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -1201,7 +1209,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, } if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid, root); + root->root_key.objectid); BUG_ON(ret); btrfs_orphan_cleanup(root); } @@ -1247,11 +1255,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) int ret = 0; struct btrfs_device *device; struct backing_dev_info *bdi; -#if 0 - if ((bdi_bits & (1 << BDI_write_congested)) && - btrfs_congested_async(info, 0)) - return 1; -#endif + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; @@ -1340,12 +1344,25 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) free_extent_map(em); } +/* + * If this fails, caller must call bdi_destroy() to get rid of the + * bdi again. + */ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { - bdi_init(bdi); + int err; + + bdi->capabilities = BDI_CAP_MAP_COPY; + err = bdi_init(bdi); + if (err) + return err; + + err = bdi_register(bdi, NULL, "btrfs-%d", + atomic_inc_return(&btrfs_bdi_num)); + if (err) + return err; + bdi->ra_pages = default_backing_dev_info.ra_pages; - bdi->state = 0; - bdi->capabilities = default_backing_dev_info.capabilities; bdi->unplug_io_fn = btrfs_unplug_io_fn; bdi->unplug_io_data = info; bdi->congested_fn = btrfs_congested_fn; @@ -1387,8 +1404,6 @@ static int bio_ready_for_csum(struct bio *bio) ret = extent_range_uptodate(io_tree, start + length, start + buf_len - 1); - if (ret == 1) - return ret; return ret; } @@ -1471,12 +1486,6 @@ static int transaction_kthread(void *arg) vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) { - printk(KERN_INFO "btrfs: total reference cache " - "size %llu\n", - root->fs_info->total_ref_cache_size); - } - mutex_lock(&root->fs_info->trans_mutex); cur = root->fs_info->running_transaction; if (!cur) { @@ -1493,6 +1502,7 @@ static int transaction_kthread(void *arg) mutex_unlock(&root->fs_info->trans_mutex); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); + sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); @@ -1552,6 +1562,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); @@ -1570,15 +1581,15 @@ struct btrfs_root *open_ctree(struct super_block *sb, atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); - atomic_set(&fs_info->throttles, 0); - atomic_set(&fs_info->throttle_gen, 0); fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; - setup_bdi(fs_info, &fs_info->bdi); + if (setup_bdi(fs_info, &fs_info->bdi)) + goto fail_bdi; fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + fs_info->metadata_ratio = 8; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1598,6 +1609,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_mapping->a_ops = &btree_aops; fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1611,31 +1623,26 @@ struct btrfs_root *open_ctree(struct super_block *sb, extent_io_tree_init(&fs_info->pinned_extents, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&fs_info->pending_del, - fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&fs_info->extent_ins, - fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; - INIT_LIST_HEAD(&fs_info->dead_reloc_roots); - btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); - btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); - BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); insert_inode_hash(fs_info->btree_inode); mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); - mutex_init(&fs_info->extent_ins_mutex); - mutex_init(&fs_info->pinned_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->tree_reloc_mutex); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->async_submit_wait); @@ -1670,17 +1677,23 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } + features = btrfs_super_incompat_flags(disk_super); + if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + btrfs_set_super_incompat_flags(disk_super, features); + } + features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -1772,7 +1785,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read the system " "array on %s\n", sb->s_id); - goto fail_sys_array; + goto fail_sb_buffer; } blocksize = btrfs_level_size(tree_root, @@ -1786,6 +1799,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root(disk_super), blocksize, generation); BUG_ON(!chunk_root->node); + btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); + chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), @@ -1811,7 +1826,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize, generation); if (!tree_root->node) goto fail_chunk_root; - + btrfs_set_root_node(&tree_root->root_item, tree_root->node); + tree_root->commit_root = btrfs_root_node(tree_root); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); @@ -1821,14 +1837,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); - dev_root->track_dirty = 1; if (ret) goto fail_extent_root; + dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_extent_root; + goto fail_dev_root; csum_root->track_dirty = 1; @@ -1850,6 +1866,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; + if (!btrfs_test_opt(tree_root, SSD) && + !btrfs_test_opt(tree_root, NOSSD) && + !fs_info->fs_devices->rotating) { + printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + "mode\n"); + btrfs_set_opt(fs_info->mount_opt, SSD); + } + if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); @@ -1882,7 +1906,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, } if (!(sb->s_flags & MS_RDONLY)) { - ret = btrfs_cleanup_reloc_trees(tree_root); + ret = btrfs_recover_relocation(tree_root); BUG_ON(ret); } @@ -1893,6 +1917,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) goto fail_trans_kthread; + return tree_root; fail_trans_kthread: @@ -1909,14 +1934,19 @@ fail_cleaner: fail_csum_root: free_extent_buffer(csum_root->node); + free_extent_buffer(csum_root->commit_root); +fail_dev_root: + free_extent_buffer(dev_root->node); + free_extent_buffer(dev_root->commit_root); fail_extent_root: free_extent_buffer(extent_root->node); + free_extent_buffer(extent_root->commit_root); fail_tree_root: free_extent_buffer(tree_root->node); + free_extent_buffer(tree_root->commit_root); fail_chunk_root: free_extent_buffer(chunk_root->node); -fail_sys_array: - free_extent_buffer(dev_root->node); + free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); @@ -1932,8 +1962,8 @@ fail_iput: btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); +fail_bdi: bdi_destroy(&fs_info->bdi); - fail: kfree(extent_root); kfree(tree_root); @@ -2006,6 +2036,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) return latest; } +/* + * this should be called twice, once with wait == 0 and + * once with wait == 1. When wait == 0 is done, all the buffer heads + * we write are pinned. + * + * They are released when wait == 1 is done. + * max_mirrors must be the same for both runs, and it indicates how + * many supers on this one device should be written. + * + * max_mirrors == 0 means to write them all. + */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int do_barriers, int wait, int max_mirrors) @@ -2041,12 +2082,16 @@ static int write_dev_supers(struct btrfs_device *device, bh = __find_get_block(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); BUG_ON(!bh); - brelse(bh); wait_on_buffer(bh); - if (buffer_uptodate(bh)) { - brelse(bh); - continue; - } + if (!buffer_uptodate(bh)) + errors++; + + /* drop our reference */ + brelse(bh); + + /* drop the reference from the wait == 0 run */ + brelse(bh); + continue; } else { btrfs_set_super_bytenr(sb, bytenr); @@ -2057,12 +2102,18 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, sb->csum); + /* + * one reference for us, and we leave it for the + * caller + */ bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); - set_buffer_uptodate(bh); + /* one reference for submit_bh */ get_bh(bh); + + set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; } @@ -2074,30 +2125,24 @@ static int write_dev_supers(struct btrfs_device *device, device->name); set_buffer_uptodate(bh); device->barriers = 0; + /* one reference for submit_bh */ get_bh(bh); lock_buffer(bh); - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } } else { - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } - if (!ret && wait) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - errors++; - } else if (ret) { + if (ret) errors++; - } - if (wait) - brelse(bh); } return errors < i ? 0 : -1; } int write_all_supers(struct btrfs_root *root, int max_mirrors) { - struct list_head *head = &root->fs_info->fs_devices->devices; + struct list_head *head; struct btrfs_device *dev; struct btrfs_super_block *sb; struct btrfs_dev_item *dev_item; @@ -2112,6 +2157,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) sb = &root->fs_info->super_for_commit; dev_item = &sb->dev_item; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; @@ -2155,6 +2203,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) total_errors++; } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); @@ -2174,6 +2223,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); if (root->anon_super.s_dev) { @@ -2220,10 +2270,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) ARRAY_SIZE(gang)); if (!ret) break; + + root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { root_objectid = gang[i]->root_key.objectid; ret = btrfs_find_dead_roots(fs_info->tree_root, - root_objectid, gang[i]); + root_objectid); BUG_ON(ret); btrfs_orphan_cleanup(gang[i]); } @@ -2272,27 +2324,23 @@ int close_ctree(struct btrfs_root *root) if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - fs_info->delalloc_bytes); + (unsigned long long)fs_info->delalloc_bytes); } if (fs_info->total_ref_cache_size) { printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", (unsigned long long)fs_info->total_ref_cache_size); } - if (fs_info->extent_root->node) - free_extent_buffer(fs_info->extent_root->node); - - if (fs_info->tree_root->node) - free_extent_buffer(fs_info->tree_root->node); - - if (root->fs_info->chunk_root->node) - free_extent_buffer(root->fs_info->chunk_root->node); - - if (root->fs_info->dev_root->node) - free_extent_buffer(root->fs_info->dev_root->node); - - if (root->fs_info->csum_root->node) - free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(fs_info->extent_root->node); + free_extent_buffer(fs_info->extent_root->commit_root); + free_extent_buffer(fs_info->tree_root->node); + free_extent_buffer(fs_info->tree_root->commit_root); + free_extent_buffer(root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + free_extent_buffer(root->fs_info->dev_root->node); + free_extent_buffer(root->fs_info->dev_root->commit_root); + free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); @@ -2309,16 +2357,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); -#if 0 - while (!list_empty(&fs_info->hashers)) { - struct btrfs_hasher *hasher; - hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, - hashers); - list_del(&hasher->hashers); - crypto_free_hash(&fs_info->hash_tfm); - kfree(hasher); - } -#endif btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2358,8 +2396,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; - - btrfs_set_lock_blocking(buf); + int was_dirty; btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) { @@ -2370,7 +2407,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); + was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += buf->len; + spin_unlock(&root->fs_info->delalloc_lock); + } } void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) @@ -2379,17 +2422,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - struct extent_io_tree *tree; u64 num_dirty; - u64 start = 0; unsigned long thresh = 32 * 1024 * 1024; - tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; - if (current_is_pdflush() || current->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) return; - num_dirty = count_range_bits(tree, &start, (u64)-1, - thresh, EXTENT_DIRTY); + num_dirty = root->fs_info->dirty_metadata_bytes; + if (num_dirty > thresh) { balance_dirty_pages_ratelimited_nr( root->fs_info->btree_inode->i_mapping, 1); @@ -2410,6 +2450,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) int btree_lock_page_hook(struct page *page) { struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; unsigned long len; @@ -2425,6 +2466,16 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_lock(eb); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= eb->len) + root->fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + btrfs_tree_unlock(eb); free_extent_buffer(eb); out: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 95029db227be..c958ecbc1916 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root, void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int wait_on_tree_block_writeback(struct btrfs_root *root, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 85315d2c90de..9596b40caa4e 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - inode = btrfs_iget(sb, &key, root, NULL); + inode = btrfs_iget(sb, &key, root); if (IS_ERR(inode)) return (void *)inode; @@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); } const struct export_operations btrfs_export_ops = { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fefe83ad2059..a5aca3997d42 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,43 +23,39 @@ #include <linux/rcupdate.h> #include "compat.h" #include "hash.h" -#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "print-tree.h" #include "transaction.h" #include "volumes.h" #include "locking.h" -#include "ref-cache.h" +#include "free-space-cache.h" -#define PENDING_EXTENT_INSERT 0 -#define PENDING_EXTENT_DELETE 1 -#define PENDING_BACKREF_UPDATE 2 - -struct pending_extent_op { - int type; - u64 bytenr; - u64 num_bytes; - u64 parent; - u64 orig_parent; - u64 generation; - u64 orig_generation; - int level; - struct list_head list; - int del; -}; - -static int finish_current_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all); -static int del_pending_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all); -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int is_data); +static int update_reserved_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int reserve); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extra_op); +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei); +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod); +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, @@ -160,7 +156,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, u64 extent_start, extent_end, size; int ret; - mutex_lock(&info->pinned_mutex); while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, @@ -186,7 +181,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - mutex_unlock(&info->pinned_mutex); return 0; } @@ -285,8 +279,8 @@ next: block_group->key.objectid + block_group->key.offset); - remove_sb_from_cache(root, block_group); block_group->cached = 1; + remove_sb_from_cache(root, block_group); ret = 0; err: btrfs_free_path(path); @@ -307,7 +301,7 @@ btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) } /* - * return the block group that contains teh given bytenr + * return the block group that contains the given bytenr */ struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, @@ -320,7 +314,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -static inline void put_block_group(struct btrfs_block_group_cache *cache) +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { if (atomic_dec_and_test(&cache->count)) kfree(cache); @@ -393,12 +387,12 @@ again: div_factor(cache->key.offset, factor)) { group_start = cache->key.objectid; spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); goto found; } } spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); cond_resched(); } if (!wrapped) { @@ -448,442 +442,965 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) * maintenance. This is actually the same as #2, but with a slightly * different use case. * + * There are two kinds of back refs. The implicit back refs is optimized + * for pointers in non-shared tree blocks. For a given pointer in a block, + * back refs of this kind provide information about the block's owner tree + * and the pointer's key. These information allow us to find the block by + * b-tree searching. The full back refs is for pointers in tree blocks not + * referenced by their owner trees. The location of tree block is recorded + * in the back refs. Actually the full back refs is generic, and can be + * used in all cases the implicit back refs is used. The major shortcoming + * of the full back refs is its overhead. Every time a tree block gets + * COWed, we have to update back refs entry for all pointers in it. + * + * For a newly allocated tree block, we use implicit back refs for + * pointers in it. This means most tree related operations only involve + * implicit back refs. For a tree block created in old transaction, the + * only way to drop a reference to it is COW it. So we can detect the + * event that tree block loses its owner tree's reference and do the + * back refs conversion. + * + * When a tree block is COW'd through a tree, there are four cases: + * + * The reference count of the block is one and the tree is the block's + * owner tree. Nothing to do in this case. + * + * The reference count of the block is one and the tree is not the + * block's owner tree. In this case, full back refs is used for pointers + * in the block. Remove these full back refs, add implicit back refs for + * every pointers in the new block. + * + * The reference count of the block is greater than one and the tree is + * the block's owner tree. In this case, implicit back refs is used for + * pointers in the block. Add full back refs for every pointers in the + * block, increase lower level extents' reference counts. The original + * implicit back refs are entailed to the new block. + * + * The reference count of the block is greater than one and the tree is + * not the block's owner tree. Add implicit back refs for every pointer in + * the new block, increase lower level extents' reference count. + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, + * The key type is used to differentiate between types of back refs. + * There are different meanings of the key offset for different types + * of back refs. + * * File extents can be referenced by: * * - multiple snapshots, subvolumes, or different generations in one subvol * - different files inside a single subvolume * - different offsets inside a file (bookend extents in file.c) * - * The extent ref structure has fields for: + * The extent ref structure for the implicit back refs has fields for: * * - Objectid of the subvolume root - * - Generation number of the tree holding the reference * - objectid of the file holding the reference - * - number of references holding by parent node (alway 1 for tree blocks) - * - * Btree leaf may hold multiple references to a file extent. In most cases, - * these references are from same file and the corresponding offsets inside - * the file are close together. - * - * When a file extent is allocated the fields are filled in: - * (root_key.objectid, trans->transid, inode objectid, 1) - * - * When a leaf is cow'd new references are added for every file extent found - * in the leaf. It looks similar to the create case, but trans->transid will - * be different when the block is cow'd. + * - original offset in the file + * - how many bookend extents * - * (root_key.objectid, trans->transid, inode objectid, - * number of references in the leaf) + * The key offset for the implicit back refs is hash of the first + * three fields. * - * When a file extent is removed either during snapshot deletion or - * file truncation, we find the corresponding back reference and check - * the following fields: + * The extent ref structure for the full back refs has field for: * - * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), - * inode objectid) + * - number of pointers in the tree leaf * - * Btree extents can be referenced by: - * - * - Different subvolumes - * - Different generations of the same subvolume + * The key offset for the implicit back refs is the first byte of + * the tree leaf * - * When a tree block is created, back references are inserted: + * When a file extent is allocated, The implicit back refs is used. + * the fields are filled in: * - * (root->root_key.objectid, trans->transid, level, 1) + * (root_key.objectid, inode objectid, offset in file, 1) * - * When a tree block is cow'd, new back references are added for all the - * blocks it points to. If the tree block isn't in reference counted root, - * the old back references are removed. These new back references are of - * the form (trans->transid will have increased since creation): + * When a file extent is removed file truncation, we find the + * corresponding implicit back refs and check the following fields: * - * (root->root_key.objectid, trans->transid, level, 1) + * (btrfs_header_owner(leaf), inode objectid, offset in file) * - * When a backref is in deleting, the following fields are checked: + * Btree extents can be referenced by: * - * if backref was for a tree root: - * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) - * else - * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) + * - Different subvolumes * - * Back Reference Key composing: + * Both the implicit back refs and the full back refs for tree blocks + * only consist of key. The key offset for the implicit back refs is + * objectid of block's owner tree. The key offset for the full back refs + * is the first byte of parent block. * - * The key objectid corresponds to the first byte in the extent, the key - * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first - * byte of parent extent. If a extent is tree root, the key offset is set - * to the key objectid. + * When implicit back refs is used, information about the lowest key and + * level of the tree block are required. These information are stored in + * tree block info structure. */ -static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid, int del) +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int convert_extent_item_v0(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 owner, u32 extra_size) { + struct btrfs_extent_item *item; + struct btrfs_extent_item_v0 *ei0; + struct btrfs_extent_ref_v0 *ref0; + struct btrfs_tree_block_info *bi; + struct extent_buffer *leaf; struct btrfs_key key; - struct btrfs_extent_ref *ref; + struct btrfs_key found_key; + u32 new_size = sizeof(*item); + u64 refs; + int ret; + + leaf = path->nodes[0]; + BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + refs = btrfs_extent_refs_v0(leaf, ei0); + + if (owner == (u64)-1) { + while (1) { + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0]); + BUG_ON(key.objectid != found_key.objectid); + if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { + path->slots[0]++; + continue; + } + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + owner = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + } + btrfs_release_path(root, path); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) + new_size += sizeof(*bi); + + new_size -= sizeof(*ei0); + ret = btrfs_search_slot(trans, root, &key, path, + new_size + extra_size, 1); + if (ret < 0) + return ret; + BUG_ON(ret); + + ret = btrfs_extend_item(trans, root, path, new_size); + BUG_ON(ret); + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, refs); + /* FIXME: get real generation */ + btrfs_set_extent_generation(leaf, item, 0); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_set_extent_flags(leaf, item, + BTRFS_EXTENT_FLAG_TREE_BLOCK | + BTRFS_BLOCK_FLAG_FULL_BACKREF); + bi = (struct btrfs_tree_block_info *)(item + 1); + /* FIXME: get first key of the block */ + memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); + btrfs_set_tree_block_level(leaf, bi, (int)owner); + } else { + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} +#endif + +static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +{ + u32 high_crc = ~(u32)0; + u32 low_crc = ~(u32)0; + __le64 lenum; + + lenum = cpu_to_le64(root_objectid); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(owner); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(offset); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + + return ((u64)high_crc << 31) ^ (u64)low_crc; +} + +static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref) +{ + return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), + btrfs_extent_data_ref_objectid(leaf, ref), + btrfs_extent_data_ref_offset(leaf, ref)); +} + +static int match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) +{ + if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != owner || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + return 0; + return 1; +} + +static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, + u64 owner, u64 offset) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; - u64 ref_objectid; + u32 nritems; int ret; + int recow; + int err = -ENOENT; key.objectid = bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = parent; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + } +again: + recow = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } - ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; + if (parent) { + if (!ret) + return 0; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + key.type = BTRFS_EXTENT_REF_V0_KEY; + btrfs_release_path(root, path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + if (!ret) + return 0; +#endif + goto fail; } leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - ref_objectid = btrfs_ref_objectid(leaf, ref); - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != ref_generation || - (ref_objectid != owner_objectid && - ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { - ret = -EIO; - WARN_ON(1); - goto out; + nritems = btrfs_header_nritems(leaf); + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + err = ret; + if (ret) + goto fail; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != bytenr || + key.type != BTRFS_EXTENT_DATA_REF_KEY) + goto fail; + + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + err = 0; + break; + } + path->slots[0]++; + } +fail: + return err; +} + +static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u32 size; + u32 num_refs; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + size = sizeof(struct btrfs_shared_data_ref); + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + size = sizeof(struct btrfs_extent_data_ref); + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + if (parent) { + struct btrfs_shared_data_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + if (ret == 0) { + btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_shared_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_shared_data_ref_count(leaf, ref, num_refs); + } + } else { + struct btrfs_extent_data_ref *ref; + while (ret == -EEXIST) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) + break; + btrfs_release_path(root, path); + key.offset++; + ret = btrfs_insert_empty_item(trans, root, path, &key, + size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + } + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (ret == 0) { + btrfs_set_extent_data_ref_root(leaf, ref, + root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_extent_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_extent_data_ref_count(leaf, ref, num_refs); + } } + btrfs_mark_buffer_dirty(leaf); ret = 0; -out: +fail: + btrfs_release_path(root, path); return ret; } -/* - * updates all the backrefs that are pending on update_list for the - * extent_root - */ -static noinline int update_backrefs(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_path *path, - struct list_head *update_list) +static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int refs_to_drop) { struct btrfs_key key; - struct btrfs_extent_ref *ref; - struct btrfs_fs_info *info = extent_root->fs_info; - struct pending_extent_op *op; + struct btrfs_extent_data_ref *ref1 = NULL; + struct btrfs_shared_data_ref *ref2 = NULL; struct extent_buffer *leaf; + u32 num_refs = 0; int ret = 0; - struct list_head *cur = update_list->next; - u64 ref_objectid; - u64 ref_root = extent_root->root_key.objectid; - op = list_entry(cur, struct pending_extent_op, list); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + BUG(); + } -search: - key.objectid = op->bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = op->orig_parent; + BUG_ON(num_refs < refs_to_drop); + num_refs -= refs_to_drop; - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); - BUG_ON(ret); + if (num_refs == 0) { + ret = btrfs_del_item(trans, root, path); + } else { + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); + else if (key.type == BTRFS_SHARED_DATA_REF_KEY) + btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + else { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + btrfs_set_ref_count_v0(leaf, ref0, num_refs); + } +#endif + btrfs_mark_buffer_dirty(leaf); + } + return ret; +} - leaf = path->nodes[0]; +static noinline u32 extent_data_ref_count(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + struct btrfs_shared_data_ref *ref2; + u32 num_refs = 0; -loop: - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - - ref_objectid = btrfs_ref_objectid(leaf, ref); - - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != op->orig_generation || - (ref_objectid != op->level && - ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { - printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, " - "root %llu, owner %u\n", - (unsigned long long)op->bytenr, - (unsigned long long)op->orig_parent, - (unsigned long long)ref_root, op->level); - btrfs_print_leaf(extent_root, leaf); - BUG(); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (iref) { + if (btrfs_extent_inline_ref_type(leaf, iref) == + BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else { + ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + WARN_ON(1); } + return num_refs; +} - key.objectid = op->bytenr; - key.offset = op->parent; - key.type = BTRFS_EXTENT_REF_KEY; - ret = btrfs_set_item_key_safe(trans, extent_root, path, &key); - BUG_ON(ret); - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - btrfs_set_ref_generation(leaf, ref, op->generation); +static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; - cur = cur->next; + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ret == -ENOENT && parent) { + btrfs_release_path(root, path); + key.type = BTRFS_EXTENT_REF_V0_KEY; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + } +#endif + return ret; +} - if (cur == update_list) { - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(extent_root, path); - goto out; +static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; } - op = list_entry(cur, struct pending_extent_op, list); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + btrfs_release_path(root, path); + return ret; +} - path->slots[0]++; - while (path->slots[0] < btrfs_header_nritems(leaf)) { - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid == op->bytenr && - key.type == BTRFS_EXTENT_REF_KEY) - goto loop; - path->slots[0]++; +static inline int extent_ref_type(u64 parent, u64 owner) +{ + int type; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (parent > 0) + type = BTRFS_SHARED_BLOCK_REF_KEY; + else + type = BTRFS_TREE_BLOCK_REF_KEY; + } else { + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; } + return type; +} - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(extent_root, path); - goto search; +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) -out: - return 0; +{ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 >= + btrfs_header_nritems(path->nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + else + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + return 1; } -static noinline int insert_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_path *path, - struct list_head *insert_list, int nr) +/* + * look for inline back ref. if back ref is found, *ref_ret is set + * to the address of inline back ref, and 0 is returned. + * + * if back ref isn't found, *ref_ret is set to the address where it + * should be inserted, and -ENOENT is returned. + * + * if insert is true and there are too many inline back refs, the path + * points to the extent item, and -EAGAIN is returned. + * + * NOTE: inline back refs are ordered in the same way that back ref + * items in the tree are ordered. + */ +static noinline_for_stack +int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int insert) { - struct btrfs_key *keys; - u32 *data_size; - struct pending_extent_op *op; + struct btrfs_key key; struct extent_buffer *leaf; - struct list_head *cur = insert_list->next; - struct btrfs_fs_info *info = extent_root->fs_info; - u64 ref_root = extent_root->root_key.objectid; - int i = 0, last = 0, ret; - int total = nr * 2; - - if (!nr) - return 0; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + u64 flags; + u64 item_size; + unsigned long ptr; + unsigned long end; + int extra_size; + int type; + int want; + int ret; + int err = 0; - keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS); - if (!keys) - return -ENOMEM; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; - data_size = kzalloc(total * sizeof(u32), GFP_NOFS); - if (!data_size) { - kfree(keys); - return -ENOMEM; + want = extent_ref_type(parent, owner); + if (insert) { + extra_size = btrfs_extent_inline_ref_size(want); + path->keep_locks = 1; + } else + extra_size = -1; + ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); + if (ret < 0) { + err = ret; + goto out; } + BUG_ON(ret); - list_for_each_entry(op, insert_list, list) { - keys[i].objectid = op->bytenr; - keys[i].offset = op->num_bytes; - keys[i].type = BTRFS_EXTENT_ITEM_KEY; - data_size[i] = sizeof(struct btrfs_extent_item); - i++; - - keys[i].objectid = op->bytenr; - keys[i].offset = op->parent; - keys[i].type = BTRFS_EXTENT_REF_KEY; - data_size[i] = sizeof(struct btrfs_extent_ref); - i++; + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + if (!insert) { + err = -ENOENT; + goto out; + } + ret = convert_extent_item_v0(trans, root, path, owner, + extra_size); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); } +#endif + BUG_ON(item_size < sizeof(*ei)); - op = list_entry(cur, struct pending_extent_op, list); - i = 0; - while (i < total) { - int c; - ret = btrfs_insert_some_items(trans, extent_root, path, - keys+i, data_size+i, total-i); - BUG_ON(ret < 0); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); - if (last && ret > 1) - BUG(); + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; - leaf = path->nodes[0]; - for (c = 0; c < ret; c++) { - int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY; + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } - /* - * if the first item we inserted was a backref, then - * the EXTENT_ITEM will be the odd c's, else it will - * be the even c's - */ - if ((ref_first && (c % 2)) || - (!ref_first && !(c % 2))) { - struct btrfs_extent_item *itm; - - itm = btrfs_item_ptr(leaf, path->slots[0] + c, - struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], itm, 1); - op->del++; - } else { - struct btrfs_extent_ref *ref; - - ref = btrfs_item_ptr(leaf, path->slots[0] + c, - struct btrfs_extent_ref); - btrfs_set_ref_root(leaf, ref, ref_root); - btrfs_set_ref_generation(leaf, ref, - op->generation); - btrfs_set_ref_objectid(leaf, ref, op->level); - btrfs_set_ref_num_refs(leaf, ref, 1); - op->del++; - } + err = -ENOENT; + while (1) { + if (ptr >= end) { + WARN_ON(ptr > end); + break; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + if (want < type) + break; + if (want > type) { + ptr += btrfs_extent_inline_ref_size(type); + continue; + } - /* - * using del to see when its ok to free up the - * pending_extent_op. In the case where we insert the - * last item on the list in order to help do batching - * we need to not free the extent op until we actually - * insert the extent_item - */ - if (op->del == 2) { - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, - GFP_NOFS); - cur = cur->next; - list_del_init(&op->list); - kfree(op); - if (cur != insert_list) - op = list_entry(cur, - struct pending_extent_op, - list); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (match_extent_data_ref(leaf, dref, root_objectid, + owner, offset)) { + err = 0; + break; + } + if (hash_extent_data_ref_item(leaf, dref) < + hash_extent_data_ref(root_objectid, owner, offset)) + break; + } else { + u64 ref_offset; + ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (parent > 0) { + if (parent == ref_offset) { + err = 0; + break; + } + if (ref_offset < parent) + break; + } else { + if (root_objectid == ref_offset) { + err = 0; + break; + } + if (ref_offset < root_objectid) + break; } } - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(extent_root, path); - + ptr += btrfs_extent_inline_ref_size(type); + } + if (err == -ENOENT && insert) { + if (item_size + extra_size >= + BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } /* - * Ok backref's and items usually go right next to eachother, - * but if we could only insert 1 item that means that we - * inserted on the end of a leaf, and we have no idea what may - * be on the next leaf so we just play it safe. In order to - * try and help this case we insert the last thing on our - * insert list so hopefully it will end up being the last - * thing on the leaf and everything else will be before it, - * which will let us insert a whole bunch of items at the same - * time. + * To add new inline back ref, we have to make sure + * there is no corresponding back ref item. + * For simplicity, we just do not add new inline back + * ref if there is any kind of item for this block */ - if (ret == 1 && !last && (i + ret < total)) { - /* - * last: where we will pick up the next time around - * i: our current key to insert, will be total - 1 - * cur: the current op we are screwing with - * op: duh - */ - last = i + ret; - i = total - 1; - cur = insert_list->prev; - op = list_entry(cur, struct pending_extent_op, list); - } else if (last) { - /* - * ok we successfully inserted the last item on the - * list, lets reset everything - * - * i: our current key to insert, so where we left off - * last time - * last: done with this - * cur: the op we are messing with - * op: duh - * total: since we inserted the last key, we need to - * decrement total so we dont overflow - */ - i = last; - last = 0; - total--; - if (i < total) { - cur = insert_list->next; - op = list_entry(cur, struct pending_extent_op, - list); - } - } else { - i += ret; + if (find_next_key(path, 0, &key) == 0 && + key.objectid == bytenr && + key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + err = -EAGAIN; + goto out; } - - cond_resched(); } - ret = 0; - kfree(keys); - kfree(data_size); - return ret; + *ref_ret = (struct btrfs_extent_inline_ref *)ptr; +out: + if (insert) { + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + } + return err; } -static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid) +/* + * helper to add new inline back ref + */ +static noinline_for_stack +int setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_extent_ref *ref; - u32 num_refs; + struct btrfs_extent_item *ei; + unsigned long ptr; + unsigned long end; + unsigned long item_offset; + u64 refs; + int size; + int type; int ret; - key.objectid = bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = parent; + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + item_offset = (unsigned long)iref - (unsigned long)ei; - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); - if (ret == 0) { - leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - btrfs_set_ref_root(leaf, ref, ref_root); - btrfs_set_ref_generation(leaf, ref, ref_generation); - btrfs_set_ref_objectid(leaf, ref, owner_objectid); - btrfs_set_ref_num_refs(leaf, ref, 1); - } else if (ret == -EEXIST) { - u64 existing_owner; - BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); - leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != ref_generation) { - ret = -EIO; - WARN_ON(1); - goto out; - } + type = extent_ref_type(parent, owner); + size = btrfs_extent_inline_ref_size(type); - num_refs = btrfs_ref_num_refs(leaf, ref); - BUG_ON(num_refs == 0); - btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); + ret = btrfs_extend_item(trans, root, path, size); + BUG_ON(ret); - existing_owner = btrfs_ref_objectid(leaf, ref); - if (existing_owner != owner_objectid && - existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { - btrfs_set_ref_objectid(leaf, ref, - BTRFS_MULTIPLE_OBJECTIDS); - } - ret = 0; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + refs += refs_to_add; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + ptr = (unsigned long)ei + item_offset; + end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); + + iref = (struct btrfs_extent_inline_ref *)ptr; + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, dref, owner); + btrfs_set_extent_data_ref_offset(leaf, dref, offset); + btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + sref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); } else { - goto out; + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(path->nodes[0]); -out: + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, ref_ret, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 0); + if (ret != -ENOENT) + return ret; + btrfs_release_path(root, path); + *ref_ret = NULL; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, + root_objectid); + } else { + ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, + root_objectid, owner, offset); + } return ret; } -static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) +/* + * helper to update/remove inline back ref + */ +static noinline_for_stack +int update_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; - struct btrfs_extent_ref *ref; - u32 num_refs; - int ret = 0; + struct btrfs_extent_item *ei; + struct btrfs_extent_data_ref *dref = NULL; + struct btrfs_shared_data_ref *sref = NULL; + unsigned long ptr; + unsigned long end; + u32 item_size; + int size; + int type; + int ret; + u64 refs; leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - num_refs = btrfs_ref_num_refs(leaf, ref); - BUG_ON(num_refs == 0); - num_refs -= 1; - if (num_refs == 0) { - ret = btrfs_del_item(trans, root, path); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + refs += refs_to_mod; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + type = btrfs_extent_inline_ref_type(leaf, iref); + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + refs = btrfs_extent_data_ref_count(leaf, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + sref = (struct btrfs_shared_data_ref *)(iref + 1); + refs = btrfs_shared_data_ref_count(leaf, sref); } else { - btrfs_set_ref_num_refs(leaf, ref, num_refs); - btrfs_mark_buffer_dirty(leaf); + refs = 1; + BUG_ON(refs_to_mod != -1); + } + + BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + refs += refs_to_mod; + + if (refs > 0) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, dref, refs); + else + btrfs_set_shared_data_ref_count(leaf, sref, refs); + } else { + size = btrfs_extent_inline_ref_size(type); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) + memmove_extent_buffer(leaf, ptr, ptr + size, + end - ptr - size); + item_size -= size; + ret = btrfs_truncate_item(trans, root, path, item_size, 1); + BUG_ON(ret); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int insert_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_extent_inline_ref *iref; + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 1); + if (ret == 0) { + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + ret = update_inline_extent_backref(trans, root, path, iref, + refs_to_add, extent_op); + } else if (ret == -ENOENT) { + ret = setup_inline_extent_backref(trans, root, path, iref, + parent, root_objectid, + owner, offset, refs_to_add, + extent_op); + } + return ret; +} + +static int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add) +{ + int ret; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, root, path, bytenr, + parent, root_objectid); + } else { + ret = insert_extent_data_ref(trans, root, path, bytenr, + parent, root_objectid, + owner, offset, refs_to_add); + } + return ret; +} + +static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_drop, int is_data) +{ + int ret; + + BUG_ON(!is_data && refs_to_drop != 1); + if (iref) { + ret = update_inline_extent_backref(trans, root, path, iref, + -refs_to_drop, NULL); + } else if (is_data) { + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + } else { + ret = btrfs_del_item(trans, root, path); } - btrfs_release_path(root, path); return ret; } @@ -927,502 +1444,632 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, #endif } -static noinline int free_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct list_head *del_list) +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && + root_objectid == BTRFS_TREE_LOG_OBJECTID); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_REF, NULL); + } + return ret; +} + +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_path *path; - struct btrfs_key key, found_key; struct extent_buffer *leaf; - struct list_head *cur; - struct pending_extent_op *op; - struct btrfs_extent_item *ei; - int ret, num_to_del, extent_slot = 0, found_extent = 0; - u32 refs; - u64 bytes_freed = 0; + struct btrfs_extent_item *item; + u64 refs; + int ret; + int err = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; + path->leave_spinning = 1; + /* this will setup the path even if it fails to insert the back ref */ + ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, num_bytes, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + if (ret == 0) + goto out; -search: - /* search for the backref for the current ref we want to delete */ - cur = del_list->next; - op = list_entry(cur, struct pending_extent_op, list); - ret = lookup_extent_backref(trans, extent_root, path, op->bytenr, - op->orig_parent, - extent_root->root_key.objectid, - op->orig_generation, op->level, 1); - if (ret) { - printk(KERN_ERR "btrfs unable to find backref byte nr %llu " - "root %llu gen %llu owner %u\n", - (unsigned long long)op->bytenr, - (unsigned long long)extent_root->root_key.objectid, - (unsigned long long)op->orig_generation, op->level); - btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); + if (ret != -EAGAIN) { + err = ret; goto out; } - extent_slot = path->slots[0]; - num_to_del = 1; - found_extent = 0; + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, item); + btrfs_set_extent_refs(leaf, item, refs + refs_to_add); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, item); - /* - * if we aren't the first item on the leaf we can move back one and see - * if our ref is right next to our extent item - */ - if (likely(extent_slot)) { - extent_slot--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - extent_slot); - if (found_key.objectid == op->bytenr && - found_key.type == BTRFS_EXTENT_ITEM_KEY && - found_key.offset == op->num_bytes) { - num_to_del++; - found_extent = 1; + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root->fs_info->extent_root, path); + + path->reada = 1; + path->leave_spinning = 1; + + /* now insert the actual backref */ + ret = insert_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, parent, root_objectid, + owner, offset, refs_to_add); + BUG_ON(ret); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_data_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + u64 flags = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_data_ref(node); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + if (extent_op) { + BUG_ON(extent_op->update_key); + flags |= extent_op->flags_to_set; } + ret = alloc_reserved_file_extent(trans, root, + parent, ref_root, flags, + ref->objectid, ref->offset, + &ins, node->ref_mod); + update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else { + BUG(); } + return ret; +} - /* - * if we didn't find the extent we need to delete the backref and then - * search for the extent item key so we can update its ref count - */ - if (!found_extent) { - key.objectid = op->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = op->num_bytes; +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei) +{ + u64 flags = btrfs_extent_flags(leaf, ei); + if (extent_op->update_flags) { + flags |= extent_op->flags_to_set; + btrfs_set_extent_flags(leaf, ei, flags); + } - ret = remove_extent_backref(trans, extent_root, path); - BUG_ON(ret); - btrfs_release_path(extent_root, path); - ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); - BUG_ON(ret); - extent_slot = path->slots[0]; + if (extent_op->update_key) { + struct btrfs_tree_block_info *bi; + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_set_tree_block_key(leaf, bi, &extent_op->key); + } +} + +static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + u32 item_size; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + path->reada = 1; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + err = -EIO; + goto out; } - /* this is where we update the ref count for the extent */ leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs == 0); - refs--; - btrfs_set_extent_refs(leaf, ei, refs); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + ret = convert_extent_item_v0(trans, root->fs_info->extent_root, + path, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + __run_delayed_extent_op(extent_op, leaf, ei); btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_tree_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_tree_ref(node); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + BUG_ON(node->ref_mod != 1); + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + BUG_ON(!extent_op || !extent_op->update_flags || + !extent_op->update_key); + ret = alloc_reserved_tree_block(trans, root, + parent, ref_root, + extent_op->flags_to_set, + &extent_op->key, + ref->level, &ins); + update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else { + BUG(); + } + return ret; +} - /* - * This extent needs deleting. The reason cur_slot is extent_slot + - * num_to_del is because extent_slot points to the slot where the extent - * is, and if the backref was not right next to the extent we will be - * deleting at least 1 item, and will want to start searching at the - * slot directly next to extent_slot. However if we did find the - * backref next to the extent item them we will be deleting at least 2 - * items and will want to start searching directly after the ref slot - */ - if (!refs) { - struct list_head *pos, *n, *end; - int cur_slot = extent_slot+num_to_del; - u64 super_used; - u64 root_used; - - path->slots[0] = extent_slot; - bytes_freed = op->num_bytes; - - mutex_lock(&info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, op->bytenr, - op->num_bytes, op->level >= - BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&info->pinned_mutex); - BUG_ON(ret < 0); - op->del = ret; +/* helper function to actually process a single delayed ref entry */ +static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret; + if (btrfs_delayed_ref_is_head(node)) { + struct btrfs_delayed_ref_head *head; /* - * we need to see if we can delete multiple things at once, so - * start looping through the list of extents we are wanting to - * delete and see if their extent/backref's are right next to - * eachother and the extents only have 1 ref + * we've hit the end of the chain and we were supposed + * to insert this extent into the tree. But, it got + * deleted before we ever needed to insert it, so all + * we have to do is clean up the accounting */ - for (pos = cur->next; pos != del_list; pos = pos->next) { - struct pending_extent_op *tmp; - - tmp = list_entry(pos, struct pending_extent_op, list); + BUG_ON(extent_op); + head = btrfs_delayed_node_to_head(node); + if (insert_reserved) { + if (head->is_data) { + ret = btrfs_del_csums(trans, root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } + btrfs_update_pinned_extents(root, node->bytenr, + node->num_bytes, 1); + update_reserved_extents(root, node->bytenr, + node->num_bytes, 0); + } + mutex_unlock(&head->mutex); + return 0; + } - /* we only want to delete extent+ref at this stage */ - if (cur_slot >= btrfs_header_nritems(leaf) - 1) - break; + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = run_delayed_tree_ref(trans, root, node, extent_op, + insert_reserved); + else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + ret = run_delayed_data_ref(trans, root, node, extent_op, + insert_reserved); + else + BUG(); + return ret; +} - btrfs_item_key_to_cpu(leaf, &found_key, cur_slot); - if (found_key.objectid != tmp->bytenr || - found_key.type != BTRFS_EXTENT_ITEM_KEY || - found_key.offset != tmp->num_bytes) - break; +static noinline struct btrfs_delayed_ref_node * +select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + int action = BTRFS_ADD_DELAYED_REF; +again: + /* + * select delayed ref of type BTRFS_ADD_DELAYED_REF first. + * this prevents ref count from going down to zero when + * there still are pending delayed ref. + */ + node = rb_prev(&head->node.rb_node); + while (1) { + if (!node) + break; + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr != head->node.bytenr) + break; + if (ref->action == action) + return ref; + node = rb_prev(node); + } + if (action == BTRFS_ADD_DELAYED_REF) { + action = BTRFS_DROP_DELAYED_REF; + goto again; + } + return NULL; +} - /* check to make sure this extent only has one ref */ - ei = btrfs_item_ptr(leaf, cur_slot, - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, ei) != 1) - break; +static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct list_head *cluster) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *locked_ref = NULL; + struct btrfs_delayed_extent_op *extent_op; + int ret; + int count = 0; + int must_insert_reserved = 0; - btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1); - if (found_key.objectid != tmp->bytenr || - found_key.type != BTRFS_EXTENT_REF_KEY || - found_key.offset != tmp->orig_parent) + delayed_refs = &trans->transaction->delayed_refs; + while (1) { + if (!locked_ref) { + /* pick a new head ref from the cluster list */ + if (list_empty(cluster)) break; - /* - * the ref is right next to the extent, we can set the - * ref count to 0 since we will delete them both now - */ - btrfs_set_extent_refs(leaf, ei, 0); + locked_ref = list_entry(cluster->next, + struct btrfs_delayed_ref_head, cluster); - /* pin down the bytes for this extent */ - mutex_lock(&info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, tmp->bytenr, - tmp->num_bytes, tmp->level >= - BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&info->pinned_mutex); - BUG_ON(ret < 0); + /* grab the lock that says we are going to process + * all the refs for this head */ + ret = btrfs_delayed_ref_lock(trans, locked_ref); /* - * use the del field to tell if we need to go ahead and - * free up the extent when we delete the item or not. + * we may have dropped the spin lock to get the head + * mutex lock, and that might have given someone else + * time to free the head. If that's true, it has been + * removed from our list and we can move on. */ - tmp->del = ret; - bytes_freed += tmp->num_bytes; - - num_to_del += 2; - cur_slot += 2; + if (ret == -EAGAIN) { + locked_ref = NULL; + count++; + continue; + } } - end = pos; - - /* update the free space counters */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - super_used - bytes_freed); - - root_used = btrfs_root_used(&extent_root->root_item); - btrfs_set_root_used(&extent_root->root_item, - root_used - bytes_freed); - spin_unlock(&info->delalloc_lock); - - /* delete the items */ - ret = btrfs_del_items(trans, extent_root, path, - path->slots[0], num_to_del); - BUG_ON(ret); /* - * loop through the extents we deleted and do the cleanup work - * on them + * record the must insert reserved flag before we + * drop the spin lock. */ - for (pos = cur, n = pos->next; pos != end; - pos = n, n = pos->next) { - struct pending_extent_op *tmp; - tmp = list_entry(pos, struct pending_extent_op, list); + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; - /* - * remember tmp->del tells us wether or not we pinned - * down the extent - */ - ret = update_block_group(trans, extent_root, - tmp->bytenr, tmp->num_bytes, 0, - tmp->del); - BUG_ON(ret); + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; - list_del_init(&tmp->list); - unlock_extent(&info->extent_ins, tmp->bytenr, - tmp->bytenr + tmp->num_bytes - 1, - GFP_NOFS); - kfree(tmp); - } - } else if (refs && found_extent) { /* - * the ref and extent were right next to eachother, but the - * extent still has a ref, so just free the backref and keep - * going + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates */ - ret = remove_extent_backref(trans, extent_root, path); - BUG_ON(ret); + ref = select_delayed_ref(locked_ref); + if (!ref) { + /* All delayed refs have been processed, Go ahead + * and send the head node to run_one_delayed_ref, + * so that any accounting fixes can happen + */ + ref = &locked_ref->node; - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); - } else { - /* - * the extent has multiple refs and the backref we were looking - * for was not right next to it, so just unlock and go next, - * we're good to go - */ - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); - } + if (extent_op && must_insert_reserved) { + kfree(extent_op); + extent_op = NULL; + } - btrfs_release_path(extent_root, path); - if (!list_empty(del_list)) - goto search; + if (extent_op) { + spin_unlock(&delayed_refs->lock); -out: - btrfs_free_path(path); - return ret; -} + ret = run_delayed_extent_op(trans, root, + ref, extent_op); + BUG_ON(ret); + kfree(extent_op); -static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 orig_parent, u64 parent, - u64 orig_root, u64 ref_root, - u64 orig_generation, u64 ref_generation, - u64 owner_objectid) -{ - int ret; - struct btrfs_root *extent_root = root->fs_info->extent_root; - struct btrfs_path *path; + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; + } - if (root == root->fs_info->extent_root) { - struct pending_extent_op *extent_op; - u64 num_bytes; - - BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL); - num_bytes = btrfs_level_size(root, (int)owner_objectid); - mutex_lock(&root->fs_info->extent_ins_mutex); - if (test_range_bit(&root->fs_info->extent_ins, bytenr, - bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { - u64 priv; - ret = get_state_private(&root->fs_info->extent_ins, - bytenr, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; - BUG_ON(extent_op->parent != orig_parent); - BUG_ON(extent_op->generation != orig_generation); + list_del_init(&locked_ref->cluster); + locked_ref = NULL; + } - extent_op->parent = parent; - extent_op->generation = ref_generation; - } else { - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - - extent_op->type = PENDING_BACKREF_UPDATE; - extent_op->bytenr = bytenr; - extent_op->num_bytes = num_bytes; - extent_op->parent = parent; - extent_op->orig_parent = orig_parent; - extent_op->generation = ref_generation; - extent_op->orig_generation = orig_generation; - extent_op->level = (int)owner_objectid; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - set_extent_bits(&root->fs_info->extent_ins, - bytenr, bytenr + num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->extent_ins, - bytenr, (unsigned long)extent_op); - } - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = lookup_extent_backref(trans, extent_root, path, - bytenr, orig_parent, orig_root, - orig_generation, owner_objectid, 1); - if (ret) - goto out; - ret = remove_extent_backref(trans, extent_root, path); - if (ret) - goto out; - ret = insert_extent_backref(trans, extent_root, path, bytenr, - parent, ref_root, ref_generation, - owner_objectid); - BUG_ON(ret); - finish_current_insert(trans, extent_root, 0); - del_pending_extents(trans, extent_root, 0); -out: - btrfs_free_path(path); - return ret; -} + spin_unlock(&delayed_refs->lock); -int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 orig_parent, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid) -{ - int ret; - if (ref_root == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) - return 0; - ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, - parent, ref_root, ref_root, - ref_generation, ref_generation, - owner_objectid); - return ret; + ret = run_one_delayed_ref(trans, root, ref, extent_op, + must_insert_reserved); + BUG_ON(ret); + + btrfs_put_delayed_ref(ref); + kfree(extent_op); + count++; + + cond_resched(); + spin_lock(&delayed_refs->lock); + } + return count; } -static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 orig_parent, u64 parent, - u64 orig_root, u64 ref_root, - u64 orig_generation, u64 ref_generation, - u64 owner_objectid) +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count) { - struct btrfs_path *path; + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct list_head cluster; int ret; - struct btrfs_key key; - struct extent_buffer *l; - struct btrfs_extent_item *item; - u32 refs; + int run_all = count == (unsigned long)-1; + int run_most = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (root == root->fs_info->extent_root) + root = root->fs_info->tree_root; - path->reada = 1; - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)-1; + delayed_refs = &trans->transaction->delayed_refs; + INIT_LIST_HEAD(&cluster); +again: + spin_lock(&delayed_refs->lock); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + while (1) { + if (!(run_all || run_most) && + delayed_refs->num_heads_ready < 64) + break; - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, - 0, 1); - if (ret < 0) - return ret; - BUG_ON(ret == 0 || path->slots[0] == 0); + /* + * go find something we can process in the rbtree. We start at + * the beginning of the tree, and then build a cluster + * of refs to process starting at the first one we are able to + * lock + */ + ret = btrfs_find_ref_cluster(trans, &cluster, + delayed_refs->run_delayed_start); + if (ret) + break; - path->slots[0]--; - l = path->nodes[0]; + ret = run_clustered_refs(trans, root, &cluster); + BUG_ON(ret < 0); - btrfs_item_key_to_cpu(l, &key, path->slots[0]); - if (key.objectid != bytenr) { - btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); - printk(KERN_ERR "btrfs wanted %llu found %llu\n", - (unsigned long long)bytenr, - (unsigned long long)key.objectid); - BUG(); + count -= min_t(unsigned long, ret, count); + + if (count == 0) + break; } - BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); - item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(l, item); - btrfs_set_extent_refs(l, item, refs + 1); - btrfs_mark_buffer_dirty(path->nodes[0]); + if (run_all) { + node = rb_first(&delayed_refs->root); + if (!node) + goto out; + count = (unsigned long)-1; - btrfs_release_path(root->fs_info->extent_root, path); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; - path->reada = 1; - ret = insert_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, parent, - ref_root, ref_generation, - owner_objectid); - BUG_ON(ret); - finish_current_insert(trans, root->fs_info->extent_root, 0); - del_pending_extents(trans, root->fs_info->extent_root, 0); + head = btrfs_delayed_node_to_head(ref); + atomic_inc(&ref->refs); - btrfs_free_path(path); + spin_unlock(&delayed_refs->lock); + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + + btrfs_put_delayed_ref(ref); + cond_resched(); + goto again; + } + node = rb_next(node); + } + spin_unlock(&delayed_refs->lock); + schedule_timeout(1); + goto again; + } +out: + spin_unlock(&delayed_refs->lock); return 0; } -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid) +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data) { + struct btrfs_delayed_extent_op *extent_op; int ret; - if (ref_root == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) - return 0; - ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, - 0, ref_root, 0, ref_generation, - owner_objectid); + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + if (!extent_op) + return -ENOMEM; + + extent_op->flags_to_set = flags; + extent_op->update_flags = 1; + extent_op->update_key = 0; + extent_op->is_data = is_data ? 1 : 0; + + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + if (ret) + kfree(extent_op); return ret; } -int btrfs_extent_post_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) { - u64 start; - u64 end; - int ret; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_data_ref *data_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + int ret = 0; - while(1) { - finish_current_insert(trans, root->fs_info->extent_root, 1); - del_pending_extents(trans, root->fs_info->extent_root, 1); + ret = -ENOENT; + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; - /* is there more work to do? */ - ret = find_first_extent_bit(&root->fs_info->pending_del, - 0, &start, &end, EXTENT_WRITEBACK); - if (!ret) - continue; - ret = find_first_extent_bit(&root->fs_info->extent_ins, - 0, &start, &end, EXTENT_WRITEBACK); - if (!ret) - continue; - break; + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; } - return 0; -} -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs) -{ - struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct extent_buffer *l; - struct btrfs_extent_item *item; + node = rb_prev(&head->node.rb_node); + if (!node) + goto out_unlock; - WARN_ON(num_bytes < root->sectorsize); - path = btrfs_alloc_path(); - path->reada = 1; - key.objectid = bytenr; - key.offset = num_bytes; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, - 0, 0); - if (ret < 0) - goto out; - if (ret != 0) { - btrfs_print_leaf(root, path->nodes[0]); - printk(KERN_INFO "btrfs failed to find block number %llu\n", - (unsigned long long)bytenr); - BUG(); + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + + if (ref->bytenr != bytenr) + goto out_unlock; + + ret = 1; + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) + goto out_unlock; + + data_ref = btrfs_delayed_node_to_data_ref(ref); + + node = rb_prev(node); + if (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (ref->bytenr == bytenr) + goto out_unlock; } - l = path->nodes[0]; - item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); - *refs = btrfs_extent_refs(l, item); + + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || data_ref->offset != offset) + goto out_unlock; + + ret = 0; +out_unlock: + mutex_unlock(&head->mutex); out: - btrfs_free_path(path); - return 0; + spin_unlock(&delayed_refs->lock); + return ret; } -int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 bytenr) +static noinline int check_committed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) { struct btrfs_root *extent_root = root->fs_info->extent_root; - struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_extent_ref *ref_item; + struct btrfs_extent_data_ref *ref; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_item *ei; struct btrfs_key key; - struct btrfs_key found_key; - u64 ref_root; - u64 last_snapshot; - u32 nritems; + u32 item_size; int ret; key.objectid = bytenr; key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; - path = btrfs_alloc_path(); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; @@ -1434,55 +2081,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, path->slots[0]--; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (found_key.objectid != bytenr || - found_key.type != BTRFS_EXTENT_ITEM_KEY) + if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) goto out; - last_snapshot = btrfs_root_last_snapshot(&root->root_item); - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret == 0) - continue; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != bytenr) - break; + ret = 1; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + goto out; + } +#endif + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - if (found_key.type != BTRFS_EXTENT_REF_KEY) { - path->slots[0]++; - continue; - } + if (item_size != sizeof(*ei) + + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + goto out; - ref_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - ref_root = btrfs_ref_root(leaf, ref_item); - if ((ref_root != root->root_key.objectid && - ref_root != BTRFS_TREE_LOG_OBJECTID) || - objectid != btrfs_ref_objectid(leaf, ref_item)) { - ret = 1; - goto out; - } - if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { - ret = 1; + if (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + if (btrfs_extent_inline_ref_type(leaf, iref) != + BTRFS_EXTENT_DATA_REF_KEY) + goto out; + + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (btrfs_extent_refs(leaf, ei) != + btrfs_extent_data_ref_count(leaf, ref) || + btrfs_extent_data_ref_root(leaf, ref) != + root->root_key.objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + goto out; + + ret = 0; +out: + return ret; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_path *path; + int ret; + int ret2; + + path = btrfs_alloc_path(); + if (!path) + return -ENOENT; + + do { + ret = check_committed_ref(trans, root, path, objectid, + offset, bytenr); + if (ret && ret != -ENOENT) goto out; - } - path->slots[0]++; + ret2 = check_delayed_ref(trans, root, path, objectid, + offset, bytenr); + } while (ret2 == -EAGAIN); + + if (ret2 && ret2 != -ENOENT) { + ret = ret2; + goto out; } - ret = 0; + + if (ret != -ENOENT || ret2 != -ENOENT) + ret = 0; out: btrfs_free_path(path); return ret; } +#if 0 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, u32 nr_extents) { @@ -1600,62 +2275,44 @@ static int refsort_cmp(const void *a_void, const void *b_void) return 1; return 0; } +#endif - -noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, +static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *orig_buf, - struct extent_buffer *buf, u32 *nr_extents) + struct extent_buffer *buf, + int full_backref, int inc) { u64 bytenr; + u64 num_bytes; + u64 parent; u64 ref_root; - u64 orig_root; - u64 ref_generation; - u64 orig_generation; - struct refsort *sorted; u32 nritems; - u32 nr_file_extents = 0; struct btrfs_key key; struct btrfs_file_extent_item *fi; int i; int level; int ret = 0; - int faili = 0; - int refi = 0; - int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64); ref_root = btrfs_header_owner(buf); - ref_generation = btrfs_header_generation(buf); - orig_root = btrfs_header_owner(orig_buf); - orig_generation = btrfs_header_generation(orig_buf); - nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); - BUG_ON(!sorted); + if (!root->ref_cows && level == 0) + return 0; - if (root->ref_cows) { - process_func = __btrfs_inc_extent_ref; - } else { - if (level == 0 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - goto out; - if (level != 0 && - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) - goto out; - process_func = __btrfs_update_extent_ref; - } + if (inc) + process_func = btrfs_inc_extent_ref; + else + process_func = btrfs_free_extent; + + if (full_backref) + parent = buf->start; + else + parent = 0; - /* - * we make two passes through the items. In the first pass we - * only record the byte number and slot. Then we sort based on - * byte number and do the actual work based on the sorted results - */ for (i = 0; i < nritems; i++) { - cond_resched(); if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) @@ -1669,144 +2326,38 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, if (bytenr == 0) continue; - nr_file_extents++; - sorted[refi].bytenr = bytenr; - sorted[refi].slot = i; - refi++; - } else { - bytenr = btrfs_node_blockptr(buf, i); - sorted[refi].bytenr = bytenr; - sorted[refi].slot = i; - refi++; - } - } - /* - * if refi == 0, we didn't actually put anything into the sorted - * array and we're done - */ - if (refi == 0) - goto out; - - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); - - for (i = 0; i < refi; i++) { - cond_resched(); - slot = sorted[i].slot; - bytenr = sorted[i].bytenr; - - if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, slot); - - ret = process_func(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); - - if (ret) { - faili = slot; - WARN_ON(1); + num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + key.offset -= btrfs_file_extent_offset(buf, fi); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, key.objectid, + key.offset); + if (ret) goto fail; - } } else { - ret = process_func(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - level - 1); - if (ret) { - faili = slot; - WARN_ON(1); + bytenr = btrfs_node_blockptr(buf, i); + num_bytes = btrfs_level_size(root, level - 1); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, level - 1, 0); + if (ret) goto fail; - } } } -out: - kfree(sorted); - if (nr_extents) { - if (level == 0) - *nr_extents = nr_file_extents; - else - *nr_extents = nritems; - } return 0; fail: - kfree(sorted); - WARN_ON(1); + BUG(); return ret; } -int btrfs_update_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *orig_buf, - struct extent_buffer *buf, int start_slot, int nr) - +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) { - u64 bytenr; - u64 ref_root; - u64 orig_root; - u64 ref_generation; - u64 orig_generation; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - int i; - int ret; - int slot; - int level; - - BUG_ON(start_slot < 0); - BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); - - ref_root = btrfs_header_owner(buf); - ref_generation = btrfs_header_generation(buf); - orig_root = btrfs_header_owner(orig_buf); - orig_generation = btrfs_header_generation(orig_buf); - level = btrfs_header_level(buf); - - if (!root->ref_cows) { - if (level == 0 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - return 0; - if (level != 0 && - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) - return 0; - } + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +} - for (i = 0, slot = start_slot; i < nr; i++, slot++) { - cond_resched(); - if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(buf, slot, - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(buf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) - continue; - ret = __btrfs_update_extent_ref(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); - if (ret) - goto fail; - } else { - bytenr = btrfs_node_blockptr(buf, slot); - ret = __btrfs_update_extent_ref(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - level - 1); - if (ret) - goto fail; - } - } - return 0; -fail: - WARN_ON(1); - return -1; +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -1815,7 +2366,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache) { int ret; - int pending_ret; struct btrfs_root *extent_root = root->fs_info->extent_root; unsigned long bi; struct extent_buffer *leaf; @@ -1831,12 +2381,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(extent_root, path); fail: - finish_current_insert(trans, extent_root, 0); - pending_ret = del_pending_extents(trans, extent_root, 0); if (ret) return ret; - if (pending_ret) - return pending_ret; return 0; } @@ -1900,7 +2446,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) if (!block_group || block_group->ro) readonly = 1; if (block_group) - put_block_group(block_group); + btrfs_put_block_group(block_group); return readonly; } @@ -2150,11 +2696,15 @@ again: printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " - "%llu bytes_pinned, %llu bytes_readonly, %llu may use" - "%llu total\n", bytes, data_sinfo->bytes_delalloc, - data_sinfo->bytes_used, data_sinfo->bytes_reserved, - data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, - data_sinfo->bytes_may_use, data_sinfo->total_bytes); + "%llu bytes_pinned, %llu bytes_readonly, %llu may use " + "%llu total\n", (unsigned long long)bytes, + (unsigned long long)data_sinfo->bytes_delalloc, + (unsigned long long)data_sinfo->bytes_used, + (unsigned long long)data_sinfo->bytes_reserved, + (unsigned long long)data_sinfo->bytes_pinned, + (unsigned long long)data_sinfo->bytes_readonly, + (unsigned long long)data_sinfo->bytes_may_use, + (unsigned long long)data_sinfo->total_bytes); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -2225,15 +2775,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, spin_unlock(&info->lock); } +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = 1; + } + rcu_read_unlock(); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; u64 thresh; int ret = 0; - mutex_lock(&extent_root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); flags = btrfs_reduce_alloc_profile(extent_root, flags); @@ -2265,6 +2829,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } spin_unlock(&space_info->lock); + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret) space_info->full = 1; @@ -2284,6 +2860,24 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + old_val = btrfs_super_bytes_used(&info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(&info->super_copy, old_val); + + /* block accounting for root item */ + old_val = btrfs_root_used(&root->root_item); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_root_used(&root->root_item, old_val); + spin_unlock(&info->delalloc_lock); + while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) @@ -2324,7 +2918,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, WARN_ON(ret); } } - put_block_group(cache); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; } @@ -2341,7 +2935,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return 0; bytenr = cache->key.objectid; - put_block_group(cache); + btrfs_put_block_group(cache); return bytenr; } @@ -2353,7 +2947,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); if (pin) { set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); @@ -2361,6 +2954,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, clear_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); } + while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); @@ -2385,7 +2979,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, if (cache->cached) btrfs_add_free_space(cache, bytenr, len); } - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2416,7 +3010,7 @@ static int update_reserved_extents(struct btrfs_root *root, } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2431,7 +3025,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; int ret; - mutex_lock(&root->fs_info->pinned_mutex); while (1) { ret = find_first_extent_bit(pinned_extents, last, &start, &end, EXTENT_DIRTY); @@ -2440,7 +3033,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } - mutex_unlock(&root->fs_info->pinned_mutex); return 0; } @@ -2452,7 +3044,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; - mutex_lock(&root->fs_info->pinned_mutex); while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); @@ -2461,209 +3052,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start); + /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); - if (need_resched()) { - mutex_unlock(&root->fs_info->pinned_mutex); - cond_resched(); - mutex_lock(&root->fs_info->pinned_mutex); - } + cond_resched(); } - mutex_unlock(&root->fs_info->pinned_mutex); return ret; } -static int finish_current_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all) -{ - u64 start; - u64 end; - u64 priv; - u64 search = 0; - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_path *path; - struct pending_extent_op *extent_op, *tmp; - struct list_head insert_list, update_list; - int ret; - int num_inserts = 0, max_inserts, restart = 0; - - path = btrfs_alloc_path(); - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - - max_inserts = extent_root->leafsize / - (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) + - sizeof(struct btrfs_extent_ref) + - sizeof(struct btrfs_extent_item)); -again: - mutex_lock(&info->extent_ins_mutex); - while (1) { - ret = find_first_extent_bit(&info->extent_ins, search, &start, - &end, EXTENT_WRITEBACK); - if (ret) { - if (restart && !num_inserts && - list_empty(&update_list)) { - restart = 0; - search = 0; - continue; - } - break; - } - - ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); - if (!ret) { - if (all) - restart = 1; - search = end + 1; - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - continue; - } - - ret = get_state_private(&info->extent_ins, start, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *)(unsigned long) priv; - - if (extent_op->type == PENDING_EXTENT_INSERT) { - num_inserts++; - list_add_tail(&extent_op->list, &insert_list); - search = end + 1; - if (num_inserts == max_inserts) { - restart = 1; - break; - } - } else if (extent_op->type == PENDING_BACKREF_UPDATE) { - list_add_tail(&extent_op->list, &update_list); - search = end + 1; - } else { - BUG(); - } - } - - /* - * process the update list, clear the writeback bit for it, and if - * somebody marked this thing for deletion then just unlock it and be - * done, the free_extents will handle it - */ - list_for_each_entry_safe(extent_op, tmp, &update_list, list) { - clear_extent_bits(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - if (extent_op->del) { - list_del_init(&extent_op->list); - unlock_extent(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - - 1, GFP_NOFS); - kfree(extent_op); - } - } - mutex_unlock(&info->extent_ins_mutex); - - /* - * still have things left on the update list, go ahead an update - * everything - */ - if (!list_empty(&update_list)) { - ret = update_backrefs(trans, extent_root, path, &update_list); - BUG_ON(ret); - - /* we may have COW'ed new blocks, so lets start over */ - if (all) - restart = 1; - } - - /* - * if no inserts need to be done, but we skipped some extents and we - * need to make sure everything is cleaned then reset everything and - * go back to the beginning - */ - if (!num_inserts && restart) { - search = 0; - restart = 0; - INIT_LIST_HEAD(&update_list); - INIT_LIST_HEAD(&insert_list); - goto again; - } else if (!num_inserts) { - goto out; - } - - /* - * process the insert extents list. Again if we are deleting this - * extent, then just unlock it, pin down the bytes if need be, and be - * done with it. Saves us from having to actually insert the extent - * into the tree and then subsequently come along and delete it - */ - mutex_lock(&info->extent_ins_mutex); - list_for_each_entry_safe(extent_op, tmp, &insert_list, list) { - clear_extent_bits(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - if (extent_op->del) { - u64 used; - list_del_init(&extent_op->list); - unlock_extent(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - - 1, GFP_NOFS); - - mutex_lock(&extent_root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, - extent_op->bytenr, - extent_op->num_bytes, 0); - mutex_unlock(&extent_root->fs_info->pinned_mutex); - - spin_lock(&info->delalloc_lock); - used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - used - extent_op->num_bytes); - used = btrfs_root_used(&extent_root->root_item); - btrfs_set_root_used(&extent_root->root_item, - used - extent_op->num_bytes); - spin_unlock(&info->delalloc_lock); - - ret = update_block_group(trans, extent_root, - extent_op->bytenr, - extent_op->num_bytes, - 0, ret > 0); - BUG_ON(ret); - kfree(extent_op); - num_inserts--; - } - } - mutex_unlock(&info->extent_ins_mutex); - - ret = insert_extents(trans, extent_root, path, &insert_list, - num_inserts); - BUG_ON(ret); - - /* - * if restart is set for whatever reason we need to go back and start - * searching through the pending list again. - * - * We just inserted some extents, which could have resulted in new - * blocks being allocated, which would result in new blocks needing - * updates, so if all is set we _must_ restart to get the updated - * blocks. - */ - if (restart || all) { - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - search = 0; - restart = 0; - num_inserts = 0; - goto again; - } -out: - btrfs_free_path(path); - return 0; -} - static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int is_data) + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, int is_data, + struct extent_buffer **must_clean) { int err = 0; struct extent_buffer *buf; @@ -2685,77 +3087,94 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, u64 header_owner = btrfs_header_owner(buf); u64 header_transid = btrfs_header_generation(buf); if (header_owner != BTRFS_TREE_LOG_OBJECTID && - header_owner != BTRFS_TREE_RELOC_OBJECTID && header_transid == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - clean_tree_block(NULL, root, buf); - btrfs_tree_unlock(buf); - free_extent_buffer(buf); + *must_clean = buf; return 1; } btrfs_tree_unlock(buf); } free_extent_buffer(buf); pinit: + btrfs_set_path_blocking(path); + /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); BUG_ON(err < 0); return 0; } -/* - * remove an extent from the root, returns 0 on success - */ -static int __free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, int mark_free) + +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; struct btrfs_key key; + struct btrfs_path *path; struct btrfs_fs_info *info = root->fs_info; struct btrfs_root *extent_root = info->extent_root; struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; int ret; + int is_data; int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - struct btrfs_extent_item *ei; - u32 refs; + u32 item_size; + u64 refs; - key.objectid = bytenr; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - key.offset = num_bytes; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = 1; - ret = lookup_extent_backref(trans, extent_root, path, - bytenr, parent, root_objectid, - ref_generation, owner_objectid, 1); + path->leave_spinning = 1; + + is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; + BUG_ON(!is_data && refs_to_drop != 1); + + ret = lookup_extent_backref(trans, extent_root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner_objectid, + owner_offset); if (ret == 0) { - struct btrfs_key found_key; extent_slot = path->slots[0]; - while (extent_slot > 0) { - extent_slot--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, + while (extent_slot >= 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, extent_slot); - if (found_key.objectid != bytenr) + if (key.objectid != bytenr) break; - if (found_key.type == BTRFS_EXTENT_ITEM_KEY && - found_key.offset == num_bytes) { + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) { found_extent = 1; break; } if (path->slots[0] - extent_slot > 5) break; + extent_slot--; } +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); + if (found_extent && item_size < sizeof(*ei)) + found_extent = 0; +#endif if (!found_extent) { - ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(iref); + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, + is_data); BUG_ON(ret); btrfs_release_path(extent_root, path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { @@ -2771,78 +3190,125 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "root %llu gen %llu owner %llu\n", + "parent %llu root %llu owner %llu offset %llu\n", (unsigned long long)bytenr, + (unsigned long long)parent, (unsigned long long)root_objectid, - (unsigned long long)ref_generation, - (unsigned long long)owner_objectid); + (unsigned long long)owner_objectid, + (unsigned long long)owner_offset); } leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + BUG_ON(found_extent || extent_slot != path->slots[0]); + ret = convert_extent_item_v0(trans, extent_root, path, + owner_objectid, 0); + BUG_ON(ret < 0); + + btrfs_release_path(extent_root, path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); + } +#endif + BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs == 0); - refs -= 1; - btrfs_set_extent_refs(leaf, ei, refs); + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + struct btrfs_tree_block_info *bi; + BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); + } - btrfs_mark_buffer_dirty(leaf); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs < refs_to_drop); + refs -= refs_to_drop; - if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { - struct btrfs_extent_ref *ref; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); - /* if the back ref and the extent are next to each other - * they get deleted below in one shot + if (refs > 0) { + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + /* + * In the case of inline back ref, reference count will + * be updated by remove_extent_backref */ - path->slots[0] = extent_slot; - num_to_del = 2; - } else if (found_extent) { - /* otherwise delete the extent back ref */ - ret = remove_extent_backref(trans, extent_root, path); - BUG_ON(ret); - /* if refs are 0, we need to setup the path for deletion */ - if (refs == 0) { - btrfs_release_path(extent_root, path); - ret = btrfs_search_slot(trans, extent_root, &key, path, - -1, 1); + if (iref) { + BUG_ON(!found_extent); + } else { + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); + } + if (found_extent) { + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, + is_data); BUG_ON(ret); } - } + } else { + int mark_free = 0; + struct extent_buffer *must_clean = NULL; + + if (found_extent) { + BUG_ON(is_data && refs_to_drop != + extent_data_ref_count(root, path, iref)); + if (iref) { + BUG_ON(path->slots[0] != extent_slot); + } else { + BUG_ON(path->slots[0] != extent_slot + 1); + path->slots[0] = extent_slot; + num_to_del = 2; + } + } - if (refs == 0) { - u64 super_used; - u64 root_used; + ret = pin_down_bytes(trans, root, path, bytenr, + num_bytes, is_data, &must_clean); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); + /* + * it is going to be very rare for someone to be waiting + * on the block we're freeing. del_items might need to + * schedule, so rather than get fancy, just force it + * to blocking here + */ + if (must_clean) + btrfs_set_lock_blocking(must_clean); - if (pin) { - mutex_lock(&root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, root, bytenr, num_bytes, - owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&root->fs_info->pinned_mutex); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); - } - /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - super_used - num_bytes); - - /* block accounting for root item */ - root_used = btrfs_root_used(&root->root_item); - btrfs_set_root_used(&root->root_item, - root_used - num_bytes); - spin_unlock(&info->delalloc_lock); ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); btrfs_release_path(extent_root, path); - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + if (must_clean) { + clean_tree_block(NULL, root, must_clean); + btrfs_tree_unlock(must_clean); + free_extent_buffer(must_clean); + } + + if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); + } else { + invalidate_mapping_pages(info->btree_inode->i_mapping, + bytenr >> PAGE_CACHE_SHIFT, + (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } ret = update_block_group(trans, root, bytenr, num_bytes, 0, @@ -2850,231 +3316,115 @@ static int __free_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } btrfs_free_path(path); - finish_current_insert(trans, extent_root, 0); return ret; } /* - * find all the blocks marked as pending in the radix tree and remove - * them from the extent map + * when we free an extent, it is possible (and likely) that we free the last + * delayed ref for that extent as well. This searches the delayed ref tree for + * a given extent, and if there are no other delayed refs to be processed, it + * removes it from the tree. */ -static int del_pending_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all) +static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr) { + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct rb_node *node; int ret; - int err = 0; - u64 start; - u64 end; - u64 priv; - u64 search = 0; - int nr = 0, skipped = 0; - struct extent_io_tree *pending_del; - struct extent_io_tree *extent_ins; - struct pending_extent_op *extent_op; - struct btrfs_fs_info *info = extent_root->fs_info; - struct list_head delete_list; - - INIT_LIST_HEAD(&delete_list); - extent_ins = &extent_root->fs_info->extent_ins; - pending_del = &extent_root->fs_info->pending_del; -again: - mutex_lock(&info->extent_ins_mutex); - while (1) { - ret = find_first_extent_bit(pending_del, search, &start, &end, - EXTENT_WRITEBACK); - if (ret) { - if (all && skipped && !nr) { - search = 0; - skipped = 0; - continue; - } - mutex_unlock(&info->extent_ins_mutex); - break; - } - - ret = try_lock_extent(extent_ins, start, end, GFP_NOFS); - if (!ret) { - search = end+1; - skipped = 1; - - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - - continue; - } - BUG_ON(ret < 0); - - ret = get_state_private(pending_del, start, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *)(unsigned long)priv; - - clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK, - GFP_NOFS); - if (!test_range_bit(extent_ins, start, end, - EXTENT_WRITEBACK, 0)) { - list_add_tail(&extent_op->list, &delete_list); - nr++; - } else { - kfree(extent_op); - - ret = get_state_private(&info->extent_ins, start, - &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; - - clear_extent_bits(&info->extent_ins, start, end, - EXTENT_WRITEBACK, GFP_NOFS); - - if (extent_op->type == PENDING_BACKREF_UPDATE) { - list_add_tail(&extent_op->list, &delete_list); - search = end + 1; - nr++; - continue; - } - - mutex_lock(&extent_root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, start, - end + 1 - start, 0); - mutex_unlock(&extent_root->fs_info->pinned_mutex); - - ret = update_block_group(trans, extent_root, start, - end + 1 - start, 0, ret > 0); - - unlock_extent(extent_ins, start, end, GFP_NOFS); - BUG_ON(ret); - kfree(extent_op); - } - if (ret) - err = ret; + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; - search = end + 1; + node = rb_prev(&head->node.rb_node); + if (!node) + goto out; - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - } + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (nr) { - ret = free_extents(trans, extent_root, &delete_list); - BUG_ON(ret); - } + /* there are still entries for this ref, we can't drop it */ + if (ref->bytenr == bytenr) + goto out; - if (all && skipped) { - INIT_LIST_HEAD(&delete_list); - search = 0; - nr = 0; - goto again; + if (head->extent_op) { + if (!head->must_insert_reserved) + goto out; + kfree(head->extent_op); + head->extent_op = NULL; } - if (!err) - finish_current_insert(trans, extent_root, 0); - return err; -} - -/* - * remove an extent from the root, returns 0 on success - */ -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin) -{ - struct btrfs_root *extent_root = root->fs_info->extent_root; - int pending_ret; - int ret; - - WARN_ON(num_bytes < root->sectorsize); - if (root == extent_root) { - struct pending_extent_op *extent_op = NULL; - - mutex_lock(&root->fs_info->extent_ins_mutex); - if (test_range_bit(&root->fs_info->extent_ins, bytenr, - bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { - u64 priv; - ret = get_state_private(&root->fs_info->extent_ins, - bytenr, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; - - extent_op->del = 1; - if (extent_op->type == PENDING_EXTENT_INSERT) { - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } - } - - if (extent_op) { - ref_generation = extent_op->orig_generation; - parent = extent_op->orig_parent; - } + /* + * waiting for the lock here would deadlock. If someone else has it + * locked they are already in the process of dropping it anyway + */ + if (!mutex_trylock(&head->mutex)) + goto out; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); + /* + * at this point we have a head with no other entries. Go + * ahead and process it. + */ + head->node.in_tree = 0; + rb_erase(&head->node.rb_node, &delayed_refs->root); - extent_op->type = PENDING_EXTENT_DELETE; - extent_op->bytenr = bytenr; - extent_op->num_bytes = num_bytes; - extent_op->parent = parent; - extent_op->orig_parent = parent; - extent_op->generation = ref_generation; - extent_op->orig_generation = ref_generation; - extent_op->level = (int)owner_objectid; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - set_extent_bits(&root->fs_info->pending_del, - bytenr, bytenr + num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->pending_del, - bytenr, (unsigned long)extent_op); - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } - /* if metadata always pin */ - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - mutex_lock(&root->fs_info->pinned_mutex); - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); - mutex_unlock(&root->fs_info->pinned_mutex); - update_reserved_extents(root, bytenr, num_bytes, 0); - return 0; - } - pin = 1; - } + delayed_refs->num_entries--; - /* if data pin when any transaction has committed this */ - if (ref_generation != trans->transid) - pin = 1; + /* + * we don't take a ref on the node because we're removing it from the + * tree, so we just steal the ref the tree was holding. + */ + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; - ret = __free_extent(trans, root, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, pin, pin == 0); + list_del_init(&head->cluster); + spin_unlock(&delayed_refs->lock); - finish_current_insert(trans, root->fs_info->extent_root, 0); - pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); - return ret ? ret : pending_ret; + ret = run_one_delayed_ref(trans, root->fs_info->tree_root, + &head->node, head->extent_op, + head->must_insert_reserved); + BUG_ON(ret); + btrfs_put_delayed_ref(&head->node); + return 0; +out: + spin_unlock(&delayed_refs->lock); + return 0; } int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin) + u64 root_objectid, u64 owner, u64 offset) { int ret; - ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, pin); + /* + * tree log blocks never actually go into the extent allocation + * tree, just update pinning info and exit early. + */ + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { + WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + /* unlocks the pinned mutex */ + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + update_reserved_extents(root, bytenr, num_bytes, 0); + ret = 0; + } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + ret = check_ref_cleanup(trans, root, bytenr); + BUG_ON(ret); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } return ret; } @@ -3103,228 +3453,262 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; - u64 total_needed = num_bytes; - u64 *last_ptr = NULL; - u64 last_wanted = 0; + struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - int chunk_alloc_done = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; - struct list_head *head = NULL, *cur = NULL; - int loop = 0; - int extra_loop = 0; struct btrfs_space_info *space_info; + int last_ptr_loop = 0; + int loop = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); ins->objectid = 0; ins->offset = 0; + space_info = __find_space_info(root->fs_info, data); + if (orig_root->ref_cows || empty_size) allowed_chunk_alloc = 1; if (data & BTRFS_BLOCK_GROUP_METADATA) { - last_ptr = &root->fs_info->last_alloc; + last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) - last_ptr = &root->fs_info->last_data_alloc; + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->data_alloc_cluster; + } if (last_ptr) { - if (*last_ptr) { - hint_byte = *last_ptr; - last_wanted = *last_ptr; - } else - empty_size += empty_cluster; - } else { - empty_cluster = 0; + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + spin_unlock(&last_ptr->lock); } + search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (last_wanted && search_start != last_wanted) { - last_wanted = 0; - empty_size += empty_cluster; + if (!last_ptr) { + empty_cluster = 0; + loop = 1; } - total_needed += empty_size; - block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (!block_group) - block_group = btrfs_lookup_first_block_group(root->fs_info, - search_start); - space_info = __find_space_info(root->fs_info, data); + if (search_start == hint_byte) { + block_group = btrfs_lookup_block_group(root->fs_info, + search_start); + if (block_group && block_group_bits(block_group, data)) { + down_read(&space_info->groups_sem); + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else + goto have_block_group; + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } +search: down_read(&space_info->groups_sem); - while (1) { - struct btrfs_free_space *free_space; - /* - * the only way this happens if our hint points to a block - * group thats not of the proper type, while looping this - * should never happen - */ - if (empty_size) - extra_loop = 1; + list_for_each_entry(block_group, &space_info->block_groups, list) { + u64 offset; - if (!block_group) - goto new_group_no_lock; + atomic_inc(&block_group->count); + search_start = block_group->key.objectid; +have_block_group: if (unlikely(!block_group->cached)) { mutex_lock(&block_group->cache_mutex); ret = cache_block_group(root, block_group); mutex_unlock(&block_group->cache_mutex); - if (ret) + if (ret) { + btrfs_put_block_group(block_group); break; + } } - mutex_lock(&block_group->alloc_mutex); - if (unlikely(!block_group_bits(block_group, data))) - goto new_group; - if (unlikely(block_group->ro)) - goto new_group; + goto loop; - free_space = btrfs_find_free_space(block_group, search_start, - total_needed); - if (free_space) { - u64 start = block_group->key.objectid; - u64 end = block_group->key.objectid + - block_group->key.offset; + if (last_ptr) { + /* + * the refill lock keeps out other + * people trying to start a new cluster + */ + spin_lock(&last_ptr->refill_lock); + if (last_ptr->block_group && + (last_ptr->block_group->ro || + !block_group_bits(last_ptr->block_group, data))) { + offset = 0; + goto refill_cluster; + } - search_start = stripe_align(root, free_space->offset); + offset = btrfs_alloc_from_cluster(block_group, last_ptr, + num_bytes, search_start); + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } - /* move on to the next group */ - if (search_start + num_bytes >= search_end) - goto new_group; + spin_lock(&last_ptr->lock); + /* + * whoops, this cluster doesn't actually point to + * this block group. Get a ref on the block + * group is does point to and try again + */ + if (!last_ptr_loop && last_ptr->block_group && + last_ptr->block_group != block_group) { - /* move on to the next group */ - if (search_start + num_bytes > end) - goto new_group; + btrfs_put_block_group(block_group); + block_group = last_ptr->block_group; + atomic_inc(&block_group->count); + spin_unlock(&last_ptr->lock); + spin_unlock(&last_ptr->refill_lock); - if (last_wanted && search_start != last_wanted) { - total_needed += empty_cluster; - empty_size += empty_cluster; - last_wanted = 0; + last_ptr_loop = 1; + search_start = block_group->key.objectid; /* - * if search_start is still in this block group - * then we just re-search this block group + * we know this block group is properly + * in the list because + * btrfs_remove_block_group, drops the + * cluster before it removes the block + * group from the list */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - continue; - } - - /* else we go to the next block group */ - goto new_group; + goto have_block_group; } + spin_unlock(&last_ptr->lock); +refill_cluster: + /* + * this cluster didn't work out, free it and + * start over + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; + last_ptr_loop = 0; + + /* allocate a cluster in this block group */ + ret = btrfs_find_space_cluster(trans, root, + block_group, last_ptr, + offset, num_bytes, + empty_cluster + empty_size); + if (ret == 0) { /* - * if search_start is still in this block group - * then we just re-search this block group + * now pull our allocation out of this + * cluster */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - last_wanted = 0; - continue; + offset = btrfs_alloc_from_cluster(block_group, + last_ptr, num_bytes, + search_start); + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + goto checks; } - - /* else we go to the next block group */ - goto new_group; } + /* + * at this point we either didn't find a cluster + * or we weren't able to allocate a block from our + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ + if (loop < 2) { + btrfs_return_cluster_to_free_space(NULL, + last_ptr); + spin_unlock(&last_ptr->refill_lock); + goto loop; + } + spin_unlock(&last_ptr->refill_lock); + } - ins->objectid = search_start; - ins->offset = num_bytes; + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); + if (!offset) + goto loop; +checks: + search_start = stripe_align(root, offset); - btrfs_remove_free_space_lock(block_group, search_start, - num_bytes); - /* we are all good, lets return */ - mutex_unlock(&block_group->alloc_mutex); - break; + /* move on to the next group */ + if (search_start + num_bytes >= search_end) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; } -new_group: - mutex_unlock(&block_group->alloc_mutex); - put_block_group(block_group); - block_group = NULL; -new_group_no_lock: - /* don't try to compare new allocations against the - * last allocation any more - */ - last_wanted = 0; - /* - * Here's how this works. - * loop == 0: we were searching a block group via a hint - * and didn't find anything, so we start at - * the head of the block groups and keep searching - * loop == 1: we're searching through all of the block groups - * if we hit the head again we have searched - * all of the block groups for this space and we - * need to try and allocate, if we cant error out. - * loop == 2: we allocated more space and are looping through - * all of the block groups again. - */ - if (loop == 0) { - head = &space_info->block_groups; - cur = head->next; - loop++; - } else if (loop == 1 && cur == head) { - int keep_going; - - /* at this point we give up on the empty_size - * allocations and just try to allocate the min - * space. - * - * The extra_loop field was set if an empty_size - * allocation was attempted above, and if this - * is try we need to try the loop again without - * the additional empty_size. + /* move on to the next group */ + if (search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } + + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; + + btrfs_add_free_space(block_group, offset, num_bytes); + /* + * if search_start is still in this block group + * then we just re-search this block group */ - total_needed -= empty_size; - empty_size = 0; - keep_going = extra_loop; - loop++; + if (search_start >= block_group->key.objectid && + search_start < (block_group->key.objectid + + block_group->key.offset)) + goto have_block_group; + goto loop; + } - if (allowed_chunk_alloc && !chunk_alloc_done) { - up_read(&space_info->groups_sem); - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, 1); - down_read(&space_info->groups_sem); - if (ret < 0) - goto loop_check; - head = &space_info->block_groups; - /* - * we've allocated a new chunk, keep - * trying - */ - keep_going = 1; - chunk_alloc_done = 1; - } else if (!allowed_chunk_alloc) { - space_info->force_alloc = 1; - } -loop_check: - if (keep_going) { - cur = head->next; - extra_loop = 0; - } else { - break; - } - } else if (cur == head) { - break; + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + + /* we are all good, lets return */ + break; +loop: + btrfs_put_block_group(block_group); + } + up_read(&space_info->groups_sem); + + /* loop == 0, try to find a clustered alloc in every block group + * loop == 1, try again after forcing a chunk allocation + * loop == 2, set empty_size and empty_cluster to 0 and try again + */ + if (!ins->objectid && loop < 3 && + (empty_size || empty_cluster || allowed_chunk_alloc)) { + if (loop >= 2) { + empty_size = 0; + empty_cluster = 0; } - block_group = list_entry(cur, struct btrfs_block_group_cache, - list); - atomic_inc(&block_group->count); + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + allowed_chunk_alloc = 0; + } else { + space_info->force_alloc = 1; + } - search_start = block_group->key.objectid; - cur = cur->next; + if (loop < 3) { + loop++; + goto search; + } + ret = -ENOSPC; + } else if (!ins->objectid) { + ret = -ENOSPC; } /* we found what we needed */ @@ -3332,21 +3716,10 @@ loop_check: if (!(data & BTRFS_BLOCK_GROUP_DATA)) trans->block_group = block_group->key.objectid; - if (last_ptr) - *last_ptr = ins->objectid + ins->offset; + btrfs_put_block_group(block_group); ret = 0; - } else if (!ret) { - printk(KERN_ERR "btrfs searching for %llu bytes, " - "num_bytes %llu, loop %d, allowed_alloc %d\n", - (unsigned long long)total_needed, - (unsigned long long)num_bytes, - loop, allowed_chunk_alloc); - ret = -ENOSPC; } - if (block_group) - put_block_group(block_group); - up_read(&space_info->groups_sem); return ret; } @@ -3359,9 +3732,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", info->total_bytes, - info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, - info->bytes_used); + " may_use=%llu, used=%llu\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, + (unsigned long long)info->bytes_used); down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -3451,7 +3827,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - put_block_group(cache); + btrfs_put_block_group(cache); update_reserved_extents(root, start, len, 0); return ret; @@ -3472,129 +3848,147 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, return ret; } -static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod) { int ret; - int pending_ret; - u64 super_used; - u64 root_used; - u64 num_bytes = ins->offset; - u32 sizes[2]; - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_root *extent_root = info->extent_root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_extent_item *extent_item; - struct btrfs_extent_ref *ref; + struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; - struct btrfs_key keys[2]; + struct extent_buffer *leaf; + int type; + u32 size; - if (parent == 0) - parent = ins->objectid; + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; - /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); + size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); - /* block accounting for root item */ - root_used = btrfs_root_used(&root->root_item); - btrfs_set_root_used(&root->root_item, root_used + num_bytes); - spin_unlock(&info->delalloc_lock); + path = btrfs_alloc_path(); + BUG_ON(!path); - if (root == extent_root) { - struct pending_extent_op *extent_op; + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); + BUG_ON(ret); - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, ref_mod); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_DATA); + + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { + struct btrfs_shared_data_ref *ref; + ref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); + } else { + struct btrfs_extent_data_ref *ref; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); - extent_op->type = PENDING_EXTENT_INSERT; - extent_op->bytenr = ins->objectid; - extent_op->num_bytes = ins->offset; - extent_op->parent = parent; - extent_op->orig_parent = 0; - extent_op->generation = ref_generation; - extent_op->orig_generation = 0; - extent_op->level = (int)owner; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - mutex_lock(&root->fs_info->extent_ins_mutex); - set_extent_bits(&root->fs_info->extent_ins, ins->objectid, - ins->objectid + ins->offset - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->extent_ins, - ins->objectid, (unsigned long)extent_op); - mutex_unlock(&root->fs_info->extent_ins_mutex); - goto update_block; - } - - memcpy(&keys[0], ins, sizeof(*ins)); - keys[1].objectid = ins->objectid; - keys[1].type = BTRFS_EXTENT_REF_KEY; - keys[1].offset = parent; - sizes[0] = sizeof(*extent_item); - sizes[1] = sizeof(*ref); + ret = update_block_group(trans, root, ins->objectid, ins->offset, + 1, 0); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } + return ret; +} + +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_tree_block_info *block_info; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); path = btrfs_alloc_path(); BUG_ON(!path); - ret = btrfs_insert_empty_items(trans, extent_root, path, keys, - sizes, 2); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); BUG_ON(ret); - extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], extent_item, 1); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_extent_ref); - - btrfs_set_ref_root(path->nodes[0], ref, root_objectid); - btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); - btrfs_set_ref_objectid(path->nodes[0], ref, owner); - btrfs_set_ref_num_refs(path->nodes[0], ref, 1); - - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_set_extent_refs(leaf, extent_item, 1); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } - trans->alloc_exclude_start = 0; - trans->alloc_exclude_nr = 0; + btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - finish_current_insert(trans, extent_root, 0); - pending_ret = del_pending_extents(trans, extent_root, 0); - - if (ret) - goto out; - if (pending_ret) { - ret = pending_ret; - goto out; - } -update_block: - ret = update_block_group(trans, root, ins->objectid, - ins->offset, 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, + 1, 0); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); BUG(); } -out: return ret; } -int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins) { int ret; - if (root_objectid == BTRFS_TREE_LOG_OBJECTID) - return 0; - ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins); - update_reserved_extents(root, ins->objectid, ins->offset, 0); + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + + ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, + 0, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL); return ret; } @@ -3603,10 +3997,10 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, * an extent has been allocated and makes sure to clear the free * space cache bits as well */ -int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins) { int ret; struct btrfs_block_group_cache *block_group; @@ -3619,9 +4013,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); BUG_ON(ret); - put_block_group(block_group); - ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins); + btrfs_put_block_group(block_group); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + 0, owner, offset, ins, 1); return ret; } @@ -3632,27 +4026,47 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, * * returns 0 if everything worked, non-zero otherwise. */ -int btrfs_alloc_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 min_alloc_size, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, u64 data) +static int alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 empty_size, u64 hint_byte, u64 search_end, + struct btrfs_key *ins) { int ret; + u64 flags = 0; - ret = __btrfs_reserve_extent(trans, root, num_bytes, - min_alloc_size, empty_size, hint_byte, - search_end, ins, data); + ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, + empty_size, hint_byte, search_end, + ins, 0); BUG_ON(ret); + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins->objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + update_reserved_extents(root, ins->objectid, ins->offset, 1); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = __btrfs_alloc_reserved_extent(trans, root, parent, - root_objectid, ref_generation, - owner_objectid, ins); + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); BUG_ON(ret); - - } else { - update_reserved_extents(root, ins->objectid, ins->offset, 1); } return ret; } @@ -3692,21 +4106,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u32 blocksize, u64 parent, - u64 root_objectid, - u64 ref_generation, - int level, - u64 hint, - u64 empty_size) + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size) { struct btrfs_key ins; int ret; struct extent_buffer *buf; - ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, - root_objectid, ref_generation, level, - empty_size, hint, (u64)-1, &ins, 0); + ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, + key, level, empty_size, hint, (u64)-1, &ins); if (ret) { BUG_ON(ret > 0); return ERR_PTR(ret); @@ -3717,35 +4127,23 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } +#if 0 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf) { - u64 leaf_owner; - u64 leaf_generation; - struct refsort *sorted; + u64 disk_bytenr; + u64 num_bytes; struct btrfs_key key; struct btrfs_file_extent_item *fi; + u32 nritems; int i; - int nritems; int ret; - int refi = 0; - int slot; BUG_ON(!btrfs_is_leaf(leaf)); nritems = btrfs_header_nritems(leaf); - leaf_owner = btrfs_header_owner(leaf); - leaf_generation = btrfs_header_generation(leaf); - sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); - /* we do this loop twice. The first time we build a list - * of the extents we have a reference on, then we sort the list - * by bytenr. The second time around we actually do the - * extent freeing. - */ for (i = 0; i < nritems; i++) { - u64 disk_bytenr; cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); /* only extents have references, skip everything else */ @@ -3765,42 +4163,11 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, if (disk_bytenr == 0) continue; - sorted[refi].bytenr = disk_bytenr; - sorted[refi].slot = i; - refi++; - } - - if (refi == 0) - goto out; - - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); - - for (i = 0; i < refi; i++) { - u64 disk_bytenr; - - disk_bytenr = sorted[i].bytenr; - slot = sorted[i].slot; - - cond_resched(); - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - - ret = __btrfs_free_extent(trans, root, disk_bytenr, - btrfs_file_extent_disk_num_bytes(leaf, fi), - leaf->start, leaf_owner, leaf_generation, - key.objectid, 0); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, + leaf->start, 0, key.objectid, 0); BUG_ON(ret); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); } -out: - kfree(sorted); return 0; } @@ -3829,7 +4196,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, */ for (i = 0; i < ref->nritems; i++) { info = ref->extents + sorted[i].slot; - ret = __btrfs_free_extent(trans, root, info->bytenr, + ret = btrfs_free_extent(trans, root, info->bytenr, info->num_bytes, ref->bytenr, ref->owner, ref->generation, info->objectid, 0); @@ -3846,12 +4213,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } -static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, + +static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 start, u64 len, u32 *refs) { int ret; - ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); + ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); BUG_ON(ret); #if 0 /* some debugging code in case we see problems here */ @@ -3886,6 +4255,7 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, return ret; } + /* * this is used while deleting old snapshots, and it drops the refs * on a whole subtree starting from a level 1 node. @@ -3959,7 +4329,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, * we just decrement it below and don't update any * of the refs the leaf points to. */ - ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + ret = drop_snap_lookup_refcount(trans, root, bytenr, + blocksize, &refs); BUG_ON(ret); if (refs != 1) continue; @@ -4010,7 +4381,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, */ for (i = 0; i < refi; i++) { bytenr = sorted[i].bytenr; - ret = __btrfs_free_extent(trans, root, bytenr, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, eb->start, root_owner, root_gen, 0, 1); BUG_ON(ret); @@ -4053,7 +4424,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, + ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, path->nodes[*level]->len, &refs); BUG_ON(ret); if (refs > 1) @@ -4104,7 +4475,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); blocksize = btrfs_level_size(root, *level - 1); - ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + ret = drop_snap_lookup_refcount(trans, root, bytenr, + blocksize, &refs); BUG_ON(ret); /* @@ -4119,7 +4491,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, root_gen = btrfs_header_generation(parent); path->slots[*level]++; - ret = __btrfs_free_extent(trans, root, bytenr, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level - 1, 1); @@ -4165,7 +4537,7 @@ out: * cleanup and free the reference on the last node * we processed */ - ret = __btrfs_free_extent(trans, root, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level, 1); free_extent_buffer(path->nodes[*level]); @@ -4177,268 +4549,473 @@ out: cond_resched(); return 0; } +#endif + +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; +}; + +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 /* - * helper function for drop_subtree, this function is similar to - * walk_down_tree. The main difference is that it checks reference - * counts while tree blocks are locked. + * hepler to process tree block while walking down the tree. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block. if the block is shared and + * we need update back refs for the subtree rooted at the + * block, this function changes wc->stage to UPDATE_BACKREF + * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * + * NOTE: return value 1 means we should stop walking down. */ -static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level) +static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + struct btrfs_key key; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; + int ret; + + if (wc->stage == UPDATE_BACKREF && + btrfs_header_owner(eb) != root->root_key.objectid) + return 1; + + /* + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ + if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + } + + if (wc->stage == DROP_REFERENCE && + wc->update_ref && wc->refs[level] > 1) { + BUG_ON(eb == root->node); + BUG_ON(path->slots[level] > 0); + if (level == 0) + btrfs_item_key_to_cpu(eb, &key, path->slots[level]); + else + btrfs_node_key_to_cpu(eb, &key, path->slots[level]); + if (btrfs_header_owner(eb) == root->root_key.objectid && + btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { + wc->stage = UPDATE_BACKREF; + wc->shared_level = level; + } + } + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; + + if (path->locks[level] && !wc->keep_locks) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; + } + + /* wc->stage == UPDATE_BACKREF */ + if (!(wc->flags[level] & flag)) { + BUG_ON(!path->locks[level]); + ret = btrfs_inc_ref(trans, root, eb, 1); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + ret = btrfs_set_disk_extent_flags(trans, root, eb->start, + eb->len, flag, 0); + BUG_ON(ret); + wc->flags[level] |= flag; + } + + /* + * the block is shared by multiple trees, so it's not good to + * keep the tree lock + */ + if (path->locks[level] && level > 0) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; +} + +/* + * hepler to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops + * reference count on the block. + * + * when wc->stage == UPDATE_BACKREF, this function changes + * wc->stage back to DROP_REFERENCE if we changed wc->stage + * to UPDATE_BACKREF previously while processing the block. + * + * NOTE: return value 1 means we should stop walking up. + */ +static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int ret = 0; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 parent = 0; + + if (wc->stage == UPDATE_BACKREF) { + BUG_ON(wc->shared_level < level); + if (level < wc->shared_level) + goto out; + + BUG_ON(wc->refs[level] <= 1); + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; + + wc->stage = DROP_REFERENCE; + wc->shared_level = -1; + path->slots[level] = 0; + + /* + * check reference count again if the block isn't locked. + * we should start walking down the tree again if reference + * count is one. + */ + if (!path->locks[level]) { + BUG_ON(level == 0); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + if (wc->refs[level] == 1) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + return 1; + } + } else { + BUG_ON(level != 0); + } + } + + /* wc->stage == DROP_REFERENCE */ + BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + + if (wc->refs[level] == 1) { + if (level == 0) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = btrfs_dec_ref(trans, root, eb, 1); + else + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + } + /* make block locked assertion in clean_tree_block happy */ + if (!path->locks[level] && + btrfs_header_generation(eb) == trans->transid) { + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + } + clean_tree_block(trans, root, eb); + } + + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(eb)); + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])); + } + + ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, + root->root_key.objectid, level, 0); + BUG_ON(ret); +out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return ret; +} + +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) { struct extent_buffer *next; struct extent_buffer *cur; - struct extent_buffer *parent; u64 bytenr; u64 ptr_gen; u32 blocksize; - u32 refs; + int level = wc->level; int ret; - cur = path->nodes[*level]; - ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, - &refs); - BUG_ON(ret); - if (refs > 1) - goto out; + while (level >= 0) { + cur = path->nodes[level]; + BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); - while (*level >= 0) { - cur = path->nodes[*level]; - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, cur); - BUG_ON(ret); - clean_tree_block(trans, root, cur); + ret = walk_down_proc(trans, root, path, wc); + if (ret > 0) break; - } - if (path->slots[*level] >= btrfs_header_nritems(cur)) { - clean_tree_block(trans, root, cur); + + if (level == 0) break; - } - bytenr = btrfs_node_blockptr(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + bytenr = btrfs_node_blockptr(cur, path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); next = read_tree_block(root, bytenr, blocksize, ptr_gen); btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, - &refs); - BUG_ON(ret); - if (refs > 1) { - parent = path->nodes[*level]; - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - *level - 1, 1); - BUG_ON(ret); - path->slots[*level]++; - btrfs_tree_unlock(next); - free_extent_buffer(next); - continue; - } - - *level = btrfs_header_level(next); - path->nodes[*level] = next; - path->slots[*level] = 0; - path->locks[*level] = 1; - cond_resched(); - } -out: - parent = path->nodes[*level + 1]; - bytenr = path->nodes[*level]->start; - blocksize = path->nodes[*level]->len; - - ret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, btrfs_header_owner(parent), - btrfs_header_generation(parent), *level, 1); - BUG_ON(ret); - - if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[*level]); - path->locks[*level] = 0; + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level += 1; - cond_resched(); return 0; } -/* - * helper for dropping snapshots. This walks back up the tree in the path - * to find the first node higher up where we haven't yet gone through - * all the slots - */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int *level, int max_level) + struct walk_control *wc, int max_level) { - u64 root_owner; - u64 root_gen; - struct btrfs_root_item *root_item = &root->root_item; - int i; - int slot; + int level = wc->level; int ret; - for (i = *level; i < max_level && path->nodes[i]; i++) { - slot = path->slots[i]; - if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { - struct extent_buffer *node; - struct btrfs_disk_key disk_key; - - /* - * there is more work to do in this level. - * Update the drop_progress marker to reflect - * the work we've done so far, and then bump - * the slot number - */ - node = path->nodes[i]; - path->slots[i]++; - *level = i; - WARN_ON(*level == 0); - btrfs_node_key(node, &disk_key, path->slots[i]); - memcpy(&root_item->drop_progress, - &disk_key, sizeof(disk_key)); - root_item->drop_level = i; + path->slots[level] = btrfs_header_nritems(path->nodes[level]); + while (level < max_level && path->nodes[level]) { + wc->level = level; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + path->slots[level]++; return 0; } else { - struct extent_buffer *parent; - - /* - * this whole node is done, free our reference - * on it and go up one level - */ - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; - clean_tree_block(trans, root, path->nodes[*level]); - ret = btrfs_free_extent(trans, root, - path->nodes[*level]->start, - path->nodes[*level]->len, - parent->start, root_owner, - root_gen, *level, 1); - BUG_ON(ret); - if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[*level]); - path->locks[*level] = 0; + if (path->locks[level]) { + btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level = i + 1; + free_extent_buffer(path->nodes[level]); + path->nodes[level] = NULL; + level++; } } return 1; } /* - * drop the reference count on the tree rooted at 'snap'. This traverses - * the tree freeing any blocks that have a ref count of zero after being - * decremented. + * drop a subvolume tree. + * + * this function traverses the tree freeing any blocks that only + * referenced by the tree. + * + * when a shared tree block is found. this function decreases its + * reference count by one. if update_ref is true, this function + * also make sure backrefs for the shared block and all lower level + * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root - *root) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) { - int ret = 0; - int wret; - int level; struct btrfs_path *path; - int i; - int orig_level; + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; + struct walk_control *wc; + struct btrfs_key key; + int err = 0; + int ret; + int level; - WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); path = btrfs_alloc_path(); BUG_ON(!path); - level = btrfs_header_level(root->node); - orig_level = level; + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + + trans = btrfs_start_transaction(tree_root, 1); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - path->nodes[level] = root->node; - extent_buffer_get(root->node); + level = btrfs_header_level(root->node); + path->nodes[level] = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; + path->locks[level] = 1; + memset(&wc->update_progress, 0, + sizeof(wc->update_progress)); } else { - struct btrfs_key key; - struct btrfs_disk_key found_key; - struct extent_buffer *node; - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + memcpy(&wc->update_progress, &key, + sizeof(wc->update_progress)); + level = root_item->drop_level; + BUG_ON(level == 0); path->lowest_level = level; - wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (wret < 0) { - ret = wret; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + err = ret; goto out; } - node = path->nodes[level]; - btrfs_node_key(node, &found_key, path->slots[level]); - WARN_ON(memcmp(&found_key, &root_item->drop_progress, - sizeof(found_key))); + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); + /* * unlock our path, this is safe because only this * function is allowed to delete this snapshot */ - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (path->nodes[i] && path->locks[i]) { - path->locks[i] = 0; - btrfs_tree_unlock(path->nodes[i]); - } + btrfs_unlock_up_safe(path, 0); + + level = btrfs_header_level(root->node); + while (1) { + btrfs_tree_lock(path->nodes[level]); + btrfs_set_lock_blocking(path->nodes[level]); + + ret = btrfs_lookup_extent_info(trans, root, + path->nodes[level]->start, + path->nodes[level]->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + + if (level == root_item->drop_level) + break; + + btrfs_tree_unlock(path->nodes[level]); + WARN_ON(wc->refs[level] != 1); + level--; } } + + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; + while (1) { - wret = walk_down_tree(trans, root, path, &level); - if (wret > 0) + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { + err = ret; break; - if (wret < 0) - ret = wret; + } - wret = walk_up_tree(trans, root, path, &level, - BTRFS_MAX_LEVEL); - if (wret > 0) + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { + err = ret; break; - if (wret < 0) - ret = wret; - if (trans->transaction->in_commit) { - ret = -EAGAIN; + } + + if (ret > 0) { + BUG_ON(wc->stage != DROP_REFERENCE); break; } - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - } - for (i = 0; i <= orig_level; i++) { - if (path->nodes[i]) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; + + if (wc->stage == DROP_REFERENCE) { + level = wc->level; + btrfs_node_key(path->nodes[level], + &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + } + + BUG_ON(wc->level == 0); + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) { + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + root_item); + BUG_ON(ret); + + btrfs_end_transaction(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 1); + } else { + unsigned long update; + update = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (update) + btrfs_run_delayed_refs(trans, tree_root, + update); } } + btrfs_release_path(root, path); + BUG_ON(err); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); out: + btrfs_end_transaction(trans, tree_root); + kfree(wc); btrfs_free_path(path); - return ret; + return err; } +/* + * drop subtree rooted at tree block 'node'. + * + * NOTE: this function will unlock and release tree block 'node' + */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent) { struct btrfs_path *path; + struct walk_control *wc; int level; int parent_level; int ret = 0; int wret; + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + path = btrfs_alloc_path(); BUG_ON(!path); + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); extent_buffer_get(parent); @@ -4447,28 +5024,38 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(node); level = btrfs_header_level(node); - extent_buffer_get(node); path->nodes[level] = node; path->slots[level] = 0; + path->locks[level] = 1; + + wc->refs[parent_level] = 1; + wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; while (1) { - wret = walk_down_subtree(trans, root, path, &level); - if (wret < 0) + wret = walk_down_tree(trans, root, path, wc); + if (wret < 0) { ret = wret; - if (wret != 0) break; + } - wret = walk_up_tree(trans, root, path, &level, parent_level); + wret = walk_up_tree(trans, root, path, wc, parent_level); if (wret < 0) ret = wret; if (wret != 0) break; } + kfree(wc); btrfs_free_path(path); return ret; } +#if 0 static unsigned long calc_ra(unsigned long start, unsigned long last, unsigned long nr) { @@ -5457,6 +6044,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, root->root_key.objectid, trans->transid, key.objectid); BUG_ON(ret); + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, leaf->start, btrfs_header_owner(leaf), @@ -5768,9 +6356,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, ref_path, NULL, NULL); BUG_ON(ret); - if (root == root->fs_info->extent_root) - btrfs_extent_post_op(trans, root); - return 0; } @@ -5952,6 +6537,7 @@ out: kfree(ref_path); return ret; } +#endif static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { @@ -6000,7 +6586,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root, u64 calc; spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) > 0) { + if (btrfs_block_group_used(&shrink_block_group->item) + + shrink_block_group->reserved > 0) { spin_unlock(&shrink_block_group->lock); trans = btrfs_start_transaction(root, 1); @@ -6025,6 +6612,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root, return 0; } + +int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + struct btrfs_block_group_cache *group) + +{ + __alloc_chunk_for_shrink(root, group, 1); + set_block_group_readonly(group); + return 0; +} + +#if 0 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 size) @@ -6038,6 +6636,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) goto out; @@ -6208,6 +6807,9 @@ again: btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); mutex_unlock(&root->fs_info->cleaner_mutex); + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -6294,12 +6896,13 @@ next: WARN_ON(block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&block_group->item) > 0); spin_unlock(&block_group->lock); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = 0; out: btrfs_free_path(path); return ret; } +#endif static int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) @@ -6421,9 +7024,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -6466,7 +7070,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_new_blockgroup = trans->transid; + root->fs_info->last_trans_log_full_commit = trans->transid; cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) @@ -6477,9 +7081,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); @@ -6500,9 +7105,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, sizeof(cache->item)); BUG_ON(ret); - finish_current_insert(trans, extent_root, 0); - ret = del_pending_extents(trans, extent_root, 0); - BUG_ON(ret); set_avail_alloc_bits(extent_root->fs_info, type); return 0; @@ -6513,6 +7115,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; struct btrfs_key key; int ret; @@ -6524,6 +7127,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + /* make sure this block group isn't part of an allocation cluster */ + cluster = &root->fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &root->fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + path = btrfs_alloc_path(); BUG_ON(!path); @@ -6533,7 +7151,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->block_group_cache_lock); btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); - list_del(&block_group->list); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); up_write(&block_group->space_info->groups_sem); spin_lock(&block_group->space_info->lock); @@ -6542,8 +7164,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); block_group->space_info->full = 0; - put_block_group(block_group); - put_block_group(block_group); + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ebe6b29e6069..68260180f587 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,12 +17,6 @@ #include "ctree.h" #include "btrfs_inode.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); - static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -50,20 +44,23 @@ struct extent_page_data { /* tells writepage not to lock the state bits for this range * it still does the unlocking */ - int extent_locked; + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; }; int __init extent_io_init(void) { - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; return 0; @@ -479,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + u64 last_end; int err; int set = 0; @@ -501,6 +499,7 @@ again: if (state->start > end) goto out; WARN_ON(state->end < start); + last_end = state->end; /* * | ---- desired range ---- | @@ -527,9 +526,11 @@ again: if (err) goto out; if (state->end <= end) { - start = state->end + 1; set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; } else { start = state->start; } @@ -555,8 +556,10 @@ again: goto out; } - start = state->end + 1; set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; goto search_again; out: @@ -710,8 +713,10 @@ again: goto out; } set_state_bits(tree, state, bits); - start = state->end + 1; merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; goto search_again; } @@ -745,8 +750,10 @@ again: goto out; if (state->end <= end) { set_state_bits(tree, state, bits); - start = state->end + 1; merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; } else { start = state->start; } @@ -1404,69 +1411,6 @@ out: return total_bytes; } -#if 0 -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} - -/* - * helper function to unlock both pages and extents in the tree. - */ -static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -#endif - /* * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. @@ -2101,6 +2045,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} + /* * the writepage semantics are similar to regular writepage. extent * records are inserted to lock ranges in the tree, and as dirty areas @@ -2136,8 +2090,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 delalloc_end; int page_started; int compressed; + int write_flags; unsigned long nr_written = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC_PLUG; + else + write_flags = WRITE; + WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || @@ -2164,6 +2124,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end = 0; page_started = 0; if (!epd->extent_locked) { + /* + * make sure the wbc mapping index is at least updated + * to this page. + */ + update_nr_written(page, wbc, 0); + while (delalloc_end < page_end) { nr_delalloc = find_lock_delalloc_range(inode, tree, page, @@ -2185,7 +2151,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (page_started) { ret = 0; - goto update_nr_written; + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= nr_written; + goto done_unlocked; } } lock_extent(tree, start, page_end, GFP_NOFS); @@ -2198,13 +2170,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret == -EAGAIN) { unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; - goto update_nr_written; + goto done_unlocked; } } - nr_written++; + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); end = page_end; if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) @@ -2314,9 +2291,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, (unsigned long long)end); } - ret = submit_extent_page(WRITE, tree, page, sector, - iosize, pg_offset, bdev, - &epd->bio, max_nr, + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, end_bio_extent_writepage, 0, 0, 0); if (ret) @@ -2336,11 +2313,8 @@ done: unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_page(page); -update_nr_written: - wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; +done_unlocked: + return 0; } @@ -2460,15 +2434,23 @@ retry: return ret; } -static noinline void flush_write_bio(void *data) +static void flush_epd_write_bio(struct extent_page_data *epd) { - struct extent_page_data *epd = data; if (epd->bio) { - submit_one_bio(WRITE, epd->bio, 0, 0); + if (epd->sync_io) + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); + else + submit_one_bio(WRITE, epd->bio, 0, 0); epd->bio = NULL; } } +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} + int extent_write_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct writeback_control *wbc) @@ -2480,23 +2462,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = wbc->bdi, - .sync_mode = WB_SYNC_NONE, + .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, }; - ret = __extent_writepage(page, wbc, &epd); extent_write_cache_pages(tree, mapping, &wbc_writepages, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2515,6 +2496,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .tree = tree, .get_extent = get_extent, .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = inode->i_mapping->backing_dev_info, @@ -2540,8 +2522,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, start += PAGE_CACHE_SIZE; } - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2556,13 +2537,13 @@ int extent_writepages(struct extent_io_tree *tree, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = extent_write_cache_pages(tree, mapping, wbc, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2884,25 +2865,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, disko = 0; flags = 0; - switch (em->block_start) { - case EXTENT_MAP_LAST_BYTE: + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - break; - case EXTENT_MAP_HOLE: + } else if (em->block_start == EXTENT_MAP_HOLE) { flags |= FIEMAP_EXTENT_UNWRITTEN; - break; - case EXTENT_MAP_INLINE: + } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); - break; - case EXTENT_MAP_DELALLOC: + } else if (em->block_start == EXTENT_MAP_DELALLOC) { flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); - break; - default: + } else { disko = em->block_start; - break; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; @@ -3124,20 +3099,15 @@ void free_extent_buffer(struct extent_buffer *eb) int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb) { - int set; unsigned long i; unsigned long num_pages; struct page *page; - u64 start = eb->start; - u64 end = start + eb->len - 1; - - set = clear_extent_dirty(tree, start, end, GFP_NOFS); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!set && !PageDirty(page)) + if (!PageDirty(page)) continue; lock_page(page); @@ -3146,22 +3116,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, else set_page_private(page, EXTENT_PAGE_PRIVATE); - /* - * if we're on the last page or the first page and the - * block isn't aligned on a page boundary, do extra checks - * to make sure we don't clean page that is partially dirty - */ - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - start = (u64)page->index << PAGE_CACHE_SHIFT; - end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, - EXTENT_DIRTY, 0)) { - unlock_page(page); - continue; - } - } clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3187,29 +3141,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, { unsigned long i; unsigned long num_pages; + int was_dirty = 0; + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *page = extent_buffer_page(eb, i); - /* writepage may need to do something special for the - * first page, we have to make sure page->private is - * properly set. releasepage may drop page->private - * on us if the page isn't already dirty. - */ - lock_page(page); - if (i == 0) { - set_page_extent_head(page, eb->len); - } else if (PagePrivate(page) && - page->private != EXTENT_PAGE_PRIVATE) { - set_page_extent_mapped(page); - } + for (i = 0; i < num_pages; i++) __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); - set_extent_dirty(tree, page_offset(page), - page_offset(page) + PAGE_CACHE_SIZE - 1, - GFP_NOFS); - unlock_page(page); - } - return 0; + return was_dirty; } int clear_extent_buffer_uptodate(struct extent_io_tree *tree, @@ -3789,6 +3727,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) ret = 0; goto out; } + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + ret = 0; + goto out; + } /* at this point we can safely release the extent buffer */ num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1f9df88afbf6..5bc20abf3f3d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -25,6 +25,7 @@ /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 #define EXTENT_BUFFER_BLOCKING 1 +#define EXTENT_BUFFER_DIRTY 2 /* * page->private values. Every page that is controlled by the extent @@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); +int test_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_io_tree *tree, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 50da69da20ce..30c9365861e6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -6,19 +6,14 @@ #include <linux/hardirq.h> #include "extent_map.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = btrfs_cache_create("extent_map", - sizeof(struct extent_map), 0, - NULL); + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) tree->map.rb_node = NULL; spin_lock_init(&tree->lock); } -EXPORT_SYMBOL(extent_map_tree_init); /** * alloc_extent_map - allocate new extent map structure @@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask) atomic_set(&em->refs, 1); return em; } -EXPORT_SYMBOL(alloc_extent_map); /** * free_extent_map - drop reference count of an extent_map @@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em) kmem_cache_free(extent_map_cache, em); } } -EXPORT_SYMBOL(free_extent_map); static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) @@ -234,7 +226,6 @@ int add_extent_mapping(struct extent_map_tree *tree, rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; - free_extent_map(merge); goto out; } atomic_inc(&em->refs); @@ -265,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree, out: return ret; } -EXPORT_SYMBOL(add_extent_mapping); /* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) @@ -327,7 +317,6 @@ found: out: return em; } -EXPORT_SYMBOL(lookup_extent_mapping); /** * remove_extent_mapping - removes an extent_map from the extent tree @@ -347,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) em->in_tree = 0; return ret; } -EXPORT_SYMBOL(remove_extent_mapping); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 964652435fd1..9b99886562d0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, file_key.offset = pos; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) @@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { if (path->slots[0] == 0) @@ -757,8 +759,10 @@ insert: } else { ins_size = csum_size; } + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); + path->leave_spinning = 0; if (ret < 0) goto fail_unlock; if (ret != 0) { @@ -776,7 +780,6 @@ found: item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + btrfs_item_size_nr(leaf, path->slots[0])); eb_token = NULL; - cond_resched(); next_sector: if (!eb_token || @@ -817,9 +820,9 @@ next_sector: eb_token = NULL; } btrfs_mark_buffer_dirty(path->nodes[0]); - cond_resched(); if (total_bytes < sums->len) { btrfs_release_path(root, path); + cond_resched(); goto again; } out: diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index dc78954861b3..7c3cd248d8d6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -151,7 +151,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, } if (end_pos > isize) { i_size_write(inode, end_pos); - btrfs_update_inode(trans, root, inode); + /* we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ } err = btrfs_end_transaction(trans, root); out_unlock: @@ -272,83 +275,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, return 0; } -int btrfs_check_file(struct btrfs_root *root, struct inode *inode) -{ - return 0; -#if 0 - struct btrfs_path *path; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - u64 last_offset = 0; - int nritems; - int slot; - int found_type; - int ret; - int err = 0; - u64 extent_end = 0; - - path = btrfs_alloc_path(); - ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, - last_offset, 0); - while (1) { - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - nritems = btrfs_header_nritems(path->nodes[0]); - } - slot = path->slots[0]; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != inode->i_ino) - break; - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - goto out; - - if (found_key.offset < last_offset) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu " - "expected %llu\n", inode->i_ino, - (unsigned long long)found_key.offset, - (unsigned long long)last_offset); - err = 1; - goto out; - } - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item; - item = btrfs_item_nr(leaf, slot); - extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, extent); - extent_end = (extent_end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - } - last_offset = extent_end; - path->slots[0]++; - } - if (0 && last_offset < inode->i_size) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu size %llu\n", - inode->i_ino, (unsigned long long)last_offset, - (unsigned long long)inode->i_size); - err = 1; - - } -out: - btrfs_free_path(path); - return err; -#endif -} - /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number @@ -363,20 +289,17 @@ out: */ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_byte) + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_byte) { u64 extent_end = 0; - u64 locked_end = end; u64 search_start = start; - u64 leaf_start; u64 ram_bytes = 0; - u64 orig_parent = 0; u64 disk_bytenr = 0; + u64 orig_locked_end = locked_end; u8 compression; u8 encryption; u16 other_encoding = 0; - u64 root_gen; - u64 root_owner; struct extent_buffer *leaf; struct btrfs_file_extent_item *extent; struct btrfs_path *path; @@ -416,9 +339,6 @@ next_slot: bookend = 0; found_extent = 0; found_inline = 0; - leaf_start = 0; - root_gen = 0; - root_owner = 0; compression = 0; encryption = 0; extent = NULL; @@ -493,9 +413,6 @@ next_slot: if (found_extent) { read_extent_buffer(leaf, &old, (unsigned long)extent, sizeof(old)); - root_gen = btrfs_header_generation(leaf); - root_owner = btrfs_header_owner(leaf); - leaf_start = leaf->start; } if (end < extent_end && end >= key.offset) { @@ -519,14 +436,14 @@ next_slot: } locked_end = extent_end; } - orig_parent = path->nodes[0]->start; disk_bytenr = le64_to_cpu(old.disk_bytenr); if (disk_bytenr != 0) { ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - orig_parent, root->root_key.objectid, - trans->transid, inode->i_ino); + le64_to_cpu(old.disk_num_bytes), 0, + root->root_key.objectid, + key.objectid, key.offset - + le64_to_cpu(old.offset)); BUG_ON(ret); } } @@ -606,6 +523,7 @@ next_slot: btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); btrfs_release_path(root, path); + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*extent)); BUG_ON(ret); @@ -639,17 +557,11 @@ next_slot: ram_bytes); btrfs_set_file_extent_type(leaf, extent, found_type); + btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_set_lock_blocking(path->nodes[0]); - if (disk_bytenr != 0) { - ret = btrfs_update_extent_ref(trans, root, - disk_bytenr, orig_parent, - leaf->start, - root->root_key.objectid, - trans->transid, ins.objectid); - - BUG_ON(ret); - } + path->leave_spinning = 0; btrfs_release_path(root, path); if (disk_bytenr != 0) inode_add_bytes(inode, extent_end - end); @@ -664,8 +576,9 @@ next_slot: ret = btrfs_free_extent(trans, root, old_disk_bytenr, le64_to_cpu(old.disk_num_bytes), - leaf_start, root_owner, - root_gen, key.objectid, 0); + 0, root->root_key.objectid, + key.objectid, key.offset - + le64_to_cpu(old.offset)); BUG_ON(ret); *hint_byte = old_disk_bytenr; } @@ -678,11 +591,10 @@ next_slot: } out: btrfs_free_path(path); - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + if (locked_end > orig_locked_end) { + unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, + locked_end - 1, GFP_NOFS); } - btrfs_check_file(root, inode); return ret; } @@ -735,12 +647,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, u64 bytenr; u64 num_bytes; u64 extent_end; - u64 extent_offset; + u64 orig_offset; u64 other_start; u64 other_end; u64 split = start; u64 locked_end = end; - u64 orig_parent; int extent_type; int split_end = 1; int ret; @@ -774,7 +685,7 @@ again: bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); if (key.offset == start) split = end; @@ -782,8 +693,6 @@ again: if (key.offset == start && extent_end == end) { int del_nr = 0; int del_slot = 0; - u64 leaf_owner = btrfs_header_owner(leaf); - u64 leaf_gen = btrfs_header_generation(leaf); other_start = end; other_end = 0; if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, @@ -792,8 +701,8 @@ again: del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - leaf->start, leaf_owner, - leaf_gen, inode->i_ino, 0); + 0, root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); } other_start = 0; @@ -804,8 +713,8 @@ again: del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - leaf->start, leaf_owner, - leaf_gen, inode->i_ino, 0); + 0, root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); } split_end = 0; @@ -824,7 +733,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); BUG_ON(ret); - goto done; + goto release; } else if (split == start) { if (locked_end < extent_end) { ret = try_lock_extent(&BTRFS_I(inode)->io_tree, @@ -839,13 +748,12 @@ again: locked_end = extent_end; } btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); - extent_offset += split - key.offset; } else { BUG_ON(key.offset != start); - btrfs_set_file_extent_offset(leaf, fi, extent_offset + - split - key.offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); key.offset = split; + btrfs_set_file_extent_offset(leaf, fi, key.offset - + orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); btrfs_set_item_key_safe(trans, root, path, &key); extent_end = split; } @@ -864,7 +772,8 @@ again: struct btrfs_file_extent_item); key.offset = split; btrfs_set_item_key_safe(trans, root, path, &key); - btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_offset(leaf, fi, key.offset - + orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - split); goto done; @@ -886,10 +795,9 @@ again: btrfs_mark_buffer_dirty(leaf); - orig_parent = leaf->start; - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, - orig_parent, root->root_key.objectid, - trans->transid, inode->i_ino); + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); btrfs_release_path(root, path); @@ -904,22 +812,16 @@ again: btrfs_set_file_extent_type(leaf, fi, extent_type); btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_compression(leaf, fi, 0); btrfs_set_file_extent_encryption(leaf, fi, 0); btrfs_set_file_extent_other_encoding(leaf, fi, 0); - - if (orig_parent != leaf->start) { - ret = btrfs_update_extent_ref(trans, root, bytenr, - orig_parent, leaf->start, - root->root_key.objectid, - trans->transid, inode->i_ino); - BUG_ON(ret); - } done: btrfs_mark_buffer_dirty(leaf); + +release: btrfs_release_path(root, path); if (split_end && split == start) { split = end; @@ -1125,7 +1027,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, if (will_write) { btrfs_fdatawrite_range(inode->i_mapping, pos, pos + write_bytes - 1, - WB_SYNC_NONE); + WB_SYNC_ALL); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); @@ -1155,6 +1057,20 @@ out_nolock: page_cache_release(pinned[1]); *ppos = pos; + /* + * we want to make sure fsync finds this change + * but we haven't joined a transaction running right now. + * + * Later on, someone is sure to update the inode and get the + * real transid recorded. + * + * We set last_trans now to the fs_info generation + 1, + * this will either be one more than the running transaction + * or the generation used for the next transaction if there isn't + * one running right now. + */ + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + if (num_written > 0 && will_write) { struct btrfs_trans_handle *trans; @@ -1167,8 +1083,11 @@ out_nolock: ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); if (ret == 0) { - btrfs_sync_log(trans, root); - btrfs_end_transaction(trans, root); + ret = btrfs_sync_log(trans, root); + if (ret == 0) + btrfs_end_transaction(trans, root); + else + btrfs_commit_transaction(trans, root); } else { btrfs_commit_transaction(trans, root); } @@ -1185,6 +1104,18 @@ out_nolock: int btrfs_release_file(struct inode *inode, struct file *filp) { + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (BTRFS_I(inode)->ordered_data_close) { + BTRFS_I(inode)->ordered_data_close = 0; + btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(inode->i_mapping); + } if (filp->private_data) btrfs_ioctl_trans_end(filp); return 0; @@ -1229,6 +1160,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) btrfs_wait_ordered_range(inode, 0, (u64)-1); root->log_batch++; + if (datasync && !(inode->i_state & I_DIRTY_PAGES)) + goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ @@ -1260,8 +1193,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ret > 0) { ret = btrfs_commit_transaction(trans, root); } else { - btrfs_sync_log(trans, root); - ret = btrfs_end_transaction(trans, root); + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d1e5f0e84c58..4538e48581a5 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,15 @@ #include <linux/sched.h> #include "ctree.h" +#include "free-space-cache.h" +#include "transaction.h" + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; +}; static int tree_insert_offset(struct rb_root *root, u64 offset, struct rb_node *node) @@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, } /* - * searches the tree for the given offset. If contains is set we will return - * the free space that contains the given offset. If contains is not set we - * will return the free space that starts at or after the given offset and is - * at least bytes long. + * searches the tree for the given offset. + * + * fuzzy == 1: this is used for allocations where we are given a hint of where + * to look for free space. Because the hint may not be completely on an offset + * mark, or the hint may no longer point to free space we need to fudge our + * results a bit. So we look for free space starting at or after offset with at + * least bytes size. We prefer to find as close to the given offset as we can. + * Also if the offset is within a free space range, then we will return the free + * space that contains the given offset, which means we can return a free space + * chunk with an offset before the provided offset. + * + * fuzzy == 0: this is just a normal tree search. Give us the free space that + * starts at the given offset which is at least bytes size, and if its not there + * return NULL. */ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, u64 offset, u64 bytes, - int contains) + int fuzzy) { struct rb_node *n = root->rb_node; struct btrfs_free_space *entry, *ret = NULL; @@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, entry = rb_entry(n, struct btrfs_free_space, offset_index); if (offset < entry->offset) { - if (!contains && + if (fuzzy && (!ret || entry->offset < ret->offset) && (bytes <= entry->bytes)) ret = entry; n = n->rb_left; } else if (offset > entry->offset) { - if ((entry->offset + entry->bytes - 1) >= offset && + if (fuzzy && + (entry->offset + entry->bytes - 1) >= offset && bytes <= entry->bytes) { ret = entry; break; @@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int ret = 0; + BUG_ON(!info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, &info->offset_index); if (ret) @@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, return ret; } -static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *right_info; struct btrfs_free_space *left_info; struct btrfs_free_space *info = NULL; - struct btrfs_free_space *alloc_info; int ret = 0; - alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!alloc_info) + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) return -ENOMEM; + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to * cover the entire range */ right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 1); + offset+bytes, 0, 0); left_info = tree_search_offset(&block_group->free_space_offset, offset-1, 0, 1); - if (right_info && right_info->offset == offset+bytes) { + if (right_info) { unlink_free_space(block_group, right_info); - info = right_info; - info->offset = offset; - info->bytes += bytes; - } else if (right_info && right_info->offset != offset+bytes) { - printk(KERN_ERR "btrfs adding space in the middle of an " - "existing free space area. existing: " - "offset=%llu, bytes=%llu. new: offset=%llu, " - "bytes=%llu\n", (unsigned long long)right_info->offset, - (unsigned long long)right_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); + info->bytes += right_info->bytes; + kfree(right_info); } - if (left_info) { + if (left_info && left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); - - if (unlikely((left_info->offset + left_info->bytes) != - offset)) { - printk(KERN_ERR "btrfs free space to the left " - "of new free space isn't " - "quite right. existing: offset=%llu, " - "bytes=%llu. new: offset=%llu, bytes=%llu\n", - (unsigned long long)left_info->offset, - (unsigned long long)left_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); - } - - if (info) { - info->offset = left_info->offset; - info->bytes += left_info->bytes; - kfree(left_info); - } else { - info = left_info; - info->bytes += bytes; - } + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); } - if (info) { - ret = link_free_space(block_group, info); - if (!ret) - info = NULL; - goto out; - } - - info = alloc_info; - alloc_info = NULL; - info->offset = offset; - info->bytes = bytes; - ret = link_free_space(block_group, info); if (ret) kfree(info); -out: + + spin_unlock(&block_group->tree_lock); + if (ret) { printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); - if (ret == -EEXIST) - BUG(); + BUG_ON(ret == -EEXIST); } - kfree(alloc_info); - return ret; } -static int -__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *info; int ret = 0; + spin_lock(&block_group->tree_lock); + info = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - if (info && info->offset == offset) { if (info->bytes < bytes) { printk(KERN_ERR "Found free space at %llu, size %llu," @@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; + spin_unlock(&block_group->tree_lock); goto out; } unlink_free_space(block_group, info); if (info->bytes == bytes) { kfree(info); + spin_unlock(&block_group->tree_lock); goto out; } @@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->bytes -= bytes; ret = link_free_space(block_group, info); + spin_unlock(&block_group->tree_lock); BUG_ON(ret); } else if (info && info->offset < offset && info->offset + info->bytes >= offset + bytes) { @@ -333,70 +319,37 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, */ kfree(info); } - + spin_unlock(&block_group->tree_lock); /* step two, insert a new info struct to cover anything * before the hole */ - ret = __btrfs_add_free_space(block_group, old_start, - offset - old_start); + ret = btrfs_add_free_space(block_group, old_start, + offset - old_start); BUG_ON(ret); } else { + spin_unlock(&block_group->tree_lock); + if (!info) { + printk(KERN_ERR "couldn't find space %llu to free\n", + (unsigned long long)offset); + printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", + block_group->cached, + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->key.offset); + btrfs_dump_free_space(block_group, bytes); + } else if (info) { + printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " + "but wanted offset=%llu bytes=%llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); + } WARN_ON(1); } out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - struct btrfs_free_space *sp; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - struct btrfs_free_space *sp; - - ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); - - return ret; -} - -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret = 0; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_remove_free_space(block_group, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - - ret = __btrfs_remove_free_space(block_group, offset, bytes); - - return ret; -} - void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes) { @@ -408,6 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; + printk(KERN_ERR "entry offset %llu, bytes %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); @@ -428,68 +384,341 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) return ret; } +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static int +__btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) + goto out; + + cluster->window_start = 0; + node = rb_first(&cluster->root); + while(node) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + link_free_space(block_group, entry); + } + list_del_init(&cluster->block_group_list); + + btrfs_put_block_group(cluster->block_group); + cluster->block_group = NULL; + cluster->root.rb_node = NULL; +out: + spin_unlock(&cluster->lock); + return 0; +} + void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) { struct btrfs_free_space *info; struct rb_node *node; + struct btrfs_free_cluster *cluster; + struct btrfs_free_cluster *safe; + + spin_lock(&block_group->tree_lock); + + list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, + block_group_list) { + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + } - mutex_lock(&block_group->alloc_mutex); while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { info = rb_entry(node, struct btrfs_free_space, bytes_index); unlink_free_space(block_group, info); kfree(info); if (need_resched()) { - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); cond_resched(); - mutex_lock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); } } - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); } -#if 0 -static struct btrfs_free_space *btrfs_find_free_space_offset(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size) { - struct btrfs_free_space *ret; + struct btrfs_free_space *entry = NULL; + u64 ret = 0; - mutex_lock(&block_group->alloc_mutex); - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - mutex_unlock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); + entry = tree_search_offset(&block_group->free_space_offset, offset, + bytes + empty_size, 1); + if (!entry) + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes + empty_size); + if (entry) { + unlink_free_space(block_group, entry); + ret = entry->offset; + entry->offset += bytes; + entry->bytes -= bytes; + + if (!entry->bytes) + kfree(entry); + else + link_free_space(block_group, entry); + } + spin_unlock(&block_group->tree_lock); return ret; } -static struct btrfs_free_space *btrfs_find_free_space_bytes(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) { - struct btrfs_free_space *ret; + int ret; - mutex_lock(&block_group->alloc_mutex); + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return 0; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return 0; + } + atomic_inc(&block_group->count); + spin_unlock(&cluster->lock); - ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); + /* now return any extents the cluster had on it */ + spin_lock(&block_group->tree_lock); + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&block_group->tree_lock); + /* finally drop our ref */ + btrfs_put_block_group(block_group); return ret; } -#endif -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start) { - struct btrfs_free_space *ret = NULL; + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - if (!ret) - ret = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes); + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + while(1) { + if (entry->bytes < bytes || entry->offset < min_start) { + struct rb_node *node; + + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + + if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); + kfree(entry); + } + break; + } +out: + spin_unlock(&cluster->lock); return ret; } + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + struct btrfs_free_space *next; + struct btrfs_free_space *last; + u64 min_bytes; + u64 window_start; + u64 window_free; + u64 max_extent = 0; + int total_retries = 0; + int ret; + + /* for metadata, allow allocates with more holes */ + if (btrfs_test_opt(root, SSD_SPREAD)) { + min_bytes = bytes + empty_size; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } +again: + min_bytes = min(min_bytes, bytes + empty_size); + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, min_bytes); + if (!entry) { + ret = -ENOSPC; + goto out; + } + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = entry->bytes; + + while(1) { + /* out window is just right, lets fill it */ + if (window_free >= bytes + empty_size) + break; + + node = rb_next(&last->offset_index); + if (!node) { + ret = -ENOSPC; + goto out; + } + next = rb_entry(node, struct btrfs_free_space, offset_index); + + /* + * we haven't filled the empty size and the window is + * very large. reset and try again + */ + if (next->offset - (last->offset + last->bytes) > 128 * 1024 || + next->offset - window_start > (bytes + empty_size) * 2) { + entry = next; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = 0; + total_retries++; + if (total_retries % 64 == 0) { + if (min_bytes >= (bytes + empty_size)) { + ret = -ENOSPC; + goto out; + } + /* + * grow our allocation a bit, we're not having + * much luck + */ + min_bytes *= 2; + goto again; + } + } else { + last = next; + window_free += next->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + } + + cluster->window_start = entry->offset; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + * + * The cluster includes an rbtree, but only uses the offset index + * of each free space cache entry. + */ + while(1) { + node = rb_next(&entry->offset_index); + unlink_free_space(block_group, entry); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index); + BUG_ON(ret); + + if (!node || entry == last) + break; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + ret = 0; + cluster->max_size = max_extent; + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, &block_group->cluster_list); + cluster->block_group = block_group; +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root.rb_node = NULL; + cluster->max_size = 0; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h new file mode 100644 index 000000000000..266fb8764054 --- /dev/null +++ b/fs/btrfs/free-space-cache.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_CACHE +#define __BTRFS_FREE_SPACE_CACHE + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start); +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster); +#endif diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 2a020b276768..db2ff9773b99 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -19,9 +19,9 @@ #ifndef __HASH__ #define __HASH__ -#include "crc32c.h" +#include <linux/crc32c.h> static inline u64 btrfs_name_hash(const char *name, int len) { - return btrfs_crc32c((u32)~1, name, len); + return crc32c((u32)~1, name, len); } #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 3d46fa1f29a4..6b627c611808 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; @@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc7334d833c9..9abbced1123d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); BUG_ON(!path); - search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); search_key.objectid = search_start; search_key.type = 0; search_key.offset = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..7ffa3d34ea19 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -48,7 +48,6 @@ #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" -#include "ref-cache.h" #include "compression.h" #include "locking.h" @@ -70,7 +69,6 @@ static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_bit_radix_cachep; struct kmem_cache *btrfs_path_cachep; #define S_SHIFT 12 @@ -134,6 +132,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; btrfs_set_trans_block_group(trans, inode); key.objectid = inode->i_ino; @@ -167,9 +166,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_CACHE_SIZE); - kaddr = kmap(cpage); + kaddr = kmap_atomic(cpage, KM_USER0); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap(cpage); + kunmap_atomic(kaddr, KM_USER0); i++; ptr += cur_size; @@ -204,7 +203,7 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static int cow_file_range_inline(struct btrfs_trans_handle *trans, +static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, size_t compressed_size, @@ -233,7 +232,7 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, &hint_byte); BUG_ON(ret); if (isize > actual_end) @@ -369,7 +368,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (!btrfs_test_flag(inode, NOCOMPRESS) && + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && btrfs_test_opt(root, COMPRESS)) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); @@ -470,7 +469,7 @@ again: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - btrfs_set_flag(inode, NOCOMPRESS); + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } if (will_compress) { *num_added += 1; @@ -854,11 +853,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - if (!btrfs_test_opt(root, COMPRESS)) { - return cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); - } - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC, 1, 0, GFP_NOFS); while (start < end) { @@ -868,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->locked_page = locked_page; async_cow->start = start; - if (btrfs_test_flag(inode, NOCOMPRESS)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) cur_end = end; else cur_end = min(end, start + 512 * 1024 - 1); @@ -935,7 +929,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root, * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, +static noinline int run_delalloc_nocow(struct inode *inode, + struct page *locked_page, u64 start, u64 end, int *page_started, int force, unsigned long *nr_written) { @@ -948,6 +943,7 @@ static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, u64 cow_start; u64 cur_offset; u64 extent_end; + u64 extent_offset; u64 disk_bytenr; u64 num_bytes; int extent_type; @@ -1009,6 +1005,7 @@ next_slot: if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (extent_end <= start) { @@ -1026,9 +1023,10 @@ next_slot: if (btrfs_extent_readonly(root, disk_bytenr)) goto out_check; if (btrfs_cross_ref_exist(trans, root, inode->i_ino, - disk_bytenr)) + found_key.offset - + extent_offset, disk_bytenr)) goto out_check; - disk_bytenr += btrfs_file_extent_offset(leaf, fi); + disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* @@ -1133,17 +1131,20 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, unsigned long *nr_written) { int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; - if (btrfs_test_flag(inode, NODATACOW)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (btrfs_test_flag(inode, PREALLOC)) + else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); + else if (!btrfs_test_opt(root, COMPRESS)) + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); else ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); - return ret; } @@ -1289,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret = 0; int skip_sum; - skip_sum = btrfs_test_flag(inode, NODATASUM); + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); @@ -1439,6 +1440,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, + u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1453,8 +1455,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, file_pos, &hint); + file_pos + num_bytes, locked_end, + file_pos, &hint); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1475,6 +1479,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, fi, compression); btrfs_set_file_extent_encryption(leaf, fi, encryption); btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + + btrfs_unlock_up_safe(path, 1); + btrfs_set_lock_blocking(leaf); + btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); @@ -1483,15 +1491,39 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, - root->root_key.objectid, - trans->transid, inode->i_ino, &ins); + ret = btrfs_alloc_reserved_file_extent(trans, root, + root->root_key.objectid, + inode->i_ino, file_pos, &ins); BUG_ON(ret); - btrfs_free_path(path); + return 0; } +/* + * helper function for btrfs_finish_ordered_io, this + * just reads in some of the csum leaves to prime them into ram + * before we start the transaction. It limits the amount of btree + * reads required while inside the transaction. + */ +static noinline void reada_csum(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_ordered_sum *sum; + u64 bytenr; + + sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, + list); + bytenr = sum->sums[0].bytenr; + + /* + * we don't care about the results, the point of this search is + * just to get the btree leaves into ram + */ + btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); +} + /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -1500,8 +1532,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct btrfs_ordered_extent *ordered_extent; + struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_path *path; int compressed = 0; int ret; @@ -1509,9 +1542,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) if (!ret) return 0; + /* + * before we join the transaction, try to do some of our IO. + * This will limit the amount of IO that we have to do with + * the transaction running. We're unlikely to need to do any + * IO if the file extents are new, the disk_i_size checks + * covers the most common case. + */ + if (start < BTRFS_I(inode)->disk_i_size) { + path = btrfs_alloc_path(); + if (path) { + ret = btrfs_lookup_file_extent(NULL, root, path, + inode->i_ino, + start, 0); + ordered_extent = btrfs_lookup_ordered_extent(inode, + start); + if (!list_empty(&ordered_extent->list)) { + btrfs_release_path(root, path); + reada_csum(root, path, ordered_extent); + } + btrfs_free_path(path); + } + } + trans = btrfs_join_transaction(root, 1); - ordered_extent = btrfs_lookup_ordered_extent(inode, start); + if (!ordered_extent) + ordered_extent = btrfs_lookup_ordered_extent(inode, start); BUG_ON(!ordered_extent); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) goto nocow; @@ -1536,6 +1593,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, + ordered_extent->file_offset + + ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); BUG_ON(ret); @@ -1731,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, ClearPageChecked(page); goto good; } - if (btrfs_test_flag(inode, NODATASUM)) + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && @@ -1765,10 +1825,12 @@ good: return 0; zeroit: - printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " - "private %llu\n", page->mapping->host->i_ino, - (unsigned long long)start, csum, - (unsigned long long)private); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + } memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); @@ -1897,23 +1959,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * crossing root thing. we store the inode number in the * offset of the orphan item. */ - inode = btrfs_iget_locked(root->fs_info->sb, - found_key.offset, root); - if (!inode) + found_key.objectid = found_key.offset; + found_key.type = BTRFS_INODE_ITEM_KEY; + found_key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &found_key, root); + if (IS_ERR(inode)) break; - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - - /* have to set the location manually */ - BTRFS_I(inode)->location.objectid = inode->i_ino; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - /* * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it @@ -1957,9 +2009,60 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) } /* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + +/* * read an inode from the btree into the in-memory inode */ -void btrfs_read_locked_inode(struct inode *inode) +static void btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -1967,6 +2070,7 @@ void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + int maybe_acls; u64 alloc_group_block; u32 rdev; int ret; @@ -2013,6 +2117,14 @@ void btrfs_read_locked_inode(struct inode *inode) alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + if (!maybe_acls) + cache_no_acl(inode); + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); @@ -2043,6 +2155,8 @@ void btrfs_read_locked_inode(struct inode *inode) init_special_inode(inode, inode->i_mode, rdev); break; } + + btrfs_update_iflags(inode); return; make_bad: @@ -2101,6 +2215,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 1); if (ret) { @@ -2147,6 +2262,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } + path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2190,8 +2306,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir->i_ino); BUG_ON(ret != 0 && ret != -ENOENT); - if (ret != -ENOENT) - BTRFS_I(dir)->log_dirty_trans = trans->transid; ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); @@ -2206,7 +2320,6 @@ err: btrfs_update_inode(trans, root, dir); btrfs_drop_nlink(inode); ret = btrfs_update_inode(trans, root, inode); - dir->i_sb->s_dirt = 1; out: return ret; } @@ -2224,6 +2337,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); + + btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); @@ -2475,9 +2591,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; u64 extent_start = 0; u64 extent_num_bytes = 0; + u64 extent_offset = 0; u64 item_end = 0; - u64 root_gen = 0; - u64 root_owner = 0; int found_extent; int del_item; int pending_del_nr = 0; @@ -2498,6 +2613,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto error; @@ -2591,6 +2707,9 @@ search_again: extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + /* FIXME blocksize != 4096 */ num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { @@ -2598,8 +2717,6 @@ search_again: if (root->ref_cows) inode_sub_bytes(inode, num_dec); } - root_gen = btrfs_header_generation(leaf); - root_owner = btrfs_header_owner(leaf); } } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* @@ -2643,11 +2760,12 @@ delete: } else { break; } - if (found_extent) { + if (found_extent && root->ref_cows) { + btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, - extent_num_bytes, - leaf->start, root_owner, - root_gen, inode->i_ino, 0); + extent_num_bytes, 0, + btrfs_header_owner(leaf), + inode->i_ino, extent_offset); BUG_ON(ret); } next: @@ -2685,7 +2803,6 @@ error: pending_del_nr); } btrfs_free_path(path); - inode->i_sb->s_dirt = 1; return ret; } @@ -2818,6 +2935,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, root, inode, cur_offset, cur_offset + hole_size, + block_end, cur_offset, &hint_byte); if (err) break; @@ -2848,11 +2966,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + if (attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; + } else if (inode->i_size > 0 && + attr->ia_size == 0) { + + /* we're truncating a file that used to have good + * data down to zero. Make sure it gets into + * the ordered flush list so that any new writes + * get down to disk quickly. + */ + BTRFS_I(inode)->ordered_data_close = 1; + } } err = inode_setattr(inode, attr); @@ -2968,13 +3096,49 @@ static int fixup_tree_root_location(struct btrfs_root *root, return 0; } +static void inode_tree_add(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *entry; + struct rb_node **p = &root->inode_tree.rb_node; + struct rb_node *parent = NULL; + + spin_lock(&root->inode_lock); + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_inode, rb_node); + + if (inode->i_ino < entry->vfs_inode.i_ino) + p = &(*p)->rb_left; + else if (inode->i_ino > entry->vfs_inode.i_ino) + p = &(*p)->rb_right; + else { + WARN_ON(!(entry->vfs_inode.i_state & + (I_WILL_FREE | I_FREEING | I_CLEAR))); + break; + } + } + rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); + rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); + spin_unlock(&root->inode_lock); +} + +static void inode_tree_del(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { + spin_lock(&root->inode_lock); + rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); + spin_unlock(&root->inode_lock); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + } +} + static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); - bi->i_acl = NULL; - bi->i_default_acl = NULL; - bi->generation = 0; bi->sequence = 0; bi->last_trans = 0; @@ -2984,13 +3148,16 @@ static noinline void init_btrfs_i(struct inode *inode) bi->disk_i_size = 0; bi->flags = 0; bi->index_cnt = (u64)-1; - bi->log_dirty_trans = 0; + bi->last_unlink_trans = 0; + bi->ordered_data_close = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -3013,26 +3180,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, - struct btrfs_root *root, int wait) -{ - struct inode *inode; - struct btrfs_iget_args args; - args.ino = objectid; - args.root = root; - - if (wait) { - inode = ilookup5(s, objectid, btrfs_find_actor, - (void *)&args); - } else { - inode = ilookup5_nowait(s, objectid, btrfs_find_actor, - (void *)&args); - } - return inode; -} - -struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, - struct btrfs_root *root) +static struct inode *btrfs_iget_locked(struct super_block *s, + u64 objectid, + struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -3049,24 +3199,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, * Returns in *is_new if the inode was read from disk */ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *is_new) + struct btrfs_root *root) { struct inode *inode; inode = btrfs_iget_locked(s, location->objectid, root); if (!inode) - return ERR_PTR(-EACCES); + return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); + + inode_tree_add(inode); unlock_new_inode(inode); - if (is_new) - *is_new = 1; - } else { - if (is_new) - *is_new = 0; } return inode; @@ -3079,7 +3226,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *root = bi->root; struct btrfs_root *sub_root = root; struct btrfs_key location; - int ret, new; + int ret; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -3097,7 +3244,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); - inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); + inode = btrfs_iget(dir->i_sb, &location, sub_root); if (IS_ERR(inode)) return ERR_CAST(inode); } @@ -3411,8 +3558,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (dir) { ret = btrfs_set_inode_index(dir, index); - if (ret) + if (ret) { + iput(inode); return ERR_PTR(ret); + } } /* * index_cnt is ignored for everything but a dir, @@ -3431,12 +3580,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 1; BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_hint, owner); - if ((mode & S_IFREG)) { - if (btrfs_test_opt(root, NODATASUM)) - btrfs_set_flag(inode, NODATASUM); - if (btrfs_test_opt(root, NODATACOW)) - btrfs_set_flag(inode, NODATACOW); - } key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); @@ -3449,6 +3592,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); sizes[1] = name_len + sizeof(*ref); + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); if (ret != 0) goto fail; @@ -3488,12 +3632,23 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, location->offset = 0; btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + btrfs_inherit_iflags(inode, dir); + + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(root, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + } + insert_inode_hash(inode); + inode_tree_add(inode); return inode; fail: if (dir) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); + iput(inode); return ERR_PTR(ret); } @@ -3607,7 +3762,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); out_unlock: @@ -3672,7 +3826,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); out_unlock: @@ -3719,7 +3872,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) drop_inode = 1; - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); @@ -3727,6 +3879,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; nr = trans->blocks_used; + + btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); btrfs_end_transaction_throttle(trans, root); fail: if (drop_inode) { @@ -3799,7 +3953,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); drop_on_err = 0; - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); @@ -4151,7 +4304,6 @@ out: } if (err) { free_extent_map(em); - WARN_ON(1); return ERR_PTR(err); } return em; @@ -4292,8 +4444,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = fdentry(vma->vm_file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -4306,10 +4459,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) u64 page_end; ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) + if (ret) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; goto out; + } - ret = -EINVAL; + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); size = i_size_read(inode); @@ -4357,6 +4515,8 @@ again: } ClearPageChecked(page); set_page_dirty(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -4382,6 +4542,27 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); + + /* + * setattr is responsible for setting the ordered_data_close flag, + * but that is only tested during the last file release. That + * could happen well after the next commit, leaving a great big + * window where new writes may get lost if someone chooses to write + * to this file after truncating to zero + * + * The inode doesn't have any dirty data here, and so if we commit + * this is a noop. If someone immediately starts writing to the inode + * it is very likely we'll catch some of their writes in this + * transaction, and the commit will find this file on the ordered + * data list with good things to send down. + * + * This is a best effort solution, there is still a window where + * using truncate to replace the contents of the file will + * end up with a zero length file after a crash. + */ + if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + btrfs_add_ordered_operation(trans, root, inode); + btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, inode->i_size); @@ -4455,32 +4636,37 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_trans = 0; ei->logged_trans = 0; btrfs_ordered_inode_tree_init(&ei->ordered_tree); - ei->i_acl = BTRFS_ACL_NOT_CACHED; - ei->i_default_acl = BTRFS_ACL_NOT_CACHED; INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->ordered_operations); return &ei->vfs_inode; } void btrfs_destroy_inode(struct inode *inode) { struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); - if (BTRFS_I(inode)->i_acl && - BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED) - posix_acl_release(BTRFS_I(inode)->i_acl); - if (BTRFS_I(inode)->i_default_acl && - BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) - posix_acl_release(BTRFS_I(inode)->i_default_acl); + /* + * Make sure we're properly removed from the ordered operation + * lists. + */ + smp_mb(); + if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { + spin_lock(&root->fs_info->ordered_extent_lock); + list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + } - spin_lock(&BTRFS_I(inode)->root->list_lock); + spin_lock(&root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" " list\n", inode->i_ino); dump_stack(); } - spin_unlock(&BTRFS_I(inode)->root->list_lock); + spin_unlock(&root->list_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -4496,6 +4682,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } + inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -4515,47 +4702,36 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); if (btrfs_transaction_cachep) kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_bit_radix_cachep) - kmem_cache_destroy(btrfs_bit_radix_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); } -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *)) -{ - return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | extra_flags), ctor); -} - int btrfs_init_cachep(void) { - btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), - 0, init_once); + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = - btrfs_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), - 0, NULL); + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), - 0, NULL); + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", - sizeof(struct btrfs_path), - 0, NULL); + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, - SLAB_DESTROY_BY_RCU, NULL); - if (!btrfs_bit_radix_cachep) - goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -4605,8 +4781,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_unlock; + /* + * we're using rename to replace one file with another. + * and the replacement file is large. Start IO on it now so + * we don't add too much work to the end of the transaction + */ + if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && + new_inode->i_size && + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + trans = btrfs_start_transaction(root, 1); + /* + * make sure the inode gets flushed if it is replacing + * something. + */ + if (new_inode && new_inode->i_size && + old_inode && S_ISREG(old_inode->i_mode)) { + btrfs_add_ordered_operation(trans, root, old_inode); + } + + /* + * this is an ugly little race, but the rename is required to make + * sure that if we crash, the inode is either at the old name + * or the new one. pinning the log transaction lets us make sure + * we don't allow a log commit to come in after we unlink the + * name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + btrfs_set_trans_block_group(trans, new_dir); btrfs_inc_nlink(old_dentry->d_inode); @@ -4614,6 +4818,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, old_dentry->d_name.name, old_dentry->d_name.len); @@ -4645,7 +4852,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; + btrfs_log_new_name(trans, old_inode, old_dir, + new_dentry->d_parent); out_fail: + + /* this btrfs_end_log_trans just allows the current + * log-sub transaction to complete + */ + btrfs_end_log_trans(root); btrfs_end_transaction_throttle(trans, root); out_unlock: return ret; @@ -4758,7 +4972,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); if (drop_inode) @@ -4813,10 +5026,10 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode) +static int prealloc_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end, + u64 locked_end, u64 alloc_hint, int mode) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; @@ -4824,10 +5037,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, u64 num_bytes = end - start; int ret = 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - btrfs_set_trans_block_group(trans, inode); - while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, alloc_size, @@ -4840,7 +5049,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, 0, 0, 0, + ins.offset, locked_end, + 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); num_bytes -= ins.offset; @@ -4850,7 +5060,7 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, out: if (cur_offset > start) { inode->i_ctime = CURRENT_TIME; - btrfs_set_flag(inode, PREALLOC); + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && cur_offset > i_size_read(inode)) btrfs_i_size_write(inode, cur_offset); @@ -4858,7 +5068,6 @@ out: BUG_ON(ret); } - btrfs_end_transaction(trans, root); return ret; } @@ -4870,13 +5079,22 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; + u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; int ret; alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + mutex_lock(&inode->i_mutex); if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); @@ -4884,10 +5102,28 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + root = BTRFS_I(inode)->root; + + ret = btrfs_check_data_free_space(root, inode, + alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, - alloc_end - 1, GFP_NOFS); + + trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); + if (!trans) { + ret = -EIO; + goto out_free; + } + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -4895,7 +5131,13 @@ static long btrfs_fallocate(struct inode *inode, int mode, ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, - alloc_start, alloc_end - 1, GFP_NOFS); + alloc_start, locked_end, GFP_NOFS); + btrfs_end_transaction(trans, BTRFS_I(inode)->root); + + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); } else { @@ -4913,8 +5155,9 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(inode, cur_offset, - last_byte, alloc_hint, mode); + ret = prealloc_file_range(trans, inode, cur_offset, + last_byte, locked_end + 1, + alloc_hint, mode); if (ret < 0) { free_extent_map(em); break; @@ -4930,8 +5173,12 @@ static long btrfs_fallocate(struct inode *inode, int mode, break; } } - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); + + btrfs_end_transaction(trans, BTRFS_I(inode)->root); +out_free: + btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; @@ -4944,7 +5191,7 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask) { - if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) + if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; return generic_permission(inode, mask, btrfs_check_acl); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bca729fc80c8..9f4db848db10 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,7 +50,177 @@ #include "volumes.h" #include "locking.h" +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~FS_DIRSYNC_FL; + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* + * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + */ +static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +{ + unsigned int iflags = 0; + + if (flags & BTRFS_INODE_SYNC) + iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) + iflags |= FS_NODUMP_FL; + if (flags & BTRFS_INODE_NOATIME) + iflags |= FS_NOATIME_FL; + if (flags & BTRFS_INODE_DIRSYNC) + iflags |= FS_DIRSYNC_FL; + + return iflags; +} + +/* + * Update inode->i_flags based on the btrfs internal flags. + */ +void btrfs_update_iflags(struct inode *inode) +{ + struct btrfs_inode *ip = BTRFS_I(inode); + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + + if (ip->flags & BTRFS_INODE_SYNC) + inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + if (ip->flags & BTRFS_INODE_APPEND) + inode->i_flags |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + inode->i_flags |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + inode->i_flags |= S_DIRSYNC; +} + +/* + * Inherit flags from the parent inode. + * + * Unlike extN we don't have any flags we don't want to inherit currently. + */ +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +{ + unsigned int flags; + + if (!dir) + return; + + flags = BTRFS_I(dir)->flags; + + if (S_ISREG(inode->i_mode)) + flags &= ~BTRFS_INODE_DIRSYNC; + else if (!S_ISDIR(inode->i_mode)) + flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + + BTRFS_I(inode)->flags = flags; + btrfs_update_iflags(inode); +} +static int btrfs_ioctl_getflags(struct file *file, void __user *arg) +{ + struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); + unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_root *root = ip->root; + struct btrfs_trans_handle *trans; + unsigned int flags, oldflags; + int ret; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL)) + return -EOPNOTSUPP; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + mutex_lock(&inode->i_mutex); + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out_unlock; + + if (flags & FS_SYNC_FL) + ip->flags |= BTRFS_INODE_SYNC; + else + ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else + ip->flags &= ~BTRFS_INODE_APPEND; + if (flags & FS_NODUMP_FL) + ip->flags |= BTRFS_INODE_NODUMP; + else + ip->flags &= ~BTRFS_INODE_NODUMP; + if (flags & FS_NOATIME_FL) + ip->flags |= BTRFS_INODE_NOATIME; + else + ip->flags &= ~BTRFS_INODE_NOATIME; + if (flags & FS_DIRSYNC_FL) + ip->flags |= BTRFS_INODE_DIRSYNC; + else + ip->flags &= ~BTRFS_INODE_DIRSYNC; + + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + btrfs_update_iflags(inode); + inode->i_ctime = CURRENT_TIME; + btrfs_end_transaction(trans, root); + + mnt_drop_write(file->f_path.mnt); + out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + + return put_user(inode->i_generation, arg); +} static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, @@ -82,22 +252,25 @@ static noinline int create_subvol(struct btrfs_root *root, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - objectid, trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; } - btrfs_set_header_nritems(leaf, 0); - btrfs_set_header_level(leaf, 0); + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); write_extent_buffer(leaf, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); inode_item = &root_item.inode; @@ -125,7 +298,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_dirid(&root_item, new_dirid); key.objectid = objectid; - key.offset = 1; + key.offset = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); @@ -267,7 +440,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, goto out_dput; if (!IS_POSIXACL(parent->dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = mnt_want_write(parent->mnt); if (error) @@ -437,10 +610,6 @@ out_unlock: return 0; } -/* - * Called inside transaction, so use GFP_NOFS - */ - static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) { u64 new_size; @@ -461,15 +630,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -483,11 +646,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", devid); + printk(KERN_INFO "resizing devid %llu\n", + (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", devid); + printk(KERN_INFO "resizer unable to find device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_unlock; } @@ -545,7 +710,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) out_unlock: mutex_unlock(&root->fs_info->volume_mutex); -out: kfree(vol_args); return ret; } @@ -565,15 +729,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -675,19 +833,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -703,19 +855,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -830,7 +976,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + btrfs_drop_extents(trans, root, inode, off, off + len, + off + len, 0, &hint_byte); /* clone data */ key.objectid = src->i_ino; @@ -881,7 +1028,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, struct btrfs_file_extent_item); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG) { + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { disko = btrfs_file_extent_disk_bytenr(leaf, extent); diskl = btrfs_file_extent_disk_num_bytes(leaf, @@ -904,7 +1052,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.objectid = inode->i_ino; new_key.offset = key.offset + destoff - off; - if (type == BTRFS_FILE_EXTENT_REG) { + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); if (ret) @@ -937,10 +1086,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, - disko, diskl, leaf->start, - root->root_key.objectid, - trans->transid, - inode->i_ino); + disko, diskl, 0, + root->root_key.objectid, + inode->i_ino, + new_key.offset - datao); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -1100,6 +1249,12 @@ long btrfs_ioctl(struct file *file, unsigned int void __user *argp = (void __user *)arg; switch (cmd) { + case FS_IOC_GETFLAGS: + return btrfs_ioctl_getflags(file, argp); + case FS_IOC_SETFLAGS: + return btrfs_ioctl_setflags(file, argp); + case FS_IOC_GETVERSION: + return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 47b0a88c12a2..1c36e5cd8f55 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) /* * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for every long, and often they don't block - * at all. For a dbench 50 run, if we don't spin one the blocking bit + * don't end up blocking for very long, and often they don't block + * at all. For a dbench 50 run, if we don't spin on the blocking bit * at all, the context switch rate can jump up to 400,000/sec or more. * * So, we're still stuck with this crummy spin on the blocking bit, @@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) static int btrfs_spin_on_block(struct extent_buffer *eb) { int i; + for (i = 0; i < 512; i++) { - cpu_relax(); if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) return 1; if (need_resched()) break; + cpu_relax(); } return 0; } @@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb) { int i; - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); - + if (btrfs_spin_on_block(eb)) { + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } /* spin for a bit on the BLOCKING flag */ for (i = 0; i < 2; i++) { + cpu_relax(); if (!btrfs_spin_on_block(eb)) break; @@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb) DEFINE_WAIT(wait); wait.func = btrfs_wake_function; + if (!btrfs_spin_on_block(eb)) + goto sleep; + while(1) { spin_nested(eb); @@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb) * spin for a bit, and if the blocking flag goes away, * loop around */ + cpu_relax(); if (btrfs_spin_on_block(eb)) continue; - +sleep: prepare_to_wait_exclusive(&eb->lock_wq, &wait, TASK_UNINTERRUPTIBLE); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 77c2411a5f0f..d6f0806c682f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); + + /* + * we have no more ordered extents for this inode and + * no dirty pages. We can safely remove it from the + * list of ordered extents + */ + if (RB_EMPTY_ROOT(&tree->tree) && + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + list_del_init(&BTRFS_I(inode)->ordered_operations); + } spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); mutex_unlock(&tree->mutex); @@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) } /* + * this is used during transaction commit to write all the inodes + * added to the ordered operation list. These files must be fully on + * disk before the transaction commits. + * + * we have two modes here, one is to just start the IO via filemap_flush + * and the other is to wait for all the io. When we wait, we have an + * extra check to make sure the ordered operation list really is empty + * before we return + */ +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +{ + struct btrfs_inode *btrfs_inode; + struct inode *inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); +again: + list_splice_init(&root->fs_info->ordered_operations, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + inode = &btrfs_inode->vfs_inode; + + list_del_init(&btrfs_inode->ordered_operations); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(inode); + + if (!wait && inode) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + if (wait) + btrfs_wait_ordered_range(inode, 0, (u64)-1); + else + filemap_flush(inode->i_mapping); + iput(inode); + } + + cond_resched(); + spin_lock(&root->fs_info->ordered_extent_lock); + } + if (wait && !list_empty(&root->fs_info->ordered_operations)) + goto again; + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +/* * Used to start IO or wait for a given ordered extent to finish. * * If wait is one, this effectively waits on page writeback for all the pages @@ -417,7 +489,7 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again @@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, return ret; } + +/* + * add a given inode to the list of inodes that must be fully on + * disk before a transaction commit finishes. + * + * This basically gives us the ext3 style data=ordered mode, and it is mostly + * used to make sure renamed files are fully on disk. + * + * It is a noop if the inode is already fully on disk. + * + * If trans is not null, we'll do a friendly check for a transaction that + * is already flushing things and force the IO down ourselves. + */ +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + u64 last_mod; + + last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); + + /* + * if this file hasn't been changed since the last transaction + * commit, we can safely return without doing anything + */ + if (last_mod < root->fs_info->last_trans_committed) + return 0; + + /* + * the transaction is already committing. Just start the IO and + * don't bother with all of this list nonsense + */ + if (trans && root->fs_info->running_transaction->blocked) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + return 0; + } + + spin_lock(&root->fs_info->ordered_extent_lock); + if (list_empty(&BTRFS_I(inode)->ordered_operations)) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ab66d5e8d6d6..3d31c8827b01 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5f8f218c1005..6d6523da0a30 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb, (unsigned long long)btrfs_device_total_bytes(eb, dev_item), (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); } +static void print_extent_data_ref(struct extent_buffer *eb, + struct btrfs_extent_data_ref *ref) +{ + printk(KERN_INFO "\t\textent data backref root %llu " + "objectid %llu offset %llu count %u\n", + (unsigned long long)btrfs_extent_data_ref_root(eb, ref), + (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), + (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_count(eb, ref)); +} + +static void print_extent_item(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_disk_key key; + unsigned long end; + unsigned long ptr; + int type; + u32 item_size = btrfs_item_size_nr(eb, slot); + u64 flags; + u64 offset; + + if (item_size < sizeof(*ei)) { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); + printk(KERN_INFO "\t\textent refs %u\n", + btrfs_extent_refs_v0(eb, ei0)); + return; +#else + BUG(); +#endif + } + + ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", + (unsigned long long)btrfs_extent_refs(eb, ei), + (unsigned long long)btrfs_extent_generation(eb, ei), + (unsigned long long)flags); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + info = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_tree_block_key(eb, info, &key); + printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " + "level %d\n", + (unsigned long long)btrfs_disk_key_objectid(&key), + key.type, + (unsigned long long)btrfs_disk_key_offset(&key), + btrfs_tree_block_level(eb, info)); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(eb, iref); + offset = btrfs_extent_inline_ref_offset(eb, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref " + "root %llu\n", (unsigned long long)offset); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref " + "parent %llu\n", (unsigned long long)offset); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + print_extent_data_ref(eb, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + printk(KERN_INFO "\t\tshared data backref " + "parent %llu count %u\n", + (unsigned long long)offset, + btrfs_shared_data_ref_count(eb, sref)); + break; + default: + BUG(); + } + ptr += btrfs_extent_inline_ref_size(type); + } + WARN_ON(ptr > end); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static void print_extent_ref_v0(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_ref_v0 *ref0; + + ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); + printk("\t\textent back ref root %llu gen %llu " + "owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root_v0(eb, ref0), + (unsigned long long)btrfs_ref_generation_v0(eb, ref0), + (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), + (unsigned long)btrfs_ref_count_v0(eb, ref0)); +} +#endif + void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; + u32 type; u32 nr = btrfs_header_nritems(l); struct btrfs_item *item; - struct btrfs_extent_item *ei; struct btrfs_root_item *ri; struct btrfs_dir_item *di; struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; struct btrfs_file_extent_item *fi; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_dev_extent *dev_extent; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_extent_ref *ref; - struct btrfs_dev_extent *dev_extent; - u32 type; printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, @@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); - printk(KERN_INFO "\t\textent data refs %u\n", - btrfs_extent_refs(l, ei)); - break; - case BTRFS_EXTENT_REF_KEY: - ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); - printk(KERN_INFO "\t\textent back ref root %llu " - "gen %llu owner %llu num_refs %lu\n", - (unsigned long long)btrfs_ref_root(l, ref), - (unsigned long long)btrfs_ref_generation(l, ref), - (unsigned long long)btrfs_ref_objectid(l, ref), - (unsigned long)btrfs_ref_num_refs(l, ref)); + print_extent_item(l, i); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref\n"); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref\n"); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(l, i, + struct btrfs_extent_data_ref); + print_extent_data_ref(l, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(l, i, + struct btrfs_shared_data_ref); + printk(KERN_INFO "\t\tshared data backref count %u\n", + btrfs_shared_data_ref_count(l, sref)); break; - case BTRFS_EXTENT_DATA_KEY: fi = btrfs_item_ptr(l, i, struct btrfs_file_extent_item); @@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) (unsigned long long) btrfs_file_extent_ram_bytes(l, fi)); break; + case BTRFS_EXTENT_REF_V0_KEY: +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + print_extent_ref_v0(l, i); +#else + BUG(); +#endif case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c new file mode 100644 index 000000000000..008397934778 --- /dev/null +++ b/fs/btrfs/relocation.c @@ -0,0 +1,3708 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "btrfs_inode.h" +#include "async-thread.h" + +/* + * backref_node, mapping_node and tree_block start with this + */ +struct tree_entry { + struct rb_node rb_node; + u64 bytenr; +}; + +/* + * present a tree block in the backref cache + */ +struct backref_node { + struct rb_node rb_node; + u64 bytenr; + /* objectid tree block owner */ + u64 owner; + /* list of upper level blocks reference this block */ + struct list_head upper; + /* list of child blocks in the cache */ + struct list_head lower; + /* NULL if this node is not tree root */ + struct btrfs_root *root; + /* extent buffer got by COW the block */ + struct extent_buffer *eb; + /* level of tree block */ + unsigned int level:8; + /* 1 if the block is root of old snapshot */ + unsigned int old_root:1; + /* 1 if no child blocks in the cache */ + unsigned int lowest:1; + /* is the extent buffer locked */ + unsigned int locked:1; + /* has the block been processed */ + unsigned int processed:1; + /* have backrefs of this block been checked */ + unsigned int checked:1; +}; + +/* + * present a block pointer in the backref cache + */ +struct backref_edge { + struct list_head list[2]; + struct backref_node *node[2]; + u64 blockptr; +}; + +#define LOWER 0 +#define UPPER 1 + +struct backref_cache { + /* red black tree of all backref nodes in the cache */ + struct rb_root rb_root; + /* list of backref nodes with no child block in the cache */ + struct list_head pending[BTRFS_MAX_LEVEL]; + spinlock_t lock; +}; + +/* + * map address of tree root to tree + */ +struct mapping_node { + struct rb_node rb_node; + u64 bytenr; + void *data; +}; + +struct mapping_tree { + struct rb_root rb_root; + spinlock_t lock; +}; + +/* + * present a tree block to process + */ +struct tree_block { + struct rb_node rb_node; + u64 bytenr; + struct btrfs_key key; + unsigned int level:8; + unsigned int key_ready:1; +}; + +/* inode vector */ +#define INODEVEC_SIZE 16 + +struct inodevec { + struct list_head list; + struct inode *inode[INODEVEC_SIZE]; + int nr; +}; + +struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group_cache *block_group; + /* extent tree */ + struct btrfs_root *extent_root; + /* inode for moving data */ + struct inode *data_inode; + struct btrfs_workers workers; + /* tree blocks have been processed */ + struct extent_io_tree processed_blocks; + /* map start of tree root to corresponding reloc tree */ + struct mapping_tree reloc_root_tree; + /* list of reloc trees */ + struct list_head reloc_roots; + u64 search_start; + u64 extents_found; + u64 extents_skipped; + int stage; + int create_reloc_root; + unsigned int found_file_extent:1; + unsigned int found_old_snapshot:1; +}; + +/* stages of data relocation */ +#define MOVE_DATA_EXTENTS 0 +#define UPDATE_DATA_PTRS 1 + +/* + * merge reloc tree to corresponding fs tree in worker threads + */ +struct async_merge { + struct btrfs_work work; + struct reloc_control *rc; + struct btrfs_root *root; + struct completion *done; + atomic_t *num_pending; +}; + +static void mapping_tree_init(struct mapping_tree *tree) +{ + tree->rb_root.rb_node = NULL; + spin_lock_init(&tree->lock); +} + +static void backref_cache_init(struct backref_cache *cache) +{ + int i; + cache->rb_root.rb_node = NULL; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + spin_lock_init(&cache->lock); +} + +static void backref_node_init(struct backref_node *node) +{ + memset(node, 0, sizeof(*node)); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct tree_entry *entry; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +/* + * walk up backref nodes until reach node presents tree root + */ +static struct backref_node *walk_up_backref(struct backref_node *node, + struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + int idx = *index; + + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, + struct backref_edge, list[LOWER]); + edges[idx++] = edge; + node = edge->node[UPPER]; + } + *index = idx; + return node; +} + +/* + * walk down backref nodes to find start of next reference path + */ +static struct backref_node *walk_down_backref(struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + struct backref_node *lower; + int idx = *index; + + while (idx > 0) { + edge = edges[idx - 1]; + lower = edge->node[LOWER]; + if (list_is_last(&edge->list[LOWER], &lower->upper)) { + idx--; + continue; + } + edge = list_entry(edge->list[LOWER].next, + struct backref_edge, list[LOWER]); + edges[idx - 1] = edge; + *index = idx; + return edge->node[UPPER]; + } + *index = 0; + return NULL; +} + +static void drop_node_buffer(struct backref_node *node) +{ + if (node->eb) { + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +static void drop_backref_node(struct backref_cache *tree, + struct backref_node *node) +{ + BUG_ON(!node->lowest); + BUG_ON(!list_empty(&node->upper)); + + drop_node_buffer(node); + list_del(&node->lower); + + rb_erase(&node->rb_node, &tree->rb_root); + kfree(node); +} + +/* + * remove a backref node from the backref cache + */ +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + struct backref_node *upper; + struct backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + kfree(edge); + /* + * add the node to pending list if no other + * child block cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, + &cache->pending[upper->level]); + upper->lowest = 1; + } + } + drop_backref_node(cache, node); +} + +/* + * find reloc tree by address of tree root + */ +static struct btrfs_root *find_reloc_root(struct reloc_control *rc, + u64 bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct btrfs_root *root = NULL; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + root = (struct btrfs_root *)node->data; + } + spin_unlock(&rc->reloc_root_tree.lock); + return root; +} + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_key key; + + key.objectid = root_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(root_objectid)) + key.offset = 0; + else + key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &key); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static noinline_for_stack +struct btrfs_root *find_tree_root(struct reloc_control *rc, + struct extent_buffer *leaf, + struct btrfs_extent_ref_v0 *ref0) +{ + struct btrfs_root *root; + u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); + u64 generation = btrfs_ref_generation_v0(leaf, ref0); + + BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); + + root = read_fs_root(rc->extent_root->fs_info, root_objectid); + BUG_ON(IS_ERR(root)); + + if (root->ref_cows && + generation != btrfs_root_generation(&root->root_item)) + return NULL; + + return root; +} +#endif + +static noinline_for_stack +int find_inline_backref(struct extent_buffer *leaf, int slot, + unsigned long *ptr, unsigned long *end) +{ + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + u32 item_size; + + item_size = btrfs_item_size_nr(leaf, slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + return 1; + } +#endif + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + WARN_ON(!(btrfs_extent_flags(leaf, ei) & + BTRFS_EXTENT_FLAG_TREE_BLOCK)); + + if (item_size <= sizeof(*ei) + sizeof(*bi)) { + WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); + return 1; + } + + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + *end = (unsigned long)ei + item_size; + return 0; +} + +/* + * build backref tree for a given tree block. root of the backref tree + * corresponds the tree block, leaves of the backref tree correspond + * roots of b-trees that reference the tree block. + * + * the basic idea of this function is check backrefs of a given block + * to find upper level blocks that refernece the block, and then check + * bakcrefs of these upper level blocks recursively. the recursion stop + * when tree root is reached or backrefs for the block is cached. + * + * NOTE: if we find backrefs for a block are cached, we know backrefs + * for all upper level blocks that directly/indirectly reference the + * block are also cached. + */ +static struct backref_node *build_backref_tree(struct reloc_control *rc, + struct backref_cache *cache, + struct btrfs_key *node_key, + int level, u64 bytenr) +{ + struct btrfs_path *path1; + struct btrfs_path *path2; + struct extent_buffer *eb; + struct btrfs_root *root; + struct backref_node *cur; + struct backref_node *upper; + struct backref_node *lower; + struct backref_node *node = NULL; + struct backref_node *exist = NULL; + struct backref_edge *edge; + struct rb_node *rb_node; + struct btrfs_key key; + unsigned long end; + unsigned long ptr; + LIST_HEAD(list); + int ret; + int err = 0; + + path1 = btrfs_alloc_path(); + path2 = btrfs_alloc_path(); + if (!path1 || !path2) { + err = -ENOMEM; + goto out; + } + + node = kmalloc(sizeof(*node), GFP_NOFS); + if (!node) { + err = -ENOMEM; + goto out; + } + + backref_node_init(node); + node->bytenr = bytenr; + node->owner = 0; + node->level = level; + node->lowest = 1; + cur = node; +again: + end = 0; + ptr = 0; + key.objectid = cur->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path1->search_commit_root = 1; + path1->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, + 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(!ret || !path1->slots[0]); + + path1->slots[0]--; + + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * the backref was added previously when processsing + * backref of type BTRFS_TREE_BLOCK_REF_KEY + */ + BUG_ON(!list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct backref_edge, + list[LOWER]); + BUG_ON(!list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * add the upper level block to pending list if we need + * check its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &list); + } else { + exist = NULL; + } + + while (1) { + cond_resched(); + eb = path1->nodes[0]; + + if (ptr >= end) { + if (path1->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + eb = path1->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); + if (key.objectid != cur->bytenr) { + WARN_ON(exist); + break; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = find_inline_backref(eb, path1->slots[0], + &ptr, &end); + if (ret) + goto next; + } + } + + if (ptr < end) { + /* update key for inline back ref */ + struct btrfs_extent_inline_ref *iref; + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && + key.type != BTRFS_SHARED_BLOCK_REF_KEY); + } + + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + goto next; + } + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.objectid == key.offset && + key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(eb, path1->slots[0], + struct btrfs_extent_ref_v0); + root = find_tree_root(rc, eb, ref0); + if (root) + cur->root = root; + else + cur->old_root = 1; + break; + } +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { +#endif + if (key.objectid == key.offset) { + /* + * only root blocks of reloc trees use + * backref of this type. + */ + root = find_reloc_root(rc, cur->bytenr); + BUG_ON(!root); + cur->root = root; + break; + } + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (!edge) { + err = -ENOMEM; + goto out; + } + rb_node = tree_search(&cache->rb_root, key.offset); + if (!rb_node) { + upper = kmalloc(sizeof(*upper), GFP_NOFS); + if (!upper) { + kfree(edge); + err = -ENOMEM; + goto out; + } + backref_node_init(upper); + upper->bytenr = key.offset; + upper->owner = 0; + upper->level = cur->level + 1; + /* + * backrefs for the upper level block isn't + * cached, add the block to pending list + */ + list_add_tail(&edge->list[UPPER], &list); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add(&edge->list[LOWER], &cur->upper); + edge->node[UPPER] = upper; + edge->node[LOWER] = cur; + + goto next; + } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { + goto next; + } + + /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + root = read_fs_root(rc->extent_root->fs_info, key.offset); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* tree root */ + BUG_ON(btrfs_root_bytenr(&root->root_item) != + cur->bytenr); + cur->root = root; + break; + } + + level = cur->level + 1; + + /* + * searching the tree to find upper level blocks + * reference the block. + */ + path2->search_commit_root = 1; + path2->skip_locking = 1; + path2->lowest_level = level; + ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); + path2->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out; + } + + eb = path2->nodes[level]; + WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != + cur->bytenr); + + lower = cur; + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path2->nodes[level]) { + BUG_ON(btrfs_root_bytenr(&root->root_item) != + lower->bytenr); + lower->root = root; + break; + } + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (!edge) { + err = -ENOMEM; + goto out; + } + + eb = path2->nodes[level]; + rb_node = tree_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = kmalloc(sizeof(*upper), GFP_NOFS); + if (!upper) { + kfree(edge); + err = -ENOMEM; + goto out; + } + backref_node_init(upper); + upper->bytenr = eb->start; + upper->owner = btrfs_header_owner(eb); + upper->level = lower->level + 1; + + /* + * if we know the block isn't shared + * we can void checking its backrefs. + */ + if (btrfs_block_can_be_shared(root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * add the block to pending list if we + * need check its backrefs. only block + * at 'cur->level + 1' is added to the + * tail of pending list. this guarantees + * we check backrefs from lower level + * blocks to upper level blocks. + */ + if (!upper->checked && + level == cur->level + 1) { + list_add_tail(&edge->list[UPPER], + &list); + } else + INIT_LIST_HEAD(&edge->list[UPPER]); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(!upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add_tail(&edge->list[LOWER], &lower->upper); + edge->node[UPPER] = upper; + edge->node[LOWER] = lower; + + if (rb_node) + break; + lower = upper; + upper = NULL; + } + btrfs_release_path(root, path2); +next: + if (ptr < end) { + ptr += btrfs_extent_inline_ref_size(key.type); + if (ptr >= end) { + WARN_ON(ptr > end); + ptr = 0; + end = 0; + } + } + if (ptr >= end) + path1->slots[0]++; + } + btrfs_release_path(rc->extent_root, path1); + + cur->checked = 1; + WARN_ON(exist); + + /* the pending list isn't empty, take the first block to process */ + if (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + cur = edge->node[UPPER]; + goto again; + } + + /* + * everything goes well, connect backref nodes and insert backref nodes + * into the cache. + */ + BUG_ON(!node->checked); + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + BUG_ON(rb_node); + + list_for_each_entry(edge, &node->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + + while (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + BUG_ON(!upper->checked); + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + BUG_ON(rb_node); + + list_add_tail(&edge->list[UPPER], &upper->lower); + + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + } +out: + btrfs_free_path(path1); + btrfs_free_path(path2); + if (err) { + INIT_LIST_HEAD(&list); + upper = node; + while (upper) { + if (RB_EMPTY_NODE(&upper->rb_node)) { + list_splice_tail(&upper->upper, &list); + kfree(upper); + } + + if (list_empty(&list)) + break; + + edge = list_entry(list.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + kfree(edge); + } + return ERR_PTR(err); + } + return node; +} + +/* + * helper to add 'address of tree root -> reloc tree' mapping + */ +static int __add_reloc_root(struct btrfs_root *root) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + node = kmalloc(sizeof(*node), GFP_NOFS); + BUG_ON(!node); + + node->bytenr = root->node->start; + node->data = root; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + + list_add_tail(&root->root_list, &rc->reloc_roots); + return 0; +} + +/* + * helper to update/delete the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, int del) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + BUG_ON((struct btrfs_root *)node->data != root); + + if (!del) { + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = root->node->start; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + } else { + list_del_init(&root->root_list); + kfree(node); + } + return 0; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!root->fs_info->reloc_ctl || + !root->fs_info->reloc_ctl->create_reloc_root || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = root->root_key.objectid; + + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); + memcpy(root_item, &root->root_item, sizeof(*root_item)); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(IS_ERR(reloc_root)); + reloc_root->last_trans = trans->transid; + + __add_reloc_root(reloc_root); + root->reloc_root = reloc_root; + return 0; +} + +/* + * update root item of reloc tree + */ +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + int del = 0; + int ret; + + if (!root->reloc_root) + return 0; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_root_refs(root_item) == 0) { + root->reloc_root = NULL; + del = 1; + } + + __update_reloc_root(reloc_root, del); + + if (reloc_root->commit_root != reloc_root->node) { + btrfs_set_root_node(root_item, reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->commit_root = btrfs_root_node(reloc_root); + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, root_item); + BUG_ON(ret); + return 0; +} + +/* + * helper to find first cached inode with inode number >= objectid + * in a subvolume + */ +static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < entry->vfs_inode.i_ino) + node = node->rb_left; + else if (objectid > entry->vfs_inode.i_ino) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= entry->vfs_inode.i_ino) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + return inode; + } + + objectid = entry->vfs_inode.i_ino + 1; + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return NULL; +} + +static int in_block_group(u64 bytenr, + struct btrfs_block_group_cache *block_group) +{ + if (bytenr >= block_group->key.objectid && + bytenr < block_group->key.objectid + block_group->key.offset) + return 1; + return 0; +} + +/* + * get new location of data + */ +static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bytenr -= BTRFS_I(reloc_inode)->index_cnt; + ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + bytenr, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + BUG_ON(btrfs_file_extent_offset(leaf, fi) || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)); + + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { + ret = 1; + goto out; + } + + if (new_bytenr) + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * update file extent items in the tree leaf to point to + * the new locations. + */ +static int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf, + struct list_head *inode_list) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + struct inodevec *ivec = NULL; + u64 parent; + u64 bytenr; + u64 new_bytenr; + u64 num_bytes; + u64 end; + u32 nritems; + u32 i; + int ret; + int first = 1; + int dirty = 0; + + if (rc->stage != UPDATE_DATA_PTRS) + return 0; + + /* reloc trees always use full backref */ + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent = leaf->start; + else + parent = 0; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + if (!in_block_group(bytenr, rc->block_group)) + continue; + + /* + * if we are modifying block in fs tree, wait for readpage + * to complete and drop the extent cache + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (!ivec || ivec->nr == INODEVEC_SIZE) { + ivec = kmalloc(sizeof(*ivec), GFP_NOFS); + BUG_ON(!ivec); + ivec->nr = 0; + list_add_tail(&ivec->list, inode_list); + } + if (first) { + inode = find_next_inode(root, key.objectid); + if (inode) + ivec->inode[ivec->nr++] = inode; + first = 0; + } else if (inode && inode->i_ino < key.objectid) { + inode = find_next_inode(root, key.objectid); + if (inode) + ivec->inode[ivec->nr++] = inode; + } + if (inode && inode->i_ino == key.objectid) { + end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + WARN_ON(!IS_ALIGNED(key.offset, + root->sectorsize)); + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, + GFP_NOFS); + if (!ret) + continue; + + btrfs_drop_extent_cache(inode, key.offset, end, + 1); + unlock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, GFP_NOFS); + } + } + + ret = get_new_location(rc->data_inode, &new_bytenr, + bytenr, num_bytes); + if (ret > 0) + continue; + BUG_ON(ret < 0); + + btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); + dirty = 1; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, + num_bytes, parent, + btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + parent, btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + } + if (dirty) + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int memcmp_node_keys(struct extent_buffer *eb, int slot, + struct btrfs_path *path, int level) +{ + struct btrfs_disk_key key1; + struct btrfs_disk_key key2; + btrfs_node_key(eb, &key1, slot); + btrfs_node_key(path->nodes[level], &key2, path->slots[level]); + return memcmp(&key1, &key2, sizeof(key1)); +} + +/* + * try to replace tree blocks in fs tree with the new blocks + * in reloc tree. tree blocks haven't been modified since the + * reloc tree was create can be replaced. + * + * if a block was replaced, level of the block + 1 is returned. + * if no block got replaced, 0 is returned. if there are other + * errors, a negative error number is returned. + */ +static int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + struct extent_buffer **leaf, + int lowest_level, int max_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 old_bytenr; + u64 new_bytenr; + u64 old_ptr_gen; + u64 new_ptr_gen; + u64 last_snapshot; + u32 blocksize; + int level; + int ret; + int slot; + + BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(lowest_level > 1 && leaf); + + last_snapshot = btrfs_root_last_snapshot(&src->root_item); + + slot = path->slots[lowest_level]; + btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); + + eb = btrfs_lock_root_node(dest); + btrfs_set_lock_blocking(eb); + level = btrfs_header_level(eb); + + if (level < lowest_level) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return 0; + } + + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + btrfs_set_lock_blocking(eb); + + if (next_key) { + next_key->objectid = (u64)-1; + next_key->type = (u8)-1; + next_key->offset = (u64)-1; + } + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + BUG_ON(level < lowest_level); + + ret = btrfs_bin_search(parent, &key, level, &slot); + if (ret && slot > 0) + slot--; + + if (next_key && slot + 1 < btrfs_header_nritems(parent)) + btrfs_node_key_to_cpu(parent, next_key, slot + 1); + + old_bytenr = btrfs_node_blockptr(parent, slot); + blocksize = btrfs_level_size(dest, level - 1); + old_ptr_gen = btrfs_node_ptr_generation(parent, slot); + + if (level <= max_level) { + eb = path->nodes[level]; + new_bytenr = btrfs_node_blockptr(eb, + path->slots[level]); + new_ptr_gen = btrfs_node_ptr_generation(eb, + path->slots[level]); + } else { + new_bytenr = 0; + new_ptr_gen = 0; + } + + if (new_bytenr > 0 && new_bytenr == old_bytenr) { + WARN_ON(1); + ret = level; + break; + } + + if (new_bytenr == 0 || old_ptr_gen > last_snapshot || + memcmp_node_keys(parent, slot, path, level)) { + if (level <= lowest_level && !leaf) { + ret = 0; + break; + } + + eb = read_tree_block(dest, old_bytenr, blocksize, + old_ptr_gen); + btrfs_tree_lock(eb); + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); + btrfs_set_lock_blocking(eb); + + if (level <= lowest_level) { + *leaf = eb; + ret = 0; + break; + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + + parent = eb; + continue; + } + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + btrfs_release_path(src, path); + + path->lowest_level = level; + ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + path->lowest_level = 0; + BUG_ON(ret); + + /* + * swap blocks in fs tree and reloc tree. + */ + btrfs_set_node_blockptr(parent, slot, new_bytenr); + btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); + btrfs_mark_buffer_dirty(parent); + + btrfs_set_node_blockptr(path->nodes[level], + path->slots[level], old_bytenr); + btrfs_set_node_ptr_generation(path->nodes[level], + path->slots[level], old_ptr_gen); + btrfs_mark_buffer_dirty(path->nodes[level]); + + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + btrfs_unlock_up_safe(path, 0); + + ret = level; + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return ret; +} + +/* + * helper to find next relocated block in reloc tree + */ +static noinline_for_stack +int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb; + int i; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = 0; i < *level; i++) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + + for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] + 1 < nritems) { + path->slots[i]++; + if (btrfs_node_ptr_generation(eb, path->slots[i]) <= + last_snapshot) + continue; + + *level = i; + return 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + return 1; +} + +/* + * walk down reloc tree to find relocated block of lowest level + */ +static noinline_for_stack +int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb = NULL; + int i; + u64 bytenr; + u64 ptr_gen = 0; + u64 last_snapshot; + u32 blocksize; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = *level; i > 0; i--) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] < nritems) { + ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); + if (ptr_gen > last_snapshot) + break; + path->slots[i]++; + } + if (path->slots[i] >= nritems) { + if (i == *level) + break; + *level = i + 1; + return 0; + } + if (i == 1) { + *level = i; + return 0; + } + + bytenr = btrfs_node_blockptr(eb, path->slots[i]); + blocksize = btrfs_level_size(root, i - 1); + eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + BUG_ON(btrfs_header_level(eb) != i - 1); + path->nodes[i - 1] = eb; + path->slots[i - 1] = 0; + } + return 1; +} + +/* + * invalidate extent cache for file extents whose key in range of + * [min_key, max_key) + */ +static int invalidate_extent_cache(struct btrfs_root *root, + struct btrfs_key *min_key, + struct btrfs_key *max_key) +{ + struct inode *inode = NULL; + u64 objectid; + u64 start, end; + + objectid = min_key->objectid; + while (1) { + cond_resched(); + iput(inode); + + if (objectid > max_key->objectid) + break; + + inode = find_next_inode(root, objectid); + if (!inode) + break; + + if (inode->i_ino > max_key->objectid) { + iput(inode); + break; + } + + objectid = inode->i_ino + 1; + if (!S_ISREG(inode->i_mode)) + continue; + + if (unlikely(min_key->objectid == inode->i_ino)) { + if (min_key->type > BTRFS_EXTENT_DATA_KEY) + continue; + if (min_key->type < BTRFS_EXTENT_DATA_KEY) + start = 0; + else { + start = min_key->offset; + WARN_ON(!IS_ALIGNED(start, root->sectorsize)); + } + } else { + start = 0; + } + + if (unlikely(max_key->objectid == inode->i_ino)) { + if (max_key->type < BTRFS_EXTENT_DATA_KEY) + continue; + if (max_key->type > BTRFS_EXTENT_DATA_KEY) { + end = (u64)-1; + } else { + if (max_key->offset == 0) + continue; + end = max_key->offset; + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + } + } else { + end = (u64)-1; + } + + /* the lock_extent waits for readpage to complete */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + btrfs_drop_extent_cache(inode, start, end, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + } + return 0; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + level++; + } + return 1; +} + +/* + * merge the relocated tree blocks in reloc tree with corresponding + * fs tree. + */ +static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, + struct btrfs_root *root) +{ + LIST_HEAD(inode_list); + struct btrfs_key key; + struct btrfs_key next_key; + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + struct btrfs_path *path; + struct extent_buffer *leaf = NULL; + unsigned long nr; + int level; + int max_level; + int replaced = 0; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_root_level(root_item); + extent_buffer_get(reloc_root->node); + path->nodes[level] = reloc_root->node; + path->slots[level] = 0; + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + + btrfs_node_key_to_cpu(path->nodes[level], &next_key, + path->slots[level]); + WARN_ON(memcmp(&key, &next_key, sizeof(key))); + + btrfs_unlock_up_safe(path, 0); + } + + if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { + trans = btrfs_start_transaction(root, 1); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(reloc_root, path); + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + btrfs_unlock_up_safe(path, 1); + ret = replace_file_extents(trans, rc, root, leaf, + &inode_list); + if (ret < 0) + err = ret; + goto out; + } + + memset(&next_key, 0, sizeof(next_key)); + + while (1) { + leaf = NULL; + replaced = 0; + trans = btrfs_start_transaction(root, 1); + max_level = level; + + ret = walk_down_reloc_tree(reloc_root, path, &level); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + + if (!find_next_key(path, level, &key) && + btrfs_comp_cpu_keys(&next_key, &key) >= 0) { + ret = 0; + } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_path(trans, root, reloc_root, + path, &next_key, &leaf, + level, max_level); + } else { + ret = replace_path(trans, root, reloc_root, + path, &next_key, NULL, + level, max_level); + } + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + level = ret; + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + replaced = 1; + } else if (leaf) { + /* + * no block got replaced, try replacing file extents + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + ret = replace_file_extents(trans, rc, root, leaf, + &inode_list); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + BUG_ON(ret < 0); + } + + ret = walk_up_reloc_tree(reloc_root, path, &level); + if (ret > 0) + break; + + BUG_ON(level == 0); + /* + * save the merging progress in the drop_progress. + * this is OK since root refs == 1 in this case. + */ + btrfs_node_key(path->nodes[level], &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + } + + /* + * handle the case only one block in the fs tree need to be + * relocated and the block is tree root. + */ + leaf = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + if (ret < 0) + err = ret; +out: + btrfs_free_path(path); + + if (err == 0) { + memset(&root_item->drop_progress, 0, + sizeof(root_item->drop_progress)); + root_item->drop_level = 0; + btrfs_set_root_refs(root_item, 0); + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + /* + * put inodes while we aren't holding the tree locks + */ + while (!list_empty(&inode_list)) { + struct inodevec *ivec; + ivec = list_entry(inode_list.next, struct inodevec, list); + list_del(&ivec->list); + while (ivec->nr > 0) { + ivec->nr--; + iput(ivec->inode[ivec->nr]); + } + kfree(ivec); + } + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + + return err; +} + +/* + * callback for the work threads. + * this function merges reloc tree with corresponding fs tree, + * and then drops the reloc tree. + */ +static void merge_func(struct btrfs_work *work) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_root *reloc_root; + struct async_merge *async; + + async = container_of(work, struct async_merge, work); + reloc_root = async->root; + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + merge_reloc_root(async->rc, root); + + trans = btrfs_start_transaction(root, 1); + btrfs_update_reloc_root(trans, root); + btrfs_end_transaction(trans, root); + } + + btrfs_drop_snapshot(reloc_root, 0); + + if (atomic_dec_and_test(async->num_pending)) + complete(async->done); + + kfree(async); +} + +static int merge_reloc_roots(struct reloc_control *rc) +{ + struct async_merge *async; + struct btrfs_root *root; + struct completion done; + atomic_t num_pending; + + init_completion(&done); + atomic_set(&num_pending, 1); + + while (!list_empty(&rc->reloc_roots)) { + root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + + async = kmalloc(sizeof(*async), GFP_NOFS); + BUG_ON(!async); + async->work.func = merge_func; + async->work.flags = 0; + async->rc = rc; + async->root = root; + async->done = &done; + async->num_pending = &num_pending; + atomic_inc(&num_pending); + btrfs_queue_worker(&rc->workers, &async->work); + } + + if (!atomic_dec_and_test(&num_pending)) + wait_for_completion(&done); + + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + return 0; +} + +static void free_block_list(struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + while ((rb_node = rb_first(blocks))) { + block = rb_entry(rb_node, struct tree_block, rb_node); + rb_erase(rb_node, blocks); + kfree(block); + } +} + +static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *reloc_root) +{ + struct btrfs_root *root; + + if (reloc_root->last_trans == trans->transid) + return 0; + + root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + return btrfs_record_root_in_trans(trans, root); +} + +/* + * select one tree from trees that references the block. + * for blocks in refernce counted trees, we preper reloc tree. + * if no reloc tree found and reloc_only is true, NULL is returned. + */ +static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct backref_edge *edges[], + int *nr, int reloc_only) +{ + struct backref_node *next; + struct btrfs_root *root; + int index; + int loop = 0; +again: + index = 0; + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + if (!root) { + BUG_ON(!node->old_root); + goto skip; + } + + /* no other choice for non-refernce counted tree */ + if (!root->ref_cows) { + BUG_ON(reloc_only); + break; + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + record_reloc_root_in_trans(trans, root); + break; + } + + if (loop) { + btrfs_record_root_in_trans(trans, root); + break; + } + + if (reloc_only || next != node) { + if (!root->reloc_root) + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + /* + * if the reloc tree was created in current + * transation, there is no node in backref tree + * corresponds to the root of the reloc tree. + */ + if (btrfs_root_last_snapshot(&root->root_item) == + trans->transid - 1) + break; + } +skip: + root = NULL; + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!root && !loop && !reloc_only) { + loop = 1; + goto again; + } + + if (root) + *nr = index; + else + *nr = 0; + + return root; +} + +static noinline_for_stack +struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node) +{ + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int nr; + return __select_one_root(trans, node, edges, &nr, 0); +} + +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct backref_edge *edges[], int *nr) +{ + return __select_one_root(trans, node, edges, nr, 1); +} + +static void grab_path_buffers(struct btrfs_path *path, + struct backref_node *node, + struct backref_edge *edges[], int nr) +{ + int i = 0; + while (1) { + drop_node_buffer(node); + node->eb = path->nodes[node->level]; + BUG_ON(!node->eb); + if (path->locks[node->level]) + node->locked = 1; + path->nodes[node->level] = NULL; + path->locks[node->level] = 0; + + if (i >= nr) + break; + + edges[i]->blockptr = node->eb->start; + node = edges[i]->node[UPPER]; + i++; + } +} + +/* + * relocate a block tree, and then update pointers in upper level + * blocks that reference the block to point to the new location. + * + * if called by link_to_upper, the block has already been relocated. + * in that case this function just updates pointers. + */ +static int do_relocation(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path, int lowest) +{ + struct backref_node *upper; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_root *root; + struct extent_buffer *eb; + u32 blocksize; + u64 bytenr; + u64 generation; + int nr; + int slot; + int ret; + int err = 0; + + BUG_ON(lowest && node->eb); + + path->lowest_level = node->level + 1; + list_for_each_entry(edge, &node->upper, list[LOWER]) { + cond_resched(); + if (node->eb && node->eb->start == edge->blockptr) + continue; + + upper = edge->node[UPPER]; + root = select_reloc_root(trans, upper, edges, &nr); + if (!root) + continue; + + if (upper->eb && !upper->locked) + drop_node_buffer(upper); + + if (!upper->eb) { + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + + slot = path->slots[upper->level]; + + btrfs_unlock_up_safe(path, upper->level + 1); + grab_path_buffers(path, upper, edges, nr); + + btrfs_release_path(NULL, path); + } else { + ret = btrfs_bin_search(upper->eb, key, upper->level, + &slot); + BUG_ON(ret); + } + + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (!lowest) { + if (node->eb->start == bytenr) { + btrfs_tree_unlock(upper->eb); + upper->locked = 0; + continue; + } + } else { + BUG_ON(node->bytenr != bytenr); + } + + blocksize = btrfs_level_size(root, node->level); + generation = btrfs_node_ptr_generation(upper->eb, slot); + eb = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + + if (!node->eb) { + ret = btrfs_cow_block(trans, root, eb, upper->eb, + slot, &eb); + if (ret < 0) { + err = ret; + break; + } + btrfs_set_lock_blocking(eb); + node->eb = eb; + node->locked = 1; + } else { + btrfs_set_node_blockptr(upper->eb, slot, + node->eb->start); + btrfs_set_node_ptr_generation(upper->eb, slot, + trans->transid); + btrfs_mark_buffer_dirty(upper->eb); + + ret = btrfs_inc_extent_ref(trans, root, + node->eb->start, blocksize, + upper->eb->start, + btrfs_header_owner(upper->eb), + node->level, 0); + BUG_ON(ret); + + ret = btrfs_drop_subtree(trans, root, eb, upper->eb); + BUG_ON(ret); + } + if (!lowest) { + btrfs_tree_unlock(upper->eb); + upper->locked = 0; + } + } + path->lowest_level = 0; + return err; +} + +static int link_to_upper(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct btrfs_path *path) +{ + struct btrfs_key key; + if (!node->eb || list_empty(&node->upper)) + return 0; + + btrfs_node_key_to_cpu(node->eb, &key, 0); + return do_relocation(trans, node, &key, path, 0); +} + +static int finish_pending_nodes(struct btrfs_trans_handle *trans, + struct backref_cache *cache, + struct btrfs_path *path) +{ + struct backref_node *node; + int level; + int ret; + int err = 0; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + while (!list_empty(&cache->pending[level])) { + node = list_entry(cache->pending[level].next, + struct backref_node, lower); + BUG_ON(node->level != level); + + ret = link_to_upper(trans, node, path); + if (ret < 0) + err = ret; + /* + * this remove the node from the pending list and + * may add some other nodes to the level + 1 + * pending list + */ + remove_backref_node(cache, node); + } + } + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + return err; +} + +static void mark_block_processed(struct reloc_control *rc, + struct backref_node *node) +{ + u32 blocksize; + if (node->level == 0 || + in_block_group(node->bytenr, rc->block_group)) { + blocksize = btrfs_level_size(rc->extent_root, node->level); + set_extent_bits(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + GFP_NOFS); + } + node->processed = 1; +} + +/* + * mark a block and all blocks directly/indirectly reference the block + * as processed. + */ +static void update_processed_blocks(struct reloc_control *rc, + struct backref_node *node) +{ + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + while (next) { + cond_resched(); + while (1) { + if (next->processed) + break; + + mark_block_processed(rc, next); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } +} + +static int tree_block_processed(u64 bytenr, u32 blocksize, + struct reloc_control *rc) +{ + if (test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, 1)) + return 1; + return 0; +} + +/* + * check if there are any file extent pointers in the leaf point to + * data require processing + */ +static int check_file_extents(struct reloc_control *rc, + u64 bytenr, u32 blocksize, u64 ptr_gen) +{ + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u32 nritems; + int i; + int ret = 0; + + leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &found_key, i); + if (found_key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) + continue; + if (in_block_group(bytenr, rc->block_group)) { + ret = 1; + break; + } + } + free_extent_buffer(leaf); + return ret; +} + +/* + * scan child blocks of a given block to find blocks require processing + */ +static int add_child_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + u32 nritems; + int i; + int err = 0; + + nritems = btrfs_header_nritems(node->eb); + blocksize = btrfs_level_size(rc->extent_root, node->level - 1); + for (i = 0; i < nritems; i++) { + cond_resched(); + bytenr = btrfs_node_blockptr(node->eb, i); + ptr_gen = btrfs_node_ptr_generation(node->eb, i); + if (ptr_gen == trans->transid) + continue; + if (!in_block_group(bytenr, rc->block_group) && + (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) + continue; + if (tree_block_processed(bytenr, blocksize, rc)) + continue; + + readahead_tree_block(rc->extent_root, + bytenr, blocksize, ptr_gen); + } + + for (i = 0; i < nritems; i++) { + cond_resched(); + bytenr = btrfs_node_blockptr(node->eb, i); + ptr_gen = btrfs_node_ptr_generation(node->eb, i); + if (ptr_gen == trans->transid) + continue; + if (!in_block_group(bytenr, rc->block_group) && + (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) + continue; + if (tree_block_processed(bytenr, blocksize, rc)) + continue; + if (!in_block_group(bytenr, rc->block_group) && + !check_file_extents(rc, bytenr, blocksize, ptr_gen)) + continue; + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = bytenr; + btrfs_node_key_to_cpu(node->eb, &block->key, i); + block->level = node->level - 1; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + } + if (err) + free_block_list(blocks); + return err; +} + +/* + * find adjacent blocks require processing + */ +static noinline_for_stack +int add_adjacent_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_cache *cache, + struct rb_root *blocks, int level, + struct backref_node **upper) +{ + struct backref_node *node; + int ret = 0; + + WARN_ON(!list_empty(&cache->pending[level])); + + if (list_empty(&cache->pending[level + 1])) + return 1; + + node = list_entry(cache->pending[level + 1].next, + struct backref_node, lower); + if (node->eb) + ret = add_child_blocks(trans, rc, node, blocks); + + *upper = node; + return ret; +} + +static int get_tree_block_key(struct reloc_control *rc, + struct tree_block *block) +{ + struct extent_buffer *eb; + + BUG_ON(block->key_ready); + eb = read_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + WARN_ON(btrfs_header_level(eb) != block->level); + if (block->level == 0) + btrfs_item_key_to_cpu(eb, &block->key, 0); + else + btrfs_node_key_to_cpu(eb, &block->key, 0); + free_extent_buffer(eb); + block->key_ready = 1; + return 0; +} + +static int reada_tree_block(struct reloc_control *rc, + struct tree_block *block) +{ + BUG_ON(block->key_ready); + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + return 0; +} + +/* + * helper function to relocate a tree block + */ +static int relocate_tree_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path) +{ + struct btrfs_root *root; + int ret; + + root = select_one_root(trans, node); + if (unlikely(!root)) { + rc->found_old_snapshot = 1; + update_processed_blocks(rc, node); + return 0; + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = do_relocation(trans, node, key, path, 1); + if (ret < 0) + goto out; + if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_file_extents(trans, rc, root, + node->eb, NULL); + if (ret < 0) + goto out; + } + drop_node_buffer(node); + } else if (!root->ref_cows) { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(root, path); + if (ret < 0) + goto out; + } else if (root != node->root) { + WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); + } + + update_processed_blocks(rc, node); + ret = 0; +out: + drop_node_buffer(node); + return ret; +} + +/* + * relocate a list of blocks + */ +static noinline_for_stack +int relocate_tree_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct rb_root *blocks) +{ + struct backref_cache *cache; + struct backref_node *node; + struct btrfs_path *path; + struct tree_block *block; + struct rb_node *rb_node; + int level = -1; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + cache = kmalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + btrfs_free_path(path); + return -ENOMEM; + } + + backref_cache_init(cache); + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (level == -1) + level = block->level; + else + BUG_ON(level != block->level); + if (!block->key_ready) + reada_tree_block(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (!block->key_ready) + get_tree_block_key(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + + node = build_backref_tree(rc, cache, &block->key, + block->level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, &block->key, + path); + if (ret < 0) { + err = ret; + goto out; + } + remove_backref_node(cache, node); + rb_node = rb_next(rb_node); + } + + if (level > 0) + goto out; + + free_block_list(blocks); + + /* + * now backrefs of some upper level tree blocks have been cached, + * try relocating blocks referenced by these upper level blocks. + */ + while (1) { + struct backref_node *upper = NULL; + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) + break; + + ret = add_adjacent_blocks(trans, rc, cache, blocks, level, + &upper); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) + goto out; + BUG_ON(!block->key_ready); + node = build_backref_tree(rc, cache, &block->key, + level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, + &block->key, path); + if (ret < 0) { + err = ret; + goto out; + } + remove_backref_node(cache, node); + rb_node = rb_next(rb_node); + } + free_block_list(blocks); + + if (upper) { + ret = link_to_upper(trans, upper, path); + if (ret < 0) { + err = ret; + break; + } + remove_backref_node(cache, upper); + } + } +out: + free_block_list(blocks); + + ret = finish_pending_nodes(trans, cache, path); + if (ret < 0) + err = ret; + + kfree(cache); + btrfs_free_path(path); + return err; +} + +static noinline_for_stack +int relocate_inode_pages(struct inode *inode, u64 start, u64 len) +{ + u64 page_start; + u64 page_end; + unsigned long i; + unsigned long first_index; + unsigned long last_index; + unsigned int total_read = 0; + unsigned int total_dirty = 0; + struct page *page; + struct file_ra_state *ra; + struct btrfs_ordered_extent *ordered; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret = 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + mutex_lock(&inode->i_mutex); + first_index = start >> PAGE_CACHE_SHIFT; + last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + + /* make sure the dirty trick played by the caller work */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + + for (i = first_index ; i <= last_index; i++) { + if (total_read % ra->ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, ra, NULL, i, + min(last_index, ra->ra_pages + i - 1)); + } + total_read++; +again: + if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) + BUG_ON(1); + page = grab_cache_page(inode->i_mapping, i); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + if (i == first_index) + set_extent_bits(io_tree, page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); + total_dirty++; + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + } +out_unlock: + mutex_unlock(&inode->i_mutex); + kfree(ra); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; +} + +static noinline_for_stack +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; + u64 end = start + extent_key->offset - 1; + + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->len = extent_key->offset; + em->block_len = extent_key->offset; + em->block_start = extent_key->objectid; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* setup extent map to cheat btrfs_readpage */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + + return relocate_inode_pages(inode, start, extent_key->offset); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int get_ref_objectid_v0(struct reloc_control *rc, + struct btrfs_path *path, + struct btrfs_key *extent_key, + u64 *ref_objectid, int *path_change) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref_v0 *ref0; + int ret; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0]; + while (1) { + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (path_change) + *path_change = 1; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != extent_key->objectid) + return -ENOENT; + + if (key.type != BTRFS_EXTENT_REF_V0_KEY) { + slot++; + continue; + } + ref0 = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_ref_v0); + *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + return 0; +} +#endif + +/* + * helper to add a tree block to the list. + * the major work is getting the generation and level of the block + */ +static int add_tree_block(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + struct tree_block *block; + struct rb_node *rb_node; + u32 item_size; + int level = -1; + int generation; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (item_size >= sizeof(*ei) + sizeof(*bi)) { + ei = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_item); + bi = (struct btrfs_tree_block_info *)(ei + 1); + generation = btrfs_extent_generation(eb, ei); + level = btrfs_tree_block_level(eb, bi); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int ret; + + BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, extent_key, + &ref_owner, NULL); + BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); + level = (int)ref_owner; + /* FIXME: get real generation */ + generation = 0; +#else + BUG(); +#endif + } + + btrfs_release_path(rc->extent_root, path); + + BUG_ON(level == -1); + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) + return -ENOMEM; + + block->bytenr = extent_key->objectid; + block->key.objectid = extent_key->offset; + block->key.offset = generation; + block->level = level; + block->key_ready = 0; + + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + + return 0; +} + +/* + * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY + */ +static int __add_tree_block(struct reloc_control *rc, + u64 bytenr, u32 blocksize, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + if (tree_block_processed(bytenr, blocksize, rc)) + return 0; + + if (tree_search(blocks, bytenr)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret); + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ret = add_tree_block(rc, &key, path, blocks); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to check if the block use full backrefs for pointers in it + */ +static int block_use_full_backref(struct reloc_control *rc, + struct extent_buffer *eb) +{ + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u64 flags; + int ret; + + if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || + btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) + return 1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = eb->start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = eb->len; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + BUG_ON(ret); + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + flags = btrfs_extent_flags(path->nodes[0], ei); + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = 1; + else + ret = 0; + btrfs_free_path(path); + return ret; +} + +/* + * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY + * this function scans fs tree to find blocks reference the data extent + */ +static int find_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct tree_block *block; + struct btrfs_root *root; + struct btrfs_file_extent_item *fi; + struct rb_node *rb_node; + struct btrfs_key key; + u64 ref_root; + u64 ref_objectid; + u64 ref_offset; + u32 ref_count; + u32 nritems; + int err = 0; + int added = 0; + int counted; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ref_root = btrfs_extent_data_ref_root(leaf, ref); + ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); + ref_offset = btrfs_extent_data_ref_offset(leaf, ref); + ref_count = btrfs_extent_data_ref_count(leaf, ref); + + root = read_fs_root(rc->extent_root->fs_info, ref_root); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + key.objectid = ref_objectid; + key.offset = ref_offset; + key.type = BTRFS_EXTENT_DATA_KEY; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + /* + * the references in tree blocks that use full backrefs + * are not counted in + */ + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + + while (ref_count > 0) { + while (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + WARN_ON(1); + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + added = 0; + + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) { + WARN_ON(1); + break; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + goto next; + + if (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid) + goto next; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + if (key.offset != ref_offset) + goto next; + + if (counted) + ref_count--; + if (added) + goto next; + + if (!tree_block_processed(leaf->start, leaf->len, rc)) { + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = leaf->start; + btrfs_item_key_to_cpu(leaf, &block->key, 0); + block->level = 0; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, + &block->rb_node); + BUG_ON(rb_node); + } + if (counted) + added = 1; + else + path->slots[0] = nritems; +next: + path->slots[0]++; + + } +out: + btrfs_free_path(path); + return err; +} + +/* + * hepler to find all tree blocks that reference a given data extent + */ +static noinline_for_stack +int add_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_data_ref *dref; + struct btrfs_extent_inline_ref *iref; + unsigned long ptr; + unsigned long end; + u32 blocksize; + int ret; + int err = 0; + + ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, + extent_key->offset); + BUG_ON(ret < 0); + if (ret > 0) { + /* the relocated data is fragmented */ + rc->extents_skipped++; + btrfs_release_path(rc->extent_root, path); + return 0; + } + + blocksize = btrfs_level_size(rc->extent_root, 0); + + eb = path->nodes[0]; + ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + end = ptr + btrfs_item_size_nr(eb, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ptr + sizeof(struct btrfs_extent_item_v0) == end) + ptr = end; + else +#endif + ptr += sizeof(struct btrfs_extent_item); + + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + BUG(); + } + ptr += btrfs_extent_inline_ref_size(key.type); + } + WARN_ON(ptr > end); + + while (1) { + cond_resched(); + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) { + err = ret; + break; + } + if (ret > 0) + break; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_key->objectid) + break; + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_DATA_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { +#endif + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + ret = 0; + } + if (ret) { + err = ret; + break; + } + path->slots[0]++; + } + btrfs_release_path(rc->extent_root, path); + if (err) + free_block_list(blocks); + return err; +} + +/* + * hepler to find next unprocessed extent + */ +static noinline_for_stack +int find_next_extent(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct btrfs_path *path) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u64 start, end, last; + int ret; + + last = rc->block_group->key.objectid + rc->block_group->key.offset; + while (1) { + cond_resched(); + if (rc->search_start >= last) { + ret = 1; + break; + } + + key.objectid = rc->search_start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, + 0, 0); + if (ret < 0) + break; +next: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid >= last) { + ret = 1; + break; + } + + if (key.type != BTRFS_EXTENT_ITEM_KEY || + key.objectid + key.offset <= rc->search_start) { + path->slots[0]++; + goto next; + } + + ret = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY); + + if (ret == 0 && start <= key.objectid) { + btrfs_release_path(rc->extent_root, path); + rc->search_start = end + 1; + } else { + rc->search_start = key.objectid + key.offset; + return 0; + } + } + btrfs_release_path(rc->extent_root, path); + return ret; +} + +static void set_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + mutex_lock(&fs_info->trans_mutex); + fs_info->reloc_ctl = rc; + mutex_unlock(&fs_info->trans_mutex); +} + +static void unset_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + mutex_lock(&fs_info->trans_mutex); + fs_info->reloc_ctl = NULL; + mutex_unlock(&fs_info->trans_mutex); +} + +static int check_extent_flags(u64 flags) +{ + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if (!(flags & BTRFS_EXTENT_FLAG_DATA) && + !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + return 1; + return 0; +} + +static noinline_for_stack int relocate_block_group(struct reloc_control *rc) +{ + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + unsigned long nr; + u64 flags; + u32 item_size; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rc->search_start = rc->block_group->key.objectid; + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); + + rc->create_reloc_root = 1; + set_reloc_control(rc); + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + while (1) { + trans = btrfs_start_transaction(rc->extent_root, 1); + + ret = find_next_extent(trans, rc, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rc->extents_found++; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (item_size >= sizeof(*ei)) { + flags = btrfs_extent_flags(path->nodes[0], ei); + ret = check_extent_flags(flags); + BUG_ON(ret); + + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int path_change = 0; + + BUG_ON(item_size != + sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, + &path_change); + if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) + flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else + flags = BTRFS_EXTENT_FLAG_DATA; + + if (path_change) { + btrfs_release_path(rc->extent_root, path); + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + } +#else + BUG(); +#endif + } + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = add_tree_block(rc, &key, path, &blocks); + } else if (rc->stage == UPDATE_DATA_PTRS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + ret = add_data_references(rc, &key, path, &blocks); + } else { + btrfs_release_path(rc->extent_root, path); + ret = 0; + } + if (ret < 0) { + err = 0; + break; + } + + if (!RB_EMPTY_ROOT(&blocks)) { + ret = relocate_tree_blocks(trans, rc, &blocks); + if (ret < 0) { + err = ret; + break; + } + } + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, rc->extent_root); + trans = NULL; + btrfs_btree_balance_dirty(rc->extent_root, nr); + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; + ret = relocate_data_extent(rc->data_inode, &key); + if (ret < 0) { + err = ret; + break; + } + } + } + btrfs_free_path(path); + + if (trans) { + nr = trans->blocks_used; + btrfs_end_transaction(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } + + rc->create_reloc_root = 0; + smp_mb(); + + if (rc->extents_found > 0) { + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + } + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + /* get rid of pinned extents */ + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + return err; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 size) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to create inode for data relocation. + * the inode is in data relocation tree and its link count is 0 + */ +static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key key; + unsigned long nr; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + err = btrfs_find_free_objectid(trans, root, objectid, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid, group->key.offset); + BUG_ON(err); + + err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, + group->key.offset, 0, group->key.offset, + 0, 0, 0); + BUG_ON(err); + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +/* + * function to relocate all extents in a block group. + */ +int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct reloc_control *rc; + int ret; + int err = 0; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return -ENOMEM; + + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); + INIT_LIST_HEAD(&rc->reloc_roots); + + rc->block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!rc->block_group); + + btrfs_init_workers(&rc->workers, "relocate", + fs_info->thread_pool_size); + + rc->extent_root = extent_root; + btrfs_prepare_block_group_relocation(extent_root, rc->block_group); + + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } + + printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", + (unsigned long long)rc->block_group->key.objectid, + (unsigned long long)rc->block_group->flags); + + btrfs_start_delalloc_inodes(fs_info->tree_root); + btrfs_wait_ordered_extents(fs_info->tree_root, 0); + + while (1) { + mutex_lock(&fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(fs_info->tree_root); + mutex_unlock(&fs_info->cleaner_mutex); + + rc->extents_found = 0; + rc->extents_skipped = 0; + + ret = relocate_block_group(rc); + if (ret < 0) { + err = ret; + break; + } + + if (rc->extents_found == 0) + break; + + printk(KERN_INFO "btrfs: found %llu extents\n", + (unsigned long long)rc->extents_found); + + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); + invalidate_mapping_pages(rc->data_inode->i_mapping, + 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } else if (rc->stage == UPDATE_DATA_PTRS && + rc->extents_skipped >= rc->extents_found) { + iput(rc->data_inode); + rc->data_inode = create_reloc_inode(fs_info, + rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + break; + } + rc->stage = MOVE_DATA_EXTENTS; + rc->found_file_extent = 0; + } + } + + filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, + rc->block_group->key.objectid, + rc->block_group->key.objectid + + rc->block_group->key.offset - 1); + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); +out: + iput(rc->data_inode); + btrfs_stop_workers(&rc->workers); + btrfs_put_block_group(rc->block_group); + kfree(rc); + return err; +} + +/* + * recover relocation interrupted by system crash. + * + * this function resumes merging reloc trees with corresponding fs trees. + * this is important for keeping the sharing of tree blocks + */ +int btrfs_recover_relocation(struct btrfs_root *root) +{ + LIST_HEAD(reloc_roots); + struct btrfs_key key; + struct btrfs_root *fs_root; + struct btrfs_root *reloc_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct reloc_control *rc = NULL; + struct btrfs_trans_handle *trans; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_TREE_RELOC_OBJECTID; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, + path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root->fs_info->tree_root, path); + + if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || + key.type != BTRFS_ROOT_ITEM_KEY) + break; + + reloc_root = btrfs_read_fs_root_no_radix(root, &key); + if (IS_ERR(reloc_root)) { + err = PTR_ERR(reloc_root); + goto out; + } + + list_add(&reloc_root->root_list, &reloc_roots); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + goto out; + } + } + + if (key.offset == 0) + break; + + key.offset--; + } + btrfs_release_path(root->fs_info->tree_root, path); + + if (list_empty(&reloc_roots)) + goto out; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) { + err = -ENOMEM; + goto out; + } + + mapping_tree_init(&rc->reloc_root_tree); + INIT_LIST_HEAD(&rc->reloc_roots); + btrfs_init_workers(&rc->workers, "relocate", + root->fs_info->thread_pool_size); + rc->extent_root = root->fs_info->extent_root; + + set_reloc_control(rc); + + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + + if (btrfs_root_refs(&reloc_root->root_item) == 0) { + list_add_tail(&reloc_root->root_list, + &rc->reloc_roots); + continue; + } + + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(fs_root)); + + __add_reloc_root(reloc_root); + fs_root->reloc_root = reloc_root; + } + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); +out: + if (rc) { + btrfs_stop_workers(&rc->workers); + kfree(rc); + } + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); + } + btrfs_free_path(path); + + if (err == 0) { + /* cleanup orphan inode in data relocation tree */ + fs_root = read_fs_root(root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(fs_root)) + err = PTR_ERR(fs_root); + } + return err; +} + +/* + * helper to add ordered checksum for data relocation. + * + * cloning checksum properly handles the nodatasum extents. + * it also saves CPU time to re-calculate the checksum. + */ +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + size_t offset; + int ret; + u64 disk_bytenr; + LIST_HEAD(list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return 0; +} diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index b48650de4472..0ddc6d61c55a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -111,6 +111,15 @@ out: return ret; } +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) +{ + btrfs_set_root_bytenr(item, node->start); + btrfs_set_root_level(item, btrfs_header_level(node)); + btrfs_set_root_generation(item, btrfs_header_generation(node)); + return 0; +} + /* * copy the data in 'item' into the btree */ @@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root * offset lower than the latest root. They need to be queued for deletion to * finish what was happening when we crashed. */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, - struct btrfs_root *latest) +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) { struct btrfs_root *dead_root; struct btrfs_item *item; @@ -227,10 +235,7 @@ again: goto err; } - if (objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_add_dead_reloc_root(dead_root); - else - ret = btrfs_add_dead_root(dead_root, latest); + ret = btrfs_add_dead_root(dead_root); if (ret) goto err; goto again; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 19a4daf03ccb..9f179d4832d5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -24,6 +24,7 @@ #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> +#include <linux/seq_file.h> #include <linux/string.h> #include <linux/smp_lock.h> #include <linux/backing-dev.h> @@ -51,7 +52,6 @@ #include "export.h" #include "compression.h" - static struct super_operations btrfs_super_ops; static void btrfs_put_super(struct super_block *sb) @@ -66,7 +66,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, + Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -82,7 +83,12 @@ static match_table_t tokens = { {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, + {Opt_ssd_spread, "ssd_spread"}, + {Opt_nossd, "nossd"}, {Opt_noacl, "noacl"}, + {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, {Opt_err, NULL}, }; @@ -153,7 +159,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ break; case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatacsum\n"); + printk(KERN_INFO "btrfs: setting nodatasum\n"); btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: @@ -169,6 +175,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); break; + case Opt_ssd_spread: + printk(KERN_INFO "btrfs: use spread ssd " + "allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_nossd: + printk(KERN_INFO "btrfs: not using ssd allocation " + "scheme\n"); + btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + break; case Opt_nobarrier: printk(KERN_INFO "btrfs: turning off barriers\n"); btrfs_set_opt(info->mount_opt, NOBARRIER); @@ -191,7 +210,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->max_extent = max_t(u64, info->max_extent, root->sectorsize); printk(KERN_INFO "btrfs: max_extent at %llu\n", - info->max_extent); + (unsigned long long)info->max_extent); } break; case Opt_max_inline: @@ -206,7 +225,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) root->sectorsize); } printk(KERN_INFO "btrfs: max_inline at %llu\n", - info->max_inline); + (unsigned long long)info->max_inline); } break; case Opt_alloc_start: @@ -216,12 +235,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", - info->alloc_start); + (unsigned long long)info->alloc_start); } break; case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; + case Opt_notreelog: + printk(KERN_INFO "btrfs: disabling tree log\n"); + btrfs_set_opt(info->mount_opt, NOTREELOG); + break; + case Opt_flushoncommit: + printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); + btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->metadata_ratio = intarg; + printk(KERN_INFO "btrfs: metadata ratio %d\n", + info->metadata_ratio); + } + break; default: break; } @@ -300,7 +336,7 @@ static int btrfs_fill_super(struct super_block *sb, struct dentry *root_dentry; struct btrfs_super_block *disk_super; struct btrfs_root *tree_root; - struct btrfs_inode *bi; + struct btrfs_key key; int err; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -319,23 +355,15 @@ static int btrfs_fill_super(struct super_block *sb, } sb->s_fs_info = tree_root; disk_super = &tree_root->fs_info->super_copy; - inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID, - tree_root->fs_info->fs_root); - bi = BTRFS_I(inode); - bi->location.objectid = inode->i_ino; - bi->location.offset = 0; - bi->root = tree_root->fs_info->fs_root; - - btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); - if (!inode) { - err = -ENOMEM; + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto fail_close; } - if (inode->i_state & I_NEW) { - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } root_dentry = d_alloc_root(inode); if (!root_dentry) { @@ -363,14 +391,9 @@ fail_close: int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = btrfs_sb(sb); int ret; - root = btrfs_sb(sb); - - if (sb->s_flags & MS_RDONLY) - return 0; - sb->s_dirt = 0; if (!wait) { filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; @@ -381,13 +404,49 @@ int btrfs_sync_fs(struct super_block *sb, int wait) trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); - sb->s_dirt = 0; return ret; } -static void btrfs_write_super(struct super_block *sb) +static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) { - sb->s_dirt = 0; + struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_fs_info *info = root->fs_info; + + if (btrfs_test_opt(root, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(root, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(root, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(root, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_extent != (u64)-1) + seq_printf(seq, ",max_extent=%llu", + (unsigned long long)info->max_extent); + if (info->max_inline != 8192 * 1024) + seq_printf(seq, ",max_inline=%llu", + (unsigned long long)info->max_inline); + if (info->alloc_start != 0) + seq_printf(seq, ",alloc_start=%llu", + (unsigned long long)info->alloc_start); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + if (btrfs_test_opt(root, COMPRESS)) + seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, NOSSD)) + seq_puts(seq, ",nossd"); + if (btrfs_test_opt(root, SSD_SPREAD)) + seq_puts(seq, ",ssd_spread"); + else if (btrfs_test_opt(root, SSD)) + seq_puts(seq, ",ssd"); + if (btrfs_test_opt(root, NOTREELOG)) + seq_puts(seq, ",notreelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flushoncommit"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + return 0; } static int btrfs_test_super(struct super_block *s, void *data) @@ -443,8 +502,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); error = -EBUSY; goto error_close_devices; } @@ -458,8 +516,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); goto error_free_subvol_name; } @@ -476,15 +533,13 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, mutex_unlock(&s->s_root->d_inode->i_mutex); if (IS_ERR(root)) { - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); error = PTR_ERR(root); goto error_free_subvol_name; } if (!root->d_inode) { dput(root); - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); error = -ENXIO; goto error_free_subvol_name; } @@ -529,7 +584,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) return -EINVAL; - ret = btrfs_cleanup_reloc_trees(root); + /* recover relocation */ + ret = btrfs_recover_relocation(root); WARN_ON(ret); ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -589,14 +645,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol = kmalloc(sizeof(*vol), GFP_KERNEL); - if (!vol) - return -ENOMEM; - - if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { - ret = -EFAULT; - goto out; - } + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -604,7 +655,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, &btrfs_fs_type, &fs_devices); break; } -out: + kfree(vol); return ret; } @@ -628,9 +679,8 @@ static int btrfs_unfreeze(struct super_block *sb) static struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, .put_super = btrfs_put_super, - .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, - .show_options = generic_show_options, + .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, .dirty_inode = btrfs_dirty_inode, .alloc_inode = btrfs_alloc_inode, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4112d53d4f4d..2dbf1c1f56ee 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -25,7 +25,6 @@ #include "disk-io.h" #include "transaction.h" #include "locking.h" -#include "ref-cache.h" #include "tree-log.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -53,8 +52,6 @@ static noinline int join_transaction(struct btrfs_root *root) GFP_NOFS); BUG_ON(!cur_trans); root->fs_info->generation++; - root->fs_info->last_alloc = 0; - root->fs_info->last_data_alloc = 0; cur_trans->num_writers = 1; cur_trans->num_joined = 0; cur_trans->transid = root->fs_info->generation; @@ -65,6 +62,15 @@ static noinline int join_transaction(struct btrfs_root *root) cur_trans->use_count = 1; cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.root.rb_node = NULL; + cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + spin_lock_init(&cur_trans->delayed_refs.lock); + INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -87,48 +93,40 @@ static noinline int join_transaction(struct btrfs_root *root) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -noinline int btrfs_record_root_in_trans(struct btrfs_root *root) +static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; - u64 running_trans_id = root->fs_info->running_transaction->transid; - if (root->ref_cows && root->last_trans < running_trans_id) { + if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); - if (root->root_item.refs != 0) { - radix_tree_tag_set(&root->fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - - dirty = kmalloc(sizeof(*dirty), GFP_NOFS); - BUG_ON(!dirty); - dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); - BUG_ON(!dirty->root); - dirty->latest_root = root; - INIT_LIST_HEAD(&dirty->list); - - root->commit_root = btrfs_root_node(root); - - memcpy(dirty->root, root, sizeof(*root)); - spin_lock_init(&dirty->root->node_lock); - spin_lock_init(&dirty->root->list_lock); - mutex_init(&dirty->root->objectid_mutex); - mutex_init(&dirty->root->log_mutex); - INIT_LIST_HEAD(&dirty->root->dead_list); - dirty->root->node = root->commit_root; - dirty->root->commit_root = NULL; - - spin_lock(&root->list_lock); - list_add(&dirty->root->dead_list, &root->dead_list); - spin_unlock(&root->list_lock); - - root->dirty_root = dirty; - } else { - WARN_ON(1); - } - root->last_trans = running_trans_id; + WARN_ON(root->root_item.refs == 0); + WARN_ON(root->commit_root != root->node); + + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + root->last_trans = trans->transid; + btrfs_init_reloc_root(trans, root); } return 0; } +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; + + mutex_lock(&root->fs_info->trans_mutex); + if (root->last_trans == trans->transid) { + mutex_unlock(&root->fs_info->trans_mutex); + return 0; + } + + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->trans_mutex); + return 0; +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -174,7 +172,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, ret = join_transaction(root); BUG_ON(ret); - btrfs_record_root_in_trans(root); h->transid = root->fs_info->running_transaction->transid; h->transaction = root->fs_info->running_transaction; h->blocks_reserved = num_blocks; @@ -182,7 +179,10 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->block_group = 0; h->alloc_exclude_nr = 0; h->alloc_exclude_start = 0; + h->delayed_ref_updates = 0; + root->fs_info->running_transaction->use_count++; + record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); return h; } @@ -224,6 +224,7 @@ static noinline int wait_for_commit(struct btrfs_root *root, return 0; } +#if 0 /* * rate limit against the drop_snapshot code. This helps to slow down new * operations if the drop_snapshot code isn't able to keep up. @@ -264,6 +265,7 @@ harder: goto harder; } } +#endif void btrfs_throttle(struct btrfs_root *root) { @@ -271,8 +273,6 @@ void btrfs_throttle(struct btrfs_root *root) if (!root->fs_info->open_ioctl_trans) wait_current_trans(root); mutex_unlock(&root->fs_info->trans_mutex); - - throttle_on_drops(root); } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -280,6 +280,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *info = root->fs_info; + int count = 0; + + while (count < 4) { + unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (cur && + trans->transaction->delayed_refs.num_heads_ready > 64) { + trans->delayed_ref_updates = 0; + + /* + * do a full flush if the transaction is trying + * to close + */ + if (trans->transaction->delayed_refs.flushing) + cur = 0; + btrfs_run_delayed_refs(trans, root, cur); + } else { + break; + } + count++; + } mutex_lock(&info->trans_mutex); cur_trans = info->running_transaction; @@ -294,9 +315,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (throttle) - throttle_on_drops(root); - return 0; } @@ -424,50 +442,52 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; struct btrfs_root *tree_root = root->fs_info->tree_root; - btrfs_extent_post_op(trans, root); btrfs_write_dirty_block_groups(trans, root); - btrfs_extent_post_op(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start) break; - btrfs_set_root_bytenr(&root->root_item, - root->node->start); - btrfs_set_root_level(&root->root_item, - btrfs_header_level(root->node)); - btrfs_set_root_generation(&root->root_item, trans->transid); - - btrfs_extent_post_op(trans, root); + btrfs_set_root_node(&root->root_item, root->node); ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); BUG_ON(ret); btrfs_write_dirty_block_groups(trans, root); - btrfs_extent_post_op(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); } + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); return 0; } /* * update all the cowonly tree roots on disk */ -int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *next; struct extent_buffer *eb; + int ret; - btrfs_extent_post_op(trans, fs_info->tree_root); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); eb = btrfs_lock_root_node(fs_info->tree_root); - btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); btrfs_tree_unlock(eb); free_extent_buffer(eb); - btrfs_extent_post_op(trans, fs_info->tree_root); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; @@ -475,6 +495,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, root = list_entry(next, struct btrfs_root, dirty_list); update_cowonly_root(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); } return 0; } @@ -484,118 +507,54 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) +int btrfs_add_dead_root(struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; - - dirty = kmalloc(sizeof(*dirty), GFP_NOFS); - if (!dirty) - return -ENOMEM; - dirty->root = root; - dirty->latest_root = latest; - mutex_lock(&root->fs_info->trans_mutex); - list_add(&dirty->list, &latest->fs_info->dead_roots); + list_add(&root->root_list, &root->fs_info->dead_roots); mutex_unlock(&root->fs_info->trans_mutex); return 0; } /* - * at transaction commit time we need to schedule the old roots for - * deletion via btrfs_drop_snapshot. This runs through all the - * reference counted roots that were modified in the current - * transaction and puts them into the drop list + * update all the cowonly tree roots on disk */ -static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, - struct radix_tree_root *radix, - struct list_head *list) +static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; struct btrfs_root *gang[8]; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info = root->fs_info; int i; int ret; int err = 0; - u32 refs; while (1) { - ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, ARRAY_SIZE(gang), BTRFS_ROOT_TRANS_TAG); if (ret == 0) break; for (i = 0; i < ret; i++) { root = gang[i]; - radix_tree_tag_clear(radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - - BUG_ON(!root->ref_tree); - dirty = root->dirty_root; + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); btrfs_free_log(trans, root); - btrfs_free_reloc_root(trans, root); - - if (root->commit_root == root->node) { - WARN_ON(root->node->start != - btrfs_root_bytenr(&root->root_item)); + btrfs_update_reloc_root(trans, root); + if (root->commit_root != root->node) { free_extent_buffer(root->commit_root); - root->commit_root = NULL; - root->dirty_root = NULL; - - spin_lock(&root->list_lock); - list_del_init(&dirty->root->dead_list); - spin_unlock(&root->list_lock); - - kfree(dirty->root); - kfree(dirty); - - /* make sure to update the root on disk - * so we get any updates to the block used - * counts - */ - err = btrfs_update_root(trans, - root->fs_info->tree_root, - &root->root_key, - &root->root_item); - continue; + root->commit_root = btrfs_root_node(root); + btrfs_set_root_node(&root->root_item, + root->node); } - memset(&root->root_item.drop_progress, 0, - sizeof(struct btrfs_disk_key)); - root->root_item.drop_level = 0; - root->commit_root = NULL; - root->dirty_root = NULL; - root->root_key.offset = root->fs_info->generation; - btrfs_set_root_bytenr(&root->root_item, - root->node->start); - btrfs_set_root_level(&root->root_item, - btrfs_header_level(root->node)); - btrfs_set_root_generation(&root->root_item, - root->root_key.offset); - - err = btrfs_insert_root(trans, root->fs_info->tree_root, + err = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (err) break; - - refs = btrfs_root_refs(&dirty->root->root_item); - btrfs_set_root_refs(&dirty->root->root_item, refs - 1); - err = btrfs_update_root(trans, root->fs_info->tree_root, - &dirty->root->root_key, - &dirty->root->root_item); - - BUG_ON(err); - if (refs == 1) { - list_add(&dirty->list, list); - } else { - WARN_ON(1); - free_extent_buffer(dirty->root->node); - kfree(dirty->root); - kfree(dirty); - } } } return err; @@ -634,102 +593,96 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) return 0; } +#if 0 /* - * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on - * all of them + * when dropping snapshots, we generate a ton of delayed refs, and it makes + * sense not to join the transaction while it is trying to flush the current + * queue of delayed refs out. + * + * This is used by the drop snapshot code only */ -static noinline int drop_dirty_roots(struct btrfs_root *tree_root, - struct list_head *list) +static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) { - struct btrfs_dirty_root *dirty; - struct btrfs_trans_handle *trans; - unsigned long nr; - u64 num_bytes; - u64 bytes_used; - u64 max_useless; - int ret = 0; - int err; - - while (!list_empty(list)) { - struct btrfs_root *root; + DEFINE_WAIT(wait); - dirty = list_entry(list->prev, struct btrfs_dirty_root, list); - list_del_init(&dirty->list); + mutex_lock(&info->trans_mutex); + while (info->running_transaction && + info->running_transaction->delayed_refs.flushing) { + prepare_to_wait(&info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&info->trans_mutex); - num_bytes = btrfs_root_used(&dirty->root->root_item); - root = dirty->latest_root; - atomic_inc(&root->fs_info->throttles); + schedule(); - while (1) { - trans = btrfs_start_transaction(tree_root, 1); - mutex_lock(&root->fs_info->drop_mutex); - ret = btrfs_drop_snapshot(trans, dirty->root); - if (ret != -EAGAIN) - break; - mutex_unlock(&root->fs_info->drop_mutex); + mutex_lock(&info->trans_mutex); + finish_wait(&info->transaction_wait, &wait); + } + mutex_unlock(&info->trans_mutex); + return 0; +} - err = btrfs_update_root(trans, - tree_root, - &dirty->root->root_key, - &dirty->root->root_item); - if (err) - ret = err; - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, tree_root); - BUG_ON(ret); +/* + * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on + * all of them + */ +int btrfs_drop_dead_root(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; + unsigned long nr; + int ret; - btrfs_btree_balance_dirty(tree_root, nr); - cond_resched(); + while (1) { + /* + * we don't want to jump in and create a bunch of + * delayed refs if the transaction is starting to close + */ + wait_transaction_pre_flush(tree_root->fs_info); + trans = btrfs_start_transaction(tree_root, 1); + + /* + * we've joined a transaction, make sure it isn't + * closing right now + */ + if (trans->transaction->delayed_refs.flushing) { + btrfs_end_transaction(trans, tree_root); + continue; } - BUG_ON(ret); - atomic_dec(&root->fs_info->throttles); - wake_up(&root->fs_info->transaction_throttle); - num_bytes -= btrfs_root_used(&dirty->root->root_item); - bytes_used = btrfs_root_used(&root->root_item); - if (num_bytes) { - mutex_lock(&root->fs_info->trans_mutex); - btrfs_record_root_in_trans(root); - mutex_unlock(&root->fs_info->trans_mutex); - btrfs_set_root_used(&root->root_item, - bytes_used - num_bytes); - } + ret = btrfs_drop_snapshot(trans, root); + if (ret != -EAGAIN) + break; - ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); - if (ret) { - BUG(); + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + if (ret) break; - } - mutex_unlock(&root->fs_info->drop_mutex); - - spin_lock(&root->list_lock); - list_del_init(&dirty->root->dead_list); - if (!list_empty(&root->dead_list)) { - struct btrfs_root *oldest; - oldest = list_entry(root->dead_list.prev, - struct btrfs_root, dead_list); - max_useless = oldest->root_key.offset - 1; - } else { - max_useless = root->root_key.offset - 1; - } - spin_unlock(&root->list_lock); nr = trans->blocks_used; ret = btrfs_end_transaction(trans, tree_root); BUG_ON(ret); - ret = btrfs_remove_leaf_refs(root, max_useless, 0); - BUG_ON(ret); - - free_extent_buffer(dirty->root->node); - kfree(dirty->root); - kfree(dirty); - btrfs_btree_balance_dirty(tree_root, nr); cond_resched(); } + BUG_ON(ret); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + + btrfs_btree_balance_dirty(tree_root, nr); return ret; } +#endif /* * new snapshots need to be created at a very specific time in the @@ -757,24 +710,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) goto fail; - btrfs_record_root_in_trans(root); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); key.objectid = objectid; - key.offset = trans->transid; + key.offset = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); - btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); + btrfs_cow_block(trans, root, old, NULL, 0, &old); + btrfs_set_lock_blocking(old); btrfs_copy_root(trans, root, old, &tmp, objectid); btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_set_root_bytenr(new_root_item, tmp->start); - btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); - btrfs_set_root_generation(new_root_item, trans->transid); + btrfs_set_root_node(new_root_item, tmp); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); @@ -882,6 +834,24 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, return 0; } +static void update_super_roots(struct btrfs_root *root) +{ + struct btrfs_root_item *root_item; + struct btrfs_super_block *super; + + super = &root->fs_info->super_copy; + + root_item = &root->fs_info->chunk_root->root_item; + super->chunk_root = root_item->bytenr; + super->chunk_root_generation = root_item->generation; + super->chunk_root_level = root_item->level; + + root_item = &root->fs_info->tree_root->root_item; + super->root = root_item->bytenr; + super->generation = root_item->generation; + super->root_level = root_item->level; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -889,17 +859,34 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; - struct btrfs_root *chunk_root = root->fs_info->chunk_root; - struct list_head dirty_fs_roots; struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; + int should_grow = 0; + unsigned long now = get_seconds(); + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); + + btrfs_run_ordered_operations(root, 0); + + /* make a pass through all the delayed refs we have so far + * any runnings procs may add more while we are here + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); + + cur_trans = trans->transaction; + /* + * set the flushing flag so procs in this transaction have to + * start sending their work down. + */ + cur_trans->delayed_refs.flushing = 1; + + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); - INIT_LIST_HEAD(&dirty_fs_roots); mutex_lock(&root->fs_info->trans_mutex); - if (trans->transaction->in_commit) { - cur_trans = trans->transaction; - trans->transaction->use_count++; + if (cur_trans->in_commit) { + cur_trans->use_count++; mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); @@ -922,7 +909,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->in_commit = 1; trans->transaction->blocked = 1; - cur_trans = trans->transaction; if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); @@ -937,6 +923,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } } + if (now < cur_trans->start_time || now - cur_trans->start_time < 1) + should_grow = 1; + do { int snap_pending = 0; joined = cur_trans->num_joined; @@ -949,26 +938,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (cur_trans->num_writers > 1) timeout = MAX_SCHEDULE_TIMEOUT; - else + else if (should_grow) timeout = 1; mutex_unlock(&root->fs_info->trans_mutex); - if (snap_pending) { + if (flush_on_commit || snap_pending) { + if (flush_on_commit) + btrfs_start_delalloc_inodes(root); ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } - schedule_timeout(timeout); + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + + smp_mb(); + if (cur_trans->num_writers > 1 || should_grow) + schedule_timeout(timeout); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); } while (cur_trans->num_writers > 1 || - (cur_trans->num_joined != joined)); + (should_grow && cur_trans->num_joined != joined)); ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + WARN_ON(cur_trans != trans->transaction); /* btrfs_commit_tree_roots is responsible for getting the @@ -985,41 +990,36 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * with the tree-log code. */ mutex_lock(&root->fs_info->tree_log_mutex); - /* - * keep tree reloc code from adding new reloc trees - */ - mutex_lock(&root->fs_info->tree_reloc_mutex); - - ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, - &dirty_fs_roots); + ret = commit_fs_roots(trans, root); BUG_ON(ret); - /* add_dirty_roots gets rid of all the tree log roots, it is now + /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ btrfs_free_log_root_tree(trans, root->fs_info); - ret = btrfs_commit_tree_roots(trans, root); + ret = commit_cowonly_roots(trans, root); BUG_ON(ret); cur_trans = root->fs_info->running_transaction; spin_lock(&root->fs_info->new_trans_lock); root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->new_trans_lock); - btrfs_set_super_generation(&root->fs_info->super_copy, - cur_trans->transid); - btrfs_set_super_root(&root->fs_info->super_copy, - root->fs_info->tree_root->node->start); - btrfs_set_super_root_level(&root->fs_info->super_copy, - btrfs_header_level(root->fs_info->tree_root->node)); - - btrfs_set_super_chunk_root(&root->fs_info->super_copy, - chunk_root->node->start); - btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, - btrfs_header_level(chunk_root->node)); - btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, - btrfs_header_generation(chunk_root->node)); + + btrfs_set_root_node(&root->fs_info->tree_root->root_item, + root->fs_info->tree_root->node); + free_extent_buffer(root->fs_info->tree_root->commit_root); + root->fs_info->tree_root->commit_root = + btrfs_root_node(root->fs_info->tree_root); + + btrfs_set_root_node(&root->fs_info->chunk_root->root_item, + root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + root->fs_info->chunk_root->commit_root = + btrfs_root_node(root->fs_info->chunk_root); + + update_super_roots(root); if (!root->fs_info->log_root_recovering) { btrfs_set_super_log_root(&root->fs_info->super_copy, 0); @@ -1032,7 +1032,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_copy_pinned(root, pinned_copy); trans->transaction->blocked = 0; - wake_up(&root->fs_info->transaction_throttle); + wake_up(&root->fs_info->transaction_wait); mutex_unlock(&root->fs_info->trans_mutex); @@ -1049,31 +1049,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root, pinned_copy); kfree(pinned_copy); - btrfs_drop_dead_reloc_roots(root); - mutex_unlock(&root->fs_info->tree_reloc_mutex); - /* do the directory inserts of any pending snapshot creations */ finish_pending_snapshots(trans, root->fs_info); mutex_lock(&root->fs_info->trans_mutex); cur_trans->commit_done = 1; + root->fs_info->last_trans_committed = cur_trans->transid; wake_up(&cur_trans->commit_wait); put_transaction(cur_trans); put_transaction(cur_trans); - list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); - if (root->fs_info->closing) - list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); - mutex_unlock(&root->fs_info->trans_mutex); kmem_cache_free(btrfs_trans_handle_cachep, trans); - - if (root->fs_info->closing) - drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); return ret; } @@ -1082,16 +1073,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ int btrfs_clean_old_snapshots(struct btrfs_root *root) { - struct list_head dirty_roots; - INIT_LIST_HEAD(&dirty_roots); -again: - mutex_lock(&root->fs_info->trans_mutex); - list_splice_init(&root->fs_info->dead_roots, &dirty_roots); - mutex_unlock(&root->fs_info->trans_mutex); + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->trans_mutex); + list_splice_init(&fs_info->dead_roots, &list); + mutex_unlock(&fs_info->trans_mutex); - if (!list_empty(&dirty_roots)) { - drop_dirty_roots(root, &dirty_roots); - goto again; + while (!list_empty(&list)) { + root = list_entry(list.next, struct btrfs_root, root_list); + list_del_init(&root->root_list); + btrfs_drop_snapshot(root, 0); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index ea292117f882..961c3ee5a2e1 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -19,10 +19,16 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ #include "btrfs_inode.h" +#include "delayed-ref.h" struct btrfs_transaction { u64 transid; + /* + * total writers in this transaction, it must be zero before the + * transaction can end + */ unsigned long num_writers; + unsigned long num_joined; int in_commit; int use_count; @@ -34,6 +40,7 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; + struct btrfs_delayed_ref_root delayed_refs; }; struct btrfs_trans_handle { @@ -44,6 +51,7 @@ struct btrfs_trans_handle { u64 block_group; u64 alloc_exclude_start; u64 alloc_exclude_nr; + unsigned long delayed_ref_updates; }; struct btrfs_pending_snapshot { @@ -54,12 +62,6 @@ struct btrfs_pending_snapshot { struct list_head list; }; -struct btrfs_dirty_root { - struct list_head list; - struct btrfs_root *root; - struct btrfs_root *latest_root; -}; - static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -92,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); +int btrfs_add_dead_root(struct btrfs_root *root); +int btrfs_drop_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -100,7 +103,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); -int btrfs_record_root_in_trans(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 98d25fa4570e..b10eacdb1620 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } btrfs_release_path(root, path); - if (is_extent) - btrfs_extent_post_op(trans, root); out: if (path) btrfs_free_path(path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9c462fbd60fa..c13922206d1b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -35,6 +35,49 @@ #define LOG_INODE_EXISTS 1 /* + * directory trouble cases + * + * 1) on rename or unlink, if the inode being unlinked isn't in the fsync + * log, we must force a full commit before doing an fsync of the directory + * where the unlink was done. + * ---> record transid of last unlink/rename per directory + * + * mkdir foo/some_dir + * normal commit + * rename foo/some_dir foo2/some_dir + * mkdir foo/some_dir + * fsync foo/some_dir/some_file + * + * The fsync above will unlink the original some_dir without recording + * it in its new location (foo2). After a crash, some_dir will be gone + * unless the fsync of some_file forces a full commit + * + * 2) we must log any new names for any file or dir that is in the fsync + * log. ---> check inode while renaming/linking. + * + * 2a) we must log any new names for any file or dir during rename + * when the directory they are being removed from was logged. + * ---> check inode and old parent dir during rename + * + * 2a is actually the more important variant. With the extra logging + * a crash might unlink the old name without recreating the new one + * + * 3) after a crash, we must go through any directories with a link count + * of zero and redo the rm -rf + * + * mkdir f1/foo + * normal commit + * rm -rf f1/foo + * fsync(f1) + * + * The directory f1 was fully removed from the FS, but fsync was never + * called on f1, only its parent dir. After a crash the rm -rf must + * be replayed. This must be able to recurse down the entire + * directory tree. The inode link count fixup code takes care of the + * ugly details. + */ + +/* * stages for the tree walking. The first * stage (0) is to only pin down the blocks we find * the second stage (1) is to make sure that all the inodes @@ -47,12 +90,17 @@ #define LOG_WALK_REPLAY_INODES 1 #define LOG_WALK_REPLAY_ALL 2 -static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all); /* * tree logging is a special write ahead log used to make sure that @@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root) } /* + * This either makes the current running log transaction wait + * until you call btrfs_end_log_trans() or it makes any future + * log transactions wait until you call btrfs_end_log_trans() + */ +int btrfs_pin_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + mutex_lock(&root->log_mutex); + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return ret; +} + +/* * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ -static int end_log_trans(struct btrfs_root *root) +int btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { smp_mb(); @@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { - if (wc->pin) { - mutex_lock(&log->fs_info->pinned_mutex); + if (wc->pin) btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); - mutex_unlock(&log->fs_info->pinned_mutex); - } if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -370,18 +430,16 @@ no_copy: static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { + struct btrfs_key key; struct inode *inode; - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = objectid; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - if (is_bad_inode(inode)) { + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + if (IS_ERR(inode)) { + inode = NULL; + } else if (is_bad_inode(inode)) { iput(inode); inode = NULL; } @@ -476,11 +534,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 offset; unsigned long dest_offset; struct btrfs_key ins; @@ -495,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + offset = key->offset - btrfs_file_extent_offset(eb, item); if (ins.objectid > 0) { u64 csum_start; @@ -509,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, - path->nodes[0]->start, - root->root_key.objectid, - trans->transid, key->objectid); + 0, root->root_key.objectid, + key->objectid, offset); } else { /* * insert the extent pointer in the extent * allocation tree */ - ret = btrfs_alloc_logged_extent(trans, root, - path->nodes[0]->start, - root->root_key.objectid, - trans->transid, key->objectid, - &ins); + ret = btrfs_alloc_logged_file_extent(trans, + root, root->root_key.objectid, + key->objectid, offset, &ins); BUG_ON(ret); } btrfs_release_path(root, path); @@ -603,6 +660,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = link_to_fixup_dir(trans, root, path, location.objectid); BUG_ON(ret); + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); BUG_ON(ret); kfree(name); @@ -804,6 +862,7 @@ conflict_again: victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(root, path); + ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); @@ -922,13 +981,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset--; btrfs_release_path(root, path); } - btrfs_free_path(path); + btrfs_release_path(root, path); if (nlink != inode->i_nlink) { inode->i_nlink = nlink; btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; + if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + inode->i_ino, 1); + BUG_ON(ret); + } + btrfs_free_path(path); + return 0; } @@ -971,9 +1037,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, iput(inode); - if (key.offset == 0) - break; - key.offset--; + /* + * fixup on a directory may create new entries, + * make sure we always look for the highset possible + * offset + */ + key.offset = (u64)-1; } btrfs_release_path(root, path); return 0; @@ -1150,8 +1219,7 @@ insert: ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) - BUG(); + BUG_ON(ret && ret != -ENOENT); goto out; } @@ -1313,11 +1381,11 @@ again: read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); log_di = NULL; - if (dir_key->type == BTRFS_DIR_ITEM_KEY) { + if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { log_di = btrfs_lookup_dir_item(trans, log, log_path, dir_key->objectid, name, name_len, 0); - } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { + } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, @@ -1378,7 +1446,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid) + u64 dirid, int del_all) { u64 range_start; u64 range_end; @@ -1408,10 +1476,14 @@ again: range_start = 0; range_end = 0; while (1) { - ret = find_dir_range(log, path, dirid, key_type, - &range_start, &range_end); - if (ret != 0) - break; + if (del_all) + range_end = (u64)-1; + else { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + } dir_key.offset = range_start; while (1) { @@ -1437,7 +1509,8 @@ again: break; ret = check_item_in_log(trans, root, log, path, - log_path, dir, &found_key); + log_path, dir, + &found_key); BUG_ON(ret); if (found_key.offset == (u64)-1) break; @@ -1514,7 +1587,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid); + root, log, path, key.objectid, 0); BUG_ON(ret); } ret = overwrite_item(wc->trans, root, path, @@ -1533,6 +1606,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); BUG_ON(ret); + + /* if the nlink count is zero here, the iput + * will free the inode. We bump it to make + * sure it doesn't get freed until the link + * count fixup is done + */ + if (inode->i_nlink == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(wc->trans, + root, inode); + } iput(inode); } ret = link_to_fixup_dir(wc->trans, root, @@ -1619,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - ret = btrfs_drop_leaf_ref(trans, root, next); - BUG_ON(ret); - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, @@ -1666,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, next); - BUG_ON(ret); - } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); @@ -1724,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, - next); - BUG_ON(ret); - } - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, path->nodes[*level]->start, @@ -1797,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (orig_level == 0) { - ret = btrfs_drop_leaf_ref(trans, log, - next); - BUG_ON(ret); - } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(log, next->start, @@ -1840,7 +1906,8 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_root *root, unsigned long transid) +static int wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -1854,9 +1921,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid) prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->log_transid < transid + 2 && + + if (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])) schedule(); + finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); } while (root->log_transid < transid + 2 && @@ -1864,14 +1934,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid) return 0; } -static int wait_for_writer(struct btrfs_root *root) +static int wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { DEFINE_WAIT(wait); while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (atomic_read(&root->log_writers)) + if (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); @@ -1882,7 +1954,14 @@ static int wait_for_writer(struct btrfs_root *root) /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk + * you know that any inodes previously logged are safely on disk only + * if it returns 0. + * + * Any other return value means you need to call btrfs_commit_transaction. + * Some of the edge cases for fsyncing directories that have had unlinks + * or renames done in the past mean that sometimes the only safe + * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, + * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -1896,7 +1975,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(root, root->log_transid); + wait_log_commit(trans, root, root->log_transid); mutex_unlock(&root->log_mutex); return 0; } @@ -1904,24 +1983,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(root, root->log_transid - 1); + wait_log_commit(trans, root, root->log_transid - 1); while (1) { unsigned long batch = root->log_batch; mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); - wait_for_writer(root); + + wait_for_writer(trans, root); if (batch == root->log_batch) break; } + /* bail out if we need to do a full commit */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + ret = -EAGAIN; + mutex_unlock(&root->log_mutex); + goto out; + } + ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); - btrfs_set_root_bytenr(&log->root_item, log->node->start); - btrfs_set_root_generation(&log->root_item, trans->transid); - btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; root->log_transid++; @@ -1951,16 +2036,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { - wait_log_commit(log_root_tree, log_root_tree->log_transid); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); - if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) - wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid - 1); + } + + wait_for_writer(trans, log_root_tree); - wait_for_writer(log_root_tree); + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; + } ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); @@ -1985,7 +2083,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * in and cause problems either. */ write_ctree_super(trans, root->fs_info->tree_root, 2); + ret = 0; +out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) @@ -1998,7 +2098,8 @@ out: return 0; } -/* * free all the extents used by the tree log. This should be called +/* + * free all the extents used by the tree log. This should be called * at commit time of the full transaction */ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2132,7 +2233,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_free_path(path); mutex_unlock(&BTRFS_I(dir)->log_mutex); - end_log_trans(root); + btrfs_end_log_trans(root); return 0; } @@ -2159,7 +2260,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); - end_log_trans(root); + btrfs_end_log_trans(root); return ret; } @@ -2457,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_keys, ins_sizes, nr); BUG_ON(ret); - for (i = 0; i < nr; i++) { + for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_path->slots[0]); @@ -2493,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 ds = btrfs_file_extent_disk_bytenr(src, - extent); - u64 dl = btrfs_file_extent_disk_num_bytes(src, - extent); - u64 cs = btrfs_file_extent_offset(src, extent); - u64 cl = btrfs_file_extent_num_bytes(src, - extent);; + u64 ds, dl, cs, cl; + ds = btrfs_file_extent_disk_bytenr(src, + extent); + /* ds == 0 is a hole */ + if (ds == 0) + continue; + + dl = btrfs_file_extent_disk_num_bytes(src, + extent); + cs = btrfs_file_extent_offset(src, extent); + cl = btrfs_file_extent_num_bytes(src, + extent);; if (btrfs_file_extent_compression(src, extent)) { cs = 0; cl = dl; } - /* ds == 0 is a hole */ - if (ds != 0) { - ret = btrfs_inc_extent_ref(trans, log, - ds, dl, - dst_path->nodes[0]->start, - BTRFS_TREE_LOG_OBJECTID, - trans->transid, - ins_keys[i].objectid); - BUG_ON(ret); - ret = btrfs_lookup_csums_range( - log->fs_info->csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums); - BUG_ON(ret); - } + + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); } } - dst_path->slots[0]++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); @@ -2559,7 +2655,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * * This handles both files and directories. */ -static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only) { @@ -2585,28 +2681,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans, min_key.offset = 0; max_key.objectid = inode->i_ino; + + /* today the code can only do partial logging of directories */ + if (!S_ISDIR(inode->i_mode)) + inode_only = LOG_INODE_ALL; + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* - * if this inode has already been logged and we're in inode_only - * mode, we don't want to delete the things that have already - * been written to the log. - * - * But, if the inode has been through an inode_only log, - * the logged_trans field is not set. This allows us to catch - * any new names for this inode in the backrefs by logging it - * again - */ - if (inode_only == LOG_INODE_EXISTS && - BTRFS_I(inode)->logged_trans == trans->transid) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - goto out; - } mutex_lock(&BTRFS_I(inode)->log_mutex); /* @@ -2693,7 +2778,6 @@ next_slot: if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { btrfs_release_path(root, path); btrfs_release_path(log, dst_path); - BTRFS_I(inode)->log_dirty_trans = 0; ret = log_directory_changes(trans, root, inode, path, dst_path); BUG_ON(ret); } @@ -2702,19 +2786,69 @@ next_slot: btrfs_free_path(path); btrfs_free_path(dst_path); -out: return 0; } -int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) +/* + * follow the dentry parent pointers up the chain and see if any + * of the directories in it require a full commit before they can + * be logged. Returns zero if nothing special needs to be done or 1 if + * a full commit is required. + */ +static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + struct inode *inode, + struct dentry *parent, + struct super_block *sb, + u64 last_committed) { - int ret; + int ret = 0; + struct btrfs_root *root; - start_log_trans(trans, root); - ret = __btrfs_log_inode(trans, root, inode, inode_only); - end_log_trans(root); + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto out; + + if (!S_ISDIR(inode->i_mode)) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + goto out; + inode = parent->d_inode; + } + + while (1) { + BTRFS_I(inode)->logged_trans = trans->transid; + smp_mb(); + + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + root = BTRFS_I(inode)->root; + + /* + * make sure any commits to the log are forced + * to be full commits + */ + root->fs_info->last_trans_log_full_commit = + trans->transid; + ret = 1; + break; + } + + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + + if (parent == sb->s_root) + break; + + parent = parent->d_parent; + inode = parent->d_inode; + + } +out: return ret; } @@ -2724,31 +2858,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans, * only logging is done of any parent directories that are older than * the last committed transaction */ -int btrfs_log_dentry(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only) { - int inode_only = LOG_INODE_ALL; + int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; - int ret; + int ret = 0; + u64 last_committed = root->fs_info->last_trans_committed; + + sb = inode->i_sb; + + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + + if (root->fs_info->last_trans_log_full_commit > + root->fs_info->last_trans_committed) { + ret = 1; + goto end_no_trans; + } + + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; start_log_trans(trans, root); - sb = dentry->d_inode->i_sb; - while (1) { - ret = __btrfs_log_inode(trans, root, dentry->d_inode, - inode_only); - BUG_ON(ret); - inode_only = LOG_INODE_EXISTS; - dentry = dentry->d_parent; - if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto no_parent; + + inode_only = LOG_INODE_EXISTS; + while (1) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; - if (BTRFS_I(dentry->d_inode)->generation <= - root->fs_info->last_trans_committed) + inode = parent->d_inode; + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + } + if (parent == sb->s_root) break; + + parent = parent->d_parent; } - end_log_trans(root); - return 0; +no_parent: + ret = 0; + btrfs_end_log_trans(root); +end_no_trans: + return ret; } /* @@ -2760,12 +2933,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans, int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - u64 gen; - gen = root->fs_info->last_trans_new_blockgroup; - if (gen > root->fs_info->last_trans_committed) - return 1; - else - return btrfs_log_dentry(trans, root, dentry); + return btrfs_log_inode_parent(trans, root, dentry->d_inode, + dentry->d_parent, 0); } /* @@ -2832,9 +3001,7 @@ again: BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; - mutex_lock(&fs_info->trans_mutex); - btrfs_record_root_in_trans(wc.replay_dest); - mutex_unlock(&fs_info->trans_mutex); + btrfs_record_root_in_trans(trans, wc.replay_dest); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); @@ -2852,6 +3019,7 @@ again: key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); kfree(log); if (found_key.offset == 0) @@ -2884,3 +3052,94 @@ again: kfree(log_root_tree); return 0; } + +/* + * there are some corner cases where we want to force a full + * commit instead of allowing a directory to be logged. + * + * They revolve around files there were unlinked from the directory, and + * this function updates the parent directory so that a full commit is + * properly done if it is fsync'd later after the unlinks are done. + */ +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename) +{ + /* + * when we're logging a file, if it hasn't been renamed + * or unlinked, and its inode is fully committed on disk, + * we don't have to worry about walking up the directory chain + * to log its parents. + * + * So, we use the last_unlink_trans field to put this transid + * into the file. When the file is logged we check it and + * don't log the parents if the file is fully on disk. + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this directory was already logged any new + * names for this file/dir will get recorded + */ + smp_mb(); + if (BTRFS_I(dir)->logged_trans == trans->transid) + return; + + /* + * if the inode we're about to unlink was logged, + * the log will be properly updated for any new names + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) + return; + + /* + * when renaming files across directories, if the directory + * there we're unlinking from gets fsync'd later on, there's + * no way to find the destination directory later and fsync it + * properly. So, we have to be conservative and force commits + * so the new name gets discovered. + */ + if (for_rename) + goto record; + + /* we can safely do the unlink without any special recording */ + return; + +record: + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + +/* + * Call this after adding a new name for a file and it will properly + * update the log to reflect the new name. + * + * It will return zero if all goes well, and it will return 1 if a + * full transaction commit is required. + */ +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent) +{ + struct btrfs_root * root = BTRFS_I(inode)->root; + + /* + * this will force the logging code to walk the dentry chain + * up for the file + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ + if (BTRFS_I(inode)->logged_trans <= + root->fs_info->last_trans_committed && + (!old_dir || BTRFS_I(old_dir)->logged_trans <= + root->fs_info->last_trans_committed)) + return 0; + + return btrfs_log_inode_parent(trans, root, inode, parent, 1); +} + diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index b9409b32ed02..d09c7609e16b 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -22,14 +22,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_log_dentry(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry); -int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *inode, u64 dirid); +int btrfs_join_running_log_trans(struct btrfs_root *root); +int btrfs_end_log_trans(struct btrfs_root *root); +int btrfs_pin_log_trans(struct btrfs_root *root); +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only); +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename); +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dd06e18e5aac..3ab80e9cd767 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/random.h> +#include <linux/iocontext.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -124,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static void requeue_list(struct btrfs_pending_bios *pending_bios, + struct bio *head, struct bio *tail) +{ + + struct bio *old_head; + + old_head = pending_bios->head; + pending_bios->head = head; + if (pending_bios->tail) + tail->bi_next = old_head; + else + pending_bios->tail = tail; +} + /* * we try to collect pending bios for a device so we don't get a large * number of procs sending bios down to the same device. This greatly @@ -140,31 +155,49 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) struct bio *pending; struct backing_dev_info *bdi; struct btrfs_fs_info *fs_info; + struct btrfs_pending_bios *pending_bios; struct bio *tail; struct bio *cur; int again = 0; - unsigned long num_run = 0; + unsigned long num_run; + unsigned long num_sync_run; + unsigned long batch_run = 0; unsigned long limit; + unsigned long last_waited = 0; + int force_reg = 0; - bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; + bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* we want to make sure that every time we switch from the sync + * list to the normal list, we unplug + */ + num_sync_run = 0; + loop: spin_lock(&device->io_lock); loop_lock: + num_run = 0; + /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted * into the list if we hit congestion */ - pending = device->pending_bios; - tail = device->pending_bio_tail; + if (!force_reg && device->pending_sync_bios.head) { + pending_bios = &device->pending_sync_bios; + force_reg = 1; + } else { + pending_bios = &device->pending_bios; + force_reg = 0; + } + + pending = pending_bios->head; + tail = pending_bios->tail; WARN_ON(pending && !tail); - device->pending_bios = NULL; - device->pending_bio_tail = NULL; /* * if pending was null this time around, no bios need processing @@ -174,16 +207,45 @@ loop_lock: * device->running_pending is used to synchronize with the * schedule_bio code. */ - if (pending) { - again = 1; - device->running_pending = 1; - } else { + if (device->pending_sync_bios.head == NULL && + device->pending_bios.head == NULL) { again = 0; device->running_pending = 0; + } else { + again = 1; + device->running_pending = 1; } + + pending_bios->head = NULL; + pending_bios->tail = NULL; + spin_unlock(&device->io_lock); + /* + * if we're doing the regular priority list, make sure we unplug + * for any high prio bios we've sent down + */ + if (pending_bios == &device->pending_bios && num_sync_run > 0) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + while (pending) { + + rmb(); + /* we want to work on both lists, but do more bios on the + * sync list than the regular list + */ + if ((num_run > 32 && + pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head) || + (num_run > 64 && pending_bios == &device->pending_sync_bios && + device->pending_bios.head)) { + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + goto loop_lock; + } + cur = pending; pending = pending->bi_next; cur->bi_next = NULL; @@ -194,29 +256,63 @@ loop_lock: wake_up(&fs_info->async_submit_wait); BUG_ON(atomic_read(&cur->bi_cnt) == 0); - bio_get(cur); submit_bio(cur->bi_rw, cur); - bio_put(cur); num_run++; + batch_run++; + + if (bio_sync(cur)) + num_sync_run++; + + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } /* * we made progress, there is more work to do and the bdi * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && num_run > 16 && + if (pending && bdi_write_congested(bdi) && batch_run > 32 && fs_info->fs_devices->open_devices > 1) { - struct bio *old_head; + struct io_context *ioc; - spin_lock(&device->io_lock); - - old_head = device->pending_bios; - device->pending_bios = pending; - if (device->pending_bio_tail) - tail->bi_next = old_head; - else - device->pending_bio_tail = tail; + ioc = current->io_context; + /* + * the main goal here is that we don't want to + * block if we're going to be able to submit + * more requests without blocking. + * + * This code does two great things, it pokes into + * the elevator code from a filesystem _and_ + * it makes assumptions about how batching works. + */ + if (ioc && ioc->nr_batch_requests > 0 && + time_before(jiffies, ioc->last_waited + HZ/50UL) && + (last_waited == 0 || + ioc->last_waited == last_waited)) { + /* + * we want to go through our batch of + * requests and stop. So, we copy out + * the ioc->last_waited time and test + * against it before looping + */ + last_waited = ioc->last_waited; + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } + continue; + } + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); device->running_pending = 1; spin_unlock(&device->io_lock); @@ -224,13 +320,32 @@ loop_lock: goto done; } } + + if (num_sync_run) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + + cond_resched(); if (again) goto loop; spin_lock(&device->io_lock); - if (device->pending_bios) + if (device->pending_bios.head || device->pending_sync_bios.head) goto loop_lock; spin_unlock(&device->io_lock); + + /* + * IO has already been through a long path to get here. Checksumming, + * async helper threads, perhaps compression. We've done a pretty + * good job of collecting a batch of IO and should just unplug + * the device right away. + * + * This will help anyone who is waiting on the IO, they might have + * already unplugged, but managed to do so before the bio they + * cared about found its way down here. + */ + blk_run_backing_dev(bdi, NULL); done: return 0; } @@ -262,6 +377,7 @@ static noinline int device_list_add(const char *path, memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; + mutex_init(&fs_devices->device_list_mutex); device = NULL; } else { device = __find_device(&fs_devices->devices, devid, @@ -288,7 +404,11 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } INIT_LIST_HEAD(&device->dev_alloc_list); + + mutex_lock(&fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_devices->devices); + mutex_unlock(&fs_devices->device_list_mutex); + device->fs_devices = fs_devices; fs_devices->num_devices++; } @@ -314,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) INIT_LIST_HEAD(&fs_devices->devices); INIT_LIST_HEAD(&fs_devices->alloc_list); INIT_LIST_HEAD(&fs_devices->list); + mutex_init(&fs_devices->device_list_mutex); fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_trans = orig->latest_trans; memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + mutex_lock(&orig->device_list_mutex); list_for_each_entry(orig_dev, &orig->devices, dev_list) { device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) @@ -339,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } + mutex_unlock(&orig->device_list_mutex); return fs_devices; error: + mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(-ENOMEM); } @@ -351,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); again: + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) continue; @@ -370,6 +495,7 @@ again: kfree(device->name); kfree(device); } + mutex_unlock(&fs_devices->device_list_mutex); if (fs_devices->seed) { fs_devices = fs_devices->seed; @@ -490,6 +616,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, device->in_fs_metadata = 0; device->mode = flags; + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + fs_devices->rotating = 1; + fs_devices->open_devices++; if (device->writeable) { fs_devices->rw_devices++; @@ -1017,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device = NULL; devices = &root->fs_info->fs_devices->devices; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_for_each_entry(tmp, devices, dev_list) { if (tmp->in_fs_metadata && !tmp->bdev) { device = tmp; break; } } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); bdev = NULL; bh = NULL; disk_super = NULL; @@ -1077,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto error_brelse; device->in_fs_metadata = 0; + + /* + * the device list mutex makes sure that we don't change + * the device list while someone else is writing out all + * the device supers. + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_del_init(&device->dev_list); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + device->fs_devices->num_devices--; next_device = list_entry(root->fs_info->fs_devices->devices.next, @@ -1171,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, seed_devices->opened = 1; INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->alloc_list); + mutex_init(&seed_devices->device_list_mutex); list_splice_init(&fs_devices->devices, &seed_devices->devices); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); list_for_each_entry(device, &seed_devices->devices, dev_list) { @@ -1296,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; + /* + * we have the volume lock, so we don't need the extra + * device list mutex while reading the list here. + */ list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; @@ -1336,6 +1481,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->io_align = root->sectorsize; device->sector_size = root->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -1349,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } device->fs_devices = root->fs_info->fs_devices; + + /* + * we don't want write_supers to jump in here with our device + * half setup + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); @@ -1357,6 +1509,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->rw_devices++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + root->fs_info->fs_devices->rotating = 1; + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); btrfs_set_super_total_bytes(&root->fs_info->super_copy, total_bytes + device->total_bytes); @@ -1364,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); btrfs_set_super_num_devices(&root->fs_info->super_copy, total_bytes + 1); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { ret = init_first_rw_device(trans, root, device); @@ -1439,7 +1595,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_mark_buffer_dirty(leaf); @@ -1566,8 +1722,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, int ret; int i; - printk(KERN_INFO "btrfs relocating chunk %llu\n", - (unsigned long long)chunk_offset); root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; @@ -1836,14 +1990,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) device->total_bytes = new_size; if (device->writeable) device->fs_devices->total_rw_bytes -= diff; - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } - WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); btrfs_end_transaction(trans, root); @@ -1875,7 +2021,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) - goto done; + break; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -1888,6 +2034,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; } + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + lock_chunks(root); + + device->disk_total_bytes = new_size; + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); done: btrfs_free_path(path); return ret; @@ -2458,7 +2624,7 @@ again: max_errors = 1; } } - if (multi_ret && rw == WRITE && + if (multi_ret && (rw & (1 << BIO_RW)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2723,6 +2889,7 @@ static noinline int schedule_bio(struct btrfs_root *root, int rw, struct bio *bio) { int should_queue = 1; + struct btrfs_pending_bios *pending_bios; /* don't bother with additional async steps for reads, right now */ if (!(rw & (1 << BIO_RW))) { @@ -2744,13 +2911,17 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); + if (bio_sync(bio)) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; - if (device->pending_bio_tail) - device->pending_bio_tail->bi_next = bio; + if (pending_bios->tail) + pending_bios->tail->bi_next = bio; - device->pending_bio_tail = bio; - if (!device->pending_bios) - device->pending_bios = bio; + pending_bios->tail = bio; + if (!pending_bios->head) + pending_bios->head = bio; if (device->running_pending) should_queue = 0; @@ -2967,7 +3138,8 @@ static int fill_device_from_item(struct extent_buffer *leaf, unsigned long ptr; device->devid = btrfs_device_id(leaf, dev_item); - device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 86c44e9ae110..5139a833f721 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -23,13 +23,22 @@ #include "async-thread.h" struct buffer_head; +struct btrfs_pending_bios { + struct bio *head; + struct bio *tail; +}; + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; struct btrfs_fs_devices *fs_devices; struct btrfs_root *dev_root; - struct bio *pending_bios; - struct bio *pending_bio_tail; + + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* WRITE_SYNC bios */ + struct btrfs_pending_bios pending_sync_bios; + int running_pending; u64 generation; @@ -52,6 +61,9 @@ struct btrfs_device { /* size of the device */ u64 total_bytes; + /* size of the disk */ + u64 disk_total_bytes; + /* bytes used */ u64 bytes_used; @@ -76,7 +88,7 @@ struct btrfs_device { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - /* the device with this id has the most recent coyp of the super */ + /* the device with this id has the most recent copy of the super */ u64 latest_devid; u64 latest_trans; u64 num_devices; @@ -84,7 +96,12 @@ struct btrfs_fs_devices { u64 rw_devices; u64 total_rw_bytes; struct block_device *latest_bdev; - /* all of the devices in the FS */ + + /* all of the devices in the FS, protected by a mutex + * so we can safely walk it to write out the supers without + * worrying about add/remove by the multi-device code + */ + struct mutex device_list_mutex; struct list_head devices; /* devices not currently being allocated */ @@ -95,6 +112,11 @@ struct btrfs_fs_devices { int seeding; int opened; + + /* set when we find or add a device that doesn't have the + * nonrot flag set + */ + int rotating; }; struct btrfs_bio_stripe { |
