From 8d875f95da43c6a8f18f77869f2ef26e9594fecc Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Tue, 12 Aug 2014 10:47:42 -0700
Subject: btrfs: disable strict file flushes for renames and truncates

Truncates and renames are often used to replace old versions of a file
with new versions.  Applications often expect this to be an atomic
replacement, even if they haven't done anything to make sure the new
version is fully on disk.

Btrfs has strict flushing in place to make sure that renaming over an
old file with a new file will fully flush out the new file before
allowing the transaction commit with the rename to complete.

This ordering means the commit code needs to be able to lock file pages,
and there are a few paths in the filesystem where we will try to end a
transaction with the page lock held.  It's rare, but these things can
deadlock.

This patch removes the ordered flushes and switches to a best effort
filemap_flush like ext4 uses. It's not perfect, but it should fix the
deadlocks.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ordered-data.c | 123 ------------------------------------------------
 1 file changed, 123 deletions(-)

(limited to 'fs/btrfs/ordered-data.c')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7187b14faa6c..963895c1f801 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 
 	trace_btrfs_ordered_extent_remove(inode, entry);
 
-	/*
-	 * we have no more ordered extents for this inode and
-	 * no dirty pages.  We can safely remove it from the
-	 * list of ordered extents
-	 */
-	if (RB_EMPTY_ROOT(&tree->tree) &&
-	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
-		spin_lock(&root->fs_info->ordered_root_lock);
-		list_del_init(&BTRFS_I(inode)->ordered_operations);
-		spin_unlock(&root->fs_info->ordered_root_lock);
-	}
-
 	if (!root->nr_ordered_extents) {
 		spin_lock(&root->fs_info->ordered_root_lock);
 		BUG_ON(list_empty(&root->ordered_root));
@@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 	mutex_unlock(&fs_info->ordered_operations_mutex);
 }
 
-/*
- * this is used during transaction commit to write all the inodes
- * added to the ordered operation list.  These files must be fully on
- * disk before the transaction commits.
- *
- * we have two modes here, one is to just start the IO via filemap_flush
- * and the other is to wait for all the io.  When we wait, we have an
- * extra check to make sure the ordered operation list really is empty
- * before we return
- */
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root, int wait)
-{
-	struct btrfs_inode *btrfs_inode;
-	struct inode *inode;
-	struct btrfs_transaction *cur_trans = trans->transaction;
-	struct list_head splice;
-	struct list_head works;
-	struct btrfs_delalloc_work *work, *next;
-	int ret = 0;
-
-	INIT_LIST_HEAD(&splice);
-	INIT_LIST_HEAD(&works);
-
-	mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
-	spin_lock(&root->fs_info->ordered_root_lock);
-	list_splice_init(&cur_trans->ordered_operations, &splice);
-	while (!list_empty(&splice)) {
-		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-				   ordered_operations);
-		inode = &btrfs_inode->vfs_inode;
-
-		list_del_init(&btrfs_inode->ordered_operations);
-
-		/*
-		 * the inode may be getting freed (in sys_unlink path).
-		 */
-		inode = igrab(inode);
-		if (!inode)
-			continue;
-
-		if (!wait)
-			list_add_tail(&BTRFS_I(inode)->ordered_operations,
-				      &cur_trans->ordered_operations);
-		spin_unlock(&root->fs_info->ordered_root_lock);
-
-		work = btrfs_alloc_delalloc_work(inode, wait, 1);
-		if (!work) {
-			spin_lock(&root->fs_info->ordered_root_lock);
-			if (list_empty(&BTRFS_I(inode)->ordered_operations))
-				list_add_tail(&btrfs_inode->ordered_operations,
-					      &splice);
-			list_splice_tail(&splice,
-					 &cur_trans->ordered_operations);
-			spin_unlock(&root->fs_info->ordered_root_lock);
-			ret = -ENOMEM;
-			goto out;
-		}
-		list_add_tail(&work->list, &works);
-		btrfs_queue_work(root->fs_info->flush_workers,
-				 &work->work);
-
-		cond_resched();
-		spin_lock(&root->fs_info->ordered_root_lock);
-	}
-	spin_unlock(&root->fs_info->ordered_root_lock);
-out:
-	list_for_each_entry_safe(work, next, &works, list) {
-		list_del_init(&work->list);
-		btrfs_wait_and_free_delalloc_work(work);
-	}
-	mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
-	return ret;
-}
-
 /*
  * Used to start IO or wait for a given ordered extent to finish.
  *
@@ -1120,42 +1033,6 @@ out:
 	return index;
 }
 
-
-/*
- * add a given inode to the list of inodes that must be fully on
- * disk before a transaction commit finishes.
- *
- * This basically gives us the ext3 style data=ordered mode, and it is mostly
- * used to make sure renamed files are fully on disk.
- *
- * It is a noop if the inode is already fully on disk.
- *
- * If trans is not null, we'll do a friendly check for a transaction that
- * is already flushing things and force the IO down ourselves.
- */
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root, struct inode *inode)
-{
-	struct btrfs_transaction *cur_trans = trans->transaction;
-	u64 last_mod;
-
-	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
-
-	/*
-	 * if this file hasn't been changed since the last transaction
-	 * commit, we can safely return without doing anything
-	 */
-	if (last_mod <= root->fs_info->last_trans_committed)
-		return;
-
-	spin_lock(&root->fs_info->ordered_root_lock);
-	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
-		list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &cur_trans->ordered_operations);
-	}
-	spin_unlock(&root->fs_info->ordered_root_lock);
-}
-
 int __init ordered_data_init(void)
 {
 	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
-- 
cgit v1.2.3


From 9e0af23764344f7f1b68e4eefbe7dc865018b63d Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 15 Aug 2014 23:36:53 +0800
Subject: Btrfs: fix task hang under heavy compressed write

This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.

Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.

Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --

normal_work_helper(arg)
    work = container_of(arg, struct btrfs_work, normal_work);

    work->func() <---- (we name it work X)
    for ordered_work in wq->ordered_list
            ordered_work->ordered_func()
            ordered_work->ordered_free()

The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will

file a readahead request
    btrfs_readpages()
         for page that is not in page cache
                __do_readpage()
                     submit_extent_page()
                           btrfs_submit_bio_hook()
                                 btrfs_bio_wq_end_io()
                                 submit_bio()
                                 end_workqueue_bio() <--(ret by the 1st endio)
                                      queue a work(named work Y) for the 2nd
                                      also the real endio()

So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.

A bit more explanation,

A,B,C -- struct btrfs_work
arg   -- struct work_struct

kthread:
worker_thread()
    pick up a work_struct from @worklist
    process_one_work(arg)
	worker->current_work = arg;  <-- arg is A->normal_work
	worker->current_func(arg)
		normal_work_helper(arg)
		     A = container_of(arg, struct btrfs_work, normal_work);

		     A->func()
		     A->ordered_func()
		     A->ordered_free()  <-- A gets freed

		     B->ordered_func()
			  submit_compressed_extents()
			      find_free_extent()
				  load_free_space_inode()
				      ...   <-- (the above readhead stack)
				      end_workqueue_bio()
					   btrfs_queue_work(work C)
		     B->ordered_free()

As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.

Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).

When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.

So the situation is that our kthread is waiting forever on work C.

Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.

With this patch, I no long hit the above hang.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ordered-data.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/ordered-data.c')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 963895c1f801..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -615,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 		spin_unlock(&root->ordered_extent_lock);
 
 		btrfs_init_work(&ordered->flush_work,
+				btrfs_flush_delalloc_helper,
 				btrfs_run_ordered_extent_work, NULL, NULL);
 		list_add_tail(&ordered->work_list, &works);
 		btrfs_queue_work(root->fs_info->flush_workers,
-- 
cgit v1.2.3