Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 73beece9 authored by Liu Bo's avatar Liu Bo Committed by David Sterba
Browse files

Btrfs: fix lockdep deadlock warning due to dev_replace



Xfstests btrfs/011 complains about a deadlock warning,

[ 1226.649039] =========================================================
[ 1226.649039] [ INFO: possible irq lock inversion dependency detected ]
[ 1226.649039] 4.1.0+ #270 Not tainted
[ 1226.649039] ---------------------------------------------------------
[ 1226.652955] kswapd0/46 just changed the state of lock:
[ 1226.652955]  (&delayed_node->mutex){+.+.-.}, at: [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0
[ 1226.652955] but this lock took another, RECLAIM_FS-unsafe lock in the past:
[ 1226.652955]  (&fs_info->dev_replace.lock){+.+.+.}

and interrupts could create inverse lock ordering between them.

[ 1226.652955]
other info that might help us debug this:
[ 1226.652955] Chain exists of:
  &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock

[ 1226.652955]  Possible interrupt unsafe locking scenario:

[ 1226.652955]        CPU0                    CPU1
[ 1226.652955]        ----                    ----
[ 1226.652955]   lock(&fs_info->dev_replace.lock);
[ 1226.652955]                                local_irq_disable();
[ 1226.652955]                                lock(&delayed_node->mutex);
[ 1226.652955]                                lock(&found->groups_sem);
[ 1226.652955]   <Interrupt>
[ 1226.652955]     lock(&delayed_node->mutex);
[ 1226.652955]
 *** DEADLOCK ***

Commit 084b6e7c ("btrfs: Fix a lockdep warning when running xfstest.") tried
to fix a similar one that has the exactly same warning, but with that, we still
run to this.

The above lock chain comes from
btrfs_commit_transaction
  ->btrfs_run_delayed_items
    ...
    ->__btrfs_update_delayed_inode
      ...
      ->__btrfs_cow_block
         ...
         ->find_free_extent
            ->cache_block_group
              ->load_free_space_cache
                ->btrfs_readpages
                  ->submit_one_bio
                    ...
                    ->__btrfs_map_block
                      ->btrfs_dev_replace_lock

However, with high memory pressure, tasks which hold dev_replace.lock can
be interrupted by kswapd and then kswapd is intended to release memory occupied
by superblock, inodes and dentries, where we may call evict_inode, and it comes
to

[ 1226.652955]  [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0
[ 1226.652955]  [<ffffffff81459e74>] btrfs_remove_delayed_node+0x24/0x30
[ 1226.652955]  [<ffffffff8140c5fe>] btrfs_evict_inode+0x34e/0x700

delayed_node->mutex may be acquired in __btrfs_release_delayed_node(), and it leads
to a ABBA deadlock.

To fix this, we can use "blocking rwlock" used in the case of extent_buffer, but
things are simpler here since we only needs read's spinlock to blocking lock.

With this, btrfs/011 no more produces warnings in dmesg.

Signed-off-by: default avatarLiu Bo <bo.li.liu@oracle.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 18558cae
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -1002,8 +1002,10 @@ struct btrfs_dev_replace {
	pid_t lock_owner;
	atomic_t nesting_level;
	struct mutex lock_finishing_cancel_unmount;
	struct mutex lock_management_lock;
	struct mutex lock;
	rwlock_t lock;
	atomic_t read_locks;
	atomic_t blocking_readers;
	wait_queue_head_t read_lock_wq;

	struct btrfs_scrub_progress scrub_progress;
};
+70 −60
Original line number Diff line number Diff line
@@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
	struct btrfs_dev_replace_item *ptr;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 0);
	if (!dev_replace->is_valid ||
	    !dev_replace->item_needs_writeback) {
		btrfs_dev_replace_unlock(dev_replace);
		btrfs_dev_replace_unlock(dev_replace, 0);
		return 0;
	}
	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 0);

	key.objectid = 0;
	key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
	ptr = btrfs_item_ptr(eb, path->slots[0],
			     struct btrfs_dev_replace_item);

	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 1);
	if (dev_replace->srcdev)
		btrfs_set_dev_replace_src_devid(eb, ptr,
			dev_replace->srcdev->devid);
@@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
	btrfs_set_dev_replace_cursor_right(eb, ptr,
		dev_replace->cursor_right);
	dev_replace->item_needs_writeback = 0;
	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 1);

	btrfs_mark_buffer_dirty(eb);

@@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
		return PTR_ERR(trans);
	}

	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 1);
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
	dev_replace->is_valid = 1;
	dev_replace->item_needs_writeback = 1;
	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 1);

	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
	if (ret)
@@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_dev_replace_lock(dev_replace);
		btrfs_dev_replace_lock(dev_replace, 1);
		goto leave;
	}

@@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
leave:
	dev_replace->srcdev = NULL;
	dev_replace->tgtdev = NULL;
	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 1);
	btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
	return ret;
}
@@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
	/* don't allow cancel or unmount to disturb the finishing procedure */
	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);

	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 0);
	/* was the operation canceled, or is it finished? */
	if (dev_replace->replace_state !=
	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
		btrfs_dev_replace_unlock(dev_replace);
		btrfs_dev_replace_unlock(dev_replace, 0);
		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
		return 0;
	}

	tgt_device = dev_replace->tgtdev;
	src_device = dev_replace->srcdev;
	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 0);

	/*
	 * flush all outstanding I/O and inode extent mappings before the
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
	/* keep away write_all_supers() during the finishing procedure */
	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
	mutex_lock(&root->fs_info->chunk_mutex);
	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 1);
	dev_replace->replace_state =
		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
			        rcu_str_deref(src_device->name),
			      src_device->devid,
			      rcu_str_deref(tgt_device->name), scrub_ret);
		btrfs_dev_replace_unlock(dev_replace);
		btrfs_dev_replace_unlock(dev_replace, 1);
		mutex_unlock(&root->fs_info->chunk_mutex);
		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
		mutex_unlock(&uuid_mutex);
@@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
	fs_info->fs_devices->rw_devices++;

	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 1);

	btrfs_rm_dev_replace_blocked(fs_info);

@@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	struct btrfs_device *srcdev;

	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 0);
	/* even if !dev_replace_is_valid, the values are good enough for
	 * the replace_status ioctl */
	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
			div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
		break;
	}
	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 0);
}

int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
@@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
		return -EROFS;

	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 1);
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
		btrfs_dev_replace_unlock(dev_replace);
		btrfs_dev_replace_unlock(dev_replace, 1);
		goto leave;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
@@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
	dev_replace->time_stopped = get_seconds();
	dev_replace->item_needs_writeback = 1;
	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 1);
	btrfs_scrub_cancel(fs_info);

	trans = btrfs_start_transaction(root, 0);
@@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 1);
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
		break;
	}

	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 1);
	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}

@@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
	struct task_struct *task;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

	btrfs_dev_replace_lock(dev_replace);
	btrfs_dev_replace_lock(dev_replace, 1);
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
		btrfs_dev_replace_unlock(dev_replace);
		btrfs_dev_replace_unlock(dev_replace, 1);
		return 0;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
		break;
@@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
		btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
		btrfs_info(fs_info,
			"you may cancel the operation after 'mount -o degraded'");
		btrfs_dev_replace_unlock(dev_replace);
		btrfs_dev_replace_unlock(dev_replace, 1);
		return 0;
	}
	btrfs_dev_replace_unlock(dev_replace);
	btrfs_dev_replace_unlock(dev_replace, 1);

	WARN_ON(atomic_xchg(
		&fs_info->mutually_exclusive_operation_running, 1));
@@ -865,50 +865,60 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
	return 1;
}

void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
{
	/* the beginning is just an optimization for the typical case */
	if (atomic_read(&dev_replace->nesting_level) == 0) {
acquire_lock:
		/* this is not a nested case where the same thread
		 * is trying to acqurire the same lock twice */
		mutex_lock(&dev_replace->lock);
		mutex_lock(&dev_replace->lock_management_lock);
		dev_replace->lock_owner = current->pid;
		atomic_inc(&dev_replace->nesting_level);
		mutex_unlock(&dev_replace->lock_management_lock);
		return;
	if (rw == 1) {
		/* write */
again:
		wait_event(dev_replace->read_lock_wq,
			   atomic_read(&dev_replace->blocking_readers) == 0);
		write_lock(&dev_replace->lock);
		if (atomic_read(&dev_replace->blocking_readers)) {
			write_unlock(&dev_replace->lock);
			goto again;
		}

	mutex_lock(&dev_replace->lock_management_lock);
	if (atomic_read(&dev_replace->nesting_level) > 0 &&
	    dev_replace->lock_owner == current->pid) {
		WARN_ON(!mutex_is_locked(&dev_replace->lock));
		atomic_inc(&dev_replace->nesting_level);
		mutex_unlock(&dev_replace->lock_management_lock);
		return;
	} else {
		read_lock(&dev_replace->lock);
		atomic_inc(&dev_replace->read_locks);
	}

	mutex_unlock(&dev_replace->lock_management_lock);
	goto acquire_lock;
}

void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
{
	WARN_ON(!mutex_is_locked(&dev_replace->lock));
	mutex_lock(&dev_replace->lock_management_lock);
	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
	WARN_ON(dev_replace->lock_owner != current->pid);
	atomic_dec(&dev_replace->nesting_level);
	if (atomic_read(&dev_replace->nesting_level) == 0) {
		dev_replace->lock_owner = 0;
		mutex_unlock(&dev_replace->lock_management_lock);
		mutex_unlock(&dev_replace->lock);
	if (rw == 1) {
		/* write */
		ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
		write_unlock(&dev_replace->lock);
	} else {
		mutex_unlock(&dev_replace->lock_management_lock);
		ASSERT(atomic_read(&dev_replace->read_locks) > 0);
		atomic_dec(&dev_replace->read_locks);
		read_unlock(&dev_replace->lock);
	}
}

/* inc blocking cnt and release read lock */
void btrfs_dev_replace_set_lock_blocking(
					struct btrfs_dev_replace *dev_replace)
{
	/* only set blocking for read lock */
	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
	atomic_inc(&dev_replace->blocking_readers);
	read_unlock(&dev_replace->lock);
}

/* acquire read lock and dec blocking cnt */
void btrfs_dev_replace_clear_lock_blocking(
					struct btrfs_dev_replace *dev_replace)
{
	/* only set blocking for read lock */
	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
	ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
	read_lock(&dev_replace->lock);
	if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
	    waitqueue_active(&dev_replace->read_lock_wq))
		wake_up(&dev_replace->read_lock_wq);
}

void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
	percpu_counter_inc(&fs_info->bio_counter);
+5 −2
Original line number Diff line number Diff line
@@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_clear_lock_blocking(
					struct btrfs_dev_replace *dev_replace);

static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
{
+4 −2
Original line number Diff line number Diff line
@@ -2272,9 +2272,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
	fs_info->dev_replace.lock_owner = 0;
	atomic_set(&fs_info->dev_replace.nesting_level, 0);
	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
	mutex_init(&fs_info->dev_replace.lock_management_lock);
	mutex_init(&fs_info->dev_replace.lock);
	rwlock_init(&fs_info->dev_replace.lock);
	atomic_set(&fs_info->dev_replace.read_locks, 0);
	atomic_set(&fs_info->dev_replace.blocking_readers, 0);
	init_waitqueue_head(&fs_info->replace_wait);
	init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}

static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
+5 −5
Original line number Diff line number Diff line
@@ -396,7 +396,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
	}

	/* insert extent in reada_tree + all per-device trees, all or nothing */
	btrfs_dev_replace_lock(&fs_info->dev_replace);
	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
	spin_lock(&fs_info->reada_lock);
	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
	if (ret == -EEXIST) {
@@ -404,12 +404,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
		BUG_ON(!re_exist);
		re_exist->refcnt++;
		spin_unlock(&fs_info->reada_lock);
		btrfs_dev_replace_unlock(&fs_info->dev_replace);
		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
		goto error;
	}
	if (ret) {
		spin_unlock(&fs_info->reada_lock);
		btrfs_dev_replace_unlock(&fs_info->dev_replace);
		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
		goto error;
	}
	prev_dev = NULL;
@@ -456,12 +456,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
			BUG_ON(fs_info == NULL);
			radix_tree_delete(&fs_info->reada_tree, index);
			spin_unlock(&fs_info->reada_lock);
			btrfs_dev_replace_unlock(&fs_info->dev_replace);
			btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			goto error;
		}
	}
	spin_unlock(&fs_info->reada_lock);
	btrfs_dev_replace_unlock(&fs_info->dev_replace);
	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);

	btrfs_put_bbio(bbio);
	return re;
Loading