Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c50db428 authored by Linux Build Service Account's avatar Linux Build Service Account Committed by Gerrit - the friendly Code Review server
Browse files

Merge "mm/page-writeback.c: add strictlimit feature"

parents 0a3c7b35 c2bf56d4
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -930,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
	fc->bdi.name = "fuse";
	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
	/* fuse does it's own writeback accounting */
	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;

	err = bdi_init(&fc->bdi);
	if (err)
+3 −0
Original line number Diff line number Diff line
@@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 * BDI_CAP_EXEC_MAP:       Can be mapped for execution
 *
 * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
 *
 * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
 */
#define BDI_CAP_NO_ACCT_DIRTY	0x00000001
#define BDI_CAP_NO_WRITEBACK	0x00000002
@@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
#define BDI_CAP_NO_ACCT_WB	0x00000080
#define BDI_CAP_SWAP_BACKED	0x00000100
#define BDI_CAP_STABLE_WRITES	0x00000200
#define BDI_CAP_STRICTLIMIT	0x00000400

#define BDI_CAP_VMFLAGS \
	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+202 −61
Original line number Diff line number Diff line
@@ -582,6 +582,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
	return bdi_dirty;
}

/*
 *                           setpoint - dirty 3
 *        f(dirty) := 1.0 + (----------------)
 *                           limit - setpoint
 *
 * it's a 3rd order polynomial that subjects to
 *
 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 * (2) f(setpoint) = 1.0 => the balance point
 * (3) f(limit)    = 0   => the hard limit
 * (4) df/dx      <= 0	 => negative feedback control
 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 *     => fast response on large errors; small oscillation near setpoint
 */
static inline long long pos_ratio_polynom(unsigned long setpoint,
					  unsigned long dirty,
					  unsigned long limit)
{
	long long pos_ratio;
	long x;

	x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
		    limit - setpoint + 1);
	pos_ratio = x;
	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;

	return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}

/*
 * Dirty position control.
 *
@@ -680,26 +711,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
	/*
	 * global setpoint
	 *
	 *                           setpoint - dirty 3
	 *        f(dirty) := 1.0 + (----------------)
	 *                           limit - setpoint
	 * See comment for pos_ratio_polynom().
	 */
	setpoint = (freerun + limit) / 2;
	pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);

	/*
	 * The strictlimit feature is a tool preventing mistrusted filesystems
	 * from growing a large number of dirty pages before throttling. For
	 * such filesystems balance_dirty_pages always checks bdi counters
	 * against bdi limits. Even if global "nr_dirty" is under "freerun".
	 * This is especially important for fuse which sets bdi->max_ratio to
	 * 1% by default. Without strictlimit feature, fuse writeback may
	 * consume arbitrary amount of RAM because it is accounted in
	 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
	 *
	 * Here, in bdi_position_ratio(), we calculate pos_ratio based on
	 * two values: bdi_dirty and bdi_thresh. Let's consider an example:
	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
	 * limits are set by default to 10% and 20% (background and throttle).
	 * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
	 * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
	 * about ~6K pages (as the average of background and throttle bdi
	 * limits). The 3rd order polynomial will provide positive feedback if
	 * bdi_dirty is under bdi_setpoint and vice versa.
	 *
	 * Note, that we cannot use global counters in these calculations
	 * because we want to throttle process writing to a strictlimit BDI
	 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
	 * in the example above).
	 */
	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
		long long bdi_pos_ratio;
		unsigned long bdi_bg_thresh;

		if (bdi_dirty < 8)
			return min_t(long long, pos_ratio * 2,
				     2 << RATELIMIT_CALC_SHIFT);

		if (bdi_dirty >= bdi_thresh)
			return 0;

		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
		bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
						     bdi_bg_thresh);

		if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
			return 0;

		bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
						  bdi_thresh);

		/*
		 * Typically, for strictlimit case, bdi_setpoint << setpoint
		 * and pos_ratio >> bdi_pos_ratio. In the other words global
		 * state ("dirty") is not limiting factor and we have to
		 * make decision based on bdi counters. But there is an
		 * important case when global pos_ratio should get precedence:
		 * global limits are exceeded (e.g. due to activities on other
		 * BDIs) while given strictlimit BDI is below limit.
		 *
	 * it's a 3rd order polynomial that subjects to
		 * "pos_ratio * bdi_pos_ratio" would work for the case above,
		 * but it would look too non-natural for the case of all
		 * activity in the system coming from a single strictlimit BDI
		 * with bdi->max_ratio == 100%.
		 *
	 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
	 * (2) f(setpoint) = 1.0 => the balance point
	 * (3) f(limit)    = 0   => the hard limit
	 * (4) df/dx      <= 0	 => negative feedback control
	 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
	 *     => fast response on large errors; small oscillation near setpoint
		 * Note that min() below somewhat changes the dynamics of the
		 * control system. Normally, pos_ratio value can be well over 3
		 * (when globally we are at freerun and bdi is well below bdi
		 * setpoint). Now the maximum pos_ratio in the same situation
		 * is 2. We might want to tweak this if we observe the control
		 * system is too slow to adapt.
		 */
	setpoint = (freerun + limit) / 2;
	x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
		    limit - setpoint + 1);
	pos_ratio = x;
	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
		return min(pos_ratio, bdi_pos_ratio);
	}

	/*
	 * We have computed basic pos_ratio above based on global situation. If
@@ -992,6 +1077,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
	 * keep that period small to reduce time lags).
	 */
	step = 0;

	/*
	 * For strictlimit case, calculations above were based on bdi counters
	 * and limits (starting from pos_ratio = bdi_position_ratio() and up to
	 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
	 * Hence, to calculate "step" properly, we have to use bdi_dirty as
	 * "dirty" and bdi_setpoint as "setpoint".
	 *
	 * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
	 * it's possible that bdi_thresh is close to zero due to inactivity
	 * of backing device (see the implementation of bdi_dirty_limit()).
	 */
	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
		dirty = bdi_dirty;
		if (bdi_dirty < 8)
			setpoint = bdi_dirty + 1;
		else
			setpoint = (bdi_thresh +
				    bdi_dirty_limit(bdi, bg_thresh)) / 2;
	}

	if (dirty < setpoint) {
		x = min(bdi->balanced_dirty_ratelimit,
			 min(balanced_dirty_ratelimit, task_ratelimit));
@@ -1196,6 +1302,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
	return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}

static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
				    unsigned long dirty_thresh,
				    unsigned long background_thresh,
				    unsigned long *bdi_dirty,
				    unsigned long *bdi_thresh,
				    unsigned long *bdi_bg_thresh)
{
	unsigned long bdi_reclaimable;

	/*
	 * bdi_thresh is not treated as some limiting factor as
	 * dirty_thresh, due to reasons
	 * - in JBOD setup, bdi_thresh can fluctuate a lot
	 * - in a system with HDD and USB key, the USB key may somehow
	 *   go into state (bdi_dirty >> bdi_thresh) either because
	 *   bdi_dirty starts high, or because bdi_thresh drops low.
	 *   In this case we don't want to hard throttle the USB key
	 *   dirtiers for 100 seconds until bdi_dirty drops under
	 *   bdi_thresh. Instead the auxiliary bdi control line in
	 *   bdi_position_ratio() will let the dirtier task progress
	 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
	 */
	*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);

	if (bdi_bg_thresh)
		*bdi_bg_thresh = div_u64((u64)*bdi_thresh *
					 background_thresh,
					 dirty_thresh);

	/*
	 * In order to avoid the stacked BDI deadlock we need
	 * to ensure we accurately count the 'dirty' pages when
	 * the threshold is low.
	 *
	 * Otherwise it would be possible to get thresh+n pages
	 * reported dirty, even though there are thresh-m pages
	 * actually dirty; with m+n sitting in the percpu
	 * deltas.
	 */
	if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
		bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
		*bdi_dirty = bdi_reclaimable +
			bdi_stat_sum(bdi, BDI_WRITEBACK);
	} else {
		bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
		*bdi_dirty = bdi_reclaimable +
			bdi_stat(bdi, BDI_WRITEBACK);
	}
}

/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
@@ -1207,13 +1363,9 @@ static void balance_dirty_pages(struct address_space *mapping,
				unsigned long pages_dirtied)
{
	unsigned long nr_reclaimable;	/* = file_dirty + unstable_nfs */
	unsigned long bdi_reclaimable;
	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
	unsigned long bdi_dirty;
	unsigned long freerun;
	unsigned long background_thresh;
	unsigned long dirty_thresh;
	unsigned long bdi_thresh;
	long period;
	long pause;
	long max_pause;
@@ -1224,10 +1376,16 @@ static void balance_dirty_pages(struct address_space *mapping,
	unsigned long dirty_ratelimit;
	unsigned long pos_ratio;
	struct backing_dev_info *bdi = mapping->backing_dev_info;
	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
	unsigned long start_time = jiffies;

	for (;;) {
		unsigned long now = jiffies;
		unsigned long uninitialized_var(bdi_thresh);
		unsigned long thresh;
		unsigned long uninitialized_var(bdi_dirty);
		unsigned long dirty;
		unsigned long bg_thresh;

		/*
		 * Unstable writes are a feature of certain networked
@@ -1241,61 +1399,44 @@ static void balance_dirty_pages(struct address_space *mapping,

		global_dirty_limits(&background_thresh, &dirty_thresh);

		if (unlikely(strictlimit)) {
			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
					 &bdi_dirty, &bdi_thresh, &bg_thresh);

			dirty = bdi_dirty;
			thresh = bdi_thresh;
		} else {
			dirty = nr_dirty;
			thresh = dirty_thresh;
			bg_thresh = background_thresh;
		}

		/*
		 * Throttle it only when the background writeback cannot
		 * catch-up. This avoids (excessively) small writeouts
		 * when the bdi limits are ramping up.
		 * when the bdi limits are ramping up in case of !strictlimit.
		 *
		 * In strictlimit case make decision based on the bdi counters
		 * and limits. Small writeouts when the bdi limits are ramping
		 * up are the price we consciously pay for strictlimit-ing.
		 */
		freerun = dirty_freerun_ceiling(dirty_thresh,
						background_thresh);
		if (nr_dirty <= freerun) {
		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
			current->dirty_paused_when = now;
			current->nr_dirtied = 0;
			current->nr_dirtied_pause =
				dirty_poll_interval(nr_dirty, dirty_thresh);
				dirty_poll_interval(dirty, thresh);
			break;
		}

		if (unlikely(!writeback_in_progress(bdi)))
			bdi_start_background_writeback(bdi);

		/*
		 * bdi_thresh is not treated as some limiting factor as
		 * dirty_thresh, due to reasons
		 * - in JBOD setup, bdi_thresh can fluctuate a lot
		 * - in a system with HDD and USB key, the USB key may somehow
		 *   go into state (bdi_dirty >> bdi_thresh) either because
		 *   bdi_dirty starts high, or because bdi_thresh drops low.
		 *   In this case we don't want to hard throttle the USB key
		 *   dirtiers for 100 seconds until bdi_dirty drops under
		 *   bdi_thresh. Instead the auxiliary bdi control line in
		 *   bdi_position_ratio() will let the dirtier task progress
		 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
		 */
		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);

		/*
		 * In order to avoid the stacked BDI deadlock we need
		 * to ensure we accurately count the 'dirty' pages when
		 * the threshold is low.
		 *
		 * Otherwise it would be possible to get thresh+n pages
		 * reported dirty, even though there are thresh-m pages
		 * actually dirty; with m+n sitting in the percpu
		 * deltas.
		 */
		if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
			bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
			bdi_dirty = bdi_reclaimable +
				    bdi_stat_sum(bdi, BDI_WRITEBACK);
		} else {
			bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
			bdi_dirty = bdi_reclaimable +
				    bdi_stat(bdi, BDI_WRITEBACK);
		}
		if (!strictlimit)
			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
					 &bdi_dirty, &bdi_thresh, NULL);

		dirty_exceeded = (bdi_dirty > bdi_thresh) &&
				  (nr_dirty > dirty_thresh);
				 ((nr_dirty > dirty_thresh) || strictlimit);
		if (dirty_exceeded && !bdi->dirty_exceeded)
			bdi->dirty_exceeded = 1;