Loading fs/fuse/inode.c +1 −1 Original line number Diff line number Diff line Loading @@ -930,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; /* fuse does it's own writeback accounting */ fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; err = bdi_init(&fc->bdi); if (err) Loading include/linux/backing-dev.h +3 −0 Original line number Diff line number Diff line Loading @@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_EXEC_MAP: Can be mapped for execution * * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. * * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 Loading @@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 #define BDI_CAP_STABLE_WRITES 0x00000200 #define BDI_CAP_STRICTLIMIT 0x00000400 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) Loading mm/page-writeback.c +202 −61 Original line number Diff line number Diff line Loading @@ -582,6 +582,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) return bdi_dirty; } /* * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ static inline long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* * Dirty position control. * Loading Loading @@ -680,26 +711,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, /* * global setpoint * * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For * such filesystems balance_dirty_pages always checks bdi counters * against bdi limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * * Here, in bdi_position_ratio(), we calculate pos_ratio based on * two values: bdi_dirty and bdi_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is * about ~6K pages (as the average of background and throttle bdi * limits). The 3rd order polynomial will provide positive feedback if * bdi_dirty is under bdi_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations * because we want to throttle process writing to a strictlimit BDI * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long bdi_pos_ratio; unsigned long bdi_bg_thresh; if (bdi_dirty < 8) return min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); if (bdi_dirty >= bdi_thresh) return 0; bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, bdi_bg_thresh); if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) return 0; bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, bdi_thresh); /* * Typically, for strictlimit case, bdi_setpoint << setpoint * and pos_ratio >> bdi_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to * make decision based on bdi counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other * BDIs) while given strictlimit BDI is below limit. * * it's a 3rd order polynomial that subjects to * "pos_ratio * bdi_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all * activity in the system coming from a single strictlimit BDI * with bdi->max_ratio == 100%. * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 * (when globally we are at freerun and bdi is well below bdi * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ setpoint = (freerun + limit) / 2; x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return min(pos_ratio, bdi_pos_ratio); } /* * We have computed basic pos_ratio above based on global situation. If Loading Loading @@ -992,6 +1077,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * keep that period small to reduce time lags). */ step = 0; /* * For strictlimit case, calculations above were based on bdi counters * and limits (starting from pos_ratio = bdi_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use bdi_dirty as * "dirty" and bdi_setpoint as "setpoint". * * We rampup dirty_ratelimit forcibly if bdi_dirty is low because * it's possible that bdi_thresh is close to zero due to inactivity * of backing device (see the implementation of bdi_dirty_limit()). */ if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = bdi_dirty; if (bdi_dirty < 8) setpoint = bdi_dirty + 1; else setpoint = (bdi_thresh + bdi_dirty_limit(bdi, bg_thresh)) / 2; } if (dirty < setpoint) { x = min(bdi->balanced_dirty_ratelimit, min(balanced_dirty_ratelimit, task_ratelimit)); Loading Loading @@ -1196,6 +1302,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } static inline void bdi_dirty_limits(struct backing_dev_info *bdi, unsigned long dirty_thresh, unsigned long background_thresh, unsigned long *bdi_dirty, unsigned long *bdi_thresh, unsigned long *bdi_bg_thresh) { unsigned long bdi_reclaimable; /* * bdi_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, bdi_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (bdi_dirty >> bdi_thresh) either because * bdi_dirty starts high, or because bdi_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until bdi_dirty drops under * bdi_thresh. Instead the auxiliary bdi control line in * bdi_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down bdi_dirty. */ *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); if (bdi_bg_thresh) *bdi_bg_thresh = div_u64((u64)*bdi_thresh * background_thresh, dirty_thresh); /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); *bdi_dirty = bdi_reclaimable + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); *bdi_dirty = bdi_reclaimable + bdi_stat(bdi, BDI_WRITEBACK); } } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force Loading @@ -1207,13 +1363,9 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pages_dirtied) { unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ unsigned long bdi_reclaimable; unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ unsigned long bdi_dirty; unsigned long freerun; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; long period; long pause; long max_pause; Loading @@ -1224,10 +1376,16 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long dirty_ratelimit; unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; unsigned long uninitialized_var(bdi_thresh); unsigned long thresh; unsigned long uninitialized_var(bdi_dirty); unsigned long dirty; unsigned long bg_thresh; /* * Unstable writes are a feature of certain networked Loading @@ -1241,61 +1399,44 @@ static void balance_dirty_pages(struct address_space *mapping, global_dirty_limits(&background_thresh, &dirty_thresh); if (unlikely(strictlimit)) { bdi_dirty_limits(bdi, dirty_thresh, background_thresh, &bdi_dirty, &bdi_thresh, &bg_thresh); dirty = bdi_dirty; thresh = bdi_thresh; } else { dirty = nr_dirty; thresh = dirty_thresh; bg_thresh = background_thresh; } /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. * when the bdi limits are ramping up in case of !strictlimit. * * In strictlimit case make decision based on the bdi counters * and limits. Small writeouts when the bdi limits are ramping * up are the price we consciously pay for strictlimit-ing. */ freerun = dirty_freerun_ceiling(dirty_thresh, background_thresh); if (nr_dirty <= freerun) { if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { current->dirty_paused_when = now; current->nr_dirtied = 0; current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh); dirty_poll_interval(dirty, thresh); break; } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); /* * bdi_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, bdi_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (bdi_dirty >> bdi_thresh) either because * bdi_dirty starts high, or because bdi_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until bdi_dirty drops under * bdi_thresh. Instead the auxiliary bdi control line in * bdi_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down bdi_dirty. */ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (bdi_thresh < 2 * bdi_stat_error(bdi)) { bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); bdi_dirty = bdi_reclaimable + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); bdi_dirty = bdi_reclaimable + bdi_stat(bdi, BDI_WRITEBACK); } if (!strictlimit) bdi_dirty_limits(bdi, dirty_thresh, background_thresh, &bdi_dirty, &bdi_thresh, NULL); dirty_exceeded = (bdi_dirty > bdi_thresh) && (nr_dirty > dirty_thresh); ((nr_dirty > dirty_thresh) || strictlimit); if (dirty_exceeded && !bdi->dirty_exceeded) bdi->dirty_exceeded = 1; Loading Loading
fs/fuse/inode.c +1 −1 Original line number Diff line number Diff line Loading @@ -930,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; /* fuse does it's own writeback accounting */ fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; err = bdi_init(&fc->bdi); if (err) Loading
include/linux/backing-dev.h +3 −0 Original line number Diff line number Diff line Loading @@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_EXEC_MAP: Can be mapped for execution * * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. * * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 Loading @@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 #define BDI_CAP_STABLE_WRITES 0x00000200 #define BDI_CAP_STRICTLIMIT 0x00000400 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) Loading
mm/page-writeback.c +202 −61 Original line number Diff line number Diff line Loading @@ -582,6 +582,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) return bdi_dirty; } /* * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ static inline long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* * Dirty position control. * Loading Loading @@ -680,26 +711,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, /* * global setpoint * * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For * such filesystems balance_dirty_pages always checks bdi counters * against bdi limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * * Here, in bdi_position_ratio(), we calculate pos_ratio based on * two values: bdi_dirty and bdi_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is * about ~6K pages (as the average of background and throttle bdi * limits). The 3rd order polynomial will provide positive feedback if * bdi_dirty is under bdi_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations * because we want to throttle process writing to a strictlimit BDI * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long bdi_pos_ratio; unsigned long bdi_bg_thresh; if (bdi_dirty < 8) return min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); if (bdi_dirty >= bdi_thresh) return 0; bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, bdi_bg_thresh); if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) return 0; bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, bdi_thresh); /* * Typically, for strictlimit case, bdi_setpoint << setpoint * and pos_ratio >> bdi_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to * make decision based on bdi counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other * BDIs) while given strictlimit BDI is below limit. * * it's a 3rd order polynomial that subjects to * "pos_ratio * bdi_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all * activity in the system coming from a single strictlimit BDI * with bdi->max_ratio == 100%. * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 * (when globally we are at freerun and bdi is well below bdi * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ setpoint = (freerun + limit) / 2; x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return min(pos_ratio, bdi_pos_ratio); } /* * We have computed basic pos_ratio above based on global situation. If Loading Loading @@ -992,6 +1077,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * keep that period small to reduce time lags). */ step = 0; /* * For strictlimit case, calculations above were based on bdi counters * and limits (starting from pos_ratio = bdi_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use bdi_dirty as * "dirty" and bdi_setpoint as "setpoint". * * We rampup dirty_ratelimit forcibly if bdi_dirty is low because * it's possible that bdi_thresh is close to zero due to inactivity * of backing device (see the implementation of bdi_dirty_limit()). */ if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = bdi_dirty; if (bdi_dirty < 8) setpoint = bdi_dirty + 1; else setpoint = (bdi_thresh + bdi_dirty_limit(bdi, bg_thresh)) / 2; } if (dirty < setpoint) { x = min(bdi->balanced_dirty_ratelimit, min(balanced_dirty_ratelimit, task_ratelimit)); Loading Loading @@ -1196,6 +1302,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } static inline void bdi_dirty_limits(struct backing_dev_info *bdi, unsigned long dirty_thresh, unsigned long background_thresh, unsigned long *bdi_dirty, unsigned long *bdi_thresh, unsigned long *bdi_bg_thresh) { unsigned long bdi_reclaimable; /* * bdi_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, bdi_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (bdi_dirty >> bdi_thresh) either because * bdi_dirty starts high, or because bdi_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until bdi_dirty drops under * bdi_thresh. Instead the auxiliary bdi control line in * bdi_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down bdi_dirty. */ *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); if (bdi_bg_thresh) *bdi_bg_thresh = div_u64((u64)*bdi_thresh * background_thresh, dirty_thresh); /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); *bdi_dirty = bdi_reclaimable + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); *bdi_dirty = bdi_reclaimable + bdi_stat(bdi, BDI_WRITEBACK); } } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force Loading @@ -1207,13 +1363,9 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pages_dirtied) { unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ unsigned long bdi_reclaimable; unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ unsigned long bdi_dirty; unsigned long freerun; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; long period; long pause; long max_pause; Loading @@ -1224,10 +1376,16 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long dirty_ratelimit; unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; unsigned long uninitialized_var(bdi_thresh); unsigned long thresh; unsigned long uninitialized_var(bdi_dirty); unsigned long dirty; unsigned long bg_thresh; /* * Unstable writes are a feature of certain networked Loading @@ -1241,61 +1399,44 @@ static void balance_dirty_pages(struct address_space *mapping, global_dirty_limits(&background_thresh, &dirty_thresh); if (unlikely(strictlimit)) { bdi_dirty_limits(bdi, dirty_thresh, background_thresh, &bdi_dirty, &bdi_thresh, &bg_thresh); dirty = bdi_dirty; thresh = bdi_thresh; } else { dirty = nr_dirty; thresh = dirty_thresh; bg_thresh = background_thresh; } /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. * when the bdi limits are ramping up in case of !strictlimit. * * In strictlimit case make decision based on the bdi counters * and limits. Small writeouts when the bdi limits are ramping * up are the price we consciously pay for strictlimit-ing. */ freerun = dirty_freerun_ceiling(dirty_thresh, background_thresh); if (nr_dirty <= freerun) { if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { current->dirty_paused_when = now; current->nr_dirtied = 0; current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh); dirty_poll_interval(dirty, thresh); break; } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); /* * bdi_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, bdi_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (bdi_dirty >> bdi_thresh) either because * bdi_dirty starts high, or because bdi_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until bdi_dirty drops under * bdi_thresh. Instead the auxiliary bdi control line in * bdi_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down bdi_dirty. */ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (bdi_thresh < 2 * bdi_stat_error(bdi)) { bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); bdi_dirty = bdi_reclaimable + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); bdi_dirty = bdi_reclaimable + bdi_stat(bdi, BDI_WRITEBACK); } if (!strictlimit) bdi_dirty_limits(bdi, dirty_thresh, background_thresh, &bdi_dirty, &bdi_thresh, NULL); dirty_exceeded = (bdi_dirty > bdi_thresh) && (nr_dirty > dirty_thresh); ((nr_dirty > dirty_thresh) || strictlimit); if (dirty_exceeded && !bdi->dirty_exceeded) bdi->dirty_exceeded = 1; Loading