Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f8891e5e authored by Christoph Lameter's avatar Christoph Lameter Committed by Linus Torvalds
Browse files

[PATCH] Light weight event counters

The remaining counters in page_state after the zoned VM counter patches
have been applied are all just for show in /proc/vmstat.  They have no
essential function for the VM.

We use a simple increment of per cpu variables.  In order to avoid the most
severe races we disable preempt.  Preempt does not prevent the race between
an increment and an interrupt handler incrementing the same statistics
counter.  However, that race is exceedingly rare, we may only loose one
increment or so and there is no requirement (at least not in kernel) that
the vm event counters have to be accurate.

In the non preempt case this results in a simple increment for each
counter.  For many architectures this will be reduced by the compiler to a
single instruction.  This single instruction is atomic for i386 and x86_64.
 And therefore even the rare race condition in an interrupt is avoided for
both architectures in most cases.

The patchset also adds an off switch for embedded systems that allows a
building of linux kernels without these counters.

The implementation of these counters is through inline code that hopefully
results in only a single instruction increment instruction being emitted
(i386, x86_64) or in the increment being hidden though instruction
concurrency (EPIC architectures such as ia64 can get that done).

Benefits:
- VM event counter operations usually reduce to a single inline instruction
  on i386 and x86_64.
- No interrupt disable, only preempt disable for the preempt case.
  Preempt disable can also be avoided by moving the counter into a spinlock.
- Handling is similar to zoned VM counters.
- Simple and easily extendable.
- Can be omitted to reduce memory use for embedded use.

References:

RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=113512330605497&w=2
RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=114988082814934&w=2
local_t http://marc.theaimsgroup.com/?l=linux-kernel&m=114991748606690&w=2
V2 http://marc.theaimsgroup.com/?t=115014808400007&r=1&w=2
V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767022346&w=2
V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115047968808926&w=2



Signed-off-by: default avatarChristoph Lameter <clameter@sgi.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent ca889e6c
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -766,7 +766,6 @@ unsigned long nr_iowait(void)
#endif /* MODULE */
EXPORT_SYMBOL_GPL(si_swapinfo);
EXPORT_SYMBOL_GPL(nr_threads);
EXPORT_SYMBOL_GPL(get_full_page_state);
EXPORT_SYMBOL_GPL(nr_running);
EXPORT_SYMBOL_GPL(nr_iowait);
//EXPORT_SYMBOL_GPL(nr_context_switches);
+10 −10
Original line number Diff line number Diff line
@@ -107,21 +107,21 @@ static void appldata_get_mem_data(void *data)
	 * serialized through the appldata_ops_lock and can use static
	 */
	static struct sysinfo val;
	static struct page_state ps;
	unsigned long ev[NR_VM_EVENT_ITEMS];
	struct appldata_mem_data *mem_data;

	mem_data = data;
	mem_data->sync_count_1++;

	get_full_page_state(&ps);
	mem_data->pgpgin     = ps.pgpgin >> 1;
	mem_data->pgpgout    = ps.pgpgout >> 1;
	mem_data->pswpin     = ps.pswpin;
	mem_data->pswpout    = ps.pswpout;
	mem_data->pgalloc    = ps.pgalloc_high + ps.pgalloc_normal +
			       ps.pgalloc_dma;
	mem_data->pgfault    = ps.pgfault;
	mem_data->pgmajfault = ps.pgmajfault;
	all_vm_events(ev);
	mem_data->pgpgin     = ev[PGPGIN] >> 1;
	mem_data->pgpgout    = ev[PGPGOUT] >> 1;
	mem_data->pswpin     = ev[PSWPIN];
	mem_data->pswpout    = ev[PSWPOUT];
	mem_data->pgalloc    = ev[PGALLOC_HIGH] + ev[PGALLOC_NORMAL] +
			       ev[PGALLOC_DMA];
	mem_data->pgfault    = ev[PGFAULT];
	mem_data->pgmajfault = ev[PGMAJFAULT];

	si_meminfo(&val);
	mem_data->sharedram = val.sharedram;
+2 −2
Original line number Diff line number Diff line
@@ -3117,9 +3117,9 @@ void submit_bio(int rw, struct bio *bio)
	BIO_BUG_ON(!bio->bi_io_vec);
	bio->bi_rw |= rw;
	if (rw & WRITE)
		mod_page_state(pgpgout, count);
		count_vm_events(PGPGOUT, count);
	else
		mod_page_state(pgpgin, count);
		count_vm_events(PGPGIN, count);

	if (unlikely(block_dump)) {
		char b[BDEVNAME_SIZE];
+6 −5
Original line number Diff line number Diff line
@@ -411,16 +411,17 @@ static __inline__ int led_get_net_activity(void)
static __inline__ int led_get_diskio_activity(void)
{	
	static unsigned long last_pgpgin, last_pgpgout;
	struct page_state pgstat;
	unsigned long events[NR_VM_EVENT_ITEMS];
	int changed;

	get_full_page_state(&pgstat); /* get no of sectors in & out */
	all_vm_events(events);

	/* Just use a very simple calculation here. Do not care about overflow,
	   since we only want to know if there was activity or not. */
	changed = (pgstat.pgpgin != last_pgpgin) || (pgstat.pgpgout != last_pgpgout);
	last_pgpgin  = pgstat.pgpgin;
	last_pgpgout = pgstat.pgpgout;
	changed = (events[PGPGIN] != last_pgpgin) ||
		  (events[PGPGOUT] != last_pgpgout);
	last_pgpgin  = events[PGPGIN];
	last_pgpgout = events[PGPGOUT];

	return (changed ? LED_DISK_IO : 0);
}
+4 −5
Original line number Diff line number Diff line
@@ -452,15 +452,14 @@ static void prune_icache(int nr_to_scan)
		nr_pruned++;
	}
	inodes_stat.nr_unused -= nr_pruned;
	if (current_is_kswapd())
		__count_vm_events(KSWAPD_INODESTEAL, reap);
	else
		__count_vm_events(PGINODESTEAL, reap);
	spin_unlock(&inode_lock);

	dispose_list(&freeable);
	mutex_unlock(&iprune_mutex);

	if (current_is_kswapd())
		mod_page_state(kswapd_inodesteal, reap);
	else
		mod_page_state(pginodesteal, reap);
}

/*
Loading