Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 54c707e9 authored by Glauber Costa's avatar Glauber Costa Committed by Ingo Molnar
Browse files

sched/accounting: Re-use scheduler statistics for the root cgroup



Right now, after we collect tick statistics for user and system and store them
in a well known location, we keep the same statistics again for cpuacct.
Since cpuacct is hierarchical, the numbers for the root cgroup should be
absolutely equal to the system-wide numbers.

So it would be better to just use it: this patch changes cpuacct accounting
in a way that the cpustat statistics are kept in a struct kernel_cpustat percpu
array. In the root cgroup case, we just point it to the main array. The rest of
the hierarchy walk can be totally disabled later with a static branch - but I am
not doing it here.

Signed-off-by: default avatarGlauber Costa <glommer@parallels.com>
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Tuner <pjt@google.com>
Link: http://lkml.kernel.org/r/1322498719-2255-4-git-send-email-glommer@parallels.com


Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent b39e66ea
Loading
Loading
Loading
Loading
+76 −89
Original line number Diff line number Diff line
@@ -2556,6 +2556,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
	return ns;
}

#ifdef CONFIG_CGROUP_CPUACCT
struct cgroup_subsys cpuacct_subsys;
struct cpuacct root_cpuacct;
#endif

static inline void task_group_account_field(struct task_struct *p,
					     u64 tmp, int index)
{
#ifdef CONFIG_CGROUP_CPUACCT
	struct kernel_cpustat *kcpustat;
	struct cpuacct *ca;
#endif
	/*
	 * Since all updates are sure to touch the root cgroup, we
	 * get ourselves ahead and touch it first. If the root cgroup
	 * is the only cgroup, then nothing else should be necessary.
	 *
	 */
	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;

#ifdef CONFIG_CGROUP_CPUACCT
	if (unlikely(!cpuacct_subsys.active))
		return;

	rcu_read_lock();
	ca = task_ca(p);
	while (ca && (ca != &root_cpuacct)) {
		kcpustat = this_cpu_ptr(ca->cpustat);
		kcpustat->cpustat[index] += tmp;
		ca = parent_ca(ca);
	}
	rcu_read_unlock();
#endif
}


/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
@@ -2580,7 +2616,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
	cpustat[index] += tmp;

	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
	task_group_account_field(p, index, cputime);
	/* Account for user time used */
	acct_update_integrals(p);
}
@@ -2636,7 +2672,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,

	/* Add system time to cpustat. */
	cpustat[index] += tmp;
	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
	task_group_account_field(p, index, cputime);

	/* Account for system time used */
	acct_update_integrals(p);
@@ -6781,8 +6817,15 @@ void __init sched_init(void)
	INIT_LIST_HEAD(&root_task_group.children);
	INIT_LIST_HEAD(&root_task_group.siblings);
	autogroup_init(&init_task);

#endif /* CONFIG_CGROUP_SCHED */

#ifdef CONFIG_CGROUP_CPUACCT
	root_cpuacct.cpustat = &kernel_cpustat;
	root_cpuacct.cpuusage = alloc_percpu(u64);
	/* Too early, not expected to fail */
	BUG_ON(!root_cpuacct.cpuusage);
#endif
	for_each_possible_cpu(i) {
		struct rq *rq;

@@ -7843,44 +7886,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 * (balbir@in.ibm.com).
 */

/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
	struct cgroup_subsys_state css;
	/* cpuusage holds pointer to a u64-type object on every cpu */
	u64 __percpu *cpuusage;
	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
};

struct cgroup_subsys cpuacct_subsys;

/* return cpu accounting group corresponding to this container */
static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
{
	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
			    struct cpuacct, css);
}

/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
			    struct cpuacct, css);
}

static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
	if (!ca || !ca->css.cgroup->parent)
		return NULL;
	return cgroup_ca(ca->css.cgroup->parent);
}

/* create a new cpu accounting group */
static struct cgroup_subsys_state *cpuacct_create(
	struct cgroup_subsys *ss, struct cgroup *cgrp)
{
	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
	int i;
	struct cpuacct *ca;

	if (!cgrp->parent)
		return &root_cpuacct.css;

	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
	if (!ca)
		goto out;

@@ -7888,15 +7903,13 @@ static struct cgroup_subsys_state *cpuacct_create(
	if (!ca->cpuusage)
		goto out_free_ca;

	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
		if (percpu_counter_init(&ca->cpustat[i], 0))
			goto out_free_counters;
	ca->cpustat = alloc_percpu(struct kernel_cpustat);
	if (!ca->cpustat)
		goto out_free_cpuusage;

	return &ca->css;

out_free_counters:
	while (--i >= 0)
		percpu_counter_destroy(&ca->cpustat[i]);
out_free_cpuusage:
	free_percpu(ca->cpuusage);
out_free_ca:
	kfree(ca);
@@ -7909,10 +7922,8 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
	struct cpuacct *ca = cgroup_ca(cgrp);
	int i;

	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
		percpu_counter_destroy(&ca->cpustat[i]);
	free_percpu(ca->cpustat);
	free_percpu(ca->cpuusage);
	kfree(ca);
}
@@ -8008,13 +8019,28 @@ static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
			      struct cgroup_map_cb *cb)
{
	struct cpuacct *ca = cgroup_ca(cgrp);
	int i;
	int cpu;
	s64 val = 0;

	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
		s64 val = percpu_counter_read(&ca->cpustat[i]);
	for_each_online_cpu(cpu) {
		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
		val += kcpustat->cpustat[CPUTIME_USER];
		val += kcpustat->cpustat[CPUTIME_NICE];
	}
	val = cputime64_to_clock_t(val);
		cb->fill(cb, cpuacct_stat_desc[i], val);
	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);

	val = 0;
	for_each_online_cpu(cpu) {
		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
		val += kcpustat->cpustat[CPUTIME_SYSTEM];
		val += kcpustat->cpustat[CPUTIME_IRQ];
		val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
	}

	val = cputime64_to_clock_t(val);
	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);

	return 0;
}

@@ -8066,45 +8092,6 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
	rcu_read_unlock();
}

/*
 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
 * in cputime_t units. As a result, cpuacct_update_stats calls
 * percpu_counter_add with values large enough to always overflow the
 * per cpu batch limit causing bad SMP scalability.
 *
 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
 */
#ifdef CONFIG_SMP
#define CPUACCT_BATCH	\
	min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
#else
#define CPUACCT_BATCH	0
#endif

/*
 * Charge the system/user time to the task's accounting group.
 */
void cpuacct_update_stats(struct task_struct *tsk,
		enum cpuacct_stat_index idx, cputime_t val)
{
	struct cpuacct *ca;
	int batch = CPUACCT_BATCH;

	if (unlikely(!cpuacct_subsys.active))
		return;

	rcu_read_lock();
	ca = task_ca(tsk);

	do {
		__percpu_counter_add(&ca->cpustat[idx], val, batch);
		ca = parent_ca(ca);
	} while (ca);
	rcu_read_unlock();
}

struct cgroup_subsys cpuacct_subsys = {
	.name = "cpuacct",
	.create = cpuacct_create,
+30 −4
Original line number Diff line number Diff line
@@ -830,13 +830,39 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
extern void update_cpu_load(struct rq *this_rq);

#ifdef CONFIG_CGROUP_CPUACCT
#include <linux/cgroup.h>
/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
	struct cgroup_subsys_state css;
	/* cpuusage holds pointer to a u64-type object on every cpu */
	u64 __percpu *cpuusage;
	struct kernel_cpustat __percpu *cpustat;
};

/* return cpu accounting group corresponding to this container */
static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
{
	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
			    struct cpuacct, css);
}

/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
			    struct cpuacct, css);
}

static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
	if (!ca || !ca->css.cgroup->parent)
		return NULL;
	return cgroup_ca(ca->css.cgroup->parent);
}

extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
extern void cpuacct_update_stats(struct task_struct *tsk,
		enum cpuacct_stat_index idx, cputime_t val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_update_stats(struct task_struct *tsk,
		enum cpuacct_stat_index idx, cputime_t val) {}
#endif

static inline void inc_nr_running(struct rq *rq)