Loading include/linux/sched/sysctl.h +18 −0 Original line number Diff line number Diff line Loading @@ -20,11 +20,17 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, enum { sysctl_hung_task_timeout_secs = 0 }; #endif #define MAX_CLUSTERS 3 /* MAX_MARGIN_LEVELS should be one less than MAX_CLUSTERS */ #define MAX_MARGIN_LEVELS (MAX_CLUSTERS - 1) extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; #ifdef CONFIG_SCHED_WALT extern unsigned int __weak sysctl_sched_capacity_margin_up[MAX_MARGIN_LEVELS]; extern unsigned int __weak sysctl_sched_capacity_margin_down[MAX_MARGIN_LEVELS]; extern unsigned int __weak sysctl_sched_user_hint; extern const int __weak sched_user_hint_max; extern unsigned int __weak sysctl_sched_cpu_high_irqload; Loading Loading @@ -55,12 +61,24 @@ walt_proc_user_hint_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int __weak sched_updown_migrate_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int __weak sched_ravg_window_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif #if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_DEBUG_PREEMPT) extern unsigned int sysctl_preemptoff_tracing_threshold_ns; #endif #if defined(CONFIG_PREEMPTIRQ_EVENTS) && defined(CONFIG_IRQSOFF_TRACER) extern unsigned int sysctl_irqsoff_tracing_threshold_ns; #endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, Loading include/linux/sysctl.h +3 −0 Original line number Diff line number Diff line Loading @@ -73,6 +73,9 @@ extern int proc_do_large_bitmap(struct ctl_table *, int, extern int proc_do_static_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int proc_douintvec_capacity(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int proc_douintvec_ravg_window(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); Loading include/trace/events/preemptirq.h +28 −0 Original line number Diff line number Diff line Loading @@ -62,6 +62,34 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #define trace_preempt_disable_rcuidle(...) #endif TRACE_EVENT(irqs_disable, TP_PROTO(u64 delta, unsigned long caddr0, unsigned long caddr1, unsigned long caddr2, unsigned long caddr3), TP_ARGS(delta, caddr0, caddr1, caddr2, caddr3), TP_STRUCT__entry( __field(u64, delta) __field(void*, caddr0) __field(void*, caddr1) __field(void*, caddr2) __field(void*, caddr3) ), TP_fast_assign( __entry->delta = delta; __entry->caddr0 = (void *)caddr0; __entry->caddr1 = (void *)caddr1; __entry->caddr2 = (void *)caddr2; __entry->caddr3 = (void *)caddr3; ), TP_printk("delta=%llu(ns) Callers:(%ps<-%ps<-%ps<-%ps)", __entry->delta, __entry->caddr0, __entry->caddr1, __entry->caddr2, __entry->caddr3) ); #endif /* _TRACE_PREEMPTIRQ_H */ #include <trace/define_trace.h> Loading include/trace/events/sched.h +460 −0 Original line number Diff line number Diff line Loading @@ -8,6 +8,7 @@ #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> #include <linux/sched/idle.h> /* * Tracepoint for calling kthread_stop, performed to end a kthread: Loading Loading @@ -259,6 +260,233 @@ TRACE_EVENT(sched_migrate_task, __entry->orig_cpu, __entry->dest_cpu) ); /* * Tracepoint for load balancing: */ #ifdef CONFIG_SMP #if NR_CPUS > BITS_PER_LONG #define trace_sched_load_balance_sg_stats(...) #define trace_sched_load_balance_stats(...) #define trace_sched_load_balance(...) #define trace_sched_load_balance_nohz_kick(...) #else TRACE_EVENT(sched_load_balance, TP_PROTO(int cpu, enum cpu_idle_type idle, int balance, unsigned long group_mask, int busiest_nr_running, unsigned long imbalance, unsigned int env_flags, int ld_moved, unsigned int balance_interval, int active_balance), TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running, imbalance, env_flags, ld_moved, balance_interval, active_balance), TP_STRUCT__entry( __field(int, cpu) __field(enum cpu_idle_type, idle) __field(int, balance) __field(unsigned long, group_mask) __field(int, busiest_nr_running) __field(unsigned long, imbalance) __field(unsigned int, env_flags) __field(int, ld_moved) __field(unsigned int, balance_interval) __field(int, active_balance) ), TP_fast_assign( __entry->cpu = cpu; __entry->idle = idle; __entry->balance = balance; __entry->group_mask = group_mask; __entry->busiest_nr_running = busiest_nr_running; __entry->imbalance = imbalance; __entry->env_flags = env_flags; __entry->ld_moved = ld_moved; __entry->balance_interval = balance_interval; __entry->active_balance = active_balance; ), TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d", __entry->cpu, __entry->idle == CPU_IDLE ? "idle" : (__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"), __entry->balance, __entry->group_mask, __entry->busiest_nr_running, __entry->imbalance, __entry->env_flags, __entry->ld_moved, __entry->balance_interval, __entry->active_balance) ); TRACE_EVENT(sched_load_balance_nohz_kick, TP_PROTO(int cpu, int kick_cpu), TP_ARGS(cpu, kick_cpu), TP_STRUCT__entry( __field(int, cpu) __field(unsigned int, cpu_nr) __field(unsigned long, misfit_task_load) __field(int, cpu_overutil) __field(int, kick_cpu) __field(unsigned long, nohz_flags) ), TP_fast_assign( __entry->cpu = cpu; __entry->cpu_nr = cpu_rq(cpu)->nr_running; __entry->misfit_task_load = cpu_rq(cpu)->misfit_task_load; __entry->cpu_overutil = cpu_overutilized(cpu); __entry->kick_cpu = kick_cpu; __entry->nohz_flags = atomic_read(nohz_flags(kick_cpu)); ), TP_printk("cpu=%d nr_run=%u misfit_task_load=%lu overutilized=%d kick_cpu=%d nohz_flags=0x%lx", __entry->cpu, __entry->cpu_nr, __entry->misfit_task_load, __entry->cpu_overutil, __entry->kick_cpu, __entry->nohz_flags) ); TRACE_EVENT(sched_load_balance_sg_stats, TP_PROTO(unsigned long sg_cpus, int group_type, unsigned int idle_cpus, unsigned int sum_nr_running, unsigned long group_load, unsigned long group_capacity, unsigned long group_util, int group_no_capacity, unsigned long load_per_task, unsigned long misfit_load, unsigned long busiest), TP_ARGS(sg_cpus, group_type, idle_cpus, sum_nr_running, group_load, group_capacity, group_util, group_no_capacity, load_per_task, misfit_load, busiest), TP_STRUCT__entry( __field(unsigned long, group_mask) __field(int, group_type) __field(unsigned int, group_idle_cpus) __field(unsigned int, sum_nr_running) __field(unsigned long, group_load) __field(unsigned long, group_capacity) __field(unsigned long, group_util) __field(int, group_no_capacity) __field(unsigned long, load_per_task) __field(unsigned long, misfit_task_load) __field(unsigned long, busiest) ), TP_fast_assign( __entry->group_mask = sg_cpus; __entry->group_type = group_type; __entry->group_idle_cpus = idle_cpus; __entry->sum_nr_running = sum_nr_running; __entry->group_load = group_load; __entry->group_capacity = group_capacity; __entry->group_util = group_util; __entry->group_no_capacity = group_no_capacity; __entry->load_per_task = load_per_task; __entry->misfit_task_load = misfit_load; __entry->busiest = busiest; ), TP_printk("sched_group=%#lx type=%d idle_cpus=%u sum_nr_run=%u group_load=%lu capacity=%lu util=%lu no_capacity=%d lpt=%lu misfit_tload=%lu busiest_group=%#lx", __entry->group_mask, __entry->group_type, __entry->group_idle_cpus, __entry->sum_nr_running, __entry->group_load, __entry->group_capacity, __entry->group_util, __entry->group_no_capacity, __entry->load_per_task, __entry->misfit_task_load, __entry->busiest) ); TRACE_EVENT(sched_load_balance_stats, TP_PROTO(unsigned long busiest, int bgroup_type, unsigned long bavg_load, unsigned long bload_per_task, unsigned long local, int lgroup_type, unsigned long lavg_load, unsigned long lload_per_task, unsigned long sds_avg_load, unsigned long imbalance), TP_ARGS(busiest, bgroup_type, bavg_load, bload_per_task, local, lgroup_type, lavg_load, lload_per_task, sds_avg_load, imbalance), TP_STRUCT__entry( __field(unsigned long, busiest) __field(int, bgp_type) __field(unsigned long, bavg_load) __field(unsigned long, blpt) __field(unsigned long, local) __field(int, lgp_type) __field(unsigned long, lavg_load) __field(unsigned long, llpt) __field(unsigned long, sds_avg) __field(unsigned long, imbalance) ), TP_fast_assign( __entry->busiest = busiest; __entry->bgp_type = bgroup_type; __entry->bavg_load = bavg_load; __entry->blpt = bload_per_task; __entry->bgp_type = bgroup_type; __entry->local = local; __entry->lgp_type = lgroup_type; __entry->lavg_load = lavg_load; __entry->llpt = lload_per_task; __entry->sds_avg = sds_avg_load; __entry->imbalance = imbalance; ), TP_printk("busiest_group=%#lx busiest_type=%d busiest_avg_load=%ld busiest_lpt=%ld local_group=%#lx local_type=%d local_avg_load=%ld local_lpt=%ld domain_avg_load=%ld imbalance=%ld", __entry->busiest, __entry->bgp_type, __entry->bavg_load, __entry->blpt, __entry->local, __entry->lgp_type, __entry->lavg_load, __entry->llpt, __entry->sds_avg, __entry->imbalance) ); #endif /* NR_CPUS > BITS_PER_LONG */ #endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_WALT TRACE_EVENT(sched_load_balance_skip_tasks, TP_PROTO(int scpu, int dcpu, int grp_type, int pid, unsigned long h_load, unsigned long task_util, unsigned long affinity), TP_ARGS(scpu, dcpu, grp_type, pid, h_load, task_util, affinity), TP_STRUCT__entry( __field(int, scpu) __field(unsigned long, src_util_cum) __field(int, grp_type) __field(int, dcpu) __field(unsigned long, dst_util_cum) __field(int, pid) __field(unsigned long, affinity) __field(unsigned long, task_util) __field(unsigned long, h_load) ), TP_fast_assign( __entry->scpu = scpu; __entry->src_util_cum = cpu_rq(scpu)->cum_window_demand_scaled; __entry->grp_type = grp_type; __entry->dcpu = dcpu; __entry->dst_util_cum = cpu_rq(dcpu)->cum_window_demand_scaled; __entry->pid = pid; __entry->affinity = affinity; __entry->task_util = task_util; __entry->h_load = h_load; ), TP_printk("source_cpu=%d util_cum=%lu group_type=%d dest_cpu=%d util_cum=%lu pid=%d affinity=%#lx task_util=%lu task_h_load=%lu", __entry->scpu, __entry->src_util_cum, __entry->grp_type, __entry->dcpu, __entry->dst_util_cum, __entry->pid, __entry->affinity, __entry->task_util, __entry->h_load) ); #endif DECLARE_EVENT_CLASS(sched_process_template, TP_PROTO(struct task_struct *p), Loading Loading @@ -671,6 +899,238 @@ DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); TRACE_EVENT(sched_cpu_util, TP_PROTO(int cpu), TP_ARGS(cpu), TP_STRUCT__entry( __field(unsigned int, cpu) __field(unsigned int, nr_running) __field(long, cpu_util) __field(long, cpu_util_cum) __field(unsigned int, capacity_curr) __field(unsigned int, capacity) __field(unsigned int, capacity_orig) __field(int, idle_state) __field(u64, irqload) __field(int, online) __field(int, isolated) __field(int, reserved) __field(int, high_irq_load) ), TP_fast_assign( __entry->cpu = cpu; __entry->nr_running = cpu_rq(cpu)->nr_running; __entry->cpu_util = cpu_util(cpu); __entry->cpu_util_cum = cpu_util_cum(cpu, 0); __entry->capacity_curr = capacity_curr_of(cpu); __entry->capacity = capacity_of(cpu); __entry->capacity_orig = capacity_orig_of(cpu); __entry->idle_state = idle_get_state_idx(cpu_rq(cpu)); __entry->irqload = sched_irqload(cpu); __entry->online = cpu_online(cpu); __entry->isolated = cpu_isolated(cpu); __entry->reserved = is_reserved(cpu); __entry->high_irq_load = sched_cpu_high_irqload(cpu); ), TP_printk("cpu=%d nr_running=%d cpu_util=%ld cpu_util_cum=%ld capacity_curr=%u capacity=%u capacity_orig=%u idle_state=%d irqload=%llu online=%u, isolated=%u, reserved=%u, high_irq_load=%u", __entry->cpu, __entry->nr_running, __entry->cpu_util, __entry->cpu_util_cum, __entry->capacity_curr, __entry->capacity, __entry->capacity_orig, __entry->idle_state, __entry->irqload, __entry->online, __entry->isolated, __entry->reserved, __entry->high_irq_load) ); TRACE_EVENT(sched_compute_energy, TP_PROTO(struct task_struct *p, int eval_cpu, unsigned long eval_energy, unsigned long prev_energy, unsigned long best_energy, unsigned long best_energy_cpu), TP_ARGS(p, eval_cpu, eval_energy, prev_energy, best_energy, best_energy_cpu), TP_STRUCT__entry( __field(int, pid) __array(char, comm, TASK_COMM_LEN) __field(unsigned long, util) __field(int, prev_cpu) __field(unsigned long, prev_energy) __field(int, eval_cpu) __field(unsigned long, eval_energy) __field(int, best_energy_cpu) __field(unsigned long, best_energy) ), TP_fast_assign( __entry->pid = p->pid; memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->util = task_util(p); __entry->prev_cpu = task_cpu(p); __entry->prev_energy = prev_energy; __entry->eval_cpu = eval_cpu; __entry->eval_energy = eval_energy; __entry->best_energy_cpu = best_energy_cpu; __entry->best_energy = best_energy; ), TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d prev_energy=%llu eval_cpu=%d eval_energy=%llu best_energy_cpu=%d best_energy=%llu", __entry->pid, __entry->comm, __entry->util, __entry->prev_cpu, __entry->prev_energy, __entry->eval_cpu, __entry->eval_energy, __entry->best_energy_cpu, __entry->best_energy) ) TRACE_EVENT(sched_task_util, TP_PROTO(struct task_struct *p, unsigned long candidates, int best_energy_cpu, bool sync, bool need_idle, int fastpath, bool placement_boost, u64 start_t, bool stune_boosted, bool is_rtg, bool rtg_skip_min, int start_cpu), TP_ARGS(p, candidates, best_energy_cpu, sync, need_idle, fastpath, placement_boost, start_t, stune_boosted, is_rtg, rtg_skip_min, start_cpu), TP_STRUCT__entry( __field(int, pid) __array(char, comm, TASK_COMM_LEN) __field(unsigned long, util) __field(unsigned long, candidates) __field(int, prev_cpu) __field(int, best_energy_cpu) __field(bool, sync) __field(bool, need_idle) __field(int, fastpath) __field(int, placement_boost) __field(int, rtg_cpu) __field(u64, latency) __field(bool, stune_boosted) __field(bool, is_rtg) __field(bool, rtg_skip_min) __field(int, start_cpu) __field(int, unfilter) __field(unsigned long, cpus_allowed) ), TP_fast_assign( __entry->pid = p->pid; memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->util = task_util(p); __entry->prev_cpu = task_cpu(p); __entry->candidates = candidates; __entry->best_energy_cpu = best_energy_cpu; __entry->sync = sync; __entry->need_idle = need_idle; __entry->fastpath = fastpath; __entry->placement_boost = placement_boost; __entry->latency = (sched_clock() - start_t); __entry->stune_boosted = stune_boosted; __entry->is_rtg = is_rtg; __entry->rtg_skip_min = rtg_skip_min; __entry->start_cpu = start_cpu; #ifdef CONFIG_SCHED_WALT __entry->unfilter = p->unfilter; #else __entry->unfilter = 0; #endif __entry->cpus_allowed = cpumask_bits(&p->cpus_mask)[0]; ), TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d candidates=%#lx best_energy_cpu=%d sync=%d need_idle=%d fastpath=%d placement_boost=%d latency=%llu stune_boosted=%d is_rtg=%d rtg_skip_min=%d start_cpu=%d unfilter=%d affinity=%lx", __entry->pid, __entry->comm, __entry->util, __entry->prev_cpu, __entry->candidates, __entry->best_energy_cpu, __entry->sync, __entry->need_idle, __entry->fastpath, __entry->placement_boost, __entry->latency, __entry->stune_boosted, __entry->is_rtg, __entry->rtg_skip_min, __entry->start_cpu, __entry->unfilter, __entry->cpus_allowed) ) /* * Tracepoint for find_best_target */ TRACE_EVENT(sched_find_best_target, TP_PROTO(struct task_struct *tsk, unsigned long min_util, int start_cpu, int best_idle, int best_active, int most_spare_cap, int target, int backup), TP_ARGS(tsk, min_util, start_cpu, best_idle, best_active, most_spare_cap, target, backup), TP_STRUCT__entry( __array(char, comm, TASK_COMM_LEN) __field(pid_t, pid) __field(unsigned long, min_util) __field(int, start_cpu) __field(int, best_idle) __field(int, best_active) __field(int, most_spare_cap) __field(int, target) __field(int, backup) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->min_util = min_util; __entry->start_cpu = start_cpu; __entry->best_idle = best_idle; __entry->best_active = best_active; __entry->most_spare_cap = most_spare_cap; __entry->target = target; __entry->backup = backup; ), TP_printk("pid=%d comm=%s start_cpu=%d best_idle=%d best_active=%d most_spare_cap=%d target=%d backup=%d", __entry->pid, __entry->comm, __entry->start_cpu, __entry->best_idle, __entry->best_active, __entry->most_spare_cap, __entry->target, __entry->backup) ); TRACE_EVENT(sched_preempt_disable, TP_PROTO(u64 delta, bool irqs_disabled, unsigned long caddr0, unsigned long caddr1, unsigned long caddr2, unsigned long caddr3), TP_ARGS(delta, irqs_disabled, caddr0, caddr1, caddr2, caddr3), TP_STRUCT__entry( __field(u64, delta) __field(bool, irqs_disabled) __field(void*, caddr0) __field(void*, caddr1) __field(void*, caddr2) __field(void*, caddr3) ), TP_fast_assign( __entry->delta = delta; __entry->irqs_disabled = irqs_disabled; __entry->caddr0 = (void *)caddr0; __entry->caddr1 = (void *)caddr1; __entry->caddr2 = (void *)caddr2; __entry->caddr3 = (void *)caddr3; ), TP_printk("delta=%llu(ns) irqs_d=%d Callers:(%ps<-%ps<-%ps<-%ps)", __entry->delta, __entry->irqs_disabled, __entry->caddr0, __entry->caddr1, __entry->caddr2, __entry->caddr3) ); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ Loading kernel/sched/core.c +56 −1 Original line number Diff line number Diff line Loading @@ -3652,6 +3652,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } unsigned int capacity_margin_freq = 1280; /* ~20% margin */ /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. Loading Loading @@ -3847,17 +3849,55 @@ static inline void sched_tick_stop(int cpu) { } #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * preemptoff stack tracing threshold in ns. * default: 1ms */ unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL; struct preempt_store { u64 ts; unsigned long caddr[4]; bool irqs_disabled; }; DEFINE_PER_CPU(struct preempt_store, the_ps); /* * This is only called from __schedule() upon context switch. * * schedule() calls __schedule() with preemption disabled. * if we had entered idle and exiting idle now, reset the preemption * tracking otherwise we may think preemption is disabled the whole time * when the non idle task re-enables the preemption in schedule(). */ static inline void preempt_latency_reset(void) { if (is_idle_task(this_rq()->curr)) this_cpu_ptr(&the_ps)->ts = 0; } /* * If the value passed in is equal to the current preempt count * then we just disabled preemption. Start timing the latency. */ static inline void preempt_latency_start(int val) { int cpu = raw_smp_processor_id(); struct preempt_store *ps = &per_cpu(the_ps, cpu); if (preempt_count() == val) { unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif ps->ts = sched_clock(); ps->caddr[0] = CALLER_ADDR0; ps->caddr[1] = CALLER_ADDR1; ps->caddr[2] = CALLER_ADDR2; ps->caddr[3] = CALLER_ADDR3; ps->irqs_disabled = irqs_disabled(); trace_preempt_off(CALLER_ADDR0, ip); } } Loading Loading @@ -3890,9 +3930,22 @@ NOKPROBE_SYMBOL(preempt_count_add); */ static inline void preempt_latency_stop(int val) { if (preempt_count() == val) if (preempt_count() == val) { struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id()); u64 delta = ps->ts ? (sched_clock() - ps->ts) : 0; /* * Trace preempt disable stack if preemption * is disabled for more than the threshold. */ if (delta > sysctl_preemptoff_tracing_threshold_ns) trace_sched_preempt_disable(delta, ps->irqs_disabled, ps->caddr[0], ps->caddr[1], ps->caddr[2], ps->caddr[3]); trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); } } void preempt_count_sub(int val) { Loading @@ -3919,6 +3972,7 @@ NOKPROBE_SYMBOL(preempt_count_sub); #else static inline void preempt_latency_start(int val) { } static inline void preempt_latency_stop(int val) { } static inline void preempt_latency_reset(void) { } #endif static inline unsigned long get_preempt_disable_ip(struct task_struct *p) Loading Loading @@ -4153,6 +4207,7 @@ static void __sched notrace __schedule(bool preempt) prev->last_sleep_ts = wallclock; #endif preempt_latency_reset(); walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); rq->nr_switches++; Loading Loading
include/linux/sched/sysctl.h +18 −0 Original line number Diff line number Diff line Loading @@ -20,11 +20,17 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, enum { sysctl_hung_task_timeout_secs = 0 }; #endif #define MAX_CLUSTERS 3 /* MAX_MARGIN_LEVELS should be one less than MAX_CLUSTERS */ #define MAX_MARGIN_LEVELS (MAX_CLUSTERS - 1) extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; #ifdef CONFIG_SCHED_WALT extern unsigned int __weak sysctl_sched_capacity_margin_up[MAX_MARGIN_LEVELS]; extern unsigned int __weak sysctl_sched_capacity_margin_down[MAX_MARGIN_LEVELS]; extern unsigned int __weak sysctl_sched_user_hint; extern const int __weak sched_user_hint_max; extern unsigned int __weak sysctl_sched_cpu_high_irqload; Loading Loading @@ -55,12 +61,24 @@ walt_proc_user_hint_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int __weak sched_updown_migrate_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int __weak sched_ravg_window_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif #if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_DEBUG_PREEMPT) extern unsigned int sysctl_preemptoff_tracing_threshold_ns; #endif #if defined(CONFIG_PREEMPTIRQ_EVENTS) && defined(CONFIG_IRQSOFF_TRACER) extern unsigned int sysctl_irqsoff_tracing_threshold_ns; #endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, Loading
include/linux/sysctl.h +3 −0 Original line number Diff line number Diff line Loading @@ -73,6 +73,9 @@ extern int proc_do_large_bitmap(struct ctl_table *, int, extern int proc_do_static_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int proc_douintvec_capacity(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int proc_douintvec_ravg_window(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); Loading
include/trace/events/preemptirq.h +28 −0 Original line number Diff line number Diff line Loading @@ -62,6 +62,34 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #define trace_preempt_disable_rcuidle(...) #endif TRACE_EVENT(irqs_disable, TP_PROTO(u64 delta, unsigned long caddr0, unsigned long caddr1, unsigned long caddr2, unsigned long caddr3), TP_ARGS(delta, caddr0, caddr1, caddr2, caddr3), TP_STRUCT__entry( __field(u64, delta) __field(void*, caddr0) __field(void*, caddr1) __field(void*, caddr2) __field(void*, caddr3) ), TP_fast_assign( __entry->delta = delta; __entry->caddr0 = (void *)caddr0; __entry->caddr1 = (void *)caddr1; __entry->caddr2 = (void *)caddr2; __entry->caddr3 = (void *)caddr3; ), TP_printk("delta=%llu(ns) Callers:(%ps<-%ps<-%ps<-%ps)", __entry->delta, __entry->caddr0, __entry->caddr1, __entry->caddr2, __entry->caddr3) ); #endif /* _TRACE_PREEMPTIRQ_H */ #include <trace/define_trace.h> Loading
include/trace/events/sched.h +460 −0 Original line number Diff line number Diff line Loading @@ -8,6 +8,7 @@ #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> #include <linux/sched/idle.h> /* * Tracepoint for calling kthread_stop, performed to end a kthread: Loading Loading @@ -259,6 +260,233 @@ TRACE_EVENT(sched_migrate_task, __entry->orig_cpu, __entry->dest_cpu) ); /* * Tracepoint for load balancing: */ #ifdef CONFIG_SMP #if NR_CPUS > BITS_PER_LONG #define trace_sched_load_balance_sg_stats(...) #define trace_sched_load_balance_stats(...) #define trace_sched_load_balance(...) #define trace_sched_load_balance_nohz_kick(...) #else TRACE_EVENT(sched_load_balance, TP_PROTO(int cpu, enum cpu_idle_type idle, int balance, unsigned long group_mask, int busiest_nr_running, unsigned long imbalance, unsigned int env_flags, int ld_moved, unsigned int balance_interval, int active_balance), TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running, imbalance, env_flags, ld_moved, balance_interval, active_balance), TP_STRUCT__entry( __field(int, cpu) __field(enum cpu_idle_type, idle) __field(int, balance) __field(unsigned long, group_mask) __field(int, busiest_nr_running) __field(unsigned long, imbalance) __field(unsigned int, env_flags) __field(int, ld_moved) __field(unsigned int, balance_interval) __field(int, active_balance) ), TP_fast_assign( __entry->cpu = cpu; __entry->idle = idle; __entry->balance = balance; __entry->group_mask = group_mask; __entry->busiest_nr_running = busiest_nr_running; __entry->imbalance = imbalance; __entry->env_flags = env_flags; __entry->ld_moved = ld_moved; __entry->balance_interval = balance_interval; __entry->active_balance = active_balance; ), TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d", __entry->cpu, __entry->idle == CPU_IDLE ? "idle" : (__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"), __entry->balance, __entry->group_mask, __entry->busiest_nr_running, __entry->imbalance, __entry->env_flags, __entry->ld_moved, __entry->balance_interval, __entry->active_balance) ); TRACE_EVENT(sched_load_balance_nohz_kick, TP_PROTO(int cpu, int kick_cpu), TP_ARGS(cpu, kick_cpu), TP_STRUCT__entry( __field(int, cpu) __field(unsigned int, cpu_nr) __field(unsigned long, misfit_task_load) __field(int, cpu_overutil) __field(int, kick_cpu) __field(unsigned long, nohz_flags) ), TP_fast_assign( __entry->cpu = cpu; __entry->cpu_nr = cpu_rq(cpu)->nr_running; __entry->misfit_task_load = cpu_rq(cpu)->misfit_task_load; __entry->cpu_overutil = cpu_overutilized(cpu); __entry->kick_cpu = kick_cpu; __entry->nohz_flags = atomic_read(nohz_flags(kick_cpu)); ), TP_printk("cpu=%d nr_run=%u misfit_task_load=%lu overutilized=%d kick_cpu=%d nohz_flags=0x%lx", __entry->cpu, __entry->cpu_nr, __entry->misfit_task_load, __entry->cpu_overutil, __entry->kick_cpu, __entry->nohz_flags) ); TRACE_EVENT(sched_load_balance_sg_stats, TP_PROTO(unsigned long sg_cpus, int group_type, unsigned int idle_cpus, unsigned int sum_nr_running, unsigned long group_load, unsigned long group_capacity, unsigned long group_util, int group_no_capacity, unsigned long load_per_task, unsigned long misfit_load, unsigned long busiest), TP_ARGS(sg_cpus, group_type, idle_cpus, sum_nr_running, group_load, group_capacity, group_util, group_no_capacity, load_per_task, misfit_load, busiest), TP_STRUCT__entry( __field(unsigned long, group_mask) __field(int, group_type) __field(unsigned int, group_idle_cpus) __field(unsigned int, sum_nr_running) __field(unsigned long, group_load) __field(unsigned long, group_capacity) __field(unsigned long, group_util) __field(int, group_no_capacity) __field(unsigned long, load_per_task) __field(unsigned long, misfit_task_load) __field(unsigned long, busiest) ), TP_fast_assign( __entry->group_mask = sg_cpus; __entry->group_type = group_type; __entry->group_idle_cpus = idle_cpus; __entry->sum_nr_running = sum_nr_running; __entry->group_load = group_load; __entry->group_capacity = group_capacity; __entry->group_util = group_util; __entry->group_no_capacity = group_no_capacity; __entry->load_per_task = load_per_task; __entry->misfit_task_load = misfit_load; __entry->busiest = busiest; ), TP_printk("sched_group=%#lx type=%d idle_cpus=%u sum_nr_run=%u group_load=%lu capacity=%lu util=%lu no_capacity=%d lpt=%lu misfit_tload=%lu busiest_group=%#lx", __entry->group_mask, __entry->group_type, __entry->group_idle_cpus, __entry->sum_nr_running, __entry->group_load, __entry->group_capacity, __entry->group_util, __entry->group_no_capacity, __entry->load_per_task, __entry->misfit_task_load, __entry->busiest) ); TRACE_EVENT(sched_load_balance_stats, TP_PROTO(unsigned long busiest, int bgroup_type, unsigned long bavg_load, unsigned long bload_per_task, unsigned long local, int lgroup_type, unsigned long lavg_load, unsigned long lload_per_task, unsigned long sds_avg_load, unsigned long imbalance), TP_ARGS(busiest, bgroup_type, bavg_load, bload_per_task, local, lgroup_type, lavg_load, lload_per_task, sds_avg_load, imbalance), TP_STRUCT__entry( __field(unsigned long, busiest) __field(int, bgp_type) __field(unsigned long, bavg_load) __field(unsigned long, blpt) __field(unsigned long, local) __field(int, lgp_type) __field(unsigned long, lavg_load) __field(unsigned long, llpt) __field(unsigned long, sds_avg) __field(unsigned long, imbalance) ), TP_fast_assign( __entry->busiest = busiest; __entry->bgp_type = bgroup_type; __entry->bavg_load = bavg_load; __entry->blpt = bload_per_task; __entry->bgp_type = bgroup_type; __entry->local = local; __entry->lgp_type = lgroup_type; __entry->lavg_load = lavg_load; __entry->llpt = lload_per_task; __entry->sds_avg = sds_avg_load; __entry->imbalance = imbalance; ), TP_printk("busiest_group=%#lx busiest_type=%d busiest_avg_load=%ld busiest_lpt=%ld local_group=%#lx local_type=%d local_avg_load=%ld local_lpt=%ld domain_avg_load=%ld imbalance=%ld", __entry->busiest, __entry->bgp_type, __entry->bavg_load, __entry->blpt, __entry->local, __entry->lgp_type, __entry->lavg_load, __entry->llpt, __entry->sds_avg, __entry->imbalance) ); #endif /* NR_CPUS > BITS_PER_LONG */ #endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_WALT TRACE_EVENT(sched_load_balance_skip_tasks, TP_PROTO(int scpu, int dcpu, int grp_type, int pid, unsigned long h_load, unsigned long task_util, unsigned long affinity), TP_ARGS(scpu, dcpu, grp_type, pid, h_load, task_util, affinity), TP_STRUCT__entry( __field(int, scpu) __field(unsigned long, src_util_cum) __field(int, grp_type) __field(int, dcpu) __field(unsigned long, dst_util_cum) __field(int, pid) __field(unsigned long, affinity) __field(unsigned long, task_util) __field(unsigned long, h_load) ), TP_fast_assign( __entry->scpu = scpu; __entry->src_util_cum = cpu_rq(scpu)->cum_window_demand_scaled; __entry->grp_type = grp_type; __entry->dcpu = dcpu; __entry->dst_util_cum = cpu_rq(dcpu)->cum_window_demand_scaled; __entry->pid = pid; __entry->affinity = affinity; __entry->task_util = task_util; __entry->h_load = h_load; ), TP_printk("source_cpu=%d util_cum=%lu group_type=%d dest_cpu=%d util_cum=%lu pid=%d affinity=%#lx task_util=%lu task_h_load=%lu", __entry->scpu, __entry->src_util_cum, __entry->grp_type, __entry->dcpu, __entry->dst_util_cum, __entry->pid, __entry->affinity, __entry->task_util, __entry->h_load) ); #endif DECLARE_EVENT_CLASS(sched_process_template, TP_PROTO(struct task_struct *p), Loading Loading @@ -671,6 +899,238 @@ DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); TRACE_EVENT(sched_cpu_util, TP_PROTO(int cpu), TP_ARGS(cpu), TP_STRUCT__entry( __field(unsigned int, cpu) __field(unsigned int, nr_running) __field(long, cpu_util) __field(long, cpu_util_cum) __field(unsigned int, capacity_curr) __field(unsigned int, capacity) __field(unsigned int, capacity_orig) __field(int, idle_state) __field(u64, irqload) __field(int, online) __field(int, isolated) __field(int, reserved) __field(int, high_irq_load) ), TP_fast_assign( __entry->cpu = cpu; __entry->nr_running = cpu_rq(cpu)->nr_running; __entry->cpu_util = cpu_util(cpu); __entry->cpu_util_cum = cpu_util_cum(cpu, 0); __entry->capacity_curr = capacity_curr_of(cpu); __entry->capacity = capacity_of(cpu); __entry->capacity_orig = capacity_orig_of(cpu); __entry->idle_state = idle_get_state_idx(cpu_rq(cpu)); __entry->irqload = sched_irqload(cpu); __entry->online = cpu_online(cpu); __entry->isolated = cpu_isolated(cpu); __entry->reserved = is_reserved(cpu); __entry->high_irq_load = sched_cpu_high_irqload(cpu); ), TP_printk("cpu=%d nr_running=%d cpu_util=%ld cpu_util_cum=%ld capacity_curr=%u capacity=%u capacity_orig=%u idle_state=%d irqload=%llu online=%u, isolated=%u, reserved=%u, high_irq_load=%u", __entry->cpu, __entry->nr_running, __entry->cpu_util, __entry->cpu_util_cum, __entry->capacity_curr, __entry->capacity, __entry->capacity_orig, __entry->idle_state, __entry->irqload, __entry->online, __entry->isolated, __entry->reserved, __entry->high_irq_load) ); TRACE_EVENT(sched_compute_energy, TP_PROTO(struct task_struct *p, int eval_cpu, unsigned long eval_energy, unsigned long prev_energy, unsigned long best_energy, unsigned long best_energy_cpu), TP_ARGS(p, eval_cpu, eval_energy, prev_energy, best_energy, best_energy_cpu), TP_STRUCT__entry( __field(int, pid) __array(char, comm, TASK_COMM_LEN) __field(unsigned long, util) __field(int, prev_cpu) __field(unsigned long, prev_energy) __field(int, eval_cpu) __field(unsigned long, eval_energy) __field(int, best_energy_cpu) __field(unsigned long, best_energy) ), TP_fast_assign( __entry->pid = p->pid; memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->util = task_util(p); __entry->prev_cpu = task_cpu(p); __entry->prev_energy = prev_energy; __entry->eval_cpu = eval_cpu; __entry->eval_energy = eval_energy; __entry->best_energy_cpu = best_energy_cpu; __entry->best_energy = best_energy; ), TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d prev_energy=%llu eval_cpu=%d eval_energy=%llu best_energy_cpu=%d best_energy=%llu", __entry->pid, __entry->comm, __entry->util, __entry->prev_cpu, __entry->prev_energy, __entry->eval_cpu, __entry->eval_energy, __entry->best_energy_cpu, __entry->best_energy) ) TRACE_EVENT(sched_task_util, TP_PROTO(struct task_struct *p, unsigned long candidates, int best_energy_cpu, bool sync, bool need_idle, int fastpath, bool placement_boost, u64 start_t, bool stune_boosted, bool is_rtg, bool rtg_skip_min, int start_cpu), TP_ARGS(p, candidates, best_energy_cpu, sync, need_idle, fastpath, placement_boost, start_t, stune_boosted, is_rtg, rtg_skip_min, start_cpu), TP_STRUCT__entry( __field(int, pid) __array(char, comm, TASK_COMM_LEN) __field(unsigned long, util) __field(unsigned long, candidates) __field(int, prev_cpu) __field(int, best_energy_cpu) __field(bool, sync) __field(bool, need_idle) __field(int, fastpath) __field(int, placement_boost) __field(int, rtg_cpu) __field(u64, latency) __field(bool, stune_boosted) __field(bool, is_rtg) __field(bool, rtg_skip_min) __field(int, start_cpu) __field(int, unfilter) __field(unsigned long, cpus_allowed) ), TP_fast_assign( __entry->pid = p->pid; memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->util = task_util(p); __entry->prev_cpu = task_cpu(p); __entry->candidates = candidates; __entry->best_energy_cpu = best_energy_cpu; __entry->sync = sync; __entry->need_idle = need_idle; __entry->fastpath = fastpath; __entry->placement_boost = placement_boost; __entry->latency = (sched_clock() - start_t); __entry->stune_boosted = stune_boosted; __entry->is_rtg = is_rtg; __entry->rtg_skip_min = rtg_skip_min; __entry->start_cpu = start_cpu; #ifdef CONFIG_SCHED_WALT __entry->unfilter = p->unfilter; #else __entry->unfilter = 0; #endif __entry->cpus_allowed = cpumask_bits(&p->cpus_mask)[0]; ), TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d candidates=%#lx best_energy_cpu=%d sync=%d need_idle=%d fastpath=%d placement_boost=%d latency=%llu stune_boosted=%d is_rtg=%d rtg_skip_min=%d start_cpu=%d unfilter=%d affinity=%lx", __entry->pid, __entry->comm, __entry->util, __entry->prev_cpu, __entry->candidates, __entry->best_energy_cpu, __entry->sync, __entry->need_idle, __entry->fastpath, __entry->placement_boost, __entry->latency, __entry->stune_boosted, __entry->is_rtg, __entry->rtg_skip_min, __entry->start_cpu, __entry->unfilter, __entry->cpus_allowed) ) /* * Tracepoint for find_best_target */ TRACE_EVENT(sched_find_best_target, TP_PROTO(struct task_struct *tsk, unsigned long min_util, int start_cpu, int best_idle, int best_active, int most_spare_cap, int target, int backup), TP_ARGS(tsk, min_util, start_cpu, best_idle, best_active, most_spare_cap, target, backup), TP_STRUCT__entry( __array(char, comm, TASK_COMM_LEN) __field(pid_t, pid) __field(unsigned long, min_util) __field(int, start_cpu) __field(int, best_idle) __field(int, best_active) __field(int, most_spare_cap) __field(int, target) __field(int, backup) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->min_util = min_util; __entry->start_cpu = start_cpu; __entry->best_idle = best_idle; __entry->best_active = best_active; __entry->most_spare_cap = most_spare_cap; __entry->target = target; __entry->backup = backup; ), TP_printk("pid=%d comm=%s start_cpu=%d best_idle=%d best_active=%d most_spare_cap=%d target=%d backup=%d", __entry->pid, __entry->comm, __entry->start_cpu, __entry->best_idle, __entry->best_active, __entry->most_spare_cap, __entry->target, __entry->backup) ); TRACE_EVENT(sched_preempt_disable, TP_PROTO(u64 delta, bool irqs_disabled, unsigned long caddr0, unsigned long caddr1, unsigned long caddr2, unsigned long caddr3), TP_ARGS(delta, irqs_disabled, caddr0, caddr1, caddr2, caddr3), TP_STRUCT__entry( __field(u64, delta) __field(bool, irqs_disabled) __field(void*, caddr0) __field(void*, caddr1) __field(void*, caddr2) __field(void*, caddr3) ), TP_fast_assign( __entry->delta = delta; __entry->irqs_disabled = irqs_disabled; __entry->caddr0 = (void *)caddr0; __entry->caddr1 = (void *)caddr1; __entry->caddr2 = (void *)caddr2; __entry->caddr3 = (void *)caddr3; ), TP_printk("delta=%llu(ns) irqs_d=%d Callers:(%ps<-%ps<-%ps<-%ps)", __entry->delta, __entry->irqs_disabled, __entry->caddr0, __entry->caddr1, __entry->caddr2, __entry->caddr3) ); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ Loading
kernel/sched/core.c +56 −1 Original line number Diff line number Diff line Loading @@ -3652,6 +3652,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } unsigned int capacity_margin_freq = 1280; /* ~20% margin */ /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. Loading Loading @@ -3847,17 +3849,55 @@ static inline void sched_tick_stop(int cpu) { } #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * preemptoff stack tracing threshold in ns. * default: 1ms */ unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL; struct preempt_store { u64 ts; unsigned long caddr[4]; bool irqs_disabled; }; DEFINE_PER_CPU(struct preempt_store, the_ps); /* * This is only called from __schedule() upon context switch. * * schedule() calls __schedule() with preemption disabled. * if we had entered idle and exiting idle now, reset the preemption * tracking otherwise we may think preemption is disabled the whole time * when the non idle task re-enables the preemption in schedule(). */ static inline void preempt_latency_reset(void) { if (is_idle_task(this_rq()->curr)) this_cpu_ptr(&the_ps)->ts = 0; } /* * If the value passed in is equal to the current preempt count * then we just disabled preemption. Start timing the latency. */ static inline void preempt_latency_start(int val) { int cpu = raw_smp_processor_id(); struct preempt_store *ps = &per_cpu(the_ps, cpu); if (preempt_count() == val) { unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif ps->ts = sched_clock(); ps->caddr[0] = CALLER_ADDR0; ps->caddr[1] = CALLER_ADDR1; ps->caddr[2] = CALLER_ADDR2; ps->caddr[3] = CALLER_ADDR3; ps->irqs_disabled = irqs_disabled(); trace_preempt_off(CALLER_ADDR0, ip); } } Loading Loading @@ -3890,9 +3930,22 @@ NOKPROBE_SYMBOL(preempt_count_add); */ static inline void preempt_latency_stop(int val) { if (preempt_count() == val) if (preempt_count() == val) { struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id()); u64 delta = ps->ts ? (sched_clock() - ps->ts) : 0; /* * Trace preempt disable stack if preemption * is disabled for more than the threshold. */ if (delta > sysctl_preemptoff_tracing_threshold_ns) trace_sched_preempt_disable(delta, ps->irqs_disabled, ps->caddr[0], ps->caddr[1], ps->caddr[2], ps->caddr[3]); trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); } } void preempt_count_sub(int val) { Loading @@ -3919,6 +3972,7 @@ NOKPROBE_SYMBOL(preempt_count_sub); #else static inline void preempt_latency_start(int val) { } static inline void preempt_latency_stop(int val) { } static inline void preempt_latency_reset(void) { } #endif static inline unsigned long get_preempt_disable_ip(struct task_struct *p) Loading Loading @@ -4153,6 +4207,7 @@ static void __sched notrace __schedule(bool preempt) prev->last_sleep_ts = wallclock; #endif preempt_latency_reset(); walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); rq->nr_switches++; Loading