Loading include/linux/cpuset.h +1 −1 Original line number Diff line number Diff line Loading @@ -160,7 +160,7 @@ static inline int current_cpuset_is_being_rebound(void) static inline void rebuild_sched_domains(void) { partition_sched_domains(0, NULL, NULL); partition_sched_domains(1, NULL, NULL); } #endif /* !CONFIG_CPUSETS */ Loading kernel/cpuset.c +182 −130 Original line number Diff line number Diff line Loading @@ -14,6 +14,8 @@ * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups * 2008 Rework of the scheduler domains and CPU hotplug handling * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux Loading Loading @@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); /* This is ugly, but preserves the userspace API for existing cpuset /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ * silently switch it to mount "cgroup" instead */ static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) Loading Loading @@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* * Helper routine for rebuild_sched_domains(). * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpus_intersects(a->cpus_allowed, b->cpus_allowed); Loading Loading @@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) } /* * rebuild_sched_domains() * * This routine will be called to rebuild the scheduler's dynamic * sched domains: * - if the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, * - or if the 'cpus' allowed changes in any cpuset which has that * flag enabled, * - or if the 'sched_relax_domain_level' of any cpuset which has * that flag enabled and with non-empty 'cpus' changes, * - or if any cpuset with non-empty 'cpus' is removed, * - or if a cpu gets offlined. * * This routine builds a partial partition of the systems CPUs * (the set of non-overlappping cpumask_t's in the array 'part' * below), and passes that partial partition to the kernel/sched.c * partition_sched_domains() routine, which will rebuild the * schedulers load balancing domains (sched domains) as specified * by that partial partition. A 'partial partition' is a set of * non-overlapping subsets whose union is a subset of that set. * generate_sched_domains() * * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. * The output of this function needs to be passed to kernel/sched.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. * * See "What is sched_load_balance" in Documentation/cpusets.txt * for a background explanation of this. Loading @@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * * Call with cgroup_mutex held. May take callback_mutex during * call due to the kfifo_alloc() and kmalloc() calls. May nest * a call to the get_online_cpus()/put_online_cpus() pair. * Must not be called holding callback_mutex, because we must not * call get_online_cpus() while holding callback_mutex. Elsewhere * the kernel nests callback_mutex inside get_online_cpus() calls. * So the reverse nesting would risk an ABBA deadlock. * Must be called with cgroup_lock held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a Loading Loading @@ -588,8 +574,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ void rebuild_sched_domains(void) static int generate_sched_domains(cpumask_t **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ Loading @@ -601,23 +587,26 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ csa = NULL; ndoms = 0; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; goto done; dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; goto rebuild; ndoms = 1; goto done; } csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); Loading Loading @@ -680,18 +669,33 @@ void rebuild_sched_domains(void) } } /* Convert <csn, csa> to <ndoms, doms> */ /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; if (!doms) { ndoms = 0; goto done; } /* * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; cpumask_t *dp; int apn = a->pn; if (apn >= 0) { cpumask_t *dp = doms + nslot; if (apn < 0) { /* Skip completed partitions */ continue; } dp = doms + nslot; if (nslot == ndoms) { static int warnings = 10; Loading @@ -714,27 +718,92 @@ void rebuild_sched_domains(void) if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; if (dattr) update_domain_attr_tree(dattr + nslot, b); update_domain_attr_tree(dattr + nslot, b); /* Done with this partition */ b->pn = -1; } } nslot++; } } BUG_ON(nslot != ndoms); rebuild: /* Have scheduler rebuild sched domains */ done: kfree(csa); *domains = doms; *attributes = dattr; return ndoms; } /* * Rebuild scheduler domains. * * Call with neither cgroup_mutex held nor within get_online_cpus(). * Takes both cgroup_mutex and get_online_cpus(). * * Cannot be directly called from cpuset code handling changes * to the cpuset pseudo-filesystem, because it cannot be called * from code that already holds cgroup_mutex. */ static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; cpumask_t *doms; int ndoms; get_online_cpus(); partition_sched_domains(ndoms, doms, dattr); /* Generate domain masks and attrs */ cgroup_lock(); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); put_online_cpus(); } done: kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ /* Don't kfree(dattr) -- partition_sched_domains() does that. */ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); /* * Rebuild scheduler domains, asynchronously via workqueue. * * If the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset * which has that flag enabled, or if any cpuset with a non-empty * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * * The rebuild_sched_domains() and partition_sched_domains() * routines must nest cgroup_lock() inside get_online_cpus(), * but such cpuset changes as these must nest that locking the * other way, holding cgroup_lock() for much of the code. * * So in order to avoid an ABBA deadlock, the cpuset code handling * these user changes delegates the actual sched domain rebuilding * to a separate workqueue thread, which ends up processing the * above do_rebuild_sched_domains() function. */ static void async_rebuild_sched_domains(void) { schedule_work(&rebuild_sched_domains_work); } /* * Accomplishes the same scheduler domain rebuild as the above * async_rebuild_sched_domains(), however it directly calls the * rebuild routine synchronously rather than calling it via an * asynchronous work thread. * * This can only be called from code that is not holding * cgroup_mutex (not nested in a cgroup_lock() call.) */ void rebuild_sched_domains(void) { do_rebuild_sched_domains(NULL); } /** Loading Loading @@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) return retval; if (is_load_balanced) rebuild_sched_domains(); async_rebuild_sched_domains(); return 0; } Loading Loading @@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) rebuild_sched_domains(); async_rebuild_sched_domains(); } return 0; Loading Loading @@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (cpus_nonempty && balance_flag_changed) rebuild_sched_domains(); async_rebuild_sched_domains(); return 0; } Loading Loading @@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) default: BUG(); } /* Unreachable but makes gcc happy */ return 0; } static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) Loading @@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) default: BUG(); } /* Unrechable but makes gcc happy */ return 0; } Loading Loading @@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create( } /* * Locking note on the strange update_flag() call below: * * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which * will call rebuild_sched_domains(). The get_online_cpus() * call in rebuild_sched_domains() must not be made while holding * callback_mutex. Elsewhere the kernel nests callback_mutex inside * get_online_cpus() calls. So the reverse nesting would risk an * ABBA deadlock. * will call async_rebuild_sched_domains(). */ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) Loading Loading @@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) } /* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty Loading Loading @@ -1902,35 +1971,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root) } } /* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to * track what's online after any CPU or memory node hotplug or unplug event. * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded * two separate routines here. We code it as a single common routine * in order to minimize text size. */ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) { cgroup_lock(); top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); /* * Scheduler destroys domains on hotplug events. * Rebuild them based on the current settings. */ if (rebuild_sd) rebuild_sched_domains(); cgroup_unlock(); } /* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent Loading @@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */ static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; cpumask_t *doms; int ndoms; switch (phase) { case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: common_cpu_mem_hotplug_unplug(1); break; default: return NOTIFY_DONE; } cgroup_lock(); top_cpuset.cpus_allowed = cpu_online_map; scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after you change * node_states[N_HIGH_MEMORY]. * See also the previous routine cpuset_handle_cpuhp(). * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. * See also the previous routine cpuset_track_online_cpus(). */ void cpuset_track_online_nodes(void) { common_cpu_mem_hotplug_unplug(0); cgroup_lock(); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); cgroup_unlock(); } #endif Loading @@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void) top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_handle_cpuhp, 0); hotcpu_notifier(cpuset_track_online_cpus, 0); } /** Loading kernel/sched.c +13 −6 Original line number Diff line number Diff line Loading @@ -7696,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * and partition_sched_domains() will fallback to the single partition * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new==NULL it will be replaced with cpu_online_map. * ndoms_new==0 is a special case for destroying existing domains. * It will not create the default domain. * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { int i, j; int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); if (doms_new == NULL) ndoms_new = 0; n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < ndoms_new; j++) { for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; Loading @@ -7726,7 +7729,6 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, if (doms_new == NULL) { ndoms_cur = 0; ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); dattr_new = NULL; Loading Loading @@ -7763,8 +7765,13 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, int arch_reinit_sched_domains(void) { get_online_cpus(); /* Destroy domains first to force the rebuild */ partition_sched_domains(0, NULL, NULL); rebuild_sched_domains(); put_online_cpus(); return 0; } Loading Loading @@ -7848,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: partition_sched_domains(0, NULL, NULL); partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; default: Loading Loading
include/linux/cpuset.h +1 −1 Original line number Diff line number Diff line Loading @@ -160,7 +160,7 @@ static inline int current_cpuset_is_being_rebound(void) static inline void rebuild_sched_domains(void) { partition_sched_domains(0, NULL, NULL); partition_sched_domains(1, NULL, NULL); } #endif /* !CONFIG_CPUSETS */ Loading
kernel/cpuset.c +182 −130 Original line number Diff line number Diff line Loading @@ -14,6 +14,8 @@ * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups * 2008 Rework of the scheduler domains and CPU hotplug handling * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux Loading Loading @@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); /* This is ugly, but preserves the userspace API for existing cpuset /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ * silently switch it to mount "cgroup" instead */ static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) Loading Loading @@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* * Helper routine for rebuild_sched_domains(). * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpus_intersects(a->cpus_allowed, b->cpus_allowed); Loading Loading @@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) } /* * rebuild_sched_domains() * * This routine will be called to rebuild the scheduler's dynamic * sched domains: * - if the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, * - or if the 'cpus' allowed changes in any cpuset which has that * flag enabled, * - or if the 'sched_relax_domain_level' of any cpuset which has * that flag enabled and with non-empty 'cpus' changes, * - or if any cpuset with non-empty 'cpus' is removed, * - or if a cpu gets offlined. * * This routine builds a partial partition of the systems CPUs * (the set of non-overlappping cpumask_t's in the array 'part' * below), and passes that partial partition to the kernel/sched.c * partition_sched_domains() routine, which will rebuild the * schedulers load balancing domains (sched domains) as specified * by that partial partition. A 'partial partition' is a set of * non-overlapping subsets whose union is a subset of that set. * generate_sched_domains() * * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. * The output of this function needs to be passed to kernel/sched.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. * * See "What is sched_load_balance" in Documentation/cpusets.txt * for a background explanation of this. Loading @@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * * Call with cgroup_mutex held. May take callback_mutex during * call due to the kfifo_alloc() and kmalloc() calls. May nest * a call to the get_online_cpus()/put_online_cpus() pair. * Must not be called holding callback_mutex, because we must not * call get_online_cpus() while holding callback_mutex. Elsewhere * the kernel nests callback_mutex inside get_online_cpus() calls. * So the reverse nesting would risk an ABBA deadlock. * Must be called with cgroup_lock held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a Loading Loading @@ -588,8 +574,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ void rebuild_sched_domains(void) static int generate_sched_domains(cpumask_t **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ Loading @@ -601,23 +587,26 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ csa = NULL; ndoms = 0; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; goto done; dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; goto rebuild; ndoms = 1; goto done; } csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); Loading Loading @@ -680,18 +669,33 @@ void rebuild_sched_domains(void) } } /* Convert <csn, csa> to <ndoms, doms> */ /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; if (!doms) { ndoms = 0; goto done; } /* * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; cpumask_t *dp; int apn = a->pn; if (apn >= 0) { cpumask_t *dp = doms + nslot; if (apn < 0) { /* Skip completed partitions */ continue; } dp = doms + nslot; if (nslot == ndoms) { static int warnings = 10; Loading @@ -714,27 +718,92 @@ void rebuild_sched_domains(void) if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; if (dattr) update_domain_attr_tree(dattr + nslot, b); update_domain_attr_tree(dattr + nslot, b); /* Done with this partition */ b->pn = -1; } } nslot++; } } BUG_ON(nslot != ndoms); rebuild: /* Have scheduler rebuild sched domains */ done: kfree(csa); *domains = doms; *attributes = dattr; return ndoms; } /* * Rebuild scheduler domains. * * Call with neither cgroup_mutex held nor within get_online_cpus(). * Takes both cgroup_mutex and get_online_cpus(). * * Cannot be directly called from cpuset code handling changes * to the cpuset pseudo-filesystem, because it cannot be called * from code that already holds cgroup_mutex. */ static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; cpumask_t *doms; int ndoms; get_online_cpus(); partition_sched_domains(ndoms, doms, dattr); /* Generate domain masks and attrs */ cgroup_lock(); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); put_online_cpus(); } done: kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ /* Don't kfree(dattr) -- partition_sched_domains() does that. */ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); /* * Rebuild scheduler domains, asynchronously via workqueue. * * If the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset * which has that flag enabled, or if any cpuset with a non-empty * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * * The rebuild_sched_domains() and partition_sched_domains() * routines must nest cgroup_lock() inside get_online_cpus(), * but such cpuset changes as these must nest that locking the * other way, holding cgroup_lock() for much of the code. * * So in order to avoid an ABBA deadlock, the cpuset code handling * these user changes delegates the actual sched domain rebuilding * to a separate workqueue thread, which ends up processing the * above do_rebuild_sched_domains() function. */ static void async_rebuild_sched_domains(void) { schedule_work(&rebuild_sched_domains_work); } /* * Accomplishes the same scheduler domain rebuild as the above * async_rebuild_sched_domains(), however it directly calls the * rebuild routine synchronously rather than calling it via an * asynchronous work thread. * * This can only be called from code that is not holding * cgroup_mutex (not nested in a cgroup_lock() call.) */ void rebuild_sched_domains(void) { do_rebuild_sched_domains(NULL); } /** Loading Loading @@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) return retval; if (is_load_balanced) rebuild_sched_domains(); async_rebuild_sched_domains(); return 0; } Loading Loading @@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) rebuild_sched_domains(); async_rebuild_sched_domains(); } return 0; Loading Loading @@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (cpus_nonempty && balance_flag_changed) rebuild_sched_domains(); async_rebuild_sched_domains(); return 0; } Loading Loading @@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) default: BUG(); } /* Unreachable but makes gcc happy */ return 0; } static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) Loading @@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) default: BUG(); } /* Unrechable but makes gcc happy */ return 0; } Loading Loading @@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create( } /* * Locking note on the strange update_flag() call below: * * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which * will call rebuild_sched_domains(). The get_online_cpus() * call in rebuild_sched_domains() must not be made while holding * callback_mutex. Elsewhere the kernel nests callback_mutex inside * get_online_cpus() calls. So the reverse nesting would risk an * ABBA deadlock. * will call async_rebuild_sched_domains(). */ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) Loading Loading @@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) } /* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty Loading Loading @@ -1902,35 +1971,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root) } } /* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to * track what's online after any CPU or memory node hotplug or unplug event. * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded * two separate routines here. We code it as a single common routine * in order to minimize text size. */ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) { cgroup_lock(); top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); /* * Scheduler destroys domains on hotplug events. * Rebuild them based on the current settings. */ if (rebuild_sd) rebuild_sched_domains(); cgroup_unlock(); } /* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent Loading @@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */ static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; cpumask_t *doms; int ndoms; switch (phase) { case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: common_cpu_mem_hotplug_unplug(1); break; default: return NOTIFY_DONE; } cgroup_lock(); top_cpuset.cpus_allowed = cpu_online_map; scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after you change * node_states[N_HIGH_MEMORY]. * See also the previous routine cpuset_handle_cpuhp(). * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. * See also the previous routine cpuset_track_online_cpus(). */ void cpuset_track_online_nodes(void) { common_cpu_mem_hotplug_unplug(0); cgroup_lock(); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); cgroup_unlock(); } #endif Loading @@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void) top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_handle_cpuhp, 0); hotcpu_notifier(cpuset_track_online_cpus, 0); } /** Loading
kernel/sched.c +13 −6 Original line number Diff line number Diff line Loading @@ -7696,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * and partition_sched_domains() will fallback to the single partition * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new==NULL it will be replaced with cpu_online_map. * ndoms_new==0 is a special case for destroying existing domains. * It will not create the default domain. * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { int i, j; int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); if (doms_new == NULL) ndoms_new = 0; n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < ndoms_new; j++) { for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; Loading @@ -7726,7 +7729,6 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, if (doms_new == NULL) { ndoms_cur = 0; ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); dattr_new = NULL; Loading Loading @@ -7763,8 +7765,13 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, int arch_reinit_sched_domains(void) { get_online_cpus(); /* Destroy domains first to force the rebuild */ partition_sched_domains(0, NULL, NULL); rebuild_sched_domains(); put_online_cpus(); return 0; } Loading Loading @@ -7848,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: partition_sched_domains(0, NULL, NULL); partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; default: Loading