diff --git a/init/Kconfig b/init/Kconfig index a9a2e2c..53f2615 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -46,6 +46,15 @@ config BROKEN_ON_SMP depends on BROKEN || !SMP default y +config BLD + bool "An alternate CPU load distribution technique for task scheduler" + depends on SMP && !NUMA_BALANCING + default y + help + This is an alternate CPU load distribution technique based for task + scheduler based on The Barbershop Load Distribution algorithm. Not + suitable for NUMA, should work well on SMP. + config INIT_ENV_ARG_LIMIT int default 32 if !UML diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h new file mode 100644 index 0000000..e2a487d --- /dev/null +++ b/kernel/sched/bld.h @@ -0,0 +1,215 @@ +#ifdef CONFIG_BLD + +static DEFINE_RWLOCK(rt_list_lock); +static LIST_HEAD(rt_rq_head); +static LIST_HEAD(cfs_rq_head); +static DEFINE_RWLOCK(cfs_list_lock); + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} +#else +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) +{ + return rt_rq->rq; +} +#else +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) +{ + return container_of(rt_rq, struct rq, rt); +} +#endif + +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) +{ + int cpu = smp_processor_id(), i; + unsigned long load, varload; + struct rq *rq; + + if (task_type) { + varload = ULONG_MAX; + for_each_cpu(i, mask) { + rq = cpu_rq(i); + load = rq->cfs.load.weight; + if (load < varload) { + varload = load; + cpu = i; + } + } + } else { + /* Here's an attempt to get a CPU within the mask where + * we can preempt easily. To achieve this we tried to + * maintain a lowbit, which indicate the lowest bit set on + * array bitmap. Since all CPUs contains high priority + * kernel threads therefore we eliminate 0, so it might not + * be right every time, but it's just an indicator. + */ + varload = 1; + + for_each_cpu(i, mask) { + rq = cpu_rq(i); + load = rq->rt.lowbit; + if (load >= varload) { + varload = load; + cpu = i; + } + } + } + + return cpu; +} + +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) +{ + struct cfs_rq *cfs; + unsigned long flags; + unsigned int cpu = smp_processor_id(); + + read_lock_irqsave(&cfs_list_lock, flags); + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { + cpu = cpu_of(rq_of_cfs(cfs)); + if (cpu_online(cpu)) + break; + } + read_unlock_irqrestore(&cfs_list_lock, flags); + return cpu; +} + +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) +{ + struct rt_rq *rt; + unsigned long flags; + unsigned int cpu = smp_processor_id(); + + read_lock_irqsave(&rt_list_lock, flags); + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { + cpu = cpu_of(rq_of_rt(rt)); + if (cpu_online(cpu)) + break; + } + read_unlock_irqrestore(&rt_list_lock, flags); + return cpu; +} + +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) +{ + unsigned int cpu = smp_processor_id(), want_affine = 0; + struct cpumask *tmpmask; + + if (p->nr_cpus_allowed == 1) + return task_cpu(p); + + if (sd_flags & SD_BALANCE_WAKE) { + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { + want_affine = 1; + } + } + + if (want_affine) + tmpmask = &p->cpus_allowed; + else + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); + + if (rt_task(p)) + cpu = select_cpu_for_wakeup(0, tmpmask); + else + cpu = select_cpu_for_wakeup(1, tmpmask); + + return cpu; +} + +static void track_load_rt(struct rq *rq, struct task_struct *p) +{ + unsigned long flag; + int firstbit; + struct rt_rq *first; + struct rt_prio_array *array = &rq->rt.active; + + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); + firstbit = sched_find_first_bit(array->bitmap); + + /* Maintaining rt.lowbit */ + if (firstbit > 0 && firstbit <= rq->rt.lowbit) + rq->rt.lowbit = firstbit; + + if (rq->rt.lowbit < first->lowbit) { + write_lock_irqsave(&rt_list_lock, flag); + list_del(&rq->rt.bld_rt_list); + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); + write_unlock_irqrestore(&rt_list_lock, flag); + } +} + +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) +{ + unsigned int cpu; + + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); + else { + if (rt_task(p)) + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); + else + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); + } + + return cpu; +} + +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) +{ + unsigned long flag; + if (rt_task(p)) { + track_load_rt(rq, p); + } else { + if (rq->cfs.pos != 2) { + struct cfs_rq *last; + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); + if (rq->cfs.load.weight >= last->load.weight) { + write_lock_irqsave(&cfs_list_lock, flag); + list_del(&rq->cfs.bld_cfs_list); + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); + rq->cfs.pos = 2; last->pos = 1; + write_unlock_irqrestore(&cfs_list_lock, flag); + } + } + } +} + +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) +{ + unsigned long flag; + if (rt_task(p)) { + track_load_rt(rq, p); + } else { + if (rq->cfs.pos != 0) { + struct cfs_rq *first; + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); + if (rq->cfs.load.weight <= first->load.weight) { + write_lock_irqsave(&cfs_list_lock, flag); + list_del(&rq->cfs.bld_cfs_list); + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); + rq->cfs.pos = 0; first->pos = 1; + write_unlock_irqrestore(&cfs_list_lock, flag); + } + } + } +} +#else +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) +{ +} + +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) +{ +} +#endif /* CONFIG_BLD */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a7bf32a..a560eb2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -38,6 +38,7 @@ #include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" +#include "bld.h" #define CREATE_TRACE_POINTS #include @@ -769,6 +770,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); + if (!dl_task(p)) + bld_track_load_activate(rq, p); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -780,6 +783,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); + if (!dl_task(p)) + bld_track_load_deactivate(rq, p); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -1549,11 +1554,17 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); +#ifndef CONFIG_BLD if (p->nr_cpus_allowed > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else cpu = cpumask_any(&p->cpus_allowed); - +#else + if (dl_task(p)) + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = bld_get_cpu(p, sd_flags, wake_flags); +#endif /* * In order not to call set_task_cpu() on a blocking task we need * to rely on ttwu() to place the task on a valid ->cpus_allowed @@ -1774,7 +1785,11 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); +#ifndef CONFIG_BLD if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) +#else + if (llist_empty(&this_rq()->wake_list)) +#endif return; /* @@ -1796,13 +1811,16 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ +#ifndef CONFIG_BLD if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#endif irq_exit(); } +#ifndef CONFIG_BLD static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -1816,6 +1834,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) trace_sched_wake_idle_without_ipi(cpu); } } +#endif /*CONFIG_BLD */ void wake_up_if_idle(int cpu) { @@ -1852,7 +1871,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ttwu_queue_remote(p, cpu, wake_flags); @@ -2921,7 +2940,14 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifndef CONFIG_BLD dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); +#else + if (dl_task(p)) + dest_cpu = task_cpu(p); + else + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); +#endif if (dest_cpu == smp_processor_id()) goto unlock; @@ -3031,8 +3057,10 @@ void scheduler_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); +#ifndef CONFIG_BLD trigger_load_balance(rq); #endif +#endif rq_last_tick_reset(rq); } @@ -5736,7 +5764,9 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); +#ifndef CONFIG_BLD nohz_balance_exit_idle(cpu); +#endif hrtick_clear(rq); return 0; } @@ -5957,6 +5987,15 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); +#ifdef CONFIG_BLD + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); + rq->cfs.pos = 0; + + INIT_LIST_HEAD(&rq->rt.bld_rt_list); + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); + rq->rt.lowbit = INT_MAX; +#endif } set_load_weight(&init_task, false); @@ -5986,6 +6025,9 @@ void __init sched_init(void) init_schedstats(); scheduler_running = 1; +#ifdef CONFIG_BLD + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); +#endif } #ifdef CONFIG_DEBUG_ATOMIC_SLEEP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26a71eb..55076de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -151,6 +151,11 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) lw->inv_weight = 0; } +static unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + /* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible @@ -710,7 +715,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) #include "sched-pelt.h" +#ifndef CONFIG_BLD static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); +#endif static unsigned long task_h_load(struct task_struct *p); /* Give new sched_entity start runnable values to heavy its load in infant time */ @@ -1422,7 +1429,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -5582,6 +5588,8 @@ void cpu_load_update_active(struct rq *this_rq) cpu_load_update_periodic(this_rq, load); } +#ifndef CONFIG_BLD + /* * Return a low guess at the load of a migration-source cpu weighted * according to the scheduling class and "nice" value. @@ -5615,11 +5623,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - static unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; @@ -6380,6 +6383,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return new_cpu; } +#endif /* CONFIG_BLD */ static void detach_entity_cfs_rq(struct sched_entity *se); @@ -6732,6 +6736,7 @@ done: __maybe_unused return p; idle: +#ifndef CONFIG_BLD new_tasks = idle_balance(rq, rf); /* @@ -6744,7 +6749,7 @@ done: __maybe_unused if (new_tasks > 0) goto again; - +#endif /* CONFIG_BLD */ return NULL; } @@ -7447,8 +7452,8 @@ static unsigned long task_h_load(struct task_struct *p) } #endif +#ifndef CONFIG_BLD /********** Helpers for find_busiest_group ************************/ - enum group_type { group_other = 0, group_imbalanced, @@ -7542,120 +7547,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long scale_rt_capacity(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 total, used, age_stamp, avg; - s64 delta; - - /* - * Since we're reading these variables without serialization make sure - * we read them once before doing sanity checks on them. - */ - age_stamp = READ_ONCE(rq->age_stamp); - avg = READ_ONCE(rq->rt_avg); - delta = __rq_clock_broken(rq) - age_stamp; - - if (unlikely(delta < 0)) - delta = 0; - - total = sched_avg_period() + delta; - - used = div_u64(avg, total); - - if (likely(used < SCHED_CAPACITY_SCALE)) - return SCHED_CAPACITY_SCALE - used; - - return 1; -} - -static void update_cpu_capacity(struct sched_domain *sd, int cpu) -{ - unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); - struct sched_group *sdg = sd->groups; - - cpu_rq(cpu)->cpu_capacity_orig = capacity; - - capacity *= scale_rt_capacity(cpu); - capacity >>= SCHED_CAPACITY_SHIFT; - - if (!capacity) - capacity = 1; - - cpu_rq(cpu)->cpu_capacity = capacity; - sdg->sgc->capacity = capacity; - sdg->sgc->min_capacity = capacity; -} - -void update_group_capacity(struct sched_domain *sd, int cpu) -{ - struct sched_domain *child = sd->child; - struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, min_capacity; - unsigned long interval; - - interval = msecs_to_jiffies(sd->balance_interval); - interval = clamp(interval, 1UL, max_load_balance_interval); - sdg->sgc->next_update = jiffies + interval; - - if (!child) { - update_cpu_capacity(sd, cpu); - return; - } - - capacity = 0; - min_capacity = ULONG_MAX; - - if (child->flags & SD_OVERLAP) { - /* - * SD_OVERLAP domains cannot assume that child groups - * span the current group. - */ - - for_each_cpu(cpu, sched_group_span(sdg)) { - struct sched_group_capacity *sgc; - struct rq *rq = cpu_rq(cpu); - - /* - * build_sched_domains() -> init_sched_groups_capacity() - * gets here before we've attached the domains to the - * runqueues. - * - * Use capacity_of(), which is set irrespective of domains - * in update_cpu_capacity(). - * - * This avoids capacity from being 0 and - * causing divide-by-zero issues on boot. - */ - if (unlikely(!rq->sd)) { - capacity += capacity_of(cpu); - } else { - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; - } - - min_capacity = min(capacity, min_capacity); - } - } else { - /* - * !SD_OVERLAP domains can assume that child groups - * span the current group. - */ - - group = child->groups; - do { - struct sched_group_capacity *sgc = group->sgc; - - capacity += sgc->capacity; - min_capacity = min(sgc->min_capacity, min_capacity); - group = group->next; - } while (group != child->groups); - } - - sdg->sgc->capacity = capacity; - sdg->sgc->min_capacity = min_capacity; -} - /* * Check whether the capacity of the rq has been noticeably reduced by side * activity. The imbalance_pct is used for the threshold. @@ -8954,6 +8845,123 @@ static int active_load_balance_cpu_stop(void *data) return 0; } +#endif /* CONFIG_BLD */ + +static unsigned long scale_rt_capacity(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 total, used, age_stamp, avg; + s64 delta; + + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); + delta = __rq_clock_broken(rq) - age_stamp; + + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; + + used = div_u64(avg, total); + + if (likely(used < SCHED_CAPACITY_SCALE)) + return SCHED_CAPACITY_SCALE - used; + + return 1; +} + + + +static void update_cpu_capacity(struct sched_domain *sd, int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); + struct sched_group *sdg = sd->groups; + + cpu_rq(cpu)->cpu_capacity_orig = capacity; + + capacity *= scale_rt_capacity(cpu); + capacity >>= SCHED_CAPACITY_SHIFT; + + if (!capacity) + capacity = 1; + + cpu_rq(cpu)->cpu_capacity = capacity; + sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = capacity; +} + +void update_group_capacity(struct sched_domain *sd, int cpu) +{ + struct sched_domain *child = sd->child; + struct sched_group *group, *sdg = sd->groups; + unsigned long capacity, min_capacity; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgc->next_update = jiffies + interval; + + if (!child) { + update_cpu_capacity(sd, cpu); + return; + } + + capacity = 0; + min_capacity = ULONG_MAX; + + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ + + for_each_cpu(cpu, sched_group_span(sdg)) { + struct sched_group_capacity *sgc; + struct rq *rq = cpu_rq(cpu); + + /* + * build_sched_domains() -> init_sched_groups_capacity() + * gets here before we've attached the domains to the + * runqueues. + * + * Use capacity_of(), which is set irrespective of domains + * in update_cpu_capacity(). + * + * This avoids capacity from being 0 and + * causing divide-by-zero issues on boot. + */ + if (unlikely(!rq->sd)) { + capacity += capacity_of(cpu); + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; + } + + min_capacity = min(capacity, min_capacity); + } + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ + + group = child->groups; + do { + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, min_capacity); + group = group->next; + } while (group != child->groups); + } + + sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = min_capacity; +} static inline int on_null_domain(struct rq *rq) { @@ -8967,12 +8975,27 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ +#ifndef CONFIG_BLD static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; +void nohz_balance_exit_idle(unsigned int cpu) +{ + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + /* + * Completely isolated CPUs don't ever set, so we must test. + */ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + } +} + static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -9011,20 +9034,6 @@ static void nohz_balancer_kick(void) return; } -void nohz_balance_exit_idle(unsigned int cpu) -{ - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - /* - * Completely isolated CPUs don't ever set, so we must test. - */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } -} - static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; @@ -9041,23 +9050,7 @@ static inline void set_cpu_sd_state_busy(void) unlock: rcu_read_unlock(); } - -void set_cpu_sd_state_idle(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || sd->nohz_idle) - goto unlock; - sd->nohz_idle = 1; - - atomic_dec(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} +#endif /* CONFIG_BLD */ /* * This routine will record that the cpu is going idle with tick stopped. @@ -9065,6 +9058,7 @@ void set_cpu_sd_state_idle(void) */ void nohz_balance_enter_idle(int cpu) { +#ifndef CONFIG_BLD /* * If this cpu is going down, then nothing needs to be done. */ @@ -9087,10 +9081,27 @@ void nohz_balance_enter_idle(int cpu) cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); -} #endif +} -static DEFINE_SPINLOCK(balancing); +void set_cpu_sd_state_idle(void) +{ + struct sched_domain *sd; + int cpu = smp_processor_id(); + + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + atomic_dec(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +#endif /* NO_HZ_COMMON */ /* * Scale the max load_balance interval with the number of CPUs in the system. @@ -9101,6 +9112,8 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +#ifndef CONFIG_BLD +static DEFINE_SPINLOCK(balancing); /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -9359,7 +9372,7 @@ static inline bool nohz_kick_needed(struct rq *rq) } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } -#endif +#endif /* CONFIG_NO_HZ_COMMON */ /* * run_rebalance_domains is triggered when needed from the scheduler tick. @@ -9399,6 +9412,7 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(); #endif } +#endif /* CONFIG_BLD */ static void rq_online_fair(struct rq *rq) { @@ -9905,7 +9919,9 @@ const struct sched_class fair_sched_class = { .put_prev_task = put_prev_task_fair, #ifdef CONFIG_SMP +#ifndef CONFIG_BLD .select_task_rq = select_task_rq_fair, +#endif .migrate_task_rq = migrate_task_rq_fair, .rq_online = rq_online_fair, @@ -9966,6 +9982,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { +#ifndef CONFIG_BLD #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); @@ -9974,5 +9991,5 @@ __init void init_sched_fair_class(void) zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif #endif /* SMP */ - +#endif /* BLD */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 665ace2..25ee9af 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1376,6 +1376,7 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); +#ifndef CONFIG_BLD static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { @@ -1431,6 +1432,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) out: return cpu; } +#endif /* CONFIG_BLD */ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { @@ -2350,7 +2352,9 @@ const struct sched_class rt_sched_class = { .put_prev_task = put_prev_task_rt, #ifdef CONFIG_SMP +#ifndef CONFIG_BLD .select_task_rq = select_task_rq_rt, +#endif .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a2..64dcaaf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -472,9 +472,8 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ -#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - +#ifdef CONFIG_FAIR_GROUP_SCHED /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities @@ -498,6 +497,11 @@ struct cfs_rq { struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_BLD + struct list_head bld_cfs_list; + char pos; +#endif }; static inline int rt_bandwidth_enabled(void) @@ -537,12 +541,15 @@ struct rt_rq { /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; + struct rq *rq; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; - - struct rq *rq; struct task_group *tg; #endif +#ifdef CONFIG_BLD + struct list_head bld_rt_list; + int lowbit; +#endif }; /* Deadline class' related fields in a runqueue */ @@ -824,7 +831,7 @@ static inline int cpu_of(struct rq *rq) } -#ifdef CONFIG_SCHED_SMT +#if (defined CONFIG_SCHED_SMT) && (!defined CONFIG_BLD) extern struct static_key_false sched_smt_present;