sched/fair: Prefer the previous cpu for wakeup, if an idle core From Andrea Righi The "is_idle_core" function Select target in select_idle_sibling(), if an idle core From Mario Roy Prefer the previous cpu for wakeup, if an idle core Move the recent_used_cpu heuristic to the top, including if an idle core Results Captured EEVDF, with patch, scx_cosmos, and BMQ schedulers Tested with an AMD Ryzen Threadripper 3970X CPU (32/64) The times are in seconds, lower is faster x265 time x265 -p slow -b 6 -o /dev/null --no-progress \ --log-level none --input /tmp/bosphorus_hd.y4m eevdf patch cosmos bmq ------ ------ ------ ------ 19.15 17.81 17.44 17.20 19.21 17.87 17.46 17.18 19.45 17.85 17.44 17.22 19.28 17.89 17.53 17.23 19.25 17.88 17.51 17.23 Query one million rows with SQLite [1] ./create.pl 1000000 ./query1.pl | tail eevdf patch cosmos bmq ------ ------ ------ ------ 9.63 8.68 10.50 9.92 9.82 8.75 10.07 9.89 9.89 8.61 9.80 9.51 9.62 8.61 10.42 9.77 9.63 8.72 10.59 9.52 Chameneos-redux [2] python3 pipe2.py 600000 eevdf patch cosmos bmq ------ ------ ------ ------ 7.00 6.26 7.07 7.27 6.85 6.19 6.92 6.39 6.83 6.19 7.15 6.25 7.00 6.20 7.30 7.46 6.89 6.25 7.18 6.11 Algorithm3 50% CPU Saturation [3] ./algorithm3.pl 1e12 --threads=50% eevdf patch cosmos bmq ------ ------ ------ ------ 20.01 15.81 15.83 17.25 19.94 15.90 15.94 17.30 20.43 15.97 15.86 17.27 20.80 16.12 15.97 17.42 20.41 15.99 15.96 17.53 Algorithm3 31.25% CPU Saturation [3] ./algorithm3.pl 1e12 --threads=31.25% eevdf patch cosmos bmq ------ ------ ------ ------ 29.70 23.74 24.36 24.08 30.19 23.99 24.57 23.87 30.26 24.16 24.59 24.33 30.26 23.93 24.69 24.33 30.63 24.14 24.37 24.20 Primesieve 31.25% CPU Saturation [4] primesieve 2e12 --threads=20 eevdf patch cosmos bmq ------ ------ ------ ------ 18.57 18.30 18.97 18.41 18.57 18.37 18.60 18.43 18.51 18.34 18.81 18.55 18.35 18.41 18.81 18.42 18.33 18.37 19.05 18.37 [1] https://github.com/marioroy/mce-examples/tree/main/sampledb [2] https://github.com/marioroy/mce-examples/tree/main/chameneos [3] https://github.com/marioroy/mce-sandbox [4] https://github.com/kimwalisch/primesieve Tested-by: Mario Roy <...> diff -uarp a/kernel/sched/fair.c b/kernel/sched/fair.c --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7542,6 +7542,24 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p) DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); +/* + * Return true if all the CPUs in the SMT core where @cpu belongs are idle, + * false otherwise. + */ +static bool is_idle_core(int cpu) +{ + int sibling; + + if (!sched_smt_active()) + return (available_idle_cpu(cpu) || sched_idle_cpu(cpu)); + + for_each_cpu(sibling, cpu_smt_mask(cpu)) + if (!available_idle_cpu(sibling) && !sched_idle_cpu(sibling)) + return false; + + return true; +} + static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; @@ -7817,6 +7835,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) unsigned long task_util, util_min, util_max; int i, recent_used_cpu, prev_aff = -1; + /* Check a recently used CPU as a potential idle candidate: */ + recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; + if (recent_used_cpu != prev && + recent_used_cpu != target && + cpus_share_cache(recent_used_cpu, target) && + is_idle_core(recent_used_cpu) && + cpumask_test_cpu(recent_used_cpu, p->cpus_ptr)) { + return recent_used_cpu; + } else { + recent_used_cpu = -1; + } + /* * On asymmetric system, update task utilization because we will check * that the task fits with CPU's capacity. @@ -7833,7 +7864,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + if (is_idle_core(target) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -7867,24 +7898,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return prev; } - /* Check a recently used CPU as a potential idle candidate: */ - recent_used_cpu = p->recent_used_cpu; - p->recent_used_cpu = prev; - if (recent_used_cpu != prev && - recent_used_cpu != target && - cpus_share_cache(recent_used_cpu, target) && - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && - asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - - if (!static_branch_unlikely(&sched_cluster_active) || - cpus_share_resources(recent_used_cpu, target)) - return recent_used_cpu; - - } else { - recent_used_cpu = -1; - } - /* * For asymmetric CPU capacity systems, our domain of interest is * sd_asym_cpucapacity rather than sd_llc. @@ -8618,7 +8631,14 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + /* + * If the previous CPU is an idle core, retain the same for + * cache locality. Otherwise, search for an idle sibling. + */ + if (is_idle_core(prev_cpu)) + new_cpu = prev_cpu; + else + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } rcu_read_unlock(); -- 2.40.2