diff --git a/include/linux/sched.h b/include/linux/sched.h index 12aa57de8eea..4ec389d7e7c2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -452,6 +452,11 @@ struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; +#ifdef CONFIG_CACHY_SCHED + struct sched_entity* next; + struct sched_entity* prev; + u64 hrrn_start_time; +#endif struct list_head group_node; unsigned int on_rq; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..cfeefb3d5681 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,6 +31,11 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_CACHY_SCHED +extern int hrrn_max_lifetime; +extern int cachy_harsh_mode; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/init/Kconfig b/init/Kconfig index 2a5df1cf838c..23c583cc4935 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -822,6 +822,17 @@ config UCLAMP_BUCKETS_COUNT endmenu +config CACHY_SCHED + bool "Cachy CPU scheduler" + default y + help + The Cachy CPU scheduler that is based on Highest Response Ratio Next + (HRRN) policy. + + Is designed for desktop usage since it is about responsiveness. + + If unsure, say Y here. + # # For architectures that want to enable the support for NUMA-affine scheduler # balancing logic: @@ -939,6 +950,7 @@ config CGROUP_WRITEBACK menuconfig CGROUP_SCHED bool "CPU controller" + depends on !CACHY_SCHED default n help This feature lets CPU scheduler recognize task groups and control CPU @@ -1201,6 +1213,7 @@ config CHECKPOINT_RESTORE config SCHED_AUTOGROUP bool "Automatic process group scheduling" + depends on !CACHY_SCHED select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c4da7e17b906..22538c0307e8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3359,6 +3359,11 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); +#ifdef CONFIG_CACHY_SCHED + if (!cachy_harsh_mode) + p->se.hrrn_start_time = sched_clock(); +#endif + activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -7073,6 +7078,10 @@ void __init sched_init(void) BUG_ON(&dl_sched_class + 1 != &stop_sched_class); #endif +#ifdef CONFIG_CACHY_SCHED + printk(KERN_INFO "Cachy CPU scheduler v5.9-r9 by Hamad Al Marri."); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 36c54265bb2b..3c3cff283ab4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -481,8 +481,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; + s64 MIN_vruntime = -1, +#if !defined(CONFIG_CACHY_SCHED) + min_vruntime, rq0_min_vruntime, + spread0, +#endif + max_vruntime = -1, spread; struct rq *rq = cpu_rq(cpu); struct sched_entity *last; unsigned long flags; @@ -503,21 +507,27 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; +#if !defined(CONFIG_CACHY_SCHED) min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; +#endif raw_spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); +#if !defined(CONFIG_CACHY_SCHED) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", SPLIT_NS(min_vruntime)); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", SPLIT_NS(max_vruntime)); spread = max_vruntime - MIN_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); +#if !defined(CONFIG_CACHY_SCHED) spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); +#endif SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0c4d9ad7da8..c29ab40d3a7a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,10 @@ * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Cachy enhancements CPU cache and scheduler based on + * Highest Response Ratio Next (HRRN) policy. + * (C) 2020 Hamad Al Marri */ #include "sched.h" @@ -38,6 +42,12 @@ unsigned int sysctl_sched_latency = 6000000ULL; static unsigned int normalized_sysctl_sched_latency = 6000000ULL; +#ifdef CONFIG_CACHY_SCHED +int hrrn_max_lifetime = 30000; // in ms +int cachy_harsh_mode = 0; +#define U64_TO_U32(X) ((u32) (((X) >> 8) & 0xFFFFFFFF)) +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -512,7 +522,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: */ - +#if !defined(CONFIG_CACHY_SCHED) static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); @@ -568,7 +578,129 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif } +#endif /* CONFIG_CACHY_SCHED */ + +#ifdef CONFIG_CACHY_SCHED +/* + * Does se have higher HRRN value than curr? If yes, return 1, + * otherwise return -1 + * se is before curr if se has higher HRRN + */ +static int +entity_before(u64 now, struct sched_entity *curr, struct sched_entity *se) +{ + u32 l_curr, l_se, r_curr, r_se, vr_curr, vr_se; + s32 diff; + + /* + * in case of vruntime==0, logical OR with 1 would + * make sure that the least sig. bit is 1 + */ + vr_curr = U64_TO_U32(curr->vruntime) | 1; + vr_se = U64_TO_U32(se->vruntime) | 1; + + l_curr = U64_TO_U32(now - curr->hrrn_start_time); + l_se = U64_TO_U32(now - se->hrrn_start_time); + + r_curr = l_curr / vr_curr; + r_se = l_se / vr_se; + + diff = r_se - r_curr; + + // take the remainder if equal + if (diff == 0) { + r_curr = l_curr % vr_curr; + r_se = l_se % vr_se; + diff = r_se - r_curr; + } + + if (diff > 0) + return 1; + + return -1; +} +/* + * Enqueue an entity + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct sched_entity *iter, *prev = NULL; + u64 now = sched_clock(); + se->next = NULL; + se->prev = NULL; + + if (likely(cfs_rq->head)) { + + // start from head + iter = cfs_rq->head; + + // does iter have higher HRRN value than se? + while (iter && entity_before(now, se, iter) == 1) { + prev = iter; + iter = iter->next; + } + + // if iter == NULL, insert se at the end + if (iter == NULL) { + prev->next = se; + se->prev = prev; + } + // else if not head, insert se before iter + else if (iter != cfs_rq->head) { + se->next = iter; + se->prev = prev; + + iter->prev = se; + prev->next = se; + } + // else iter == head, insert se at head + else { + se->next = cfs_rq->head; + cfs_rq->head->prev = se; + + // lastly reset the head + cfs_rq->head = se; + } + return; + } + + // if empty rq + cfs_rq->head = se; +} + +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + // if only one se in rq + if (cfs_rq->head->next == NULL) + { + cfs_rq->head = NULL; + WRITE_ONCE(cfs_rq->hrrn_head, 0UL); + } + else if (se == cfs_rq->head) + { + // if it is the head + cfs_rq->head = cfs_rq->head->next; + cfs_rq->head->prev = NULL; + } + else + { + // if in the middle + struct sched_entity *prev = se->prev; + struct sched_entity *next = se->next; + + prev->next = next; + + if (next) + next->prev = prev; + } +} + +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +{ + return cfs_rq->head; +} +#else /* * Enqueue an entity into the rb-tree: */ @@ -626,16 +758,21 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) return rb_entry(next, struct sched_entity, run_node); } +#endif /* CONFIG_CACHY_SCHED */ #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { +#ifdef CONFIG_CACHY_SCHED + return cfs_rq->head; +#else struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); if (!last) return NULL; return rb_entry(last, struct sched_entity, run_node); +#endif /* CONFIG_CACHY_SCHED */ } /************************************************************** @@ -720,6 +857,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) return slice; } +#if !defined(CONFIG_CACHY_SCHED) /* * We calculate the vruntime slice of a to-be-inserted task. * @@ -729,6 +867,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { return calc_delta_fair(sched_slice(cfs_rq, se), se); } +#endif /* CONFIG_CACHY_SCHED */ #include "pelt.h" #ifdef CONFIG_SMP @@ -836,13 +975,41 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) } #endif /* CONFIG_SMP */ +#ifdef CONFIG_CACHY_SCHED +static void reset_lifetime(u64 now, struct sched_entity *se) +{ + /* + * left shift 20 bits is approximately = * 1000000 + * we don't need the precision of life time + * Ex. for 30s, with left shift (20bits) == 31.457s + */ + u64 max_life_ns = ((u64) hrrn_max_lifetime) << 20; + u64 life_time = now - se->hrrn_start_time; + s64 diff = life_time - max_life_ns; + + if (unlikely(diff > 0)) { + // multiply life_time by 8 for more precision + u64 old_hrrn_x8 = life_time / ((se->vruntime >> 3) | 1); + + // reset life to half max_life (i.e ~15s) + se->hrrn_start_time = now - (max_life_ns >> 1); + + // avoid division by zero + if (old_hrrn_x8 == 0) old_hrrn_x8 = 1; + + // reset vruntime based on old hrrn ration + se->vruntime = (max_life_ns << 2) / old_hrrn_x8; + } +} +#endif /* CONFIG_CACHY_SCHED */ + /* * Update the current task's runtime statistics. */ static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_clock_task(rq_of(cfs_rq)); + u64 now = sched_clock(); u64 delta_exec; if (unlikely(!curr)) @@ -861,12 +1028,20 @@ static void update_curr(struct cfs_rq *cfs_rq) schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); + +#ifdef CONFIG_CACHY_SCHED + reset_lifetime(sched_clock(), curr); +#else update_min_vruntime(cfs_rq); +#endif if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); +#if !defined(CONFIG_CACHY_SCHED) trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); +#endif + cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } @@ -882,6 +1057,7 @@ static void update_curr_fair(struct rq *rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACHY_SCHED) u64 wait_start, prev_wait_start; if (!schedstat_enabled()) @@ -895,11 +1071,13 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) wait_start -= prev_wait_start; __schedstat_set(se->statistics.wait_start, wait_start); +#endif } static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACHY_SCHED) struct task_struct *p; u64 delta; @@ -927,11 +1105,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) __schedstat_inc(se->statistics.wait_count); __schedstat_add(se->statistics.wait_sum, delta); __schedstat_set(se->statistics.wait_start, 0); +#endif } static inline void update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACHY_SCHED) struct task_struct *tsk = NULL; u64 sleep_start, block_start; @@ -995,6 +1175,7 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) account_scheduler_latency(tsk, delta >> 10, 0); } } +#endif } /* @@ -1003,6 +1184,7 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { +#if !defined(CONFIG_CACHY_SCHED) if (!schedstat_enabled()) return; @@ -1015,12 +1197,13 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) update_stats_enqueue_sleeper(cfs_rq, se); +#endif } static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - +#if !defined(CONFIG_CACHY_SCHED) if (!schedstat_enabled()) return; @@ -1041,6 +1224,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __schedstat_set(se->statistics.block_start, rq_clock(rq_of(cfs_rq))); } +#endif } /* @@ -1052,7 +1236,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_clock_task(rq_of(cfs_rq)); + se->exec_start = sched_clock(); } /************************************************** @@ -3060,15 +3244,19 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACHY_SCHED) cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; +#endif } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACHY_SCHED) sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); +#endif } #else static inline void @@ -3324,6 +3512,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { +#if !defined(CONFIG_CACHY_SCHED) u64 p_last_update_time; u64 n_last_update_time; @@ -3363,6 +3552,7 @@ void set_task_rq_fair(struct sched_entity *se, #endif __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; +#endif } @@ -3642,6 +3832,9 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { +#ifdef CONFIG_CACHY_SCHED + return 0; +#else unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; @@ -3687,8 +3880,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #endif return decayed; +#endif } +#if !defined(CONFIG_CACHY_SCHED) /** * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to @@ -3766,6 +3961,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s trace_pelt_cfs_tp(cfs_rq); } +#endif /* * Optional action to be done while updating the load average @@ -3777,6 +3973,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { +#if !defined(CONFIG_CACHY_SCHED) u64 now = cfs_rq_clock_pelt(cfs_rq); int decayed; @@ -3808,8 +4005,10 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s if (flags & UPDATE_TG) update_tg_load_avg(cfs_rq, 0); } +#endif } +#if !defined(CONFIG_CACHY_SCHED) #ifndef CONFIG_64BIT static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { @@ -3830,6 +4029,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) return cfs_rq->avg.last_update_time; } #endif +#endif /* * Synchronize entity load avg of dequeued entity without locking @@ -3837,11 +4037,13 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) */ static void sync_entity_load_avg(struct sched_entity *se) { +#if !defined(CONFIG_CACHY_SCHED) struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); __update_load_avg_blocked_se(last_update_time, se); +#endif } /* @@ -3850,6 +4052,7 @@ static void sync_entity_load_avg(struct sched_entity *se) */ static void remove_entity_load_avg(struct sched_entity *se) { +#if !defined(CONFIG_CACHY_SCHED) struct cfs_rq *cfs_rq = cfs_rq_of(se); unsigned long flags; @@ -3867,6 +4070,7 @@ static void remove_entity_load_avg(struct sched_entity *se) cfs_rq->removed.load_avg += se->avg.load_avg; cfs_rq->removed.runnable_avg += se->avg.runnable_avg; raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); +#endif } static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) @@ -4089,7 +4293,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { -#ifdef CONFIG_SCHED_DEBUG +#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_CACHY_SCHED) s64 d = se->vruntime - cfs_rq->min_vruntime; if (d < 0) @@ -4100,6 +4304,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) #endif } +#if !defined(CONFIG_CACHY_SCHED) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { @@ -4133,9 +4338,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +#endif /* CONFIG_CACHY_SCHED */ static inline void check_schedstat_required(void) { +#if !defined(CONFIG_CACHY_SCHED) #ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) return; @@ -4152,6 +4359,7 @@ static inline void check_schedstat_required(void) "kernel.sched_schedstats=1\n"); } #endif +#endif } static inline bool cfs_bandwidth_used(void); @@ -4189,18 +4397,23 @@ static inline bool cfs_bandwidth_used(void); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { +#if !defined(CONFIG_CACHY_SCHED) bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); +#endif bool curr = cfs_rq->curr == se; +#if !defined(CONFIG_CACHY_SCHED) /* * If we're the current task, we must renormalise before calling * update_curr(). */ if (renorm && curr) se->vruntime += cfs_rq->min_vruntime; +#endif update_curr(cfs_rq); +#if !defined(CONFIG_CACHY_SCHED) /* * Otherwise, renormalise after, such that we're placed at the current * moment in time, instead of some random moment in the past. Being @@ -4209,6 +4422,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; +#endif /* * When enqueuing a sched_entity, we must: @@ -4223,8 +4437,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); account_entity_enqueue(cfs_rq, se); +#if !defined(CONFIG_CACHY_SCHED) if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); +#endif check_schedstat_required(); update_stats_enqueue(cfs_rq, se, flags); @@ -4233,6 +4449,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; +#if !defined(CONFIG_CACHY_SCHED) /* * When bandwidth control is enabled, cfs might have been removed * because of a parent been throttled but cfs->nr_running > 1. Try to @@ -4243,8 +4460,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_running == 1) check_enqueue_throttle(cfs_rq); +#endif } +#if !defined(CONFIG_CACHY_SCHED) static void __clear_buddies_last(struct sched_entity *se) { for_each_sched_entity(se) { @@ -4289,6 +4508,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) if (cfs_rq->skip == se) __clear_buddies_skip(se); } +#endif // !CONFIG_CACHY_SCHED static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -4313,13 +4533,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_dequeue(cfs_rq, se, flags); +#if !defined(CONFIG_CACHY_SCHED) clear_buddies(cfs_rq, se); +#endif if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; account_entity_dequeue(cfs_rq, se); +#if !defined(CONFIG_CACHY_SCHED) /* * Normalize after update_curr(); which will also have moved * min_vruntime if @se is the one holding it back. But before doing @@ -4328,12 +4551,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; +#endif /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); update_cfs_group(se); +#if !defined(CONFIG_CACHY_SCHED) /* * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be @@ -4342,8 +4567,35 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); +#endif +} + +#ifdef CONFIG_CACHY_SCHED +static inline u32 +hrrn_of(u64 now, struct sched_entity *se) +{ + u32 l_se, r_se, vr_se; + + vr_se = U64_TO_U32(se->vruntime) | 1; + l_se = U64_TO_U32(now - se->hrrn_start_time); + r_se = l_se / vr_se; + + return r_se; } +/* + * Preempt the current task with a newly woken task if needed: + */ +static void +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + u64 now = sched_clock(); + + // does head have higher HRRN value than curr + if (entity_before(now, curr, cfs_rq->head) == 1) + resched_curr(rq_of(cfs_rq)); +} +#else /* * Preempt the current task with a newly woken task if needed: */ @@ -4383,6 +4635,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (delta > ideal_runtime) resched_curr(rq_of(cfs_rq)); } +#endif /* CONFIG_CACHY_SCHED */ static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4417,6 +4670,21 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +#ifdef CONFIG_CACHY_SCHED +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + struct sched_entity *se = cfs_rq->head; + u64 now = sched_clock(); + + if (unlikely(!se)) + se = curr; + else if (unlikely(curr && entity_before(now, se, curr) == 1)) + se = curr; + + return se; +} +#else static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); @@ -4477,6 +4745,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) return se; } +#endif /* CONFIG_CACHY_SCHED */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -4736,6 +5005,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { +#ifdef CONFIG_CACHY_SCHED + return false; +#else struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; @@ -4801,10 +5073,12 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); return true; +#endif } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { +#if !defined(CONFIG_CACHY_SCHED) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; @@ -4886,6 +5160,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); +#endif } static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) @@ -5341,7 +5616,11 @@ static inline bool cfs_bandwidth_used(void) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } + +#if !defined(CONFIG_CACHY_SCHED) static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +#endif + static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -5472,7 +5751,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#if !defined(CONFIG_CACHY_SCHED) int idle_h_nr_running = task_has_idle_policy(p); +#endif int task_new = !(flags & ENQUEUE_WAKEUP); /* @@ -5491,6 +5772,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); +#ifdef CONFIG_CACHY_SCHED + if (!se->on_rq) { + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, flags); + cfs_rq->h_nr_running++; + } +#else for_each_sched_entity(se) { if (se->on_rq) break; @@ -5528,6 +5816,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (throttled_hierarchy(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } +#endif /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -5549,6 +5838,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) update_overutilized_status(rq); +#if !defined(CONFIG_CACHY_SCHED) enqueue_throttle: if (cfs_bandwidth_used()) { /* @@ -5564,13 +5854,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; } } - +#endif assert_list_leaf_cfs_rq(rq); hrtick_update(rq); } +#if !defined(CONFIG_CACHY_SCHED) static void set_next_buddy(struct sched_entity *se); +#endif /* * The dequeue_task method is called before nr_running is @@ -5582,6 +5874,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + +#ifdef CONFIG_CACHY_SCHED + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, flags); + cfs_rq->h_nr_running--; +#else int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); @@ -5600,12 +5898,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); + /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ if (task_sleep && se && !throttled_hierarchy(cfs_rq)) set_next_buddy(se); + break; } flags |= DEQUEUE_SLEEP; @@ -5626,15 +5926,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) goto dequeue_throttle; } - +#endif /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); +#if !defined(CONFIG_CACHY_SCHED) /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; dequeue_throttle: +#endif util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5721,6 +6023,7 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } +#if !defined(CONFIG_CACHY_SCHED) static void record_wakee(struct task_struct *p) { /* @@ -5767,7 +6070,9 @@ static int wake_wide(struct task_struct *p) return 0; return 1; } +#endif +#if !defined(CONFIG_CACHY_SCHED) /* * The purpose of wake_affine() is to quickly determine on which CPU we can run * soonest. For the purpose of speed we only consider the waking and previous @@ -5866,6 +6171,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; } +#endif static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); @@ -6424,6 +6730,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) return min_t(unsigned long, util, capacity_orig_of(cpu)); } +#if !defined(CONFIG_CACHY_SCHED) /* * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) * to @dst_cpu. @@ -6657,6 +6964,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) return -1; } +#endif /* CONFIG_CACHY_SCHED */ /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -6673,6 +6981,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { +#ifdef CONFIG_CACHY_SCHED + int new_cpu = prev_cpu; + + return select_idle_sibling(p, prev_cpu, new_cpu); +#else struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; @@ -6727,9 +7040,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_unlock(); return new_cpu; +#endif /* CONFIG_CACHY_SCHED */ } +#if !defined(CONFIG_CACHY_SCHED) static void detach_entity_cfs_rq(struct sched_entity *se); +#endif /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and @@ -6738,6 +7054,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { +#if !defined(CONFIG_CACHY_SCHED) /* * As blocked tasks retain absolute vruntime the migration needs to * deal with this by subtracting the old and adding the new @@ -6783,6 +7100,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) */ remove_entity_load_avg(&p->se); } +#endif /* CONFIG_CACHY_SCHED */ /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; @@ -6808,6 +7126,7 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ +#if !defined(CONFIG_CACHY_SCHED) static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -6886,6 +7205,7 @@ static void set_skip_buddy(struct sched_entity *se) for_each_sched_entity(se) cfs_rq_of(se)->skip = se; } +#endif /* CONFIG_CACHY_SCHED */ /* * Preempt the current task with a newly woken task if needed: @@ -6894,9 +7214,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; + +#if !defined(CONFIG_CACHY_SCHED) struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; +#endif /* CONFIG_CACHY_SCHED */ if (unlikely(se == pse)) return; @@ -6910,10 +7233,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; +#if !defined(CONFIG_CACHY_SCHED) if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { set_next_buddy(pse); next_buddy_marked = 1; } +#endif /* CONFIG_CACHY_SCHED */ /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -6943,6 +7268,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ find_matching_se(&se, &pse); update_curr(cfs_rq_of(se)); BUG_ON(!pse); + +#ifdef CONFIG_CACHY_SCHED + if (entity_before(sched_clock(), se, pse) == 1) + goto preempt; +#else if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -6952,11 +7282,14 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ set_next_buddy(pse); goto preempt; } +#endif /* CONFIG_CACHY_SCHED */ return; preempt: resched_curr_lazy(rq); + +#if !defined(CONFIG_CACHY_SCHED) /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -6971,6 +7304,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) set_last_buddy(se); +#endif /* CONFIG_CACHY_SCHED */ } struct task_struct * @@ -7066,11 +7400,23 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf if (prev) put_prev_task(rq, prev); +#ifdef CONFIG_CACHY_SCHED + se = pick_next_entity(cfs_rq, NULL); + set_next_entity(cfs_rq, se); + + if (cfs_rq->head) { + u32 hrrn = hrrn_of(sched_clock(), cfs_rq->head); + WRITE_ONCE(cfs_rq->hrrn_head, hrrn); + } else { + WRITE_ONCE(cfs_rq->hrrn_head, 0UL); + } +#else do { se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); +#endif p = task_of(se); @@ -7092,6 +7438,10 @@ done: __maybe_unused; return p; idle: +#ifdef CONFIG_CACHY_SCHED + WRITE_ONCE(cfs_rq->hrrn_head, 0UL); +#endif + if (!rf) return NULL; @@ -7145,7 +7495,10 @@ static void yield_task_fair(struct rq *rq) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); + +#if !defined(CONFIG_CACHY_SCHED) struct sched_entity *se = &curr->se; +#endif /* * Are we the only task in the tree? @@ -7153,7 +7506,9 @@ static void yield_task_fair(struct rq *rq) if (unlikely(rq->nr_running == 1)) return; +#if !defined(CONFIG_CACHY_SCHED) clear_buddies(cfs_rq, se); +#endif if (curr->policy != SCHED_BATCH) { update_rq_clock(rq); @@ -7169,7 +7524,9 @@ static void yield_task_fair(struct rq *rq) rq_clock_skip_update(rq); } +#if !defined(CONFIG_CACHY_SCHED) set_skip_buddy(se); +#endif } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -7180,8 +7537,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; +#if !defined(CONFIG_CACHY_SCHED) /* Tell the scheduler that we'd really like pse to run next. */ set_next_buddy(se); +#endif yield_task_fair(rq); @@ -7390,6 +7749,7 @@ struct lb_env { struct list_head tasks; }; +#if !defined(CONFIG_CACHY_SCHED) /* * Is this task likely cache-hot: */ @@ -7737,7 +8097,9 @@ static int detach_tasks(struct lb_env *env) return detached; } +#endif +#if !defined(CONFIG_CACHY_SCHED) /* * attach_task() -- attach the task detached by detach_task() to its new rq. */ @@ -7786,6 +8148,7 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +#endif #ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) @@ -7831,6 +8194,7 @@ static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif +#if !defined(CONFIG_CACHY_SCHED) static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; @@ -7856,6 +8220,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) return decayed; } +#endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7963,6 +8328,7 @@ static unsigned long task_h_load(struct task_struct *p) cfs_rq_load_avg(cfs_rq) + 1); } #else +#if !defined(CONFIG_CACHY_SCHED) static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -7974,6 +8340,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) return decayed; } +#endif static unsigned long task_h_load(struct task_struct *p) { @@ -7983,6 +8350,7 @@ static unsigned long task_h_load(struct task_struct *p) static void update_blocked_averages(int cpu) { +#if !defined(CONFIG_CACHY_SCHED) bool decayed = false, done = true; struct rq *rq = cpu_rq(cpu); struct rq_flags rf; @@ -7997,6 +8365,7 @@ static void update_blocked_averages(int cpu) if (decayed) cpufreq_update_util(rq, 0); rq_unlock_irqrestore(rq, &rf); +#endif } /********** Helpers for find_busiest_group ************************/ @@ -9140,6 +9509,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * different in groups. */ +#if !defined(CONFIG_CACHY_SCHED) /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. @@ -9405,6 +9775,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, return busiest; } +#endif /* * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but @@ -9451,6 +9822,7 @@ voluntary_active_balance(struct lb_env *env) return 0; } +#if !defined(CONFIG_CACHY_SCHED) static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; @@ -9463,6 +9835,7 @@ static int need_active_balance(struct lb_env *env) static int active_load_balance_cpu_stop(void *data); + static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; @@ -9772,6 +10145,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, out: return ld_moved; } +#endif static inline unsigned long get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) @@ -9801,6 +10175,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) *next_balance = next; } +#if !defined(CONFIG_CACHY_SCHED) /* * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at @@ -9892,6 +10267,7 @@ static int active_load_balance_cpu_stop(void *data) } static DEFINE_SPINLOCK(balancing); +#endif /* * Scale the max load_balance interval with the number of CPUs in the system. @@ -9902,6 +10278,7 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +#if !defined(CONFIG_CACHY_SCHED) /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -10007,6 +10384,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) #endif } } +#endif static inline int on_null_domain(struct rq *rq) { @@ -10036,6 +10414,7 @@ static inline int find_new_ilb(void) return nr_cpu_ids; } +#if !defined(CONFIG_CACHY_SCHED) /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick any * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). @@ -10186,6 +10565,7 @@ static void nohz_balancer_kick(struct rq *rq) if (flags) kick_ilb(flags); } +#endif static void set_cpu_sd_state_busy(int cpu) { @@ -10293,6 +10673,7 @@ void nohz_balance_enter_idle(int cpu) WRITE_ONCE(nohz.has_blocked, 1); } +#if !defined(CONFIG_CACHY_SCHED) /* * Internal function that runs load balance for all idle cpus. The load balance * can be a simple update of blocked load or a complete load balance with @@ -10453,8 +10834,10 @@ static void nohz_newidle_balance(struct rq *this_rq) kick_ilb(NOHZ_STATS_KICK); raw_spin_lock(&this_rq->lock); } +#endif #else /* !CONFIG_NO_HZ_COMMON */ +#if !defined(CONFIG_CACHY_SCHED) static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) @@ -10463,8 +10846,111 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle } static inline void nohz_newidle_balance(struct rq *this_rq) { } +#endif + #endif /* CONFIG_NO_HZ_COMMON */ +#ifdef CONFIG_CACHY_SCHED +static int +cachy_can_migrate_task(struct task_struct *p, int dst_cpu, struct rq *src_rq) +{ + /* + * We do not migrate tasks that are: + * 1) throttled_lb_pair, or + * 2) cannot be migrated to this CPU due to cpus_ptr, or + * 3) running (obviously), or + */ + + if (task_running(src_rq, p)) + return 0; + + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) + return 0; + + if (p->se.exec_start == 0) + return 0; + + return 1; +} + +static void pull_from_unlock(struct rq *this_rq, + struct rq *src_rq, + struct rq_flags *rf, + struct task_struct *p, + int dst_cpu) +{ + // detach task + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, dst_cpu); + + // unlock src rq + rq_unlock(src_rq, rf); + local_irq_restore(rf->flags); + + // lock this rq + raw_spin_lock(&this_rq->lock); + update_rq_clock(this_rq); + + activate_task(this_rq, p, ENQUEUE_NOCLOCK); + check_preempt_curr(this_rq, p, 0); + + // unlock this rq + raw_spin_unlock(&this_rq->lock); +} + +static inline struct rq * +find_max_hrrn_rq(struct cfs_rq *cfs_rq, int dst_cpu) +{ + struct rq *tmp_rq, *max_rq = NULL; + int cpu; + u32 max_hrrn = cfs_rq->hrrn_head; + u32 local_hrrn; + + // find max hrrn + for_each_online_cpu(cpu) { + if (cpu == dst_cpu) + continue; + + tmp_rq = cpu_rq(cpu); + + if (tmp_rq->cfs.nr_running < 2 || !tmp_rq->cfs.head) + continue; + + local_hrrn = READ_ONCE(tmp_rq->cfs.hrrn_head); + + if (local_hrrn > max_hrrn) { + max_hrrn = local_hrrn; + max_rq = tmp_rq; + } + } + + return max_rq; +} + +static int try_pull_from(struct rq *src_rq, struct rq *this_rq) +{ + struct rq_flags rf; + int dst_cpu = cpu_of(this_rq); + struct task_struct *p; + + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.head && src_rq->cfs.nr_running > 1) { + p = task_of(src_rq->cfs.head); + + if (cachy_can_migrate_task(p, dst_cpu, src_rq)) { + pull_from_unlock(this_rq, src_rq, &rf, p, dst_cpu); + return 1; + } + } + + rq_unlock(src_rq, &rf); + local_irq_restore(rf.flags); + + return 0; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -10475,6 +10961,105 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * > 0 - success, new (fair) tasks present */ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +{ + int this_cpu = this_rq->cpu; + struct task_struct *p = NULL; + struct rq *src_rq; + int src_cpu; + struct rq_flags src_rf; + int pulled_task = 0; + int cores_round = 1; + + update_misfit_status(NULL, this_rq); + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + this_rq->idle_stamp = rq_clock(this_rq); + + /* + * Do not pull tasks towards !active CPUs... + */ + if (!cpu_active(this_cpu)) + return 0; + + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + raw_spin_unlock(&this_rq->lock); + +again: + for_each_online_cpu(src_cpu) { + + if (src_cpu == this_cpu) + continue; + + if (cores_round && !cpus_share_cache(src_cpu, this_cpu)) + continue; + + src_rq = cpu_rq(src_cpu); + + rq_lock_irqsave(src_rq, &src_rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.nr_running < 2 || !(src_rq->cfs.head)) + goto next; + + p = task_of(src_rq->cfs.head); + + if (cachy_can_migrate_task(p, this_cpu, src_rq)) { + pull_from_unlock(this_rq, src_rq, &src_rf, p, this_cpu); + + pulled_task = 1; + goto out; + } + +next: + rq_unlock(src_rq, &src_rf); + local_irq_restore(src_rf.flags); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) + goto out; + } + + if (cores_round) { + // now search for all cpus + cores_round = 0; + goto again; + } + +out: + raw_spin_lock(&this_rq->lock); + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) + this_rq->idle_stamp = 0; + + rq_repin_lock(this_rq, rf); + + return pulled_task; +} +#else +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -10625,6 +11210,159 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); } +#endif + +#ifdef CONFIG_CACHY_SCHED +static int +idle_try_pull_any(struct cfs_rq *cfs_rq) +{ + struct task_struct *p = NULL; + struct rq *this_rq = rq_of(cfs_rq), *src_rq; + int dst_cpu = cpu_of(this_rq); + int src_cpu; + struct rq_flags rf; + int pulled = 0; + int cores_round = 1; + +again: + for_each_online_cpu(src_cpu) { + + if (src_cpu == dst_cpu) + continue; + + if (cores_round && !cpus_share_cache(src_cpu, dst_cpu)) + continue; + + src_rq = cpu_rq(src_cpu); + + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.nr_running < 2 || !src_rq->cfs.head) + goto next; + + p = task_of(src_rq->cfs.head); + + if (cachy_can_migrate_task(p, dst_cpu, src_rq)) { + pull_from_unlock(this_rq, src_rq, &rf, p, dst_cpu); + pulled = 1; + goto out; + } + +next: + rq_unlock(src_rq, &rf); + local_irq_restore(rf.flags); + } + + if (cores_round) { + // now search for all cpus + cores_round = 0; + goto again; + } + +out: + return pulled; +} + + +static int +try_pull_higher_HRRN(struct cfs_rq *cfs_rq) +{ + struct rq *this_rq = rq_of(cfs_rq), *max_rq; + int dst_cpu = cpu_of(this_rq); + + max_rq = find_max_hrrn_rq(cfs_rq, dst_cpu); + + if (!max_rq) + return 0; + + if (try_pull_from(max_rq, this_rq)) + return 1; + + return 0; +} + +static void try_pull_any(struct rq *this_rq) +{ + struct task_struct *p = NULL; + struct rq *src_rq; + int dst_cpu = cpu_of(this_rq); + int src_cpu; + struct rq_flags src_rf; + int cores_round = 1; + +again: + for_each_online_cpu(src_cpu) { + + if (src_cpu == dst_cpu) + continue; + + if (cores_round && !cpus_share_cache(src_cpu, dst_cpu)) + continue; + + src_rq = cpu_rq(src_cpu); + + if (src_rq->cfs.nr_running < 2 || !(src_rq->cfs.head) + || src_rq->cfs.nr_running <= this_rq->cfs.nr_running) + continue; + + rq_lock_irqsave(src_rq, &src_rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.nr_running < 2 || !(src_rq->cfs.head) + || src_rq->cfs.nr_running <= this_rq->cfs.nr_running) + goto next; + + p = task_of(src_rq->cfs.head); + + if (cachy_can_migrate_task(p, dst_cpu, src_rq)) { + pull_from_unlock(this_rq, src_rq, &src_rf, p, dst_cpu); + return; + } + +next: + rq_unlock(src_rq, &src_rf); + local_irq_restore(src_rf.flags); + } + + if (cores_round) { + // now search for all cpus + cores_round = 0; + goto again; + } +} + +static inline void +active_balance(struct rq *rq) +{ + struct cfs_rq *cfs_rq = &rq->cfs; + + if (!cfs_rq->head || cfs_rq->nr_running < 2) + try_pull_higher_HRRN(&rq->cfs); + else + try_pull_any(rq); +} + +void trigger_load_balance(struct rq *rq) +{ + //int pulled = 0; + + /* Don't need to rebalance while attached to NULL domain */ + if (unlikely(on_null_domain(rq))) + return; + + if (rq->idle_balance) { + //pulled = idle_try_pull_any(&rq->cfs); + idle_try_pull_any(&rq->cfs); + + //if (pulled) + //resched_curr(rq); + } + else { + active_balance(rq); + } +} +#endif static void rq_online_fair(struct rq *rq) { @@ -10668,12 +11406,31 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_overutilized_status(task_rq(curr)); } +#ifdef CONFIG_CACHY_SCHED /* * called on fork with the child task as argument from the parent's context * - child not yet on the tasklist * - preemption disabled */ static void task_fork_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *curr; + struct rq *rq = this_rq(); + struct rq_flags rf; + + rq_lock(rq, &rf); + update_rq_clock(rq); + + cfs_rq = task_cfs_rq(current); + curr = cfs_rq->curr; + if (curr) + update_curr(cfs_rq); + + rq_unlock(rq, &rf); +} +#else +static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; @@ -10703,6 +11460,7 @@ static void task_fork_fair(struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; rq_unlock(rq, &rf); } +#endif /* CONFIG_CACHY_SCHED */ /* * Priority of the task has changed. Check to see if we preempt @@ -10779,9 +11537,12 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) } } #else +#if !defined(CONFIG_CACHY_SCHED) static void propagate_entity_cfs_rq(struct sched_entity *se) { } #endif +#endif +#if !defined(CONFIG_CACHY_SCHED) static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10792,9 +11553,11 @@ static void detach_entity_cfs_rq(struct sched_entity *se) update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } +#endif static void attach_entity_cfs_rq(struct sched_entity *se) { +#if !defined(CONFIG_CACHY_SCHED) struct cfs_rq *cfs_rq = cfs_rq_of(se); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -10810,10 +11573,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se) attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); +#endif } static void detach_task_cfs_rq(struct task_struct *p) { +#if !defined(CONFIG_CACHY_SCHED) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10827,17 +11592,19 @@ static void detach_task_cfs_rq(struct task_struct *p) } detach_entity_cfs_rq(se); +#endif } static void attach_task_cfs_rq(struct task_struct *p) { +#if !defined(CONFIG_CACHY_SCHED) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); attach_entity_cfs_rq(se); - if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; +#endif } static void switched_from_fair(struct rq *rq, struct task_struct *p) @@ -10893,13 +11660,21 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; + +#if !defined(CONFIG_CACHY_SCHED) cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#endif /* CONFIG_CACHY_SCHED */ + #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif + +#ifdef CONFIG_CACHY_SCHED + cfs_rq->head = NULL; +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -11224,7 +11999,9 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP +#if !defined(CONFIG_CACHY_SCHED) open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); +#endif #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8d1ca65db3b0..65ffb0244f1e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -523,10 +523,13 @@ struct cfs_rq { unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; + +#if !defined(CONFIG_CACHY_SCHED) u64 min_vruntime; #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif +#endif // CONFIG_CACHY_SCHED struct rb_root_cached tasks_timeline; @@ -535,9 +538,14 @@ struct cfs_rq { * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; +#ifdef CONFIG_CACHY_SCHED + struct sched_entity *head; + u32 hrrn_head; +#else struct sched_entity *next; struct sched_entity *last; struct sched_entity *skip; +#endif // CONFIG_CACHY_SCHED #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afad085960b8..bfe341f83969 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,22 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#if CONFIG_CACHY_SCHED + { + .procname = "sched_hrrn_max_lifetime_ms", + .data = &hrrn_max_lifetime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_cachy_harsh_mode_enabled", + .data = &cachy_harsh_mode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns",