From 42468b19eb6a0e20bf9b27fe8b617e5f91a5d059 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 13 May 2022 22:42:02 +0200 Subject: [PATCH] latency_nice Signed-off-by: Peter Jung --- include/linux/sched.h | 3 + include/uapi/linux/sched.h | 4 +- include/uapi/linux/sched/types.h | 19 ++++++ init/init_task.c | 1 + kernel/sched/core.c | 90 ++++++++++++++++++++++++++ kernel/sched/debug.c | 1 + kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++- kernel/sched/sched.h | 34 ++++++++++ tools/include/uapi/linux/sched.h | 4 +- 9 files changed, 257 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b4cc633b266..b605c402c1e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -560,6 +560,8 @@ struct sched_entity { unsigned long runnable_weight; #endif + int latency_weight; + #ifdef CONFIG_SMP /* * Per entity load average tracking. @@ -775,6 +777,7 @@ struct task_struct { int static_prio; int normal_prio; unsigned int rt_priority; + int latency_prio; struct sched_entity se; struct sched_rt_entity rt; diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..b2e932c25be6 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -132,6 +132,7 @@ struct clone_args { #define SCHED_FLAG_KEEP_PARAMS 0x10 #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 +#define SCHED_FLAG_LATENCY_NICE 0x80 #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ SCHED_FLAG_KEEP_PARAMS) @@ -143,6 +144,7 @@ struct clone_args { SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN | \ SCHED_FLAG_KEEP_ALL | \ - SCHED_FLAG_UTIL_CLAMP) + SCHED_FLAG_UTIL_CLAMP | \ + SCHED_FLAG_LATENCY_NICE) #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index f2c4589d4dbf..db1e8199e8c8 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -10,6 +10,7 @@ struct sched_param { #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ +#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */ /* * Extended scheduling parameters data structure. @@ -98,6 +99,22 @@ struct sched_param { * scheduled on a CPU with no more capacity than the specified value. * * A task utilization boundary can be reset by setting the attribute to -1. + * + * Latency Tolerance Attributes + * =========================== + * + * A subset of sched_attr attributes allows to specify the relative latency + * requirements of a task with respect to the other tasks running/queued in the + * system. + * + * @ sched_latency_nice task's latency_nice value + * + * The latency_nice of a task can have any value in a range of + * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. + * + * A task with latency_nice with the value of LATENCY_NICE_MIN can be + * taken for a task requiring a lower latency as opposed to the task with + * higher latency_nice. */ struct sched_attr { __u32 size; @@ -120,6 +137,8 @@ struct sched_attr { __u32 sched_util_min; __u32 sched_util_max; + /* latency requirement hints */ + __s32 sched_latency_nice; }; #endif /* _UAPI_LINUX_SCHED_TYPES_H */ diff --git a/init/init_task.c b/init/init_task.c index 73cc8f03511a..e98c71f24981 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -78,6 +78,7 @@ struct task_struct init_task .prio = MAX_PRIO - 20, .static_prio = MAX_PRIO - 20, .normal_prio = MAX_PRIO - 20, + .latency_prio = NICE_WIDTH - 20, .policy = SCHED_NORMAL, .cpus_ptr = &init_task.cpus_mask, .user_cpus_ptr = NULL, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1eec4925b8c6..40cf9338196b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1241,6 +1241,11 @@ static void set_load_weight(struct task_struct *p, bool update_load) } } +static void set_latency_weight(struct task_struct *p) +{ + p->se.latency_weight = sched_latency_to_weight[p->latency_prio]; +} + #ifdef CONFIG_UCLAMP_TASK /* * Serializes updates of utilization clamp values @@ -4393,6 +4398,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ p->prio = current->normal_prio; + /* Propagate the parent's latency requirements to the child as well */ + p->latency_prio = current->latency_prio; + uclamp_fork(p); /* @@ -4409,6 +4417,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); + p->latency_prio = NICE_TO_LATENCY(0); + set_latency_weight(p); + /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: @@ -7229,6 +7240,16 @@ static void __setscheduler_params(struct task_struct *p, p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); set_load_weight(p, true); + +} + +static void __setscheduler_latency(struct task_struct *p, + const struct sched_attr *attr) +{ + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { + p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice); + set_latency_weight(p); + } } /* @@ -7355,6 +7376,17 @@ static int __sched_setscheduler(struct task_struct *p, return retval; } + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { + if (attr->sched_latency_nice > MAX_LATENCY_NICE) + return -EINVAL; + if (attr->sched_latency_nice < MIN_LATENCY_NICE) + return -EINVAL; + /* Use the same security checks as NICE */ + if (attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) && + !capable(CAP_SYS_NICE)) + return -EPERM; + } + if (pi) cpuset_read_lock(); @@ -7389,6 +7421,9 @@ static int __sched_setscheduler(struct task_struct *p, goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && + attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio)) + goto change; p->sched_reset_on_fork = reset_on_fork; retval = 0; @@ -7477,6 +7512,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_params(p, attr); __setscheduler_prio(p, newprio); } + __setscheduler_latency(p, attr); __setscheduler_uclamp(p, attr); if (queued) { @@ -7687,6 +7723,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a size < SCHED_ATTR_SIZE_VER1) return -EINVAL; + if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) && + size < SCHED_ATTR_SIZE_VER2) + return -EINVAL; /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? @@ -7924,6 +7963,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, get_params(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL; + kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio); + #ifdef CONFIG_UCLAMP_TASK /* * This could race with another potential updater, but this is fine @@ -10601,6 +10642,30 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, { return sched_group_set_idle(css_tg(css), idle); } + +static s64 cpu_latency_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->latency_prio; +} + +static int cpu_latency_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 latency_prio) +{ + return sched_group_set_latency(css_tg(css), latency_prio); +} + +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return LATENCY_TO_NICE(css_tg(css)->latency_prio); +} + +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 latency_nice) +{ + return sched_group_set_latency(css_tg(css), NICE_TO_LATENCY(latency_nice)); +} #endif static struct cftype cpu_legacy_files[] = { @@ -10615,6 +10680,11 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, + { + .name = "latency", + .read_s64 = cpu_latency_nice_read_s64, + .write_s64 = cpu_latency_nice_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -10832,6 +10902,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, + { + .name = "latency", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_latency_nice_read_s64, + .write_s64 = cpu_latency_nice_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -10928,6 +11004,20 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; +/* + * latency weight for wakeup preemption + */ +const int sched_latency_to_weight[40] = { + /* -20 */ 1024, 973, 922, 870, 819, + /* -15 */ 768, 717, 666, 614, 563, + /* -10 */ 512, 461, 410, 358, 307, + /* -5 */ 256, 205, 154, 102, 51, + /* 0 */ 0, -51, -102, -154, -205, + /* 5 */ -256, -307, -358, -410, -461, + /* 10 */ -512, -563, -614, -666, -717, + /* 15 */ -768, -819, -870, -922, -973, +}; + void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 102d6f70e84d..253e52ec73fb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1043,6 +1043,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #endif P(policy); P(prio); + P(latency_prio); if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2f461f059278..ae5f83be7c7a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5584,6 +5584,35 @@ static int sched_idle_cpu(int cpu) } #endif +static void set_next_buddy(struct sched_entity *se); + +static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se) +{ + struct sched_entity *next; + + if (se->latency_weight <= 0) + return; + + if (cfs->nr_running <= 1) + return; + /* + * When waking from idle, we don't need to check to preempt at wakeup + * the idle thread and don't set next buddy as a candidate for being + * picked in priority. + * In case of simultaneous wakeup from idle, the latency sensitive tasks + * lost opportunity to preempt non sensitive tasks which woke up + * simultaneously. + */ + + if (cfs->next) + next = cfs->next; + else + next = __pick_first_entity(cfs); + + if (next && wakeup_preempt_entity(next, se) == 1) + set_next_buddy(se); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5677,6 +5706,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) update_overutilized_status(rq); + if (rq->curr == rq->idle) + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se); + enqueue_throttle: if (cfs_bandwidth_used()) { /* @@ -5698,8 +5730,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } -static void set_next_buddy(struct sched_entity *se); - /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and @@ -6999,6 +7029,37 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ +static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se) +{ + int latency_weight = se->latency_weight; + long thresh = sysctl_sched_latency; + + /* + * A positive latency weigth means that the sched_entity has latency + * requirement that needs to be evaluated versus other entity. + * Otherwise, use the latency weight to evaluate how much scheduling + * delay is acceptable by se. + */ + if ((se->latency_weight > 0) || (curr->latency_weight > 0)) + latency_weight -= curr->latency_weight; + + if (!latency_weight) + return 0; + + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + /* + * Clamp the delta to stay in the scheduler period range + * [-sysctl_sched_latency:sysctl_sched_latency] + */ + latency_weight = clamp_t(long, latency_weight, + -1 * NICE_LATENCY_WEIGHT_MAX, + NICE_LATENCY_WEIGHT_MAX); + + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT; +} + static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -7038,6 +7099,9 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { s64 gran, vdiff = curr->vruntime - se->vruntime; + /* Take into account latency priority */ + vdiff += wakeup_latency_gran(curr, se); + if (vdiff <= 0) return -1; @@ -7146,6 +7210,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; update_curr(cfs_rq_of(se)); + if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -11136,6 +11201,10 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) delta = (s64)(sea->vruntime - seb->vruntime) + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + /* Take into account latency prio */ + delta += wakeup_latency_gran(sea, seb); + + return delta > 0; } #else @@ -11472,6 +11541,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) goto err; tg->shares = NICE_0_LOAD; + tg->latency_prio = DEFAULT_LATENCY_PRIO; init_cfs_bandwidth(tg_cfs_bandwidth(tg)); @@ -11570,6 +11640,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, } se->my_q = cfs_rq; + se->latency_weight = sched_latency_to_weight[tg->latency_prio]; /* guarantee group entities always have weight */ update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; @@ -11700,6 +11771,36 @@ int sched_group_set_idle(struct task_group *tg, long idle) return 0; } +int sched_group_set_latency(struct task_group *tg, long latency_prio) +{ + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (latency_prio < 0 || + latency_prio > LATENCY_NICE_WIDTH) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->latency_prio == latency_prio) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->latency_prio = latency_prio; + + for_each_possible_cpu(i) { + struct sched_entity *se = tg->se[i]; + + WRITE_ONCE(se->latency_weight, sched_latency_to_weight[latency_prio]); + } + + mutex_unlock(&shares_mutex); + return 0; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e8a5549488dd..4b188eb6e599 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -105,6 +105,35 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count); */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +/* + * Latency nice is meant to provide scheduler hints about the relative + * latency requirements of a task with respect to other tasks. + * Thus a task with latency_nice == 19 can be hinted as the task with no + * latency requirements, in contrast to the task with latency_nice == -20 + * which should be given priority in terms of lower latency. + */ +#define MAX_LATENCY_NICE 19 +#define MIN_LATENCY_NICE -20 + +#define LATENCY_NICE_WIDTH \ + (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1) + +/* + * Default tasks should be treated as a task with latency_nice = 0. + */ +#define DEFAULT_LATENCY_NICE 0 +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2) + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static latency [ 0..39 ], + * and back. + */ +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO) +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO) +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT) +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT) + /* * Increase resolution of nice-level calculations for 64-bit architectures. * The extra resolution improves shares distribution and load balancing of @@ -400,6 +429,8 @@ struct task_group { /* A positive value indicates that this is a SCHED_IDLE group. */ int idle; + /* latency priority of the group. */ + int latency_prio; #ifdef CONFIG_SMP /* @@ -513,6 +544,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_idle(struct task_group *tg, long idle); +extern int sched_group_set_latency(struct task_group *tg, long latency); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@ -2074,6 +2107,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE); extern const int sched_prio_to_weight[40]; extern const u32 sched_prio_to_wmult[40]; +extern const int sched_latency_to_weight[40]; /* * {de,en}queue flags: diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h index 3bac0a8ceab2..ecc4884bfe4b 100644 --- a/tools/include/uapi/linux/sched.h +++ b/tools/include/uapi/linux/sched.h @@ -132,6 +132,7 @@ struct clone_args { #define SCHED_FLAG_KEEP_PARAMS 0x10 #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 +#define SCHED_FLAG_LATENCY_NICE 0X80 #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ SCHED_FLAG_KEEP_PARAMS) @@ -143,6 +144,7 @@ struct clone_args { SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN | \ SCHED_FLAG_KEEP_ALL | \ - SCHED_FLAG_UTIL_CLAMP) + SCHED_FLAG_UTIL_CLAMP | \ + SCHED_FLAG_LATENCY_NICE) #endif /* _UAPI_LINUX_SCHED_H */ -- 2.36.1