From e12219abc1eadabb1e7e7ad2908a0373c744c37e Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Wed, 1 May 2024 00:27:24 +0200 Subject: [PATCH] sched-ext Signed-off-by: Piotr Gorski --- Documentation/scheduler/index.rst | 1 + Documentation/scheduler/sched-ext.rst | 294 + MAINTAINERS | 13 + Makefile | 8 +- drivers/tty/sysrq.c | 1 + include/asm-generic/vmlinux.lds.h | 1 + include/linux/cgroup-defs.h | 8 + include/linux/cgroup.h | 5 +- include/linux/kernfs.h | 5 +- include/linux/sched.h | 5 + include/linux/sched/ext.h | 756 +++ include/linux/sched/task.h | 3 +- include/uapi/linux/sched.h | 1 + init/Kconfig | 5 + init/init_task.c | 15 + kernel/Kconfig.preempt | 24 +- kernel/bpf/bpf_struct_ops_types.h | 4 + kernel/bpf/verifier.c | 3 +- kernel/cgroup/cgroup.c | 97 +- kernel/fork.c | 17 +- kernel/sched/build_policy.c | 8 + kernel/sched/core.c | 318 +- kernel/sched/deadline.c | 4 +- kernel/sched/debug.c | 3 + kernel/sched/ext.c | 5374 +++++++++++++++++ kernel/sched/ext.h | 277 + kernel/sched/fair.c | 9 +- kernel/sched/idle.c | 2 + kernel/sched/rt.c | 4 +- kernel/sched/sched.h | 119 +- kernel/sched/topology.c | 4 +- lib/dump_stack.c | 1 + tools/Makefile | 10 +- tools/sched_ext/.gitignore | 10 + tools/sched_ext/Kconfig | 9 + tools/sched_ext/Makefile | 251 + tools/sched_ext/README.md | 269 + .../sched_ext/include/bpf-compat/gnu/stubs.h | 11 + tools/sched_ext/include/scx/common.bpf.h | 245 + tools/sched_ext/include/scx/common.h | 69 + tools/sched_ext/include/scx/ravg.bpf.h | 42 + tools/sched_ext/include/scx/ravg_impl.bpf.h | 358 ++ tools/sched_ext/include/scx/user_exit_info.h | 67 + tools/sched_ext/scx_qmap.bpf.c | 400 ++ tools/sched_ext/scx_qmap.c | 106 + tools/sched_ext/scx_show_state.py | 39 + tools/sched_ext/scx_simple.bpf.c | 155 + tools/sched_ext/scx_simple.c | 99 + tools/testing/selftests/scx/.gitignore | 6 + tools/testing/selftests/scx/Makefile | 210 + tools/testing/selftests/scx/config | 5 + .../selftests/scx/ddsp_bogus_dsq_fail.bpf.c | 50 + .../selftests/scx/ddsp_bogus_dsq_fail.c | 57 + .../selftests/scx/ddsp_vtimelocal_fail.bpf.c | 47 + .../selftests/scx/ddsp_vtimelocal_fail.c | 56 + .../selftests/scx/enq_last_no_enq_fails.bpf.c | 29 + .../selftests/scx/enq_last_no_enq_fails.c | 60 + .../selftests/scx/enq_select_cpu_fails.bpf.c | 51 + .../selftests/scx/enq_select_cpu_fails.c | 61 + .../selftests/scx/init_enable_count.bpf.c | 63 + .../testing/selftests/scx/init_enable_count.c | 160 + tools/testing/selftests/scx/maximal.bpf.c | 164 + tools/testing/selftests/scx/maximal.c | 51 + tools/testing/selftests/scx/maybe_null.bpf.c | 26 + tools/testing/selftests/scx/maybe_null.c | 40 + .../selftests/scx/maybe_null_fail.bpf.c | 25 + tools/testing/selftests/scx/minimal.bpf.c | 32 + tools/testing/selftests/scx/minimal.c | 58 + tools/testing/selftests/scx/reload_loop.c | 75 + tools/testing/selftests/scx/runner.c | 201 + tools/testing/selftests/scx/scx_test.h | 123 + .../selftests/scx/select_cpu_dfl.bpf.c | 48 + tools/testing/selftests/scx/select_cpu_dfl.c | 72 + .../scx/select_cpu_dfl_nodispatch.bpf.c | 97 + .../selftests/scx/select_cpu_dfl_nodispatch.c | 72 + .../selftests/scx/select_cpu_dispatch.bpf.c | 49 + .../selftests/scx/select_cpu_dispatch.c | 70 + .../scx/select_cpu_dispatch_bad_dsq.bpf.c | 45 + .../scx/select_cpu_dispatch_bad_dsq.c | 56 + .../scx/select_cpu_dispatch_dbl_dsp.bpf.c | 46 + .../scx/select_cpu_dispatch_dbl_dsp.c | 56 + .../selftests/scx/select_cpu_vtime.bpf.c | 94 + .../testing/selftests/scx/select_cpu_vtime.c | 59 + tools/testing/selftests/scx/test_example.c | 49 + 84 files changed, 11883 insertions(+), 109 deletions(-) create mode 100644 Documentation/scheduler/sched-ext.rst create mode 100644 include/linux/sched/ext.h create mode 100644 kernel/sched/ext.c create mode 100644 kernel/sched/ext.h create mode 100644 tools/sched_ext/.gitignore create mode 100644 tools/sched_ext/Kconfig create mode 100644 tools/sched_ext/Makefile create mode 100644 tools/sched_ext/README.md create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h create mode 100644 tools/sched_ext/include/scx/common.bpf.h create mode 100644 tools/sched_ext/include/scx/common.h create mode 100644 tools/sched_ext/include/scx/ravg.bpf.h create mode 100644 tools/sched_ext/include/scx/ravg_impl.bpf.h create mode 100644 tools/sched_ext/include/scx/user_exit_info.h create mode 100644 tools/sched_ext/scx_qmap.bpf.c create mode 100644 tools/sched_ext/scx_qmap.c create mode 100644 tools/sched_ext/scx_show_state.py create mode 100644 tools/sched_ext/scx_simple.bpf.c create mode 100644 tools/sched_ext/scx_simple.c create mode 100644 tools/testing/selftests/scx/.gitignore create mode 100644 tools/testing/selftests/scx/Makefile create mode 100644 tools/testing/selftests/scx/config create mode 100644 tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c create mode 100644 tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c create mode 100644 tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c create mode 100644 tools/testing/selftests/scx/ddsp_vtimelocal_fail.c create mode 100644 tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c create mode 100644 tools/testing/selftests/scx/enq_last_no_enq_fails.c create mode 100644 tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c create mode 100644 tools/testing/selftests/scx/enq_select_cpu_fails.c create mode 100644 tools/testing/selftests/scx/init_enable_count.bpf.c create mode 100644 tools/testing/selftests/scx/init_enable_count.c create mode 100644 tools/testing/selftests/scx/maximal.bpf.c create mode 100644 tools/testing/selftests/scx/maximal.c create mode 100644 tools/testing/selftests/scx/maybe_null.bpf.c create mode 100644 tools/testing/selftests/scx/maybe_null.c create mode 100644 tools/testing/selftests/scx/maybe_null_fail.bpf.c create mode 100644 tools/testing/selftests/scx/minimal.bpf.c create mode 100644 tools/testing/selftests/scx/minimal.c create mode 100644 tools/testing/selftests/scx/reload_loop.c create mode 100644 tools/testing/selftests/scx/runner.c create mode 100644 tools/testing/selftests/scx/scx_test.h create mode 100644 tools/testing/selftests/scx/select_cpu_dfl.bpf.c create mode 100644 tools/testing/selftests/scx/select_cpu_dfl.c create mode 100644 tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c create mode 100644 tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch.bpf.c create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch.c create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c create mode 100644 tools/testing/selftests/scx/select_cpu_vtime.bpf.c create mode 100644 tools/testing/selftests/scx/select_cpu_vtime.c create mode 100644 tools/testing/selftests/scx/test_example.c diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst index 317074722..0b650bb55 100644 --- a/Documentation/scheduler/index.rst +++ b/Documentation/scheduler/index.rst @@ -19,6 +19,7 @@ Scheduler sched-nice-design sched-rt-group sched-stats + sched-ext sched-debug text_files diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst new file mode 100644 index 000000000..e3847b44e --- /dev/null +++ b/Documentation/scheduler/sched-ext.rst @@ -0,0 +1,294 @@ +========================== +Extensible Scheduler Class +========================== + +sched_ext is a scheduler class whose behavior can be defined by a set of BPF +programs - the BPF scheduler. + +* sched_ext exports a full scheduling interface so that any scheduling + algorithm can be implemented on top. + +* The BPF scheduler can group CPUs however it sees fit and schedule them + together, as tasks aren't tied to specific CPUs at the time of wakeup. + +* The BPF scheduler can be turned on and off dynamically anytime. + +* The system integrity is maintained no matter what the BPF scheduler does. + The default scheduling behavior is restored anytime an error is detected, + a runnable task stalls, or on invoking the SysRq key sequence + :kbd:`SysRq-S`. + +Switching to and from sched_ext +=============================== + +``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and +``tools/sched_ext`` contains the example schedulers. + +sched_ext is used only when the BPF scheduler is loaded and running. + +If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be +treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is +loaded. On load, such tasks will be switched to and scheduled by sched_ext. + +The BPF scheduler can choose to schedule all normal and lower class tasks by +calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this +case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and +``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers, +this mode can be selected with the ``-a`` option. + +Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or +detection of any internal error including stalled runnable tasks aborts the +BPF scheduler and reverts all tasks back to CFS. + +.. code-block:: none + + # make -j16 -C tools/sched_ext + # tools/sched_ext/scx_simple + local=0 global=3 + local=5 global=24 + local=9 global=44 + local=13 global=56 + local=17 global=72 + ^CEXIT: BPF scheduler unregistered + +The current status of the BPF scheduler can be determined as follows: + +.. code-block:: none + + # cat /sys/kernel/sched_ext/state + enabled + # cat /sys/kernel/sched_ext/root/ops + simple + +``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more +detailed information: + +.. code-block:: none + + # tools/sched_ext/scx_show_state.py + ops : simple + enabled : 1 + switching_all : 1 + switched_all : 1 + enable_state : enabled (2) + bypass_depth : 0 + nr_rejected : 0 + +If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can +be determined as follows: + +.. code-block:: none + + # grep ext /proc/self/sched + ext.enabled : 1 + +The Basics +========== + +Userspace can implement an arbitrary BPF scheduler by loading a set of BPF +programs that implement ``struct sched_ext_ops``. The only mandatory field +is ``ops.name`` which must be a valid BPF object name. All operations are +optional. The following modified excerpt is from +``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler. + +.. code-block:: c + + /* + * Decide which CPU a task should be migrated to before being + * enqueued (either at wakeup, fork time, or exec time). If an + * idle core is found by the default ops.select_cpu() implementation, + * then dispatch the task directly to SCX_DSQ_LOCAL and skip the + * ops.enqueue() callback. + * + * Note that this implemenation has exactly the same behavior as the + * default ops.select_cpu implementation. The behavior of the scheduler + * would be exactly same if the implementation just didn't define the + * simple_select_cpu() struct_ops prog. + */ + s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) + { + s32 cpu; + /* Need to initialize or the BPF verifier will reject the program */ + bool direct = false; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct); + + if (direct) + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + + return cpu; + } + + /* + * Do a direct dispatch of a task to the global DSQ. This ops.enqueue() + * callback will only be invoked if we failed to find a core to dispatch + * to in ops.select_cpu() above. + * + * Note that this implemenation has exactly the same behavior as the + * default ops.enqueue implementation, which just dispatches the task + * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same + * if the implementation just didn't define the simple_enqueue struct_ops + * prog. + */ + void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) + { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + s32 BPF_STRUCT_OPS(simple_init) + { + /* + * All SCHED_OTHER, SCHED_IDLE, and SCHED_BATCH tasks should + * use sched_ext. + */ + scx_bpf_switch_all(); + return 0; + } + + void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) + { + exit_type = ei->type; + } + + SEC(".struct_ops") + struct sched_ext_ops simple_ops = { + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple", + }; + +Dispatch Queues +--------------- + +To match the impedance between the scheduler core and the BPF scheduler, +sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a +priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``), +and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage +an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and +``scx_bpf_destroy_dsq()``. + +A CPU always executes a task from its local DSQ. A task is "dispatched" to a +DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's +local DSQ. + +When a CPU is looking for the next task to run, if the local DSQ is not +empty, the first task is picked. Otherwise, the CPU tries to consume the +global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()`` +is invoked. + +Scheduling Cycle +---------------- + +The following briefly shows how a waking task is scheduled and executed. + +1. When a task is waking up, ``ops.select_cpu()`` is the first operation + invoked. This serves two purposes. First, CPU selection optimization + hint. Second, waking up the selected CPU if idle. + + The CPU selected by ``ops.select_cpu()`` is an optimization hint and not + binding. The actual decision is made at the last step of scheduling. + However, there is a small performance gain if the CPU + ``ops.select_cpu()`` returns matches the CPU the task eventually runs on. + + A side-effect of selecting a CPU is waking it up from idle. While a BPF + scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, + using ``ops.select_cpu()`` judiciously can be simpler and more efficient. + + A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by + calling ``scx_bpf_dispatch()``. If the task is dispatched to + ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the + local DSQ of whichever CPU is returned from ``ops.select_cpu()``. + Additionally, dispatching directly from ``ops.select_cpu()`` will cause the + ``ops.enqueue()`` callback to be skipped. + + Note that the scheduler core will ignore an invalid CPU selection, for + example, if it's outside the allowed cpumask of the task. + +2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the + task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()`` + can make one of the following decisions: + + * Immediately dispatch the task to either the global or local DSQ by + calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or + ``SCX_DSQ_LOCAL``, respectively. + + * Immediately dispatch the task to a custom DSQ by calling + ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63. + + * Queue the task on the BPF side. + +3. When a CPU is ready to schedule, it first looks at its local DSQ. If + empty, it then looks at the global DSQ. If there still isn't a task to + run, ``ops.dispatch()`` is invoked which can use the following two + functions to populate the local DSQ. + + * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can + be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``, + ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()`` + currently can't be called with BPF locks held, this is being worked on + and will be supported. ``scx_bpf_dispatch()`` schedules dispatching + rather than performing them immediately. There can be up to + ``ops.dispatch_max_batch`` pending tasks. + + * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ + to the dispatching DSQ. This function cannot be called with any BPF + locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks + before trying to consume the specified DSQ. + +4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ, + the CPU runs the first one. If empty, the following steps are taken: + + * Try to consume the global DSQ. If successful, run the task. + + * If ``ops.dispatch()`` has dispatched any tasks, retry #3. + + * If the previous task is an SCX task and still runnable, keep executing + it (see ``SCX_OPS_ENQ_LAST``). + + * Go idle. + +Note that the BPF scheduler can always choose to dispatch tasks immediately +in ``ops.enqueue()`` as illustrated in the above simple example. If only the +built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as +a task is never queued on the BPF scheduler and both the local and global +DSQs are consumed automatically. + +``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use +``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as +``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue +dispatching, and must be dispatched to with ``scx_bpf_dispatch()``. See the +function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for +more information. + +Where to Look +============= + +* ``include/linux/sched/ext.h`` defines the core data structures, ops table + and constants. + +* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers. + The functions prefixed with ``scx_bpf_`` can be called from the BPF + scheduler. + +* ``tools/sched_ext/`` hosts example BPF scheduler implementations. + + * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a + custom DSQ. + + * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five + levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. + +ABI Instability +=============== + +The APIs provided by sched_ext to BPF schedulers programs have no stability +guarantees. This includes the ops table callbacks and constants defined in +``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in +``kernel/sched/ext.c``. + +While we will attempt to provide a relatively stable API surface when +possible, they are subject to change without warning between kernel +versions. diff --git a/MAINTAINERS b/MAINTAINERS index 91af49091..595a7a6fb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19504,6 +19504,19 @@ F: include/linux/wait.h F: include/uapi/linux/sched.h F: kernel/sched/ +SCHEDULER - SCHED_EXT +R: Tejun Heo +R: David Vernet +L: linux-kernel@vger.kernel.org +S: Maintained +W: https://github.com/sched-ext/scx +T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git +F: include/linux/sched/ext.h +F: kernel/sched/ext.h +F: kernel/sched/ext.c +F: tools/sched_ext/ +F: tools/testing/selftests/sched_ext + SCSI LIBSAS SUBSYSTEM R: John Garry R: Jason Yan diff --git a/Makefile b/Makefile index d1922dea1..527fbecdb 100644 --- a/Makefile +++ b/Makefile @@ -1504,6 +1504,12 @@ ifneq ($(wildcard $(resolve_btfids_O)),) $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean endif +tools-clean-targets := sched_ext +PHONY += $(tools-clean-targets) +$(tools-clean-targets): + $(Q)$(MAKE) -sC tools $@_clean +tools_clean: $(tools-clean-targets) + # Clear a bunch of variables before executing the submake ifeq ($(quiet),silent_) tools_silent=s @@ -1673,7 +1679,7 @@ PHONY += $(mrproper-dirs) mrproper $(mrproper-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@) -mrproper: clean $(mrproper-dirs) +mrproper: clean $(mrproper-dirs) tools_clean $(call cmd,rmfiles) @find . $(RCS_FIND_IGNORE) \ \( -name '*.rmeta' \) \ diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 02217e3c9..1ce3535cb 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -520,6 +520,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = { NULL, /* P */ NULL, /* Q */ NULL, /* R */ + /* S: May be registered by sched_ext for resetting */ NULL, /* S */ NULL, /* T */ NULL, /* U */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5dd3a61d6..b84816895 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -131,6 +131,7 @@ *(__dl_sched_class) \ *(__rt_sched_class) \ *(__fair_sched_class) \ + *(__ext_sched_class) \ *(__idle_sched_class) \ __sched_class_lowest = .; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861c..bfc027311 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -132,12 +132,18 @@ enum { CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ + CFTYPE_HIDDEN = (1 << 6), /* file type hidden, see cgroup_show_cftypes() */ + /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ __CFTYPE_ADDED = (1 << 18), }; +enum cfile_flags { + CFILE_HIDDEN = (1 << 0), /* file instance hidden */ +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can @@ -145,7 +151,9 @@ enum { */ struct cgroup_file { /* do not access any fields from outside cgroup core */ + struct cftype *cft; struct kernfs_node *kn; + unsigned int flags; unsigned long notified_at; struct timer_list notify_timer; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 34aaf0e87..32679fcff 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -29,8 +29,6 @@ struct kernel_clone_args; -#ifdef CONFIG_CGROUPS - /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of @@ -40,6 +38,8 @@ struct kernel_clone_args; #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 +#ifdef CONFIG_CGROUPS + enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ @@ -114,6 +114,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); +void cgroup_show_cftype(struct cftype *cft, bool show); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index e857a150b..05dcbae7e 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -206,6 +206,9 @@ struct kernfs_node { const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ + unsigned short flags; + umode_t mode; + union { struct kernfs_elem_dir dir; struct kernfs_elem_symlink symlink; @@ -220,8 +223,6 @@ struct kernfs_node { */ u64 id; - unsigned short flags; - umode_t mode; struct kernfs_iattrs *iattr; struct rcu_head rcu; diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618a..fbdf57f32 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -80,6 +80,8 @@ struct task_group; struct task_struct; struct user_event_mm; +#include + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -798,6 +800,9 @@ struct task_struct { struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; +#ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; +#endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 index 000000000..474131a20 --- /dev/null +++ b/include/linux/sched/ext.h @@ -0,0 +1,756 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef _LINUX_SCHED_EXT_H +#define _LINUX_SCHED_EXT_H + +#ifdef CONFIG_SCHED_CLASS_EXT + +#include +#include + +struct cgroup; + +enum scx_consts { + SCX_OPS_NAME_LEN = 128, + + SCX_SLICE_DFL = 20 * NSEC_PER_MSEC, + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ +}; + +/* + * DSQ (dispatch queue) IDs are 64bit of the format: + * + * Bits: [63] [62 .. 0] + * [ B] [ ID ] + * + * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs + * ID: 63 bit ID + * + * Built-in IDs: + * + * Bits: [63] [62] [61..32] [31 .. 0] + * [ 1] [ L] [ R ] [ V ] + * + * 1: 1 for built-in DSQs. + * L: 1 for LOCAL_ON DSQ IDs, 0 for others + * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. + */ +enum scx_dsq_id_flags { + SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, + SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, + + SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, + SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, + SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, + SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* BPF unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * CPU cgroup knob enable flags + */ + SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16, /* cpu.weight */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_CGROUP_KNOB_WEIGHT, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * Userland can implement an arbitrary scheduling policy by implementing and + * loading operations in this table. + */ +struct sched_ext_ops { + /** + * select_cpu - Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p + * is dispatched, the ops.enqueue() callback will be skipped. Finally, + * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the + * local DSQ of whatever CPU is returned by this callback. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * enqueue - Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() + * or enqueue on the BPF scheduler. If not directly dispatched, the bpf + * scheduler owns @p and if it fails to dispatch @p, the task will + * stall. + * + * If @p was dispatched from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * dequeue - Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using + * scx_bpf_consume(). + * + * The maximum number of times scx_bpf_dispatch() can be called without + * an intervening scx_bpf_consume() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * runnable - A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU. Likewise, a task + * may be ->enqueue()'d without being preceded by this operation e.g. + * after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * running - A task is starting to run on its associated CPU + * @p: task starting to run + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * stopping - A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * quiescent - A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * yield - Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * core_sched_before - Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a,struct task_struct *b); + + /** + * set_weight - Set task weight + * @p: task to set weight for + * @weight: new eight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * set_cpumask - Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * update_idle - Update the idle state of a CPU + * @cpu: CPU to udpate the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * cpu_acquire - A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * cpu_release - A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * init_task - Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, + struct scx_init_task_args *args); + + /** + * exit_task - Exit a previously-running task from the system + * @p: task to exit + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, + struct scx_exit_task_args *args); + + /** + * enable - Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. @p is now in the cgroup specified in + * @args. enable() is called on @p any time it enters SCX, and is + * always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * disable - Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * cgroup_init - Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * cgroup_exit - Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * cgroup_prep_move - Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_move - Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_cancel_move - Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_set_weight - A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @tg's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); +#endif /* CONFIG_CGROUPS */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * cpu_online - A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks + * associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * cpu_offline - A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks + * associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * init - Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * exit - Clean up after the BPF scheduler + * @info: Exit info + */ + void (*exit)(struct scx_exit_info *info); + + /** + * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * flags - %SCX_OPS_* flags + */ + u64 flags; + + /** + * timeout_ms - The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * name - BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; +}; + +/* + * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the + * scheduler core and the BPF scheduler. See the documentation for more details. + */ +struct scx_dispatch_q { + raw_spinlock_t lock; + struct list_head fifo; /* processed in dispatching order */ + struct rb_root_cached priq; /* processed in p->scx.dsq_vtime order */ + u32 nr; + u64 id; + struct rhash_head hash_node; + struct llist_node free_node; + struct rcu_head rcu; +}; + +/* scx_entity.flags */ +enum scx_ent_flags { + SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ + SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ + SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + + SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, + + SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ +}; + +/* scx_entity.flags & SCX_TASK_STATE_MASK */ +enum scx_task_state { + SCX_TASK_NONE, /* ops.init_task() not called yet */ + SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ + SCX_TASK_READY, /* fully initialized, but not in sched_ext */ + SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + + SCX_TASK_NR_STATES, +}; + +/* scx_entity.dsq_flags */ +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ +}; + +/* + * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from + * everywhere and the following bits track which kfunc sets are currently + * allowed for %current. This simple per-task tracking works because SCX ops + * nest in a limited way. BPF will likely implement a way to allow and disallow + * kfuncs depending on the calling context which will replace this manual + * mechanism. See scx_kf_allow(). + */ +enum scx_kf_mask { + SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ + /* all non-sleepables may be nested inside INIT and SLEEPABLE */ + SCX_KF_INIT = 1 << 0, /* running ops.init() */ + SCX_KF_SLEEPABLE = 1 << 1, /* other sleepable init operations */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ + SCX_KF_CPU_RELEASE = 1 << 2, /* ops.cpu_release() */ + /* ops.dequeue (in REST) may be nested inside DISPATCH */ + SCX_KF_DISPATCH = 1 << 3, /* ops.dispatch() */ + SCX_KF_ENQUEUE = 1 << 4, /* ops.enqueue() and ops.select_cpu() */ + SCX_KF_SELECT_CPU = 1 << 5, /* ops.select_cpu() */ + SCX_KF_REST = 1 << 6, /* other rq-locked operations */ + + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + +/* + * The following is embedded in task_struct and contains all fields necessary + * for a task to be scheduled by SCX. + */ +struct sched_ext_entity { + struct scx_dispatch_q *dsq; + struct { + struct list_head fifo; /* dispatch order */ + struct rb_node priq; /* p->scx.dsq_vtime order */ + } dsq_node; + u32 flags; /* protected by rq lock */ + u32 dsq_flags; /* protected by dsq lock */ + u32 weight; + s32 sticky_cpu; + s32 holding_cpu; + u32 kf_mask; /* see scx_kf_mask above */ + struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ + atomic_long_t ops_state; + + struct list_head runnable_node; /* rq->scx.runnable_list */ + unsigned long runnable_at; + +#ifdef CONFIG_SCHED_CORE + u64 core_sched_at; /* see scx_prio_less() */ +#endif + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; + + /* BPF scheduler modifiable fields */ + + /* + * Runtime budget in nsecs. This is usually set through + * scx_bpf_dispatch() but can also be modified directly by the BPF + * scheduler. Automatically decreased by SCX as the task executes. On + * depletion, a scheduling event is triggered. + * + * This value is cleared to zero if the task is preempted by + * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the + * task ran. Use p->se.sum_exec_runtime instead. + */ + u64 slice; + + /* + * Used to order tasks when dispatching to the vtime-ordered priority + * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() + * but can also be modified directly by the BPF scheduler. Modifying it + * while a task is queued on a dsq may mangle the ordering and is not + * recommended. + */ + u64 dsq_vtime; + + /* + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * + * If set from ops.init_task() and the task's policy is already + * %SCHED_EXT, which can happen while the BPF scheduler is being loaded + * or by inhering the parent's policy during fork, the task's policy is + * rejected and forcefully reverted to %SCHED_NORMAL. The number of + * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. + */ + bool disallow; /* reject switching into SCX */ + + /* cold fields */ + struct list_head tasks_node; +#ifdef CONFIG_EXT_GROUP_SCHED + struct cgroup *cgrp_moving_from; +#endif +}; + +void sched_ext_free(struct task_struct *p); +void print_scx_info(const char *log_lvl, struct task_struct *p); + +#else /* !CONFIG_SCHED_CLASS_EXT */ + +static inline void sched_ext_free(struct task_struct *p) {} +static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} + +#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* _LINUX_SCHED_EXT_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index d362aacf9..4df2f9055 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ce..359a14cc7 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -118,6 +118,7 @@ struct clone_args { /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 #define SCHED_DEADLINE 6 +#define SCHED_EXT 7 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 diff --git a/init/Kconfig b/init/Kconfig index 9ea39297f..83f3a745a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1037,6 +1037,11 @@ config RT_GROUP_SCHED realtime bandwidth for them. See Documentation/scheduler/sched-rt-group.rst for more information. +config EXT_GROUP_SCHED + bool + depends on SCHED_CLASS_EXT && CGROUP_SCHED + default y + endif #CGROUP_SCHED config SCHED_MM_CID diff --git a/init/init_task.c b/init/init_task.c index 7ecb458eb..10a1df7df 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,20 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #endif #ifdef CONFIG_CGROUP_SCHED .sched_task_group = &root_task_group, +#endif +#ifdef CONFIG_SCHED_CLASS_EXT + .scx = { + .dsq_node.fifo = LIST_HEAD_INIT(init_task.scx.dsq_node.fifo), + .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), + .flags = 0, + .sticky_cpu = -1, + .holding_cpu = -1, + .ops_state = ATOMIC_INIT(0), + .runnable_at = INITIAL_JIFFIES, + .slice = SCX_SLICE_DFL, + .ddsp_dsq_id = SCX_DSQ_INVALID, + .ddsp_enq_flags = 0, + }, #endif .ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c2f1fd95a..bae49b743 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -133,4 +133,26 @@ config SCHED_CORE which is the likely usage by Linux distributions, there should be no measurable impact on performance. - +config SCHED_CLASS_EXT + bool "Extensible Scheduling Class" + depends on BPF_SYSCALL && BPF_JIT + help + This option enables a new scheduler class sched_ext (SCX), which + allows scheduling policies to be implemented as BPF programs to + achieve the following: + + - Ease of experimentation and exploration: Enabling rapid + iteration of new scheduling policies. + - Customization: Building application-specific schedulers which + implement policies that are not applicable to general-purpose + schedulers. + - Rapid scheduler deployments: Non-disruptive swap outs of + scheduling policies in production environments. + + sched_ext leverages BPF’s struct_ops feature to define a structure + which exports function callbacks and flags to BPF programs that + wish to implement scheduling policies. The struct_ops structure + exported by sched_ext is struct sched_ext_ops, and is conceptually + similar to struct sched_class. + + See Documentation/scheduler/sched-ext.rst for more details. diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h index 5678a9ddf..3618769d8 100644 --- a/kernel/bpf/bpf_struct_ops_types.h +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -9,4 +9,8 @@ BPF_STRUCT_OPS_TYPE(bpf_dummy_ops) #include BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) #endif +#ifdef CONFIG_SCHED_CLASS_EXT +#include +BPF_STRUCT_OPS_TYPE(sched_ext_ops) +#endif #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 19e575e6b..a51c5c253 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17600,8 +17600,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_rcu_lock && - !in_rbtree_lock_required_cb(env)) { + if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) { verbose(env, "bpf_rcu_read_unlock is missing\n"); return -EINVAL; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c8..b316d3b7f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4206,10 +4206,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (IS_ERR(kn)) return PTR_ERR(kn); + kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN)); + if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); + cfile->cft = cft; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; @@ -4485,6 +4488,24 @@ void cgroup_file_notify(struct cgroup_file *cfile) spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile) +{ + struct kernfs_node *kn; + + spin_lock_irq(&cgroup_file_kn_lock); + kn = cfile->kn; + kernfs_get(kn); + spin_unlock_irq(&cgroup_file_kn_lock); + + return kn; +} + +static bool cfile_visible(struct cgroup_file *cfile) +{ + return !(cfile->cft->flags & CFTYPE_HIDDEN) && + !(cfile->flags & CFILE_HIDDEN); +} + /** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset @@ -4494,15 +4515,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; - spin_lock_irq(&cgroup_file_kn_lock); - kn = cfile->kn; - kernfs_get(kn); - spin_unlock_irq(&cgroup_file_kn_lock); + mutex_lock(&cgroup_mutex); - if (kn) - kernfs_show(kn, show); + if (show) + cfile->flags &= ~CFILE_HIDDEN; + else + cfile->flags |= CFILE_HIDDEN; - kernfs_put(kn); + kn = cfile_kn_get(cfile); + if (kn) { + kernfs_show(kn, cfile_visible(cfile)); + kernfs_put(kn); + } + + mutex_unlock(&cgroup_mutex); } /** @@ -5526,6 +5552,63 @@ static void offline_css(struct cgroup_subsys_state *css) wake_up_all(&css->cgroup->offline_waitq); } +/** + * cgroup_show_cftype - show or hide a cgroup file type + * @cft: cftype to show or hide + * @show: whether to show or hide + * + * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show. + * @cft may or may not be added at the time of this call. After hiding, it's + * guaranteed that there are no in-flight operations on the hidden files. + */ +void cgroup_show_cftype(struct cftype *cft, bool show) +{ + struct cgroup_subsys *ss = cft->ss; + struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp; + struct cgroup_subsys_state *css; + + mutex_lock(&cgroup_mutex); + + if (show) + cft->flags &= ~CFTYPE_HIDDEN; + else + cft->flags |= CFTYPE_HIDDEN; + + if (!(cft->flags & __CFTYPE_ADDED)) + goto out_unlock; + + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { + struct cgroup *cgrp = css->cgroup; + struct kernfs_node *kn; + + if (!(css->flags & CSS_VISIBLE)) + continue; + + if (cft->file_offset) { + struct cgroup_file *cfile = + (void *)css + cft->file_offset; + + kn = cfile_kn_get(cfile); + if (kn) { + kernfs_show(kn, cfile_visible(cfile)); + kernfs_put(kn); + } + } else { + char buf[CGROUP_FILE_NAME_MAX]; + + kn = kernfs_find_and_get(cgrp->kn, + cgroup_file_name(cgrp, cft, buf)); + if (kn) { + kernfs_show(kn, show); + kernfs_put(kn); + } + } + } + +out_unlock: + mutex_unlock(&cgroup_mutex); +} + /** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with diff --git a/kernel/fork.c b/kernel/fork.c index cda4ff635..78e2f54c1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -973,6 +974,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + sched_ext_free(tsk); io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); @@ -2479,7 +2481,7 @@ __latent_entropy struct task_struct *copy_process( retval = perf_event_init_task(p, clone_flags); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_sched_cancel_fork; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; @@ -2610,7 +2612,9 @@ __latent_entropy struct task_struct *copy_process( * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ - sched_cgroup_fork(p, args); + retval = sched_cgroup_fork(p, args); + if (retval) + goto bad_fork_cancel_cgroup; /* * From this point on we must avoid any synchronous user-space @@ -2656,13 +2660,13 @@ __latent_entropy struct task_struct *copy_process( /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; - goto bad_fork_cancel_cgroup; + goto bad_fork_core_free; } /* Let kill terminate clone/fork in the middle */ if (fatal_signal_pending(current)) { retval = -EINTR; - goto bad_fork_cancel_cgroup; + goto bad_fork_core_free; } /* No more failure paths after this point. */ @@ -2736,10 +2740,11 @@ __latent_entropy struct task_struct *copy_process( return p; -bad_fork_cancel_cgroup: +bad_fork_core_free: sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); +bad_fork_cancel_cgroup: cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { @@ -2778,6 +2783,8 @@ __latent_entropy struct task_struct *copy_process( audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_sched_cancel_fork: + sched_cancel_fork(p); bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index d9dc9ab37..e0e73b44a 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -21,13 +21,18 @@ #include #include +#include #include +#include #include +#include #include #include #include #include #include +#include +#include #include @@ -52,3 +57,6 @@ #include "cputime.c" #include "deadline.c" +#ifdef CONFIG_SCHED_CLASS_EXT +# include "ext.c" +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc90..a80d257b4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &idle_sched_class) return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ + if (task_on_scx(p)) + return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ + + return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ } /* @@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a, if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); +#ifdef CONFIG_SCHED_CLASS_EXT + if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ + return scx_prio_less(a, b, in_fi); +#endif + return false; } @@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; - * if there's more than one we need the tick for involuntary - * preemption. + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks + * left. For CFS, if there's more than one we need the tick for + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) + if (!scx_switched_all() && rq->nr_running > 1) + return false; + + if (scx_enabled() && !scx_can_stop_tick(rq)) return false; /* @@ -1342,8 +1353,8 @@ static void set_load_weight(struct task_struct *p, bool update_load) * SCHED_OTHER tasks have to update their load when changing their * weight */ - if (update_load && p->sched_class == &fair_sched_class) { - reweight_task(p, prio); + if (update_load && p->sched_class->reweight_task) { + p->sched_class->reweight_task(task_rq(p), p, prio); } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; @@ -2216,6 +2227,17 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * ->switching_to() is called with the pi_lock and rq_lock held and must not + * mess with locking. + */ +void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class) +{ + if (prev_class != p->sched_class && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); +} + /* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. @@ -2223,9 +2245,9 @@ inline int task_curr(const struct task_struct *p) * this means any call to check_class_changed() must be followed by a call to * balance_callback(). */ -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) +void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) @@ -3977,6 +3999,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { + /* + * The BPF scheduler may depend on select_task_rq() being invoked during + * wakeups. In addition, @p may end up executing on a different CPU + * regardless of what happens in the wakeup path making the ttwu_queue + * optimization less meaningful. Skip if on SCX. + */ + if (task_on_scx(p)) + return false; + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. @@ -4544,6 +4575,23 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->rt.on_rq = 0; p->rt.on_list = 0; +#ifdef CONFIG_SCHED_CLASS_EXT + p->scx.dsq = NULL; + INIT_LIST_HEAD(&p->scx.dsq_node.fifo); + RB_CLEAR_NODE(&p->scx.dsq_node.priq); + INIT_LIST_HEAD(&p->scx.runnable_node); + p->scx.flags = 0; + p->scx.weight = 0; + p->scx.sticky_cpu = -1; + p->scx.holding_cpu = -1; + p->scx.kf_mask = 0; + atomic_long_set(&p->scx.ops_state, 0); + p->scx.runnable_at = INITIAL_JIFFIES; + p->scx.slice = SCX_SLICE_DFL; + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif @@ -4747,6 +4795,8 @@ late_initcall(sched_core_sysctl_init); */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { + int ret; + __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -4783,12 +4833,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (dl_prio(p->prio)) - return -EAGAIN; - else if (rt_prio(p->prio)) + scx_pre_fork(p); + + if (dl_prio(p->prio)) { + ret = -EAGAIN; + goto out_cancel; + } else if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; - else +#ifdef CONFIG_SCHED_CLASS_EXT + } else if (task_should_scx(p)) { + p->sched_class = &ext_sched_class; +#endif + } else { p->sched_class = &fair_sched_class; + } init_entity_runnable_average(&p->se); @@ -4806,9 +4864,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif return 0; + +out_cancel: + scx_cancel_fork(p); + return ret; } -void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; @@ -4835,11 +4897,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return scx_fork(p); +} + +void sched_cancel_fork(struct task_struct *p) +{ + scx_cancel_fork(p); } void sched_post_fork(struct task_struct *p) { uclamp_post_fork(p); + scx_post_fork(p); } unsigned long to_ratio(u64 period, u64 runtime) @@ -5684,14 +5754,17 @@ void scheduler_tick(void) if (sched_feat(LATENCY_WARN) && resched_latency) resched_latency_warn(cpu, resched_latency); + scx_notify_sched_tick(); perf_event_task_tick(); if (curr->flags & PF_WQ_WORKER) wq_worker_tick(curr); #ifdef CONFIG_SMP - rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + if (!scx_switched_all()) { + rq->idle_balance = idle_cpu(cpu); + trigger_load_balance(rq); + } #endif } @@ -5991,7 +6064,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * We can terminate the balance pass as soon as we know there is * a runnable task of @class priority or higher. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { + for_balance_class_range(class, prev->sched_class, &idle_sched_class) { if (class->balance(rq, prev, rf)) break; } @@ -6009,6 +6082,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; struct task_struct *p; + if (scx_enabled()) + goto restart; + /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a @@ -6049,10 +6125,12 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (prev->dl_server) prev->dl_server = NULL; - for_each_class(class) { + for_each_active_class(class) { p = class->pick_next_task(rq); - if (p) + if (p) { + scx_notify_pick_next_task(rq, p, class); return p; + } } BUG(); /* The idle class should always have a runnable task. */ @@ -6082,7 +6160,7 @@ static inline struct task_struct *pick_task(struct rq *rq) const struct sched_class *class; struct task_struct *p; - for_each_class(class) { + for_each_active_class(class) { p = class->pick_task(rq); if (p) return p; @@ -7060,12 +7138,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -static void __setscheduler_prio(struct task_struct *p, int prio) +void __setscheduler_prio(struct task_struct *p, int prio) { if (dl_prio(prio)) p->sched_class = &dl_sched_class; else if (rt_prio(prio)) p->sched_class = &rt_sched_class; +#ifdef CONFIG_SCHED_CLASS_EXT + else if (task_should_scx(p)) + p->sched_class = &ext_sched_class; +#endif else p->sched_class = &fair_sched_class; @@ -7226,6 +7308,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } __setscheduler_prio(p, prio); + check_class_changing(rq, p, prev_class); if (queued) enqueue_task(rq, p, queue_flag); @@ -7769,6 +7852,10 @@ static int __sched_setscheduler(struct task_struct *p, goto unlock; } + retval = scx_check_setscheduler(p, policy); + if (retval) + goto unlock; + /* * If not changing anything there's no need to proceed further, * but store a possible modification of reset_on_fork. @@ -7871,6 +7958,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_prio(p, newprio); } __setscheduler_uclamp(p, attr); + check_class_changing(rq, p, prev_class); if (queued) { /* @@ -9046,6 +9134,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: + case SCHED_EXT: ret = 0; break; } @@ -9073,6 +9162,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: + case SCHED_EXT: ret = 0; } return ret; @@ -9168,6 +9258,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); + print_scx_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } @@ -9553,7 +9644,7 @@ static inline void balance_hotplug_wait(void) #endif /* CONFIG_HOTPLUG_CPU */ -void set_rq_online(struct rq *rq) +void set_rq_online(struct rq *rq, enum rq_onoff_reason reason) { if (!rq->online) { const struct sched_class *class; @@ -9563,12 +9654,12 @@ void set_rq_online(struct rq *rq) for_each_class(class) { if (class->rq_online) - class->rq_online(rq); + class->rq_online(rq, reason); } } } -void set_rq_offline(struct rq *rq) +void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason) { if (rq->online) { const struct sched_class *class; @@ -9576,7 +9667,7 @@ void set_rq_offline(struct rq *rq) update_rq_clock(rq); for_each_class(class) { if (class->rq_offline) - class->rq_offline(rq); + class->rq_offline(rq, reason); } cpumask_clear_cpu(rq->cpu, rq->rd->online); @@ -9672,7 +9763,7 @@ int sched_cpu_activate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); + set_rq_online(rq, RQ_ONOFF_HOTPLUG); } rq_unlock_irqrestore(rq, &rf); @@ -9716,7 +9807,7 @@ int sched_cpu_deactivate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); + set_rq_offline(rq, RQ_ONOFF_HOTPLUG); } rq_unlock_irqrestore(rq, &rf); @@ -9903,11 +9994,15 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class != &fair_sched_class + 1 || - &fair_sched_class != &rt_sched_class + 1 || - &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class != &stop_sched_class + 1); + BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); +#endif + BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); + BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); + BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); +#ifdef CONFIG_SCHED_CLASS_EXT + BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif wait_bit_init(); @@ -9931,6 +10026,9 @@ void __init sched_init(void) root_task_group.shares = ROOT_TASK_GROUP_LOAD; init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EXT_GROUP_SCHED + root_task_group.scx_weight = CGROUP_WEIGHT_DFL; +#endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -10076,6 +10174,7 @@ void __init sched_init(void) balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); + init_sched_ext_class(); psi_init(); @@ -10361,6 +10460,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); alloc_uclamp_sched_group(tg, parent); return tg; @@ -10488,6 +10588,7 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); + scx_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10502,11 +10603,6 @@ void sched_move_task(struct task_struct *tsk) } } -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct task_group, css) : NULL; -} - static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -10530,6 +10626,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); + int ret; + + ret = scx_tg_online(tg); + if (ret) + return ret; if (parent) sched_online_group(tg, parent); @@ -10544,6 +10645,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + scx_tg_offline(tg); +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -10561,9 +10669,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -#ifdef CONFIG_RT_GROUP_SCHED +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_RT_GROUP_SCHED struct task_struct *task; struct cgroup_subsys_state *css; @@ -10571,7 +10680,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } - return 0; +#endif + return scx_cgroup_can_attach(tset); } #endif @@ -10582,8 +10692,17 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + scx_cgroup_finish_attach(); } +#ifdef CONFIG_EXT_GROUP_SCHED +static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + scx_cgroup_cancel_attach(tset); +} +#endif + #ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css) { @@ -10762,9 +10881,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + int ret; + if (shareval > scale_load_down(ULONG_MAX)) shareval = MAX_SHARES; - return sched_group_set_shares(css_tg(css), scale_load(shareval)); + ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(shareval)); + return ret; } static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, @@ -11161,7 +11286,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif -static struct cftype cpu_legacy_files[] = { +static struct cftype cpu_legacy_cftypes[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", @@ -11272,38 +11397,44 @@ static int cpu_local_stat_show(struct seq_file *sf, return 0; } +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + +static unsigned long tg_weight(struct task_group *tg) +{ #ifdef CONFIG_FAIR_GROUP_SCHED + return scale_load_down(tg->shares); +#else + return sched_weight_from_cgroup(tg->scx_weight); +#endif +} + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - u64 weight = scale_load_down(tg->shares); - - return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); + return sched_weight_to_cgroup(tg_weight(css_tg(css))); } static int cpu_weight_write_u64(struct cgroup_subsys_state *css, - struct cftype *cft, u64 weight) + struct cftype *cft, u64 cgrp_weight) { - /* - * cgroup weight knobs should use the common MIN, DFL and MAX - * values which are 1, 100 and 10000 respectively. While it loses - * a bit of range on both ends, it maps pretty well onto the shares - * value used by scheduler and the round-trip conversions preserve - * the original value over the entire range. - */ - if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + unsigned long weight; + int ret; + + if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; - weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), cgrp_weight); + return ret; } static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - unsigned long weight = scale_load_down(css_tg(css)->shares); + unsigned long weight = tg_weight(css_tg(css)); int last_delta = INT_MAX; int prio, delta; @@ -11322,7 +11453,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; - int idx; + int idx, ret; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; @@ -11331,7 +11462,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, idx = array_index_nospec(idx, 40); weight = sched_prio_to_weight[idx]; - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(weight)); + return ret; } #endif @@ -11392,21 +11527,23 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, } #endif -static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - { +struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = { +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + [CPU_CFTYPE_WEIGHT] = { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_weight_read_u64, .write_u64 = cpu_weight_write_u64, }, - { + [CPU_CFTYPE_WEIGHT_NICE] = { .name = "weight.nice", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, - { +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED + [CPU_CFTYPE_IDLE] = { .name = "idle", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = cpu_idle_read_s64, @@ -11414,13 +11551,13 @@ static struct cftype cpu_files[] = { }, #endif #ifdef CONFIG_CFS_BANDWIDTH - { + [CPU_CFTYPE_MAX] = { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_max_show, .write = cpu_max_write, }, - { + [CPU_CFTYPE_MAX_BURST] = { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_cfs_burst_read_u64, @@ -11428,13 +11565,13 @@ static struct cftype cpu_files[] = { }, #endif #ifdef CONFIG_UCLAMP_TASK_GROUP - { + [CPU_CFTYPE_UCLAMP_MIN] = { .name = "uclamp.min", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_min_show, .write = cpu_uclamp_min_write, }, - { + [CPU_CFTYPE_UCLAMP_MAX] = { .name = "uclamp.max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_max_show, @@ -11447,16 +11584,20 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, .css_local_stat_show = cpu_local_stat_show, -#ifdef CONFIG_RT_GROUP_SCHED +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) .can_attach = cpu_cgroup_can_attach, #endif .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_legacy_files, - .dfl_cftypes = cpu_files, +#ifdef CONFIG_EXT_GROUP_SCHED + .cancel_attach = cpu_cgroup_cancel_attach, +#endif + .legacy_cftypes = cpu_legacy_cftypes, + .dfl_cftypes = cpu_cftypes, .early_init = true, .threaded = true, }; @@ -12044,3 +12185,38 @@ void sched_mm_cid_fork(struct task_struct *t) t->mm_cid_active = 1; } #endif + +#ifdef CONFIG_SCHED_CLASS_EXT +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + + *ctx = (struct sched_enq_and_set_ctx){ + .p = p, + .queue_flags = queue_flags, + .queued = task_on_rq_queued(p), + .running = task_current(rq, p), + }; + + update_rq_clock(rq); + if (ctx->queued) + dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + if (ctx->running) + put_prev_task(rq, p); +} + +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(ctx->p); + + lockdep_assert_rq_held(rq); + + if (ctx->queued) + enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + if (ctx->running) + set_next_task(rq, ctx->p); +} +#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af..010d1dc5f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2607,7 +2607,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, } /* Assumes rq->lock is held */ -static void rq_online_dl(struct rq *rq) +static void rq_online_dl(struct rq *rq, enum rq_onoff_reason reason) { if (rq->dl.overloaded) dl_set_overload(rq); @@ -2618,7 +2618,7 @@ static void rq_online_dl(struct rq *rq) } /* Assumes rq->lock is held */ -static void rq_offline_dl(struct rq *rq) +static void rq_offline_dl(struct rq *rq, enum rq_onoff_reason reason) { if (rq->dl.overloaded) dl_clear_overload(rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8d5d98a58..6f306e1c9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1089,6 +1089,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(dl.runtime); P(dl.deadline); } +#ifdef CONFIG_SCHED_CLASS_EXT + __PS("ext.enabled", task_on_scx(p)); +#endif #undef PN_SCHEDSTAT #undef P_SCHEDSTAT diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c new file mode 100644 index 000000000..3b3d38fb1 --- /dev/null +++ b/kernel/sched/ext.c @@ -0,0 +1,5374 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ + +#include + +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_internal_consts { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_LEN = 32768, +}; + +enum scx_ops_enable_state { + SCX_OPS_PREPPING, + SCX_OPS_ENABLING, + SCX_OPS_ENABLED, + SCX_OPS_DISABLING, + SCX_OPS_DISABLED, +}; + +static const char *scx_ops_enable_state_str[] = { + [SCX_OPS_PREPPING] = "prepping", + [SCX_OPS_ENABLING] = "enabling", + [SCX_OPS_ENABLED] = "enabled", + [SCX_OPS_DISABLING] = "disabling", + [SCX_OPS_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +/* + * During exit, a task may schedule after losing its PIDs. When disabling the + * BPF scheduler, we need to be able to iterate tasks in every state to + * guarantee system safety. Maintain a dedicated task list which contains every + * task between its fork and eventual free. + */ +static DEFINE_SPINLOCK(scx_tasks_lock); +static LIST_HEAD(scx_tasks); + +/* ops enable/disable */ +static struct kthread_worker *scx_ops_helper; +static DEFINE_MUTEX(scx_ops_enable_mutex); +DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); +static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static bool scx_switch_all_req; +static bool scx_switching_all; +DEFINE_STATIC_KEY_FALSE(__scx_switched_all); + +static struct sched_ext_ops scx_ops; +static bool scx_warned_zero_slice; + +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +struct static_key_false scx_has_op[SCX_OPI_END] = + { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; + +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); +static struct scx_exit_info *scx_exit_info; + +static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); + +/* + * The maximum amount of time in jiffies that a task may be runnable without + * being scheduled on a CPU. If this timeout is exceeded, it will trigger + * scx_ops_error(). + */ +unsigned long scx_watchdog_timeout; + +/* + * The last time the delayed work was run. This delayed work relies on + * ksoftirqd being able to run to service timer interrupts, so it's possible + * that this work itself could get wedged. To account for this, we check that + * it's not stalled in the timer tick, and trigger an error if it is. + */ +unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; + +static struct delayed_work scx_watchdog_work; + +/* idle tracking */ +#ifdef CONFIG_SMP +#ifdef CONFIG_CPUMASK_OFFSTACK +#define CL_ALIGNED_IF_ONSTACK +#else +#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp +#endif + +static struct { + cpumask_var_t cpu; + cpumask_var_t smt; +} idle_masks CL_ALIGNED_IF_ONSTACK; + +#endif /* CONFIG_SMP */ + +/* for %SCX_KICK_WAIT */ +static unsigned long __percpu *scx_kick_cpus_pnt_seqs; + +/* + * Direct dispatch marker. + * + * Non-NULL values are used for direct dispatch from enqueue path. A valid + * pointer points to the task currently being enqueued. An ERR_PTR value is used + * to indicate that direct dispatch has already happened. + */ +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); + +/* dispatch queues */ +static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; + +static const struct rhashtable_params dsq_hash_params = { + .key_len = 8, + .key_offset = offsetof(struct scx_dispatch_q, id), + .head_offset = offsetof(struct scx_dispatch_q, hash_node), +}; + +static struct rhashtable dsq_hash; +static LLIST_HEAD(dsqs_to_free); + +/* dispatch buf */ +struct scx_dsp_buf_ent { + struct task_struct *task; + unsigned long qseq; + u64 dsq_id; + u64 enq_flags; +}; + +static u32 scx_dsp_max_batch; +static struct scx_dsp_buf_ent __percpu *scx_dsp_buf; + +struct scx_dsp_ctx { + struct rq *rq; + struct rq_flags *rf; + u32 buf_cursor; + u32 nr_tasks; +}; + +static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx); + +/* /sys/kernel/sched_ext interface */ +static struct kset *scx_kset; +static struct kobject *scx_root_kobj; + +static void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags); +static void scx_bpf_kick_cpu(s32 cpu, u64 flags); + +struct scx_task_iter { + struct sched_ext_entity cursor; + struct task_struct *locked; + struct rq *rq; + struct rq_flags rf; +}; + +#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) + +static long jiffies_delta_msecs(unsigned long at, unsigned long now) +{ + if (time_after(at, now)) + return jiffies_to_msecs(at - now); + else + return -(long)jiffies_to_msecs(now - at); +} + +/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ +static u32 higher_bits(u32 flags) +{ + return ~((1 << fls(flags)) - 1); +} + +/* return the mask with only the highest bit set */ +static u32 highest_bit(u32 flags) +{ + int bit = fls(flags); + return bit ? 1 << (bit - 1) : 0; +} + +/* + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate + * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check + * whether it's running from an allowed context. + * + * @mask is constant, always inline to cull the mask calculations. + */ +static __always_inline void scx_kf_allow(u32 mask) +{ + /* nesting is allowed only in increasing scx_kf_mask order */ + WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, + "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", + current->scx.kf_mask, mask); + current->scx.kf_mask |= mask; +} + +static void scx_kf_disallow(u32 mask) +{ + current->scx.kf_mask &= ~mask; +} + +#define SCX_CALL_OP(mask, op, args...) \ +do { \ + if (mask) { \ + scx_kf_allow(mask); \ + scx_ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + scx_ops.op(args); \ + } \ +} while (0) + +#define SCX_CALL_OP_RET(mask, op, args...) \ +({ \ + __typeof__(scx_ops.op(args)) __ret; \ + if (mask) { \ + scx_kf_allow(mask); \ + __ret = scx_ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + __ret = scx_ops.op(args); \ + } \ + __ret; \ +}) + +/* + * Some kfuncs are allowed only on the tasks that are subjects of the + * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such + * restrictions, the following SCX_CALL_OP_*() variants should be used when + * invoking scx_ops operations that take task arguments. These can only be used + * for non-nesting operations due to the way the tasks are tracked. + * + * kfuncs which can only operate on such tasks can in turn use + * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on + * the specific task. + */ +#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +do { \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + SCX_CALL_OP(mask, op, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ +} while (0) + +#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +({ \ + __typeof__(scx_ops.op(task, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + __ret; \ +}) + +#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +({ \ + __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task0; \ + current->scx.kf_tasks[1] = task1; \ + __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + current->scx.kf_tasks[1] = NULL; \ + __ret; \ +}) + +/* @mask is constant, always inline to cull unnecessary branches */ +static __always_inline bool scx_kf_allowed(u32 mask) +{ + if (unlikely(!(current->scx.kf_mask & mask))) { + scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); + return false; + } + + if (unlikely((mask & (SCX_KF_INIT | SCX_KF_SLEEPABLE)) && + in_interrupt())) { + scx_ops_error("sleepable kfunc called from non-sleepable context"); + return false; + } + + /* + * Enforce nesting boundaries. e.g. A kfunc which can be called from + * DISPATCH must not be called if we're running DEQUEUE which is nested + * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE + * boundary thanks to the above in_interrupt() check. + */ + if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && + (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { + scx_ops_error("cpu_release kfunc called from a nested operation"); + return false; + } + + if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && + (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { + scx_ops_error("dispatch kfunc called from a nested operation"); + return false; + } + + return true; +} + +/* see SCX_CALL_OP_TASK() */ +static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, + struct task_struct *p) +{ + if (!scx_kf_allowed(__SCX_KF_RQ_LOCKED)) + return false; + + if (unlikely((p != current->scx.kf_tasks[0] && + p != current->scx.kf_tasks[1]))) { + scx_ops_error("called on a task not being operated on"); + return false; + } + + return true; +} + +/** + * scx_task_iter_init - Initialize a task iterator + * @iter: iterator to init + * + * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, + * @iter must eventually be exited with scx_task_iter_exit(). + * + * scx_tasks_lock may be released between this and the first next() call or + * between any two next() calls. If scx_tasks_lock is released between two + * next() calls, the caller is responsible for ensuring that the task being + * iterated remains accessible either through RCU read lock or obtaining a + * reference count. + * + * All tasks which existed when the iteration started are guaranteed to be + * visited as long as they still exist. + */ +static void scx_task_iter_init(struct scx_task_iter *iter) +{ + lockdep_assert_held(&scx_tasks_lock); + + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; + list_add(&iter->cursor.tasks_node, &scx_tasks); + iter->locked = NULL; +} + +/** + * scx_task_iter_exit - Exit a task iterator + * @iter: iterator to exit + * + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. + * If the iterator holds a task's rq lock, that rq lock is released. See + * scx_task_iter_init() for details. + */ +static void scx_task_iter_exit(struct scx_task_iter *iter) +{ + struct list_head *cursor = &iter->cursor.tasks_node; + + lockdep_assert_held(&scx_tasks_lock); + + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } + + if (list_empty(cursor)) + return; + + list_del_init(cursor); +} + +/** + * scx_task_iter_next - Next task + * @iter: iterator to walk + * + * Visit the next task. See scx_task_iter_init() for details. + */ +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) +{ + struct list_head *cursor = &iter->cursor.tasks_node; + struct sched_ext_entity *pos; + + lockdep_assert_held(&scx_tasks_lock); + + list_for_each_entry(pos, cursor, tasks_node) { + if (&pos->tasks_node == &scx_tasks) + return NULL; + if (!(pos->flags & SCX_TASK_CURSOR)) { + list_move(cursor, &pos->tasks_node); + return container_of(pos, struct task_struct, scx); + } + } + + /* can't happen, should always terminate at scx_tasks above */ + BUG(); +} + +/** + * scx_task_iter_next_filtered - Next non-idle task + * @iter: iterator to walk + * + * Visit the next non-idle task. See scx_task_iter_init() for details. + */ +static struct task_struct * +scx_task_iter_next_filtered(struct scx_task_iter *iter) +{ + struct task_struct *p; + + while ((p = scx_task_iter_next(iter))) { + /* + * is_idle_task() tests %PF_IDLE which may not be set for CPUs + * which haven't yet been onlined. Test sched_class directly. + */ + if (p->sched_class != &idle_sched_class) + return p; + } + return NULL; +} + +/** + * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked + * @iter: iterator to walk + * + * Visit the next non-idle task with its rq lock held. See scx_task_iter_init() + * for details. + */ +static struct task_struct * +scx_task_iter_next_filtered_locked(struct scx_task_iter *iter) +{ + struct task_struct *p; + + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } + + p = scx_task_iter_next_filtered(iter); + if (!p) + return NULL; + + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked = p; + return p; +} + +static enum scx_ops_enable_state scx_ops_enable_state(void) +{ + return atomic_read(&scx_ops_enable_state_var); +} + +static enum scx_ops_enable_state +scx_ops_set_enable_state(enum scx_ops_enable_state to) +{ + return atomic_xchg(&scx_ops_enable_state_var, to); +} + +static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, + enum scx_ops_enable_state from) +{ + int from_v = from; + + return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); +} + +static bool scx_ops_bypassing(void) +{ + return unlikely(atomic_read(&scx_ops_bypass_depth)); +} + +/** + * wait_ops_state - Busy-wait the specified ops state to end + * @p: target task + * @opss: state to wait the end of + * + * Busy-wait for @p to transition out of @opss. This can only be used when the + * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also + * has load_acquire semantics to ensure that the caller can see the updates made + * in the enqueueing and dispatching paths. + */ +static void wait_ops_state(struct task_struct *p, unsigned long opss) +{ + do { + cpu_relax(); + } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); +} + +/** + * ops_cpu_valid - Verify a cpu number + * @cpu: cpu number which came from a BPF ops + * + * @cpu is a cpu number which came from the BPF scheduler and can be any value. + * Verify that it is in range and one of the possible cpus. + */ +static bool ops_cpu_valid(s32 cpu) +{ + return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); +} + +/** + * ops_sanitize_err - Sanitize a -errno value + * @ops_name: operation to blame on failure + * @err: -errno value to sanitize + * + * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return + * -%EPROTO. This is necessary because returning a rogue -errno up the chain + * can cause misbehaviors. For an example, a large negative return from + * ops.init_task() triggers an oops when passed up the call chain because the + * value fails IS_ERR() test after being encoded with ERR_PTR() and then is + * handled as a pointer. + */ +static int ops_sanitize_err(const char *ops_name, s32 err) +{ + if (err < 0 && err >= -MAX_ERRNO) + return err; + + scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); + return -EPROTO; +} + +/** + * touch_core_sched - Update timestamp used for core-sched task ordering + * @rq: rq to read clock from, must be locked + * @p: task to update the timestamp for + * + * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to + * implement global or local-DSQ FIFO ordering for core-sched. Should be called + * when a task becomes runnable and its turn on the CPU ends (e.g. slice + * exhaustion). + */ +static void touch_core_sched(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SCHED_CORE + /* + * It's okay to update the timestamp spuriously. Use + * sched_core_disabled() which is cheaper than enabled(). + */ + if (!sched_core_disabled()) + p->scx.core_sched_at = rq_clock_task(rq); +#endif +} + +/** + * touch_core_sched_dispatch - Update core-sched timestamp on dispatch + * @rq: rq to read clock from, must be locked + * @p: task being dispatched + * + * If the BPF scheduler implements custom core-sched ordering via + * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO + * ordering within each local DSQ. This function is called from dispatch paths + * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. + */ +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + +#ifdef CONFIG_SCHED_CORE + if (SCX_HAS_OP(core_sched_before)) + touch_core_sched(rq, p); +#endif +} + +static void update_curr_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + u64 now = rq_clock_task(rq); + u64 delta_exec; + + if (time_before_eq64(now, curr->se.exec_start)) + return; + + delta_exec = now - curr->se.exec_start; + curr->se.exec_start = now; + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); + + if (curr->scx.slice != SCX_SLICE_INF) { + curr->scx.slice -= min(curr->scx.slice, delta_exec); + if (!curr->scx.slice) + touch_core_sched(rq, curr); + } +} + +static bool scx_dsq_priq_less(struct rb_node *node_a, + const struct rb_node *node_b) +{ + const struct task_struct *a = + container_of(node_a, struct task_struct, scx.dsq_node.priq); + const struct task_struct *b = + container_of(node_b, struct task_struct, scx.dsq_node.priq); + + return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); +} + +static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) +{ + bool is_local = dsq->id == SCX_DSQ_LOCAL; + + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo)); + WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || + !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); + + if (!is_local) { + raw_spin_lock(&dsq->lock); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { + scx_ops_error("attempting to dispatch to a destroyed dsq"); + /* fall back to the global dsq */ + raw_spin_unlock(&dsq->lock); + dsq = &scx_dsq_global; + raw_spin_lock(&dsq->lock); + } + } + + if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && + (enq_flags & SCX_ENQ_DSQ_PRIQ))) { + /* + * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from + * their FIFO queues. To avoid confusion and accidentally + * starving vtime-dispatched tasks by FIFO-dispatched tasks, we + * disallow any internal DSQ from doing vtime ordering of + * tasks. + */ + scx_ops_error("Cannot use vtime ordering for built-in DSQs"); + enq_flags &= ~SCX_ENQ_DSQ_PRIQ; + } + + if (enq_flags & SCX_ENQ_DSQ_PRIQ) { + p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; + rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq, + scx_dsq_priq_less); + /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ + if (unlikely(!list_empty(&dsq->fifo))) + scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); + } else { + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_node.fifo, &dsq->fifo); + else + list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo); + /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ + if (unlikely(rb_first_cached(&dsq->priq))) + scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + } + dsq->nr++; + p->scx.dsq = dsq; + + /* + * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the + * direct dispatch path, but we clear them here because the direct + * dispatch verdict may be overridden on the enqueue path during e.g. + * bypass. + */ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; + + /* + * We're transitioning out of QUEUEING or DISPATCHING. store_release to + * match waiters' load_acquire. + */ + if (enq_flags & SCX_ENQ_CLEAR_OPSS) + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + if (is_local) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, + rq->curr->sched_class)) + resched_curr(rq); + } else { + raw_spin_unlock(&dsq->lock); + } +} + +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_node.priq); + p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; + } else { + list_del_init(&p->scx.dsq_node.fifo); + } +} + +static bool task_linked_on_dsq(struct task_struct *p) +{ + return !list_empty(&p->scx.dsq_node.fifo) || + !RB_EMPTY_NODE(&p->scx.dsq_node.priq); +} + +static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p) +{ + struct scx_dispatch_q *dsq = p->scx.dsq; + bool is_local = dsq == &scx_rq->local_dsq; + + if (!dsq) { + WARN_ON_ONCE(task_linked_on_dsq(p)); + /* + * When dispatching directly from the BPF scheduler to a local + * DSQ, the task isn't associated with any DSQ but + * @p->scx.holding_cpu may be set under the protection of + * %SCX_OPSS_DISPATCHING. + */ + if (p->scx.holding_cpu >= 0) + p->scx.holding_cpu = -1; + return; + } + + if (!is_local) + raw_spin_lock(&dsq->lock); + + /* + * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node + * can't change underneath us. + */ + if (p->scx.holding_cpu < 0) { + /* @p must still be on @dsq, dequeue */ + WARN_ON_ONCE(!task_linked_on_dsq(p)); + task_unlink_from_dsq(p, dsq); + dsq->nr--; + } else { + /* + * We're racing against dispatch_to_local_dsq() which already + * removed @p from @dsq and set @p->scx.holding_cpu. Clear the + * holding_cpu which tells dispatch_to_local_dsq() that it lost + * the race. + */ + WARN_ON_ONCE(task_linked_on_dsq(p)); + p->scx.holding_cpu = -1; + } + p->scx.dsq = NULL; + + if (!is_local) + raw_spin_unlock(&dsq->lock); +} + +static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) +{ + lockdep_assert(rcu_read_lock_any_held()); + + if (dsq_id == SCX_DSQ_GLOBAL) + return &scx_dsq_global; + else + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, + dsq_hash_params); +} + +static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, + struct task_struct *p) +{ + struct scx_dispatch_q *dsq; + + if (dsq_id == SCX_DSQ_LOCAL) + return &rq->scx.local_dsq; + + dsq = find_non_local_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); + return &scx_dsq_global; + } + + return dsq; +} + +static void mark_direct_dispatch(struct task_struct *ddsp_task, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + /* + * Mark that dispatch already happened from ops.select_cpu() or + * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value + * which can never match a valid task pointer. + */ + __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); + + /* @p must match the task on the enqueue path */ + if (unlikely(p != ddsp_task)) { + if (IS_ERR(ddsp_task)) + scx_ops_error("%s[%d] already direct-dispatched", + p->comm, p->pid); + else + scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + ddsp_task->comm, ddsp_task->pid, + p->comm, p->pid); + return; + } + + /* + * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because + * dispatching to the local DSQ of a different CPU requires unlocking + * the current rq which isn't allowed in the enqueue path. Use + * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. + */ + if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { + scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); + return; + } + + WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); + WARN_ON_ONCE(p->scx.ddsp_enq_flags); + + p->scx.ddsp_dsq_id = dsq_id; + p->scx.ddsp_enq_flags = enq_flags; +} + +static void direct_dispatch(struct task_struct *p, u64 enq_flags) +{ + struct scx_dispatch_q *dsq; + + touch_core_sched_dispatch(task_rq(p), p); + + enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); + dispatch_enqueue(dsq, p, enq_flags); +} + +static bool test_rq_online(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->online; +#else + return true; +#endif +} + +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, + int sticky_cpu) +{ + struct task_struct **ddsp_taskp; + unsigned long qseq; + + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + + /* rq migration */ + if (sticky_cpu == cpu_of(rq)) + goto local_norefill; + + /* + * If !rq->online, we already told the BPF scheduler that the CPU is + * offline. We're just trying to on/offline the CPU. Don't bother the + * BPF scheduler. + */ + if (unlikely(!test_rq_online(rq))) + goto local; + + if (scx_ops_bypassing()) { + if (enq_flags & SCX_ENQ_LAST) + goto local; + else + goto global; + } + + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* see %SCX_OPS_ENQ_EXITING */ + if (!static_branch_unlikely(&scx_ops_enq_exiting) && + unlikely(p->flags & PF_EXITING)) + goto local; + + /* see %SCX_OPS_ENQ_LAST */ + if (!static_branch_unlikely(&scx_ops_enq_last) && + (enq_flags & SCX_ENQ_LAST)) + goto local; + + if (!SCX_HAS_OP(enqueue)) + goto global; + + /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ + qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; + + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + + *ddsp_taskp = NULL; + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* + * If not directly dispatched, QUEUEING isn't clear yet and dispatch or + * dequeue may be waiting. The store_release matches their load_acquire. + */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); + return; + +direct: + direct_dispatch(p, enq_flags); + return; + +local: + /* + * For task-ordering, slice refill must be treated as implying the end + * of the current slice. Otherwise, the longer @p stays on the CPU, the + * higher priority it becomes from scx_prio_less()'s POV. + */ + touch_core_sched(rq, p); + p->scx.slice = SCX_SLICE_DFL; +local_norefill: + dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); + return; + +global: + touch_core_sched(rq, p); /* see the comment in local: */ + p->scx.slice = SCX_SLICE_DFL; + dispatch_enqueue(&scx_dsq_global, p, enq_flags); +} + +static bool task_runnable(const struct task_struct *p) +{ + return !list_empty(&p->scx.runnable_node); +} + +static void set_task_runnable(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + + if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { + p->scx.runnable_at = jiffies; + p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; + } + + /* + * list_add_tail() must be used. scx_ops_bypass() depends on tasks being + * appened to the runnable_list. + */ + list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); +} + +static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) +{ + list_del_init(&p->scx.runnable_node); + if (reset_runnable_at) + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; +} + +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) +{ + int sticky_cpu = p->scx.sticky_cpu; + + enq_flags |= rq->scx.extra_enq_flags; + + if (sticky_cpu >= 0) + p->scx.sticky_cpu = -1; + + /* + * Restoring a running task will be immediately followed by + * set_next_task_scx() which expects the task to not be on the BPF + * scheduler as tasks can only start running through local DSQs. Force + * direct-dispatch into the local DSQ by setting the sticky_cpu. + */ + if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) + sticky_cpu = cpu_of(rq); + + if (p->scx.flags & SCX_TASK_QUEUED) { + WARN_ON_ONCE(!task_runnable(p)); + return; + } + + set_task_runnable(rq, p); + p->scx.flags |= SCX_TASK_QUEUED; + rq->scx.nr_running++; + add_nr_running(rq, 1); + + if (SCX_HAS_OP(runnable)) + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + + if (enq_flags & SCX_ENQ_WAKEUP) + touch_core_sched(rq, p); + + do_enqueue_task(rq, p, enq_flags, sticky_cpu); +} + +static void ops_dequeue(struct task_struct *p, u64 deq_flags) +{ + unsigned long opss; + + /* dequeue is always temporary, don't reset runnable_at */ + clr_task_runnable(p, false); + + /* acquire ensures that we see the preceding updates on QUEUED */ + opss = atomic_long_read_acquire(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_NONE: + break; + case SCX_OPSS_QUEUEING: + /* + * QUEUEING is started and finished while holding @p's rq lock. + * As we're holding the rq lock now, we shouldn't see QUEUEING. + */ + BUG(); + case SCX_OPSS_QUEUED: + if (SCX_HAS_OP(dequeue)) + SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + + if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_NONE)) + break; + fallthrough; + case SCX_OPSS_DISPATCHING: + /* + * If @p is being dispatched from the BPF scheduler to a DSQ, + * wait for the transfer to complete so that @p doesn't get + * added to its DSQ after dequeueing is complete. + * + * As we're waiting on DISPATCHING with the rq locked, the + * dispatching side shouldn't try to lock the rq while + * DISPATCHING is set. See dispatch_to_local_dsq(). + * + * DISPATCHING shouldn't have qseq set and control can reach + * here with NONE @opss from the above QUEUED case block. + * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. + */ + wait_ops_state(p, SCX_OPSS_DISPATCHING); + BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + break; + } +} + +static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) +{ + struct scx_rq *scx_rq = &rq->scx; + + if (!(p->scx.flags & SCX_TASK_QUEUED)) { + WARN_ON_ONCE(task_runnable(p)); + return; + } + + ops_dequeue(p, deq_flags); + + /* + * A currently running task which is going off @rq first gets dequeued + * and then stops running. As we want running <-> stopping transitions + * to be contained within runnable <-> quiescent transitions, trigger + * ->stopping() early here instead of in put_prev_task_scx(). + * + * @p may go through multiple stopping <-> running transitions between + * here and put_prev_task_scx() if task attribute changes occur while + * balance_scx() leaves @rq unlocked. However, they don't contain any + * information meaningful to the BPF scheduler and can be suppressed by + * skipping the callbacks if the task is !QUEUED. + */ + if (SCX_HAS_OP(stopping) && task_current(rq, p)) { + update_curr_scx(rq); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + } + + if (SCX_HAS_OP(quiescent)) + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + + if (deq_flags & SCX_DEQ_SLEEP) + p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; + else + p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; + + p->scx.flags &= ~SCX_TASK_QUEUED; + scx_rq->nr_running--; + sub_nr_running(rq, 1); + + dispatch_dequeue(scx_rq, p); +} + +static void yield_task_scx(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (SCX_HAS_OP(yield)) + SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + else + p->scx.slice = 0; +} + +static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) +{ + struct task_struct *from = rq->curr; + + if (SCX_HAS_OP(yield)) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + else + return false; +} + +#ifdef CONFIG_SMP +/** + * move_task_to_local_dsq - Move a task from a different rq to a local DSQ + * @rq: rq to move the task into, currently locked + * @p: task to move + * @enq_flags: %SCX_ENQ_* + * + * Move @p which is currently on a different rq to @rq's local DSQ. The caller + * must: + * + * 1. Start with exclusive access to @p either through its DSQ lock or + * %SCX_OPSS_DISPATCHING flag. + * + * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). + * + * 3. Remember task_rq(@p). Release the exclusive access so that we don't + * deadlock with dequeue. + * + * 4. Lock @rq and the task_rq from #3. + * + * 5. Call this function. + * + * Returns %true if @p was successfully moved. %false after racing dequeue and + * losing. + */ +static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, + u64 enq_flags) +{ + struct rq *task_rq; + + lockdep_assert_rq_held(rq); + + /* + * If dequeue got to @p while we were trying to lock both rq's, it'd + * have cleared @p->scx.holding_cpu to -1. While other cpus may have + * updated it to different values afterwards, as this operation can't be + * preempted or recurse, @p->scx.holding_cpu can never become + * raw_smp_processor_id() again before we're done. Thus, we can tell + * whether we lost to dequeue by testing whether @p->scx.holding_cpu is + * still raw_smp_processor_id(). + * + * See dispatch_dequeue() for the counterpart. + */ + if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) + return false; + + /* @p->rq couldn't have changed if we're still the holding cpu */ + task_rq = task_rq(p); + lockdep_assert_rq_held(task_rq); + + WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); + deactivate_task(task_rq, p, 0); + set_task_cpu(p, cpu_of(rq)); + p->scx.sticky_cpu = cpu_of(rq); + + /* + * We want to pass scx-specific enq_flags but activate_task() will + * truncate the upper 32 bit. As we own @rq, we can pass them through + * @rq->scx.extra_enq_flags instead. + */ + WARN_ON_ONCE(rq->scx.extra_enq_flags); + rq->scx.extra_enq_flags = enq_flags; + activate_task(rq, p, 0); + rq->scx.extra_enq_flags = 0; + + return true; +} + +/** + * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @src_rq: rq to move task from + * @dst_rq: rq to move task to + * + * We're holding @rq lock and trying to dispatch a task from @src_rq to + * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether + * @rq stays locked isn't important as long as the state is restored after + * dispatch_to_local_dsq_unlock(). + */ +static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf, + struct rq *src_rq, struct rq *dst_rq) +{ + rq_unpin_lock(rq, rf); + + if (src_rq == dst_rq) { + raw_spin_rq_unlock(rq); + raw_spin_rq_lock(dst_rq); + } else if (rq == src_rq) { + double_lock_balance(rq, dst_rq); + rq_repin_lock(rq, rf); + } else if (rq == dst_rq) { + double_lock_balance(rq, src_rq); + rq_repin_lock(rq, rf); + } else { + raw_spin_rq_unlock(rq); + double_rq_lock(src_rq, dst_rq); + } +} + +/** + * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @src_rq: rq to move task from + * @dst_rq: rq to move task to + * + * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. + */ +static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf, + struct rq *src_rq, struct rq *dst_rq) +{ + if (src_rq == dst_rq) { + raw_spin_rq_unlock(dst_rq); + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + } else if (rq == src_rq) { + double_unlock_balance(rq, dst_rq); + } else if (rq == dst_rq) { + double_unlock_balance(rq, src_rq); + } else { + double_rq_unlock(src_rq, dst_rq); + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + } +} +#endif /* CONFIG_SMP */ + + +static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq) +{ + return likely(test_rq_online(rq)) && !is_migration_disabled(p) && + cpumask_test_cpu(cpu_of(rq), p->cpus_ptr); +} + +static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, + struct scx_dispatch_q *dsq) +{ + struct scx_rq *scx_rq = &rq->scx; + struct task_struct *p; + struct rb_node *rb_node; + struct rq *task_rq; + bool moved = false; +retry: + if (list_empty(&dsq->fifo) && !rb_first_cached(&dsq->priq)) + return false; + + raw_spin_lock(&dsq->lock); + + list_for_each_entry(p, &dsq->fifo, scx.dsq_node.fifo) { + task_rq = task_rq(p); + if (rq == task_rq) + goto this_rq; + if (task_can_run_on_rq(p, rq)) + goto remote_rq; + } + + for (rb_node = rb_first_cached(&dsq->priq); rb_node; + rb_node = rb_next(rb_node)) { + p = container_of(rb_node, struct task_struct, scx.dsq_node.priq); + task_rq = task_rq(p); + if (rq == task_rq) + goto this_rq; + if (task_can_run_on_rq(p, rq)) + goto remote_rq; + } + + raw_spin_unlock(&dsq->lock); + return false; + +this_rq: + /* @dsq is locked and @p is on this rq */ + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); + list_add_tail(&p->scx.dsq_node.fifo, &scx_rq->local_dsq.fifo); + dsq->nr--; + scx_rq->local_dsq.nr++; + p->scx.dsq = &scx_rq->local_dsq; + raw_spin_unlock(&dsq->lock); + return true; + +remote_rq: +#ifdef CONFIG_SMP + /* + * @dsq is locked and @p is on a remote rq. @p is currently protected by + * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab + * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the + * rq lock or fail, do a little dancing from our side. See + * move_task_to_local_dsq(). + */ + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); + dsq->nr--; + p->scx.holding_cpu = raw_smp_processor_id(); + raw_spin_unlock(&dsq->lock); + + rq_unpin_lock(rq, rf); + double_lock_balance(rq, task_rq); + rq_repin_lock(rq, rf); + + moved = move_task_to_local_dsq(rq, p, 0); + + double_unlock_balance(rq, task_rq); +#endif /* CONFIG_SMP */ + if (likely(moved)) + return true; + goto retry; +} + +enum dispatch_to_local_dsq_ret { + DTL_DISPATCHED, /* successfully dispatched */ + DTL_LOST, /* lost race to dequeue */ + DTL_NOT_LOCAL, /* destination is not a local DSQ */ + DTL_INVALID, /* invalid local dsq_id */ +}; + +/** + * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @dsq_id: destination dsq ID + * @p: task to dispatch + * @enq_flags: %SCX_ENQ_* + * + * We're holding @rq lock and want to dispatch @p to the local DSQ identified by + * @dsq_id. This function performs all the synchronization dancing needed + * because local DSQs are protected with rq locks. + * + * The caller must have exclusive ownership of @p (e.g. through + * %SCX_OPSS_DISPATCHING). + */ +static enum dispatch_to_local_dsq_ret +dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id, + struct task_struct *p, u64 enq_flags) +{ + struct rq *src_rq = task_rq(p); + struct rq *dst_rq; + + /* + * We're synchronized against dequeue through DISPATCHING. As @p can't + * be dequeued, its task_rq and cpus_allowed are stable too. + */ + if (dsq_id == SCX_DSQ_LOCAL) { + dst_rq = rq; + } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (!ops_cpu_valid(cpu)) { + scx_ops_error("invalid cpu %d in SCX_DSQ_LOCAL_ON verdict for %s[%d]", + cpu, p->comm, p->pid); + return DTL_INVALID; + } + dst_rq = cpu_rq(cpu); + } else { + return DTL_NOT_LOCAL; + } + + /* if dispatching to @rq that @p is already on, no lock dancing needed */ + if (rq == src_rq && rq == dst_rq) { + dispatch_enqueue(&dst_rq->scx.local_dsq, p, + enq_flags | SCX_ENQ_CLEAR_OPSS); + return DTL_DISPATCHED; + } + +#ifdef CONFIG_SMP + if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { + struct rq *locked_dst_rq = dst_rq; + bool dsp; + + /* + * @p is on a possibly remote @src_rq which we need to lock to + * move the task. If dequeue is in progress, it'd be locking + * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq + * lock while holding DISPATCHING. + * + * As DISPATCHING guarantees that @p is wholly ours, we can + * pretend that we're moving from a DSQ and use the same + * mechanism - mark the task under transfer with holding_cpu, + * release DISPATCHING and then follow the same protocol. + */ + p->scx.holding_cpu = raw_smp_processor_id(); + + /* store_release ensures that dequeue sees the above */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq); + + /* + * We don't require the BPF scheduler to avoid dispatching to + * offline CPUs mostly for convenience but also because CPUs can + * go offline between scx_bpf_dispatch() calls and here. If @p + * is destined to an offline CPU, queue it on its current CPU + * instead, which should always be safe. As this is an allowed + * behavior, don't trigger an ops error. + */ + if (unlikely(!test_rq_online(dst_rq))) + dst_rq = src_rq; + + if (src_rq == dst_rq) { + /* + * As @p is staying on the same rq, there's no need to + * go through the full deactivate/activate cycle. + * Optimize by abbreviating the operations in + * move_task_to_local_dsq(). + */ + dsp = p->scx.holding_cpu == raw_smp_processor_id(); + if (likely(dsp)) { + p->scx.holding_cpu = -1; + dispatch_enqueue(&dst_rq->scx.local_dsq, p, + enq_flags); + } + } else { + dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); + } + + /* if the destination CPU is idle, wake it up */ + if (dsp && p->sched_class > dst_rq->curr->sched_class) + resched_curr(dst_rq); + + dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq); + + return dsp ? DTL_DISPATCHED : DTL_LOST; + } +#endif /* CONFIG_SMP */ + + scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", + cpu_of(dst_rq), p->comm, p->pid); + return DTL_INVALID; +} + +/** + * finish_dispatch - Asynchronously finish dispatching a task + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @p: task to finish dispatching + * @qseq_at_dispatch: qseq when @p started getting dispatched + * @dsq_id: destination DSQ ID + * @enq_flags: %SCX_ENQ_* + * + * Dispatching to local DSQs may need to wait for queueing to complete or + * require rq lock dancing. As we don't wanna do either while inside + * ops.dispatch() to avoid locking order inversion, we split dispatching into + * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the + * task and its qseq. Once ops.dispatch() returns, this function is called to + * finish up. + * + * There is no guarantee that @p is still valid for dispatching or even that it + * was valid in the first place. Make sure that the task is still owned by the + * BPF scheduler and claim the ownership before dispatching. + */ +static void finish_dispatch(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, + unsigned long qseq_at_dispatch, + u64 dsq_id, u64 enq_flags) +{ + struct scx_dispatch_q *dsq; + unsigned long opss; + + touch_core_sched_dispatch(rq, p); +retry: + /* + * No need for _acquire here. @p is accessed only after a successful + * try_cmpxchg to DISPATCHING. + */ + opss = atomic_long_read(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_DISPATCHING: + case SCX_OPSS_NONE: + /* someone else already got to it */ + return; + case SCX_OPSS_QUEUED: + /* + * If qseq doesn't match, @p has gone through at least one + * dispatch/dequeue and re-enqueue cycle between + * scx_bpf_dispatch() and here and we have no claim on it. + */ + if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) + return; + + /* + * While we know @p is accessible, we don't yet have a claim on + * it - the BPF scheduler is allowed to dispatch tasks + * spuriously and there can be a racing dequeue attempt. Let's + * claim @p by atomically transitioning it from QUEUED to + * DISPATCHING. + */ + if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_DISPATCHING))) + break; + goto retry; + case SCX_OPSS_QUEUEING: + /* + * do_enqueue_task() is in the process of transferring the task + * to the BPF scheduler while holding @p's rq lock. As we aren't + * holding any kernel or BPF resource that the enqueue path may + * depend upon, it's safe to wait. + */ + wait_ops_state(p, opss); + goto retry; + } + + BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); + + switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) { + case DTL_DISPATCHED: + break; + case DTL_LOST: + break; + case DTL_INVALID: + dsq_id = SCX_DSQ_GLOBAL; + fallthrough; + case DTL_NOT_LOCAL: + dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), + dsq_id, p); + dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + break; + } +} + +static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + u32 u; + + for (u = 0; u < dspc->buf_cursor; u++) { + struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u]; + + finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id, + ent->enq_flags); + } + + dspc->nr_tasks += dspc->buf_cursor; + dspc->buf_cursor = 0; +} + +static int balance_one(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf, bool local) +{ + struct scx_rq *scx_rq = &rq->scx; + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + bool prev_on_scx = prev->sched_class == &ext_sched_class; + int nr_loops = SCX_DSP_MAX_LOOPS; + bool has_tasks = false; + + lockdep_assert_rq_held(rq); + scx_rq->flags |= SCX_RQ_BALANCING; + + if (static_branch_unlikely(&scx_ops_cpu_preempt) && + unlikely(rq->scx.cpu_released)) { + /* + * If the previous sched_class for the current CPU was not SCX, + * notify the BPF scheduler that it again has control of the + * core. This callback complements ->cpu_release(), which is + * emitted in scx_notify_pick_next_task(). + */ + if (SCX_HAS_OP(cpu_acquire)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq), + NULL); + rq->scx.cpu_released = false; + } + + if (prev_on_scx) { + WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP)); + update_curr_scx(rq); + + /* + * If @prev is runnable & has slice left, it has priority and + * fetching more just increases latency for the fetched tasks. + * Tell put_prev_task_scx() to put @prev on local_dsq. If the + * BPF scheduler wants to handle this explicitly, it should + * implement ->cpu_released(). + * + * See scx_ops_disable_workfn() for the explanation on the + * disabling() test. + * + * When balancing a remote CPU for core-sched, there won't be a + * following put_prev_task_scx() call and we don't own + * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the + * same conditions later and pick @rq->curr accordingly. + */ + if ((prev->scx.flags & SCX_TASK_QUEUED) && + prev->scx.slice && !scx_ops_bypassing()) { + if (local) + prev->scx.flags |= SCX_TASK_BAL_KEEP; + goto has_tasks; + } + } + + /* if there already are tasks to run, nothing to do */ + if (scx_rq->local_dsq.nr) + goto has_tasks; + + if (consume_dispatch_q(rq, rf, &scx_dsq_global)) + goto has_tasks; + + if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing()) + goto out; + + dspc->rq = rq; + dspc->rf = rf; + + /* + * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, + * the local DSQ might still end up empty after a successful + * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() + * produced some tasks, retry. The BPF scheduler may depend on this + * looping behavior to simplify its implementation. + */ + do { + dspc->nr_tasks = 0; + + SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), + prev_on_scx ? prev : NULL); + + flush_dispatch_buf(rq, rf); + + if (scx_rq->local_dsq.nr) + goto has_tasks; + if (consume_dispatch_q(rq, rf, &scx_dsq_global)) + goto has_tasks; + + /* + * ops.dispatch() can trap us in this loop by repeatedly + * dispatching ineligible tasks. Break out once in a while to + * allow the watchdog to run. As IRQ can't be enabled in + * balance(), we want to complete this scheduling cycle and then + * start a new one. IOW, we want to call resched_curr() on the + * next, most likely idle, task, not the current one. Use + * scx_bpf_kick_cpu() for deferred kicking. + */ + if (unlikely(!--nr_loops)) { + scx_bpf_kick_cpu(cpu_of(rq), 0); + break; + } + } while (dspc->nr_tasks); + + goto out; + +has_tasks: + has_tasks = true; +out: + scx_rq->flags &= ~SCX_RQ_BALANCING; + return has_tasks; +} + +static int balance_scx(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ + int ret; + + ret = balance_one(rq, prev, rf, true); + +#ifdef CONFIG_SCHED_SMT + /* + * When core-sched is enabled, this ops.balance() call will be followed + * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx() + * on the SMT siblings. Balance the siblings too. + */ + if (sched_core_enabled(rq)) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + int scpu; + + for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { + struct rq *srq = cpu_rq(scpu); + struct rq_flags srf; + struct task_struct *sprev = srq->curr; + + /* + * While core-scheduling, rq lock is shared among + * siblings but the debug annotations and rq clock + * aren't. Do pinning dance to transfer the ownership. + */ + WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); + rq_unpin_lock(rq, rf); + rq_pin_lock(srq, &srf); + + update_rq_clock(srq); + balance_one(srq, sprev, &srf, false); + + rq_unpin_lock(srq, &srf); + rq_repin_lock(rq, rf); + } + } +#endif + return ret; +} + +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) +{ + if (p->scx.flags & SCX_TASK_QUEUED) { + /* + * Core-sched might decide to execute @p before it is + * dispatched. Call ops_dequeue() to notify the BPF scheduler. + */ + ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + dispatch_dequeue(&rq->scx, p); + } + + p->se.exec_start = rq_clock_task(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + + clr_task_runnable(p, true); + + /* + * @p is getting newly scheduled or got kicked after someone updated its + * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). + */ + if ((p->scx.slice == SCX_SLICE_INF) != + (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { + if (p->scx.slice == SCX_SLICE_INF) + rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; + else + rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; + + sched_update_tick_dependency(rq); + } +} + +static void put_prev_task_scx(struct rq *rq, struct task_struct *p) +{ +#ifndef CONFIG_SMP + /* + * UP workaround. + * + * Because SCX may transfer tasks across CPUs during dispatch, dispatch + * is performed from its balance operation which isn't called in UP. + * Let's work around by calling it from the operations which come right + * after. + * + * 1. If the prev task is on SCX, pick_next_task() calls + * .put_prev_task() right after. As .put_prev_task() is also called + * from other places, we need to distinguish the calls which can be + * done by looking at the previous task's state - if still queued or + * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). + * This case is handled here. + * + * 2. If the prev task is not on SCX, the first following call into SCX + * will be .pick_next_task(), which is covered by calling + * balance_scx() from pick_next_task_scx(). + * + * Note that we can't merge the first case into the second as + * balance_scx() must be called before the previous SCX task goes + * through put_prev_task_scx(). + * + * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf. + * Pass in %NULL. + */ + if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) + balance_scx(rq, p, NULL); +#endif + + update_curr_scx(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + + /* + * If we're being called from put_prev_task_balance(), balance_scx() may + * have decided that @p should keep running. + */ + if (p->scx.flags & SCX_TASK_BAL_KEEP) { + p->scx.flags &= ~SCX_TASK_BAL_KEEP; + set_task_runnable(rq, p); + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + return; + } + + if (p->scx.flags & SCX_TASK_QUEUED) { + set_task_runnable(rq, p); + + /* + * If @p has slice left and balance_scx() didn't tag it for + * keeping, @p is getting preempted by a higher priority + * scheduler class or core-sched forcing a different task. Leave + * it at the head of the local DSQ. + */ + if (p->scx.slice && !scx_ops_bypassing()) { + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + return; + } + + /* + * If we're in the pick_next_task path, balance_scx() should + * have already populated the local DSQ if there are any other + * available tasks. If empty, tell ops.enqueue() that @p is the + * only one available for this cpu. ops.enqueue() should put it + * on the local DSQ so that the subsequent pick_next_task_scx() + * can find the task unless it wants to trigger a separate + * follow-up scheduling event. + */ + if (list_empty(&rq->scx.local_dsq.fifo)) + do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); + else + do_enqueue_task(rq, p, 0, -1); + } +} + +static struct task_struct *first_local_task(struct rq *rq) +{ + WARN_ON_ONCE(rb_first_cached(&rq->scx.local_dsq.priq)); + return list_first_entry_or_null(&rq->scx.local_dsq.fifo, + struct task_struct, scx.dsq_node.fifo); +} + +static struct task_struct *pick_next_task_scx(struct rq *rq) +{ + struct task_struct *p; + +#ifndef CONFIG_SMP + /* UP workaround - see the comment at the head of put_prev_task_scx() */ + if (unlikely(rq->curr->sched_class != &ext_sched_class)) + balance_scx(rq, rq->curr, NULL); +#endif + + p = first_local_task(rq); + if (!p) + return NULL; + + if (unlikely(!p->scx.slice)) { + if (!scx_ops_bypassing() && !scx_warned_zero_slice) { + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", + p->comm, p->pid); + scx_warned_zero_slice = true; + } + p->scx.slice = SCX_SLICE_DFL; + } + + set_next_task_scx(rq, p, true); + + return p; +} + +#ifdef CONFIG_SCHED_CORE +/** + * scx_prio_less - Task ordering for core-sched + * @a: task A + * @b: task B + * + * Core-sched is implemented as an additional scheduling layer on top of the + * usual sched_class'es and needs to find out the expected task ordering. For + * SCX, core-sched calls this function to interrogate the task ordering. + * + * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used + * to implement the default task ordering. The older the timestamp, the higher + * prority the task - the global FIFO ordering matching the default scheduling + * behavior. + * + * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to + * implement FIFO ordering within each local DSQ. See pick_task_scx(). + */ +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) +{ + /* + * The const qualifiers are dropped from task_struct pointers when + * calling ops.core_sched_before(). Accesses are controlled by the + * verifier. + */ + if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + (struct task_struct *)a, + (struct task_struct *)b); + else + return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); +} + +/** + * pick_task_scx - Pick a candidate task for core-sched + * @rq: rq to pick the candidate task from + * + * Core-sched calls this function on each SMT sibling to determine the next + * tasks to run on the SMT siblings. balance_one() has been called on all + * siblings and put_prev_task_scx() has been called only for the current CPU. + * + * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look + * at the first task in the local dsq. @rq->curr has to be considered explicitly + * to mimic %SCX_TASK_BAL_KEEP. + */ +static struct task_struct *pick_task_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct task_struct *first = first_local_task(rq); + + if (curr->scx.flags & SCX_TASK_QUEUED) { + /* is curr the only runnable task? */ + if (!first) + return curr; + + /* + * Does curr trump first? We can always go by core_sched_at for + * this comparison as it represents global FIFO ordering when + * the default core-sched ordering is used and local-DSQ FIFO + * ordering otherwise. + * + * We can have a task with an earlier timestamp on the DSQ. For + * example, when a current task is preempted by a sibling + * picking a different cookie, the task would be requeued at the + * head of the local DSQ with an earlier timestamp than the + * core-sched picked next task. Besides, the BPF scheduler may + * dispatch any tasks to the local DSQ anytime. + */ + if (curr->scx.slice && time_before64(curr->scx.core_sched_at, + first->scx.core_sched_at)) + return curr; + } + + return first; /* this may be %NULL */ +} +#endif /* CONFIG_SCHED_CORE */ + +static enum scx_cpu_preempt_reason +preempt_reason_from_class(const struct sched_class *class) +{ +#ifdef CONFIG_SMP + if (class == &stop_sched_class) + return SCX_CPU_PREEMPT_STOP; +#endif + if (class == &dl_sched_class) + return SCX_CPU_PREEMPT_DL; + if (class == &rt_sched_class) + return SCX_CPU_PREEMPT_RT; + return SCX_CPU_PREEMPT_UNKNOWN; +} + +void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task, + const struct sched_class *active) +{ + lockdep_assert_rq_held(rq); + + /* + * The callback is conceptually meant to convey that the CPU is no + * longer under the control of SCX. Therefore, don't invoke the + * callback if the CPU is is staying on SCX, or going idle (in which + * case the SCX scheduler has actively decided not to schedule any + * tasks on the CPU). + */ + if (likely(active >= &ext_sched_class)) + return; + + /* + * At this point we know that SCX was preempted by a higher priority + * sched_class, so invoke the ->cpu_release() callback if we have not + * done so already. We only send the callback once between SCX being + * preempted, and it regaining control of the CPU. + * + * ->cpu_release() complements ->cpu_acquire(), which is emitted the + * next time that balance_scx() is invoked. + */ + if (!rq->scx.cpu_released) { + if (SCX_HAS_OP(cpu_release)) { + struct scx_cpu_release_args args = { + .reason = preempt_reason_from_class(active), + .task = task, + }; + + SCX_CALL_OP(SCX_KF_CPU_RELEASE, + cpu_release, cpu_of(rq), &args); + } + rq->scx.cpu_released = true; + } +} + +#ifdef CONFIG_SMP + +static bool test_and_clear_cpu_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from idle_masks.smt. Ensure that @cpu + * is eventually cleared. + */ + if (cpumask_intersects(smt, idle_masks.smt)) + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + else if (cpumask_test_cpu(cpu, idle_masks.smt)) + __cpumask_clear_cpu(cpu, idle_masks.smt); + } +#endif + return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); +} + +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (test_and_clear_cpu_idle(cpu)) + return cpu; + else + goto retry; +} + +static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *found) +{ + s32 cpu; + + *found = false; + + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return prev_cpu; + } + + /* + * If WAKE_SYNC, the waker's local DSQ is empty, and the system is + * under utilized, wake up @p to the local DSQ of the waker. Checking + * only for an empty local DSQ is insufficient as it could give the + * wakee an unfair advantage when the system is oversaturated. + * Checking only for the presence of idle CPUs is also insufficient as + * the local DSQ of the waker could have tasks piled up on it even if + * there is an idle core elsewhere on the system. + */ + cpu = smp_processor_id(); + if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && + !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto cpu_found; + } + + if (p->nr_cpus_allowed == 1) { + if (test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } else { + return prev_cpu; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && + test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + if (test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto cpu_found; + + return prev_cpu; + +cpu_found: + *found = true; + return cpu; +} + +__bpf_kfunc_start_defs(); + +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *found) +{ + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { + *found = false; + return prev_cpu; + } + + return scx_select_cpu_dfl(p, prev_cpu, wake_flags, found); +} + +__bpf_kfunc_end_defs(); + +static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) +{ + /* + * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it + * can be a good migration opportunity with low cache and memory + * footprint. Returning a CPU different than @prev_cpu triggers + * immediate rq migration. However, for SCX, as the current rq + * association doesn't dictate where the task is going to run, this + * doesn't fit well. If necessary, we can later add a dedicated method + * which can decide to preempt self to force it through the regular + * scheduling path. + */ + if (unlikely(wake_flags & WF_EXEC)) + return prev_cpu; + + if (SCX_HAS_OP(select_cpu)) { + s32 cpu; + struct task_struct **ddsp_taskp; + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, + select_cpu, p, prev_cpu, wake_flags); + *ddsp_taskp = NULL; + if (ops_cpu_valid(cpu)) { + return cpu; + } else { + scx_ops_error("select_cpu returned invalid cpu %d", cpu); + return prev_cpu; + } + } else { + bool found; + s32 cpu; + + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); + if (found) { + p->scx.slice = SCX_SLICE_DFL; + p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + } + return cpu; + } +} + +static void set_cpus_allowed_scx(struct task_struct *p, + struct affinity_context *ac) +{ + set_cpus_allowed_common(p, ac); + + /* + * The effective cpumask is stored in @p->cpus_ptr which may temporarily + * differ from the configured one in @p->cpus_mask. Always tell the bpf + * scheduler the effective one. + * + * Fine-grained memory write control is enforced by BPF making the const + * designation pointless. Cast it away when calling the operation. + */ + if (SCX_HAS_OP(set_cpumask)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, + (struct cpumask *)p->cpus_ptr); +} + +static void reset_idle_masks(void) +{ + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + cpumask_copy(idle_masks.cpu, cpu_online_mask); + cpumask_copy(idle_masks.smt, cpu_online_mask); +} + +void __scx_update_idle(struct rq *rq, bool idle) +{ + int cpu = cpu_of(rq); + + if (SCX_HAS_OP(update_idle)) { + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + if (!static_branch_unlikely(&scx_builtin_idle_enabled)) + return; + } + + if (idle) + cpumask_set_cpu(cpu, idle_masks.cpu); + else + cpumask_clear_cpu(cpu, idle_masks.cpu); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + if (idle) { + /* + * idle_masks.smt handling is racy but that's fine as + * it's only for optimization and self-correcting. + */ + for_each_cpu(cpu, smt) { + if (!cpumask_test_cpu(cpu, idle_masks.cpu)) + return; + } + cpumask_or(idle_masks.smt, idle_masks.smt, smt); + } else { + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + } + } +#endif +} + +static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason) +{ + if (SCX_HAS_OP(cpu_online) && reason == RQ_ONOFF_HOTPLUG) + SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu_of(rq)); +} + +static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason) +{ + if (SCX_HAS_OP(cpu_offline) && reason == RQ_ONOFF_HOTPLUG) + SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu_of(rq)); +} + +#else /* !CONFIG_SMP */ + +static bool test_and_clear_cpu_idle(int cpu) { return false; } +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } +static void reset_idle_masks(void) {} + +#endif /* CONFIG_SMP */ + +static bool check_rq_for_timeouts(struct rq *rq) +{ + struct task_struct *p; + struct rq_flags rf; + bool timed_out = false; + + rq_lock_irqsave(rq, &rf); + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + unsigned long last_runnable = p->scx.runnable_at; + + if (unlikely(time_after(jiffies, + last_runnable + scx_watchdog_timeout))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, + dur_ms / 1000, dur_ms % 1000); + timed_out = true; + break; + } + } + rq_unlock_irqrestore(rq, &rf); + + return timed_out; +} + +static void scx_watchdog_workfn(struct work_struct *work) +{ + int cpu; + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + + for_each_online_cpu(cpu) { + if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) + break; + + cond_resched(); + } + queue_delayed_work(system_unbound_wq, to_delayed_work(work), + scx_watchdog_timeout / 2); +} + +static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +{ + update_curr_scx(rq); + + /* + * While disabling, always resched and refresh core-sched timestamp as + * we can't trust the slice management or ops.core_sched_before(). + */ + if (scx_ops_bypassing()) { + curr->scx.slice = 0; + touch_core_sched(rq, curr); + } + + if (!curr->scx.slice) + resched_curr(rq); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static struct cgroup *tg_cgrp(struct task_group *tg) +{ + /* + * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, + * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the + * root cgroup. + */ + if (tg && tg->css.cgroup) + return tg->css.cgroup; + else + return &cgrp_dfl_root.cgrp; +} + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), + +#else /* CONFIG_EXT_GROUP_SCHED */ + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +static enum scx_task_state scx_get_task_state(const struct task_struct *p) +{ + return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; +} + +static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +{ + enum scx_task_state prev_state = scx_get_task_state(p); + bool warn = false; + + BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); + + switch (state) { + case SCX_TASK_NONE: + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_READY: + warn = prev_state == SCX_TASK_NONE; + break; + case SCX_TASK_ENABLED: + warn = prev_state != SCX_TASK_READY; + break; + default: + warn = true; + return; + } + + WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state << SCX_TASK_STATE_SHIFT; +} + +static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) +{ + int ret; + + p->scx.disallow = false; + + if (SCX_HAS_OP(init_task)) { + struct scx_init_task_args args = { + SCX_INIT_TASK_ARGS_CGROUP(tg) + .fork = fork, + }; + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); + if (unlikely(ret)) { + ret = ops_sanitize_err("init_task", ret); + return ret; + } + } + + scx_set_task_state(p, SCX_TASK_INIT); + + if (p->scx.disallow) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + + /* + * We're either in fork or load path and @p->policy will be + * applied right after. Reverting @p->policy here and rejecting + * %SCHED_EXT transitions from scx_check_setscheduler() + * guarantees that if ops.init_task() sets @p->disallow, @p can + * never be in SCX. + */ + if (p->policy == SCHED_EXT) { + p->policy = SCHED_NORMAL; + atomic_long_inc(&scx_nr_rejected); + } + + task_rq_unlock(rq, p, &rf); + } + + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + return 0; +} + +static void set_task_scx_weight(struct task_struct *p) +{ + u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; + + p->scx.weight = sched_weight_to_cgroup(weight); +} + +static void scx_ops_enable_task(struct task_struct *p) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* + * Set the weight before calling ops.enable() so that the scheduler + * doesn't see a stale value if they inspect the task struct. + */ + set_task_scx_weight(p); + if (SCX_HAS_OP(enable)) + SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + scx_set_task_state(p, SCX_TASK_ENABLED); + + if (SCX_HAS_OP(set_weight)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void scx_ops_disable_task(struct task_struct *p) +{ + lockdep_assert_rq_held(task_rq(p)); + WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + + if (SCX_HAS_OP(disable)) + SCX_CALL_OP(SCX_KF_REST, disable, p); + scx_set_task_state(p, SCX_TASK_READY); +} + +static void scx_ops_exit_task(struct task_struct *p) +{ + struct scx_exit_task_args args = { + .cancelled = false, + }; + + lockdep_assert_rq_held(task_rq(p)); + switch (scx_get_task_state(p)) { + case SCX_TASK_NONE: + return; + case SCX_TASK_INIT: + args.cancelled = true; + break; + case SCX_TASK_READY: + break; + case SCX_TASK_ENABLED: + scx_ops_disable_task(p); + break; + default: + WARN_ON_ONCE(true); + return; + } + + if (SCX_HAS_OP(exit_task)) + SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + scx_set_task_state(p, SCX_TASK_NONE); +} + +void scx_pre_fork(struct task_struct *p) +{ + /* + * BPF scheduler enable/disable paths want to be able to iterate and + * update all tasks which can become complex when racing forks. As + * enable/disable are very cold paths, let's use a percpu_rwsem to + * exclude forks. + */ + percpu_down_read(&scx_fork_rwsem); +} + +int scx_fork(struct task_struct *p) +{ + percpu_rwsem_assert_held(&scx_fork_rwsem); + + if (scx_enabled()) + return scx_ops_init_task(p, task_group(p), true); + else + return 0; +} + +void scx_post_fork(struct task_struct *p) +{ + if (scx_enabled()) { + scx_set_task_state(p, SCX_TASK_READY); + /* + * Enable the task immediately if it's running on sched_ext. + * Otherwise, it'll be enabled in switching_to_scx() if and + * when it's ever configured to run with a SCHED_EXT policy. + */ + if (p->sched_class == &ext_sched_class) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_ops_enable_task(p); + task_rq_unlock(rq, p, &rf); + } + } + + spin_lock_irq(&scx_tasks_lock); + list_add_tail(&p->scx.tasks_node, &scx_tasks); + spin_unlock_irq(&scx_tasks_lock); + + percpu_up_read(&scx_fork_rwsem); +} + +void scx_cancel_fork(struct task_struct *p) +{ + if (scx_enabled()) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); + scx_ops_exit_task(p); + task_rq_unlock(rq, p, &rf); + } + percpu_up_read(&scx_fork_rwsem); +} + +void sched_ext_free(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&scx_tasks_lock, flags); + list_del_init(&p->scx.tasks_node); + spin_unlock_irqrestore(&scx_tasks_lock, flags); + + /* + * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> + * ENABLED transitions can't race us. Disable ops for @p. + */ + if (scx_get_task_state(p) != SCX_TASK_NONE) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_ops_exit_task(p); + task_rq_unlock(rq, p, &rf); + } +} + +static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio) +{ + lockdep_assert_rq_held(task_rq(p)); + + set_task_scx_weight(p); + if (SCX_HAS_OP(set_weight)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +{ +} + +static void switching_to_scx(struct rq *rq, struct task_struct *p) +{ + scx_ops_enable_task(p); + + /* + * set_cpus_allowed_scx() is not called while @p is associated with a + * different scheduler class. Keep the BPF scheduler up-to-date. + */ + if (SCX_HAS_OP(set_cpumask)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, + (struct cpumask *)p->cpus_ptr); +} + +static void switched_from_scx(struct rq *rq, struct task_struct *p) +{ + scx_ops_disable_task(p); +} + +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void switched_to_scx(struct rq *rq, struct task_struct *p) {} + +int scx_check_setscheduler(struct task_struct *p, int policy) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* if disallow, reject transitioning into SCX */ + if (scx_enabled() && READ_ONCE(p->scx.disallow) && + p->policy != policy && policy == SCHED_EXT) + return -EACCES; + + return 0; +} + +#ifdef CONFIG_NO_HZ_FULL +bool scx_can_stop_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (scx_ops_bypassing()) + return false; + + if (p->sched_class != &ext_sched_class) + return true; + + /* + * @rq can dispatch from different DSQs, so we can't tell whether it + * needs the tick or not by looking at nr_running. Allow stopping ticks + * iff the BPF scheduler indicated so. See set_next_task_scx(). + */ + return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; +} +#endif + +#ifdef CONFIG_EXT_GROUP_SCHED + +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); + +int scx_tg_online(struct task_group *tg) +{ + int ret = 0; + + WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, + tg->css.cgroup, &args); + if (!ret) + tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + else + ret = ops_sanitize_err("cgroup_init", ret); + } else { + tg->scx_flags |= SCX_TG_ONLINE; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ret; +} + +void scx_tg_offline(struct task_group *tg) +{ + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_exit, tg->css.cgroup); + tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + + percpu_up_read(&scx_cgroup_rwsem); +} + +int scx_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + int ret; + + /* released in scx_finish/cancel_attach() */ + percpu_down_read(&scx_cgroup_rwsem); + + if (!scx_enabled()) + return 0; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup *from = tg_cgrp(task_group(p)); + struct cgroup *to = tg_cgrp(css_tg(css)); + + WARN_ON_ONCE(p->scx.cgrp_moving_from); + + /* + * sched_move_task() omits identity migrations. Let's match the + * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() + * always match one-to-one. + */ + if (from == to) + continue; + + if (SCX_HAS_OP(cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_prep_move, + p, from, css->cgroup); + if (ret) + goto err; + } + + p->scx.cgrp_moving_from = from; + } + + return 0; + +err: + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ops_sanitize_err("cgroup_prep_move", ret); +} + +void scx_move_task(struct task_struct *p) +{ + /* + * We're called from sched_move_task() which handles both cgroup and + * autogroup moves. Ignore the latter. + * + * Also ignore exiting tasks, because in the exit path tasks transition + * from the autogroup to the root group, so task_group_is_autogroup() + * alone isn't able to catch exiting autogroup tasks. This is safe for + * cgroup_move(), because cgroup migrations never happen for PF_EXITING + * tasks. + */ + if (p->flags & PF_EXITING || task_group_is_autogroup(task_group(p))) + return; + + if (!scx_enabled()) + return; + + /* + * @p must have ops.cgroup_prep_move() called on it and thus + * cgrp_moving_from set. + */ + if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, + p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + p->scx.cgrp_moving_from = NULL; +} + +void scx_cgroup_finish_attach(void) +{ + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + + if (!scx_enabled()) + goto out_unlock; + + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } +out_unlock: + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_weight(struct task_group *tg, unsigned long weight) +{ + percpu_down_read(&scx_cgroup_rwsem); + + if (tg->scx_weight != weight) { + if (SCX_HAS_OP(cgroup_set_weight)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_set_weight, + tg_cgrp(tg), weight); + tg->scx_weight = weight; + } + + percpu_up_read(&scx_cgroup_rwsem); +} + +static void scx_cgroup_lock(void) +{ + percpu_down_write(&scx_cgroup_rwsem); +} + +static void scx_cgroup_unlock(void) +{ + percpu_up_write(&scx_cgroup_rwsem); +} + +#else /* CONFIG_EXT_GROUP_SCHED */ + +static inline void scx_cgroup_lock(void) {} +static inline void scx_cgroup_unlock(void) {} + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +/* + * Omitted operations: + * + * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task + * isn't tied to the CPU at that point. Preemption is implemented by resetting + * the victim task's slice to 0 and triggering reschedule on the target CPU. + * + * - migrate_task_rq: Unncessary as task to cpu mapping is transient. + * + * - task_fork/dead: We need fork/dead notifications for all tasks regardless of + * their current sched_class. Call them directly from sched core instead. + * + * - task_woken: Unnecessary. + */ +DEFINE_SCHED_CLASS(ext) = { + .enqueue_task = enqueue_task_scx, + .dequeue_task = dequeue_task_scx, + .yield_task = yield_task_scx, + .yield_to_task = yield_to_task_scx, + + .wakeup_preempt = wakeup_preempt_scx, + + .pick_next_task = pick_next_task_scx, + + .put_prev_task = put_prev_task_scx, + .set_next_task = set_next_task_scx, + +#ifdef CONFIG_SMP + .balance = balance_scx, + .select_task_rq = select_task_rq_scx, + .set_cpus_allowed = set_cpus_allowed_scx, + + .rq_online = rq_online_scx, + .rq_offline = rq_offline_scx, +#endif + +#ifdef CONFIG_SCHED_CORE + .pick_task = pick_task_scx, +#endif + + .task_tick = task_tick_scx, + + .switching_to = switching_to_scx, + .switched_from = switched_from_scx, + .switched_to = switched_to_scx, + .reweight_task = reweight_task_scx, + .prio_changed = prio_changed_scx, + + .update_curr = update_curr_scx, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 0, +#endif +}; + +static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +{ + memset(dsq, 0, sizeof(*dsq)); + + raw_spin_lock_init(&dsq->lock); + INIT_LIST_HEAD(&dsq->fifo); + dsq->id = dsq_id; +} + +static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) +{ + struct scx_dispatch_q *dsq; + int ret; + + if (dsq_id & SCX_DSQ_FLAG_BUILTIN) + return ERR_PTR(-EINVAL); + + dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) + return ERR_PTR(-ENOMEM); + + init_dsq(dsq, dsq_id); + + ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, + dsq_hash_params); + if (ret) { + kfree(dsq); + return ERR_PTR(ret); + } + return dsq; +} + +static void free_dsq_irq_workfn(struct irq_work *irq_work) +{ + struct llist_node *to_free = llist_del_all(&dsqs_to_free); + struct scx_dispatch_q *dsq, *tmp_dsq; + + llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) + kfree_rcu(dsq, rcu); +} + +static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); + +static void destroy_dsq(u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + unsigned long flags; + + rcu_read_lock(); + + dsq = rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); + if (!dsq) + goto out_unlock_rcu; + + raw_spin_lock_irqsave(&dsq->lock, flags); + + if (dsq->nr) { + scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", + dsq->id, dsq->nr); + goto out_unlock_dsq; + } + + if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) + goto out_unlock_dsq; + + /* + * Mark dead by invalidating ->id to prevent dispatch_enqueue() from + * queueing more tasks. As this function can be called from anywhere, + * freeing is bounced through an irq work to avoid nesting RCU + * operations inside scheduler locks. + */ + dsq->id = SCX_DSQ_INVALID; + llist_add(&dsq->free_node, &dsqs_to_free); + irq_work_queue(&free_dsq_irq_work); + +out_unlock_dsq: + raw_spin_unlock_irqrestore(&dsq->lock, flags); +out_unlock_rcu: + rcu_read_unlock(); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static void scx_cgroup_exit(void) +{ + struct cgroup_subsys_state *css; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + /* + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * cgroups and exit all the inited ones, all online cgroups are exited. + */ + rcu_read_lock(); + css_for_each_descendant_post(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + + if (!(tg->scx_flags & SCX_TG_INITED)) + continue; + tg->scx_flags &= ~SCX_TG_INITED; + + if (!scx_ops.cgroup_exit) + continue; + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); +} + +static int scx_cgroup_init(void) +{ + struct cgroup_subsys_state *css; + int ret; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + /* + * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * cgroups and init, all online cgroups are initialized. + */ + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + if ((tg->scx_flags & + (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) + continue; + + if (!scx_ops.cgroup_init) { + tg->scx_flags |= SCX_TG_INITED; + continue; + } + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, + css->cgroup, &args); + if (ret) { + css_put(css); + return ret; + } + tg->scx_flags |= SCX_TG_INITED; + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); + + return 0; +} + +static void scx_cgroup_config_knobs(void) +{ + static DEFINE_MUTEX(cgintf_mutex); + DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { }; + u64 knob_flags; + int i; + + /* + * Called from both class switch and ops enable/disable paths, + * synchronize internally. + */ + mutex_lock(&cgintf_mutex); + + /* if fair is in use, all knobs should be shown */ + if (!scx_switched_all()) { + bitmap_fill(mask, CPU_CFTYPE_CNT); + goto apply; + } + + /* + * On ext, only show the supported knobs. Otherwise, show all possible + * knobs so that configuration attempts succeed and the states are + * remembered while ops is not loaded. + */ + if (scx_enabled()) + knob_flags = scx_ops.flags; + else + knob_flags = SCX_OPS_ALL_FLAGS; + + if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) { + __set_bit(CPU_CFTYPE_WEIGHT, mask); + __set_bit(CPU_CFTYPE_WEIGHT_NICE, mask); + } +apply: + for (i = 0; i < CPU_CFTYPE_CNT; i++) + cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask)); + + mutex_unlock(&cgintf_mutex); +} + +#else +static void scx_cgroup_exit(void) {} +static int scx_cgroup_init(void) { return 0; } +static void scx_cgroup_config_knobs(void) {} +#endif + + +/******************************************************************************** + * Sysfs interface and ops enable/disable. + */ + +#define SCX_ATTR(_name) \ + static struct kobj_attribute scx_attr_##_name = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = scx_attr_##_name##_show, \ + } + +static ssize_t scx_attr_state_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", + scx_ops_enable_state_str[scx_ops_enable_state()]); +} +SCX_ATTR(state); + +static ssize_t scx_attr_switch_all_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); +} +SCX_ATTR(switch_all); + +static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); +} +SCX_ATTR(nr_rejected); + +static struct attribute *scx_global_attrs[] = { + &scx_attr_state.attr, + &scx_attr_switch_all.attr, + &scx_attr_nr_rejected.attr, + NULL, +}; + +static const struct attribute_group scx_global_attr_group = { + .attrs = scx_global_attrs, +}; + +static void scx_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static ssize_t scx_attr_ops_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", scx_ops.name); +} +SCX_ATTR(ops); + +static struct attribute *scx_sched_attrs[] = { + &scx_attr_ops.attr, + NULL, +}; +ATTRIBUTE_GROUPS(scx_sched); + +static const struct kobj_type scx_ktype = { + .release = scx_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = scx_sched_groups, +}; + +static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); +} + +static const struct kset_uevent_ops scx_uevent_ops = { + .uevent = scx_uevent, +}; + +/* + * Used by sched_fork() and __setscheduler_prio() to pick the matching + * sched_class. dl/rt are already handled. + */ +bool task_should_scx(struct task_struct *p) +{ + if (!scx_enabled() || + unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) + return false; + if (READ_ONCE(scx_switching_all)) + return true; + return p->policy == SCHED_EXT; +} + +/** + * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * + * Bypassing guarantees that all runnable tasks make forward progress without + * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might + * be held by tasks that the BPF scheduler is forgetting to run, which + * unfortunately also excludes toggling the static branches. + * + * Let's work around by overriding a couple ops and modifying behaviors based on + * the DISABLING state and then cycling the queued tasks through dequeue/enqueue + * to force global FIFO scheduling. + * + * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * + * b. ops.dispatch() is ignored. + * + * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be + * trusted. Whenever a tick triggers, the running task is rotated to the tail + * of the queue with core_sched_at touched. + * + * d. pick_next_task() suppresses zero slice warning. + * + * e. scx_prio_less() reverts to the default core_sched_at order. + * + * f. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + */ +static void scx_ops_bypass(bool bypass) +{ + int depth, cpu; + + if (bypass) { + depth = atomic_inc_return(&scx_ops_bypass_depth); + WARN_ON_ONCE(depth <= 0); + if (depth != 1) + return; + } else { + depth = atomic_dec_return(&scx_ops_bypass_depth); + WARN_ON_ONCE(depth < 0); + if (depth != 0) + return; + } + + mutex_lock(&scx_ops_enable_mutex); + if (!scx_enabled()) + goto out_unlock; + + /* + * No task property is changing. We just need to make sure all currently + * queued tasks are re-queued according to the new scx_ops_bypassing() + * state. As an optimization, walk each rq's runnable_list instead of + * the scx_tasks list. + * + * This function can't trust the scheduler and thus can't use + * cpus_read_lock(). Walk all possible CPUs instead of online. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p, *n; + + rq_lock_irqsave(rq, &rf); + + /* + * The use of list_for_each_entry_safe_reverse() is required + * because each task is going to be removed from and added back + * to the runnable_list during iteration. Because they're added + * to the tail of the list, safe reverse iteration can still + * visit all nodes. + */ + list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, + scx.runnable_node) { + struct sched_enq_and_set_ctx ctx; + + /* cycling deq/enq is enough, see the function comment */ + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + sched_enq_and_set_task(&ctx); + } + + rq_unlock_irqrestore(rq, &rf); + + /* kick to restore ticks */ + resched_cpu(cpu); + } + +out_unlock: + mutex_unlock(&scx_ops_enable_mutex); +} + +static void free_exit_info(struct scx_exit_info *ei) +{ + kfree(ei->dump); + kfree(ei->msg); + kfree(ei->bt); + kfree(ei); +} + +static struct scx_exit_info *alloc_exit_info(void) +{ + struct scx_exit_info *ei; + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) + return NULL; + + ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL); + ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); + ei->dump = kzalloc(SCX_EXIT_DUMP_LEN, GFP_KERNEL); + + if (!ei->bt || !ei->msg || !ei->dump) { + free_exit_info(ei); + return NULL; + } + + return ei; +} + +static const char *scx_exit_reason(enum scx_exit_kind kind) +{ + switch (kind) { + case SCX_EXIT_UNREG: + return "BPF scheduler unregistered"; + case SCX_EXIT_SYSRQ: + return "disabled by sysrq-S"; + case SCX_EXIT_ERROR: + return "runtime error"; + case SCX_EXIT_ERROR_BPF: + return "scx_bpf_error"; + case SCX_EXIT_ERROR_STALL: + return "runnable task stall"; + default: + return ""; + } +} + +static void scx_ops_disable_workfn(struct kthread_work *work) +{ + struct scx_exit_info *ei = scx_exit_info; + struct scx_task_iter sti; + struct task_struct *p; + struct rhashtable_iter rht_iter; + struct scx_dispatch_q *dsq; + int i, kind; + + kind = atomic_read(&scx_exit_kind); + while (true) { + /* + * NONE indicates that a new scx_ops has been registered since + * disable was scheduled - don't kill the new ops. DONE + * indicates that the ops has already been disabled. + */ + if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) + return; + if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) + break; + } + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + /* guarantee forward progress by bypassing scx_ops */ + scx_ops_bypass(true); + + switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { + case SCX_OPS_DISABLING: + WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); + break; + case SCX_OPS_DISABLED: + pr_warn("sched_ext: ops error detected without ops (%s)\n", + scx_exit_info->msg); + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != + SCX_OPS_DISABLING); + goto done; + default: + break; + } + + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ + mutex_lock(&scx_ops_enable_mutex); + + static_branch_disable(&__scx_switched_all); + WRITE_ONCE(scx_switching_all, false); + + /* + * Avoid racing against fork and cgroup changes. See scx_ops_enable() + * for explanation on the locking order. + */ + percpu_down_write(&scx_fork_rwsem); + cpus_read_lock(); + scx_cgroup_lock(); + + spin_lock_irq(&scx_tasks_lock); + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); + __setscheduler_prio(p, p->prio); + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + scx_ops_exit_task(p); + } + scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); + + /* no task is on scx, turn off all the switches and flush in-progress calls */ + static_branch_disable_cpuslocked(&__scx_ops_enabled); + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) + static_branch_disable_cpuslocked(&scx_has_op[i]); + static_branch_disable_cpuslocked(&scx_ops_enq_last); + static_branch_disable_cpuslocked(&scx_ops_enq_exiting); + static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + synchronize_rcu(); + + scx_cgroup_exit(); + + scx_cgroup_unlock(); + cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + + if (ei->kind >= SCX_EXIT_ERROR) { + printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); + + if (ei->msg[0] == '\0') + printk(KERN_ERR "sched_ext: %s\n", ei->reason); + else + printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); + + stack_trace_print(ei->bt, ei->bt_len, 2); + } + + if (scx_ops.exit) + SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + + cancel_delayed_work_sync(&scx_watchdog_work); + /* + * Delete the kobject from the hierarchy eagerly in addition to just + * dropping a reference. Otherwise, if the object is deleted + * asynchronously, sysfs could observe an object of the same name still + * in the hierarchy when another scheduler is loaded. + */ + kobject_del(scx_root_kobj); + kobject_put(scx_root_kobj); + scx_root_kobj = NULL; + + memset(&scx_ops, 0, sizeof(scx_ops)); + + rhashtable_walk_enter(&dsq_hash, &rht_iter); + do { + rhashtable_walk_start(&rht_iter); + + while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + destroy_dsq(dsq->id); + + rhashtable_walk_stop(&rht_iter); + } while (dsq == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&rht_iter); + + free_percpu(scx_dsp_buf); + scx_dsp_buf = NULL; + scx_dsp_max_batch = 0; + + free_exit_info(scx_exit_info); + scx_exit_info = NULL; + + mutex_unlock(&scx_ops_enable_mutex); + + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != + SCX_OPS_DISABLING); + + scx_cgroup_config_knobs(); +done: + scx_ops_bypass(false); +} + +static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); + +static void schedule_scx_ops_disable_work(void) +{ + struct kthread_worker *helper = READ_ONCE(scx_ops_helper); + + /* + * We may be called spuriously before the first bpf_sched_ext_reg(). If + * scx_ops_helper isn't set up yet, there's nothing to do. + */ + if (helper) + kthread_queue_work(helper, &scx_ops_disable_work); +} + +static void scx_ops_disable(enum scx_exit_kind kind) +{ + int none = SCX_EXIT_NONE; + + if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) + kind = SCX_EXIT_ERROR; + + atomic_try_cmpxchg(&scx_exit_kind, &none, kind); + + schedule_scx_ops_disable_work(); +} + +static void scx_dump_task(struct seq_buf *s, struct task_struct *p, char marker, + unsigned long now) +{ + static unsigned long bt[SCX_EXIT_BT_LEN]; + char dsq_id_buf[19] = "(n/a)"; + unsigned long ops_state = atomic_long_read(&p->scx.ops_state); + unsigned int bt_len; + size_t avail, used; + char *buf; + + if (p->scx.dsq) + scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", + (unsigned long long)p->scx.dsq->id); + + seq_buf_printf(s, "\n %c%c %s[%d] %+ldms\n", + marker, task_state_to_char(p), p->comm, p->pid, + jiffies_delta_msecs(p->scx.runnable_at, now)); + seq_buf_printf(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu\n", + scx_get_task_state(p), + p->scx.flags & ~SCX_TASK_STATE_MASK, + p->scx.dsq_flags, + ops_state & SCX_OPSS_STATE_MASK, + ops_state >> SCX_OPSS_QSEQ_SHIFT); + seq_buf_printf(s, " sticky/holding_cpu=%d/%d dsq_id=%s\n", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + seq_buf_printf(s, " cpus=%*pb\n\n", cpumask_pr_args(p->cpus_ptr)); + + bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); + + avail = seq_buf_get_buf(s, &buf); + used = stack_trace_snprint(buf, avail, bt, bt_len, 3); + seq_buf_commit(s, used < avail ? used : -1); +} + +static void scx_dump_state(struct scx_exit_info *ei) +{ + const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; + unsigned long now = jiffies; + struct seq_buf s; + size_t avail, used; + char *buf; + int cpu; + + seq_buf_init(&s, ei->dump, SCX_EXIT_DUMP_LEN - sizeof(trunc_marker)); + + seq_buf_printf(&s, "%s[%d] triggered exit kind %d:\n %s (%s)\n\n", + current->comm, current->pid, ei->kind, ei->reason, ei->msg); + seq_buf_printf(&s, "Backtrace:\n"); + avail = seq_buf_get_buf(&s, &buf); + used = stack_trace_snprint(buf, avail, ei->bt, ei->bt_len, 1); + seq_buf_commit(&s, used < avail ? used : -1); + + seq_buf_printf(&s, "\nRunqueue states\n"); + seq_buf_printf(&s, "---------------\n"); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p; + + rq_lock(rq, &rf); + + if (list_empty(&rq->scx.runnable_list) && + rq->curr->sched_class == &idle_sched_class) + goto next; + + seq_buf_printf(&s, "\nCPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu\n", + cpu, rq->scx.nr_running, rq->scx.flags, + rq->scx.cpu_released, rq->scx.ops_qseq, + rq->scx.pnt_seq); + seq_buf_printf(&s, " curr=%s[%d] class=%ps\n", + rq->curr->comm, rq->curr->pid, + rq->curr->sched_class); + if (!cpumask_empty(rq->scx.cpus_to_kick)) + seq_buf_printf(&s, " cpus_to_kick : %*pb\n", + cpumask_pr_args(rq->scx.cpus_to_kick)); + if (!cpumask_empty(rq->scx.cpus_to_preempt)) + seq_buf_printf(&s, " cpus_to_preempt: %*pb\n", + cpumask_pr_args(rq->scx.cpus_to_preempt)); + if (!cpumask_empty(rq->scx.cpus_to_wait)) + seq_buf_printf(&s, " cpus_to_wait : %*pb\n", + cpumask_pr_args(rq->scx.cpus_to_wait)); + + if (rq->curr->sched_class == &ext_sched_class) + scx_dump_task(&s, rq->curr, '*', now); + + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) + scx_dump_task(&s, p, ' ', now); + next: + rq_unlock(rq, &rf); + } + + if (seq_buf_has_overflowed(&s)) { + used = strlen(seq_buf_str(&s)); + memcpy(ei->dump + used, trunc_marker, sizeof(trunc_marker)); + } +} + +static void scx_ops_error_irq_workfn(struct irq_work *irq_work) +{ + scx_dump_state(scx_exit_info); + schedule_scx_ops_disable_work(); +} + +static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); + +__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind, + const char *fmt, ...) +{ + struct scx_exit_info *ei = scx_exit_info; + int none = SCX_EXIT_NONE; + va_list args; + + if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) + return; + + ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); + + va_start(args, fmt); + vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); + va_end(args); + + /* + * Set ei->kind and ->reason for scx_dump_state(). They'll be set again + * in scx_ops_disable_workfn(). + */ + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + irq_work_queue(&scx_ops_error_irq_work); +} + +static struct kthread_worker *scx_create_rt_helper(const char *name) +{ + struct kthread_worker *helper; + + helper = kthread_create_worker(0, name); + if (helper) + sched_set_fifo(helper->task); + return helper; +} + +static int validate_ops(const struct sched_ext_ops *ops) +{ + /* + * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the + * ops.enqueue() callback isn't implemented. + */ + if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { + scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + return -EINVAL; + } + + return 0; +} + +static int scx_ops_enable(struct sched_ext_ops *ops) +{ + struct scx_task_iter sti; + struct task_struct *p; + unsigned long timeout; + int i, ret; + + mutex_lock(&scx_ops_enable_mutex); + + if (!scx_ops_helper) { + WRITE_ONCE(scx_ops_helper, + scx_create_rt_helper("sched_ext_ops_helper")); + if (!scx_ops_helper) { + ret = -ENOMEM; + goto err_unlock; + } + } + + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { + ret = -EBUSY; + goto err_unlock; + } + + scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); + if (!scx_root_kobj) { + ret = -ENOMEM; + goto err_unlock; + } + + scx_root_kobj->kset = scx_kset; + ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); + if (ret < 0) + goto err; + + scx_exit_info = alloc_exit_info(); + if (!scx_exit_info) { + ret = -ENOMEM; + goto err_del; + } + + /* + * Set scx_ops, transition to PREPPING and clear exit info to arm the + * disable path. Failure triggers full disabling from here on. + */ + scx_ops = *ops; + + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != + SCX_OPS_DISABLED); + + atomic_set(&scx_exit_kind, SCX_EXIT_NONE); + scx_warned_zero_slice = false; + + atomic_long_set(&scx_nr_rejected, 0); + + /* + * Keep CPUs stable during enable so that the BPF scheduler can track + * online CPUs by watching ->on/offline_cpu() after ->init(). + */ + cpus_read_lock(); + + scx_switch_all_req = false; + if (scx_ops.init) { + ret = SCX_CALL_OP_RET(SCX_KF_INIT, init); + if (ret) { + ret = ops_sanitize_err("init", ret); + goto err_disable_unlock_cpus; + } + + /* + * Exit early if ops.init() triggered scx_bpf_error(). Not + * strictly necessary as we'll fail transitioning into ENABLING + * later but that'd be after calling ops.init_task() on all + * tasks and with -EBUSY which isn't very intuitive. Let's exit + * early with success so that the condition is notified through + * ops.exit() like other scx_bpf_error() invocations. + */ + if (atomic_read(&scx_exit_kind) != SCX_EXIT_NONE) + goto err_disable_unlock_cpus; + } + + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + + cpus_read_unlock(); + + ret = validate_ops(ops); + if (ret) + goto err_disable; + + WARN_ON_ONCE(scx_dsp_buf); + scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; + scx_dsp_buf = __alloc_percpu(sizeof(scx_dsp_buf[0]) * scx_dsp_max_batch, + __alignof__(scx_dsp_buf[0])); + if (!scx_dsp_buf) { + ret = -ENOMEM; + goto err_disable; + } + + if (ops->timeout_ms) + timeout = msecs_to_jiffies(ops->timeout_ms); + else + timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + WRITE_ONCE(scx_watchdog_timeout, timeout); + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + queue_delayed_work(system_unbound_wq, &scx_watchdog_work, + scx_watchdog_timeout / 2); + + /* + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. + * + * We don't need to keep the CPUs stable but static_branch_*() requires + * cpus_read_lock() and scx_cgroup_rwsem must nest inside + * cpu_hotplug_lock because of the following dependency chain: + * + * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem + * + * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use + * static_branch_*_cpuslocked(). + * + * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the + * following dependency chain: + * + * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock + */ + percpu_down_write(&scx_fork_rwsem); + cpus_read_lock(); + scx_cgroup_lock(); + + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + + if (ops->flags & SCX_OPS_ENQ_LAST) + static_branch_enable_cpuslocked(&scx_ops_enq_last); + + if (ops->flags & SCX_OPS_ENQ_EXITING) + static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + if (scx_ops.cpu_acquire || scx_ops.cpu_release) + static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); + + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { + reset_idle_masks(); + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + } else { + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + } + + /* + * All cgroups should be initialized before letting in tasks. cgroup + * on/offlining and task migrations are already locked out. + */ + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; + + static_branch_enable_cpuslocked(&__scx_ops_enabled); + + /* + * Enable ops for every task. Fork is excluded by scx_fork_rwsem + * preventing new tasks from being added. No need to exclude tasks + * leaving as sched_ext_free() can handle both prepped and enabled + * tasks. Prep all tasks first and then enable them with preemption + * disabled. + */ + spin_lock_irq(&scx_tasks_lock); + + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered(&sti))) { + get_task_struct(p); + spin_unlock_irq(&scx_tasks_lock); + + ret = scx_ops_init_task(p, task_group(p), false); + if (ret) { + put_task_struct(p); + spin_lock_irq(&scx_tasks_lock); + scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); + pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", + ret, p->comm, p->pid); + goto err_disable_unlock_all; + } + + put_task_struct(p); + spin_lock_irq(&scx_tasks_lock); + } + scx_task_iter_exit(&sti); + + /* + * All tasks are prepped but are still ops-disabled. Ensure that + * %current can't be scheduled out and switch everyone. + * preempt_disable() is necessary because we can't guarantee that + * %current won't be starved if scheduled out while switching. + */ + preempt_disable(); + + /* + * From here on, the disable path must assume that tasks have ops + * enabled and need to be recovered. + */ + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { + preempt_enable(); + spin_unlock_irq(&scx_tasks_lock); + ret = -EBUSY; + goto err_disable_unlock_all; + } + + /* + * We're fully committed and can't fail. The PREPPED -> ENABLED + * transitions here are synchronized against sched_ext_free() through + * scx_tasks_lock. + */ + WRITE_ONCE(scx_switching_all, scx_switch_all_req); + + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + scx_set_task_state(p, SCX_TASK_READY); + __setscheduler_prio(p, p->prio); + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + } + scx_task_iter_exit(&sti); + + spin_unlock_irq(&scx_tasks_lock); + preempt_enable(); + scx_cgroup_unlock(); + cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { + ret = -EBUSY; + goto err_disable; + } + + if (scx_switch_all_req) + static_branch_enable(&__scx_switched_all); + + kobject_uevent(scx_root_kobj, KOBJ_ADD); + mutex_unlock(&scx_ops_enable_mutex); + + scx_cgroup_config_knobs(); + + return 0; + +err_del: + kobject_del(scx_root_kobj); +err: + kobject_put(scx_root_kobj); + scx_root_kobj = NULL; + if (scx_exit_info) { + free_exit_info(scx_exit_info); + scx_exit_info = NULL; + } +err_unlock: + mutex_unlock(&scx_ops_enable_mutex); + return ret; + +err_disable_unlock_all: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable_unlock_cpus: + cpus_read_unlock(); +err_disable: + mutex_unlock(&scx_ops_enable_mutex); + /* must be fully disabled before returning */ + scx_ops_disable(SCX_EXIT_ERROR); + kthread_flush_work(&scx_ops_disable_work); + return ret; +} + + +/******************************************************************************** + * bpf_struct_ops plumbing. + */ +#include +#include + +extern struct btf *btf_vmlinux; +static const struct btf_type *task_struct_type; +static u32 task_struct_type_id; + +/* Make the 2nd argument of .dispatch a pointer that can be NULL. */ +static bool promote_dispatch_2nd_arg(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const struct bpf_struct_ops *st_ops; + const struct btf_member *member; + const struct btf_type *t; + u32 btf_id, member_idx; + const char *mname; + + /* btf_id should be the type id of struct sched_ext_ops */ + btf_id = prog->aux->attach_btf_id; + st_ops = bpf_struct_ops_find(btf_id); + if (!st_ops) + return false; + + /* BTF type of struct sched_ext_ops */ + t = st_ops->type; + + member_idx = prog->expected_attach_type; + if (member_idx >= btf_type_vlen(t)) + return false; + + /* + * Get the member name of this struct_ops program, which corresponds to + * a field in struct sched_ext_ops. For example, the member name of the + * dispatch struct_ops program (callback) is "dispatch". + */ + member = &btf_type_member(t)[member_idx]; + mname = btf_name_by_offset(btf_vmlinux, member->name_off); + + /* + * Check if it is the second argument of the function pointer at + * "dispatch" in struct sched_ext_ops. The arguments of struct_ops + * operators are sequential and 64-bit, so the second argument is at + * offset sizeof(__u64). + */ + if (strcmp(mname, "dispatch") == 0 && + off == sizeof(__u64)) { + /* + * The value is a pointer to a type (struct task_struct) given + * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), + * however, can be a NULL (PTR_MAYBE_NULL). The BPF program + * should check the pointer to make sure it is not NULL before + * using it, or the verifier will reject the program. + * + * Longer term, this is something that should be addressed by + * BTF, and be fully contained within the verifier. + */ + info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | + PTR_TRUSTED; + info->btf = btf_vmlinux; + info->btf_id = task_struct_type_id; + + return true; + } + + return false; +} + +static bool bpf_scx_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type != BPF_READ) + return false; + if (promote_dispatch_2nd_arg(off, size, type, prog, info)) + return true; + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (off % size != 0) + return false; + + return btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, + int size) +{ + const struct btf_type *t; + + t = btf_type_by_id(reg->btf, reg->btf_id); + if (t == task_struct_type) { + if (off >= offsetof(struct task_struct, scx.slice) && + off + size <= offsetofend(struct task_struct, scx.slice)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.disallow) && + off + size <= offsetofend(struct task_struct, scx.disallow)) + return SCALAR_VALUE; + } + + return -EACCES; +} + +static const struct bpf_func_proto * +bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +const struct bpf_verifier_ops bpf_scx_verifier_ops = { + .get_func_proto = bpf_scx_get_func_proto, + .is_valid_access = bpf_scx_is_valid_access, + .btf_struct_access = bpf_scx_btf_struct_access, +}; + +static int bpf_scx_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct sched_ext_ops *uops = udata; + struct sched_ext_ops *ops = kdata; + u32 moff = __btf_member_bit_offset(t, member) / 8; + int ret; + + switch (moff) { + case offsetof(struct sched_ext_ops, dispatch_max_batch): + if (*(u32 *)(udata + moff) > INT_MAX) + return -E2BIG; + ops->dispatch_max_batch = *(u32 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, flags): + if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) + return -EINVAL; + ops->flags = *(u64 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, name): + ret = bpf_obj_name_cpy(ops->name, uops->name, + sizeof(ops->name)); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + return 1; + case offsetof(struct sched_ext_ops, timeout_ms): + if (msecs_to_jiffies(*(u32 *)(udata + moff)) > + SCX_WATCHDOG_MAX_TIMEOUT) + return -E2BIG; + ops->timeout_ms = *(u32 *)(udata + moff); + return 1; + } + + return 0; +} + +static int bpf_scx_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct sched_ext_ops, init_task): +#ifdef CONFIG_EXT_GROUP_SCHED + case offsetof(struct sched_ext_ops, cgroup_init): + case offsetof(struct sched_ext_ops, cgroup_exit): + case offsetof(struct sched_ext_ops, cgroup_prep_move): +#endif + case offsetof(struct sched_ext_ops, init): + case offsetof(struct sched_ext_ops, exit): + break; + default: + if (prog->aux->sleepable) + return -EINVAL; + } + + return 0; +} + +static int bpf_scx_reg(void *kdata) +{ + return scx_ops_enable(kdata); +} + +static void bpf_scx_unreg(void *kdata) +{ + scx_ops_disable(SCX_EXIT_UNREG); + kthread_flush_work(&scx_ops_disable_work); +} + +static int bpf_scx_init(struct btf *btf) +{ + u32 type_id; + + type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + task_struct_type = btf_type_by_id(btf, type_id); + task_struct_type_id = type_id; + + return 0; +} + +static int bpf_scx_update(void *kdata, void *old_kdata) +{ + /* + * sched_ext does not support updating the actively-loaded BPF + * scheduler, as registering a BPF scheduler can always fail if the + * scheduler returns an error code for e.g. ops.init(), + * ops.init_task(), etc. Similarly, we can always race with + * unregistration happening elsewhere, such as with sysrq. + */ + return -EOPNOTSUPP; +} + +static int bpf_scx_validate(void *kdata) +{ + return 0; +} + +/* "extern" to avoid sparse warning, only used in this file */ +extern struct bpf_struct_ops bpf_sched_ext_ops; + +static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + return -EINVAL; +} + +static void enqueue_stub(struct task_struct *p, u64 enq_flags) +{} + +static void dequeue_stub(struct task_struct *p, u64 enq_flags) +{} + +static void dispatch_stub(s32 prev_cpu, struct task_struct *p) +{} + +static void runnable_stub(struct task_struct *p, u64 enq_flags) +{} + +static void running_stub(struct task_struct *p) +{} + +static void stopping_stub(struct task_struct *p, bool runnable) +{} + +static void quiescent_stub(struct task_struct *p, u64 deq_flags) +{} + +static bool yield_stub(struct task_struct *from, struct task_struct *to) +{ + return false; +} + +static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) +{ + return false; +} + +static void set_weight_stub(struct task_struct *p, u32 weight) +{} + +static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) +{} + +static void update_idle_stub(s32 cpu, bool idle) +{} + +static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) +{} + +static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) +{} + +static s32 init_task_stub(struct task_struct *p, + struct scx_init_task_args *args) +{ + return -EINVAL; +} + + +static void exit_task_stub(struct task_struct *p, + struct scx_exit_task_args *args) +{} + +static void enable_stub(struct task_struct *p) +{} + +static void disable_stub(struct task_struct *p) +{} + +#ifdef CONFIG_EXT_GROUP_SCHED +static s32 cgroup_init_stub(struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + return -EINVAL; +} + +static void cgroup_exit_stub(struct cgroup *cgrp) +{} + +static s32 cgroup_prep_move_stub(struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + return -EINVAL; +} + +static void cgroup_move_stub(struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +static void cgroup_cancel_move_stub(struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) +{} +#endif + +static void cpu_online_stub(s32 cpu) +{} + +static void cpu_offline_stub(s32 cpu) +{} + +static s32 init_stub(void) +{ + return -EINVAL; +} + +static void exit_stub(struct scx_exit_info *info) +{} + +static struct sched_ext_ops __bpf_ops_sched_ext_ops = { + .select_cpu = select_cpu_stub, + .enqueue = enqueue_stub, + .dequeue = dequeue_stub, + .dispatch = dispatch_stub, + .runnable = runnable_stub, + .running = running_stub, + .stopping = stopping_stub, + .quiescent = quiescent_stub, + .yield = yield_stub, + .core_sched_before = core_sched_before_stub, + .set_weight = set_weight_stub, + .set_cpumask = set_cpumask_stub, + .update_idle = update_idle_stub, + .cpu_acquire = cpu_acquire_stub, + .cpu_release = cpu_release_stub, + .init_task = init_task_stub, + .exit_task = exit_task_stub, + .enable = enable_stub, + .disable = disable_stub, +#ifdef CONFIG_EXT_GROUP_SCHED + .cgroup_init = cgroup_init_stub, + .cgroup_exit = cgroup_exit_stub, + .cgroup_prep_move = cgroup_prep_move_stub, + .cgroup_move = cgroup_move_stub, + .cgroup_cancel_move = cgroup_cancel_move_stub, + .cgroup_set_weight = cgroup_set_weight_stub, +#endif + .cpu_online = cpu_online_stub, + .cpu_offline = cpu_offline_stub, + .init = init_stub, + .exit = exit_stub, +}; + +struct bpf_struct_ops bpf_sched_ext_ops = { + .verifier_ops = &bpf_scx_verifier_ops, + .reg = bpf_scx_reg, + .unreg = bpf_scx_unreg, + .check_member = bpf_scx_check_member, + .init_member = bpf_scx_init_member, + .init = bpf_scx_init, + .update = bpf_scx_update, + .validate = bpf_scx_validate, + .name = "sched_ext_ops", + .cfi_stubs = &__bpf_ops_sched_ext_ops +}; + + +/******************************************************************************** + * System integration and init. + */ + +static void sysrq_handle_sched_ext_reset(u8 key) +{ + if (scx_ops_helper) + scx_ops_disable(SCX_EXIT_SYSRQ); + else + pr_info("sched_ext: BPF scheduler not yet used\n"); +} + +static const struct sysrq_key_op sysrq_sched_ext_reset_op = { + .handler = sysrq_handle_sched_ext_reset, + .help_msg = "reset-sched-ext(S)", + .action_msg = "Disable sched_ext and revert all tasks to CFS", + .enable_mask = SYSRQ_ENABLE_RTNICE, +}; + +static bool can_skip_idle_kick(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + /* + * We can skip idle kicking if @rq is going to go through at least one + * full SCX scheduling cycle before going idle. Just checking whether + * curr is not idle is insufficient because we could be racing + * balance_one() trying to pull the next task from a remote rq, which + * may fail, and @rq may become idle afterwards. + * + * The race window is small and we don't and can't guarantee that @rq is + * only kicked while idle anyway. Skip only when sure. + */ + return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING); +} + +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) +{ + struct rq *rq = cpu_rq(cpu); + struct scx_rq *this_scx = &this_rq->scx; + bool should_wait = false; + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + /* + * During CPU hotplug, a CPU may depend on kicking itself to make + * forward progress. Allow kicking self regardless of online state. + */ + if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { + if (rq->curr->sched_class == &ext_sched_class) + rq->curr->scx.slice = 0; + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + } + + if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { + pseqs[cpu] = rq->scx.pnt_seq; + should_wait = true; + } + + resched_curr(rq); + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } + + raw_spin_rq_unlock_irqrestore(rq, flags); + + return should_wait; +} + +static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + if (!can_skip_idle_kick(rq) && + (cpu_online(cpu) || cpu == cpu_of(this_rq))) + resched_curr(rq); + + raw_spin_rq_unlock_irqrestore(rq, flags); +} + +static void kick_cpus_irq_workfn(struct irq_work *irq_work) +{ + struct rq *this_rq = this_rq(); + struct scx_rq *this_scx = &this_rq->scx; + unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + bool should_wait = false; + s32 cpu; + + for_each_cpu(cpu, this_scx->cpus_to_kick) { + should_wait |= kick_one_cpu(cpu, this_rq, pseqs); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { + kick_one_cpu_if_idle(cpu, this_rq); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + if (!should_wait) + return; + + for_each_cpu(cpu, this_scx->cpus_to_wait) { + unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + + if (cpu != cpu_of(this_rq)) { + /* + * Pairs with smp_store_release() issued by this CPU in + * scx_notify_pick_next_task() on the resched path. + * + * We busy-wait here to guarantee that no other task can + * be scheduled on our core before the target CPU has + * entered the resched path. + */ + while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) + cpu_relax(); + } + + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } +} + +/** + * print_scx_info - print out sched_ext scheduler state + * @log_lvl: the log level to use when printing + * @p: target task + * + * If a sched_ext scheduler is enabled, print the name and state of the + * scheduler. If @p is on sched_ext, print further information about the task. + * + * This function can be safely called on any task as long as the task_struct + * itself is accessible. While safe, this function isn't synchronized and may + * print out mixups or garbages of limited length. + */ +void print_scx_info(const char *log_lvl, struct task_struct *p) +{ + enum scx_ops_enable_state state = scx_ops_enable_state(); + const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; + char runnable_at_buf[22] = "?"; + struct sched_class *class; + unsigned long runnable_at; + + if (state == SCX_OPS_DISABLED) + return; + + /* + * Carefully check if the task was running on sched_ext, and then + * carefully copy the time it's been runnable, and its state. + */ + if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || + class != &ext_sched_class) { + printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, + scx_ops_enable_state_str[state], all); + return; + } + + if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, + sizeof(runnable_at))) + scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", + jiffies_delta_msecs(runnable_at, jiffies)); + + /* print everything onto one line to conserve console space */ + printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", + log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, + runnable_at_buf); +} + +static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) +{ + /* + * SCX schedulers often have userspace components which are sometimes + * involved in critial scheduling paths. PM operations involve freezing + * userspace which can lead to scheduling misbehaviors including stalls. + * Let's bypass while PM operations are in progress. + */ + switch (event) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + case PM_RESTORE_PREPARE: + scx_ops_bypass(true); + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + case PM_POST_RESTORE: + scx_ops_bypass(false); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_pm_notifier = { + .notifier_call = scx_pm_handler, +}; + +void __init init_sched_ext_class(void) +{ + s32 cpu, v; + + /* + * The following is to prevent the compiler from optimizing out the enum + * definitions so that BPF scheduler implementations can use them + * through the generated vmlinux.h. + */ + WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_TG_ONLINE | + SCX_KICK_PREEMPT); + + BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); + init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); +#ifdef CONFIG_SMP + BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); +#endif + scx_kick_cpus_pnt_seqs = + __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * + num_possible_cpus(), + __alignof__(scx_kick_cpus_pnt_seqs[0])); + BUG_ON(!scx_kick_cpus_pnt_seqs); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + INIT_LIST_HEAD(&rq->scx.runnable_list); + + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + } + + register_sysrq_key('S', &sysrq_sched_ext_reset_op); + INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); + scx_cgroup_config_knobs(); +} + + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ +#include + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_switch_all - Switch all tasks into SCX + * + * Switch all existing and future non-dl/rt tasks to SCX. This can only be + * called from ops.init(), and actual switching is performed asynchronously. + */ +__bpf_kfunc void scx_bpf_switch_all(void) +{ + if (!scx_kf_allowed(SCX_KF_INIT)) + return; + + scx_switch_all_req = true; +} + +BTF_SET8_START(scx_kfunc_ids_init) +BTF_ID_FLAGS(func, scx_bpf_switch_all) +BTF_SET8_END(scx_kfunc_ids_init) + +static const struct btf_kfunc_id_set scx_kfunc_set_init = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_init, +}; + +/** + * scx_bpf_create_dsq - Create a custom DSQ + * @dsq_id: DSQ to create + * @node: NUMA node to allocate from + * + * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(), + * ops.init_task(), ops.cgroup_init() and ops.cgroup_prep_move(). + */ +__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) +{ + if (!scx_kf_allowed(SCX_KF_INIT | SCX_KF_SLEEPABLE)) + return -EINVAL; + + if (unlikely(node >= (int)nr_node_ids || + (node < 0 && node != NUMA_NO_NODE))) + return -EINVAL; + return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); +} + +BTF_SET8_START(scx_kfunc_ids_sleepable) +BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_SET8_END(scx_kfunc_ids_sleepable) + +static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_sleepable, +}; + +__bpf_kfunc_end_defs(); + +static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) +{ + if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + return false; + + lockdep_assert_irqs_disabled(); + + if (unlikely(!p)) { + scx_ops_error("called with NULL task"); + return false; + } + + if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { + scx_ops_error("invalid enq_flags 0x%llx", enq_flags); + return false; + } + + return true; +} + +static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) +{ + struct task_struct *ddsp_task; + int idx; + + ddsp_task = __this_cpu_read(direct_dispatch_task); + if (ddsp_task) { + mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + return; + } + + idx = __this_cpu_read(scx_dsp_ctx.buf_cursor); + if (unlikely(idx >= scx_dsp_max_batch)) { + scx_ops_error("dispatch buffer overflow"); + return; + } + + this_cpu_ptr(scx_dsp_buf)[idx] = (struct scx_dsp_buf_ent){ + .task = p, + .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, + .dsq_id = dsq_id, + .enq_flags = enq_flags, + }; + __this_cpu_inc(scx_dsp_ctx.buf_cursor); +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to + * @slice: duration @p can run for in nsecs + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe + * to call this function spuriously. Can be called from ops.enqueue(), + * ops.select_cpu(), and ops.dispatch(). + * + * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch + * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be + * used to target the local DSQ of a CPU other than the enqueueing one. Use + * ops.select_cpu() to be on the target CPU in the first place. + * + * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p + * will be directly dispatched to the corresponding dispatch queue after + * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be + * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). + * @enq_flags are OR'd with the enqueue flags on the enqueue path before the + * task is dispatched. + * + * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id + * and this function can be called upto ops.dispatch_max_batch times to dispatch + * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the + * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * + * This function doesn't have any locking restrictions and may be called under + * BPF locks (in the future when BPF introduces more flexible locking). + * + * @p is allowed to run for @slice. The scheduling path is triggered on slice + * exhaustion. If zero, the current residual slice is maintained. If + * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with + * scx_bpf_kick_cpu() to trigger scheduling. + */ +__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + if (!scx_dispatch_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + scx_dispatch_commit(p, dsq_id, enq_flags); +} + +/** + * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to + * @slice: duration @p can run for in nsecs + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime and always + * consumed after the tasks in the FIFO queue. All other aspects are identical + * to scx_bpf_dispatch(). + * + * @vtime ordering is according to time_before64() which considers wrapping. A + * numerically larger vtime may indicate an earlier position in the ordering and + * vice-versa. + */ +__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, + u64 vtime, u64 enq_flags) +{ + if (!scx_dispatch_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +BTF_SET8_START(scx_kfunc_ids_enqueue_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) +BTF_SET8_END(scx_kfunc_ids_enqueue_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_enqueue_dispatch, +}; + +/** + * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots + * + * Can only be called from ops.dispatch(). + */ +__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) +{ + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return 0; + + return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx.buf_cursor); +} + +/** + * scx_bpf_dispatch_cancel - Cancel the latest dispatch + * + * Cancel the latest dispatch. Can be called multiple times to cancel further + * dispatches. Can only be called from ops.dispatch(). + */ +__bpf_kfunc void scx_bpf_dispatch_cancel(void) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return; + + if (dspc->buf_cursor > 0) + dspc->buf_cursor--; + else + scx_ops_error("dispatch buffer underflow"); +} + +/** + * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ + * @dsq_id: DSQ to consume + * + * Consume a task from the non-local DSQ identified by @dsq_id and transfer it + * to the current CPU's local DSQ for execution. Can only be called from + * ops.dispatch(). + * + * This function flushes the in-flight dispatches from scx_bpf_dispatch() before + * trying to consume the specified DSQ. It may also grab rq locks and thus can't + * be called under any BPF locks. + * + * Returns %true if a task has been consumed, %false if there isn't any task to + * consume. + */ +__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + struct scx_dispatch_q *dsq; + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return false; + + flush_dispatch_buf(dspc->rq, dspc->rf); + + dsq = find_non_local_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); + return false; + } + + if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) { + /* + * A successfully consumed task can be dequeued before it starts + * running while the CPU is trying to migrate other dispatched + * tasks. Bump nr_tasks to tell balance_scx() to retry on empty + * local DSQ. + */ + dspc->nr_tasks++; + return true; + } else { + return false; + } +} + +BTF_SET8_START(scx_kfunc_ids_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) +BTF_ID_FLAGS(func, scx_bpf_consume) +BTF_SET8_END(scx_kfunc_ids_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_dispatch, +}; + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + u32 nr_enqueued, i; + struct rq *rq; + struct scx_rq *scx_rq; + + if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + scx_rq = &rq->scx; + + /* + * Get the number of tasks on the local DSQ before iterating over it to + * pull off tasks. The enqueue callback below can signal that it wants + * the task to stay on the local DSQ, and we want to prevent the BPF + * scheduler from causing us to loop indefinitely. + */ + nr_enqueued = scx_rq->local_dsq.nr; + for (i = 0; i < nr_enqueued; i++) { + struct task_struct *p; + + p = first_local_task(rq); + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != + SCX_OPSS_NONE); + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + WARN_ON_ONCE(p->scx.holding_cpu != -1); + dispatch_dequeue(scx_rq, p); + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + } + + return nr_enqueued; +} + +BTF_SET8_START(scx_kfunc_ids_cpu_release) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) +BTF_SET8_END(scx_kfunc_ids_cpu_release) + +static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_cpu_release, +}; + +/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct rq *this_rq; + unsigned long irq_flags; + + if (!ops_cpu_valid(cpu)) { + scx_ops_error("invalid cpu %d", cpu); + return; + } + + /* + * While bypassing for PM ops, IRQ handling may not be online which can + * lead to irq_work_queue() malfunction such as infinite busy wait for + * IRQ status update. Suppress kicking. + */ + if (scx_ops_bypassing()) + return; + + local_irq_save(irq_flags); + + this_rq = this_rq(); + + /* + * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting + * rq locks. We can probably be smarter and avoid bouncing if called + * from ops which don't hold a rq lock. + */ + if (flags & SCX_KICK_IDLE) { + struct rq *target_rq = cpu_rq(cpu); + + if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) + scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + + if (raw_spin_rq_trylock(target_rq)) { + if (can_skip_idle_kick(target_rq)) { + raw_spin_rq_unlock(target_rq); + goto out; + } + raw_spin_rq_unlock(target_rq); + } + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); + } else { + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); + + if (flags & SCX_KICK_PREEMPT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); + if (flags & SCX_KICK_WAIT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); + } + + irq_work_queue(&this_rq->scx.kick_cpus_irq_work); +out: + local_irq_restore(irq_flags); +} + +/** + * scx_bpf_dsq_nr_queued - Return the number of queued tasks + * @dsq_id: id of the DSQ + * + * Return the number of tasks in the DSQ matching @dsq_id. If not found, + * -%ENOENT is returned. Can be called from any non-sleepable online scx_ops + * operations. + */ +__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + + lockdep_assert(rcu_read_lock_any_held()); + + if (dsq_id == SCX_DSQ_LOCAL) { + return this_rq()->scx.local_dsq.nr; + } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (ops_cpu_valid(cpu)) + return cpu_rq(cpu)->scx.local_dsq.nr; + } else { + dsq = find_non_local_dsq(dsq_id); + if (dsq) + return dsq->nr; + } + return -ENOENT; +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return false; + } + + if (ops_cpu_valid(cpu)) + return test_and_clear_cpu_idle(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return -EBUSY; + } + + return scx_pick_idle_cpu(cpus_allowed, flags); +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + s32 cpu; + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return cpu_none_mask; + } + +#ifdef CONFIG_SMP + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return cpu_none_mask; + } + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_masks.smt; + else + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or + * releasing a reference to a global idle cpumask, which is read-only + * in the caller and is never released. The acquire / release semantics + * here are just used to make the cpumask is a trusted pointer in the + * caller. + */ +} + +struct scx_bpf_error_bstr_bufs { + u64 data[MAX_BPRINTF_VARARGS]; + char msg[SCX_EXIT_MSG_LEN]; +}; + +static DEFINE_PER_CPU(struct scx_bpf_error_bstr_bufs, scx_bpf_error_bstr_bufs); + +/** + * scx_bpf_error_bstr - Indicate fatal error + * @fmt: error message format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * Indicate that the BPF scheduler encountered a fatal error and initiate ops + * disabling. + */ +__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, + u32 data__sz) +{ + struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; + struct scx_bpf_error_bstr_bufs *bufs; + unsigned long flags; + int ret; + + local_irq_save(flags); + bufs = this_cpu_ptr(&scx_bpf_error_bstr_bufs); + + if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || + (data__sz && !data)) { + scx_ops_error("invalid data=%p and data__sz=%u", + (void *)data, data__sz); + goto out_restore; + } + + ret = copy_from_kernel_nofault(bufs->data, data, data__sz); + if (ret) { + scx_ops_error("failed to read data fields (%d)", ret); + goto out_restore; + } + + ret = bpf_bprintf_prepare(fmt, UINT_MAX, bufs->data, data__sz / 8, + &bprintf_data); + if (ret < 0) { + scx_ops_error("failed to format prepration (%d)", ret); + goto out_restore; + } + + ret = bstr_printf(bufs->msg, sizeof(bufs->msg), fmt, + bprintf_data.bin_args); + bpf_bprintf_cleanup(&bprintf_data); + if (ret < 0) { + scx_ops_error("scx_ops_error(\"%s\", %p, %u) failed to format", + fmt, data, data__sz); + goto out_restore; + } + + scx_ops_error_kind(SCX_EXIT_ERROR_BPF, "%s", bufs->msg); +out_restore: + local_irq_restore(flags); +} + +/** + * scx_bpf_destroy_dsq - Destroy a custom DSQ + * @dsq_id: DSQ to destroy + * + * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with + * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is + * empty and no further tasks are dispatched to it. Ignored if called on a DSQ + * which doesn't exist. Can be called from any online scx_ops operations. + */ +__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) +{ + destroy_dsq(dsq_id); +} + +/** + * scx_bpf_task_running - Is task currently running? + * @p: task of interest + */ +__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) +{ + return task_rq(p)->curr == p; +} + +/** + * scx_bpf_task_cpu - CPU a task is currently associated with + * @p: task of interest + */ +__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) +{ + return task_cpu(p); +} + +/** + * scx_bpf_task_cgroup - Return the sched cgroup of a task + * @p: task of interest + * + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with + * from the scheduler's POV. SCX operations should use this function to + * determine @p's current cgroup as, unlike following @p->cgroups, + * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all + * rq-locked operations. Can be called on the parameter tasks of rq-locked + * operations. The restriction guarantees that @p's rq is locked by the caller. + */ +#ifdef CONFIG_CGROUP_SCHED +__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) +{ + struct task_group *tg = p->sched_task_group; + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + + if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + goto out; + + /* + * A task_group may either be a cgroup or an autogroup. In the latter + * case, @tg->css.cgroup is %NULL. A task_group can't become the other + * kind once created. + */ + if (tg && tg->css.cgroup) + cgrp = tg->css.cgroup; + else + cgrp = &cgrp_dfl_root.cgrp; +out: + cgroup_get(cgrp); + return cgrp; +} +#endif + +BTF_SET8_START(scx_kfunc_ids_ops_only) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_SET8_END(scx_kfunc_ids_ops_only) + +static const struct btf_kfunc_id_set scx_kfunc_set_ops_only = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_ops_only, +}; + +BTF_SET8_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +#ifdef CONFIG_CGROUP_SCHED +BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) +#endif +BTF_SET8_END(scx_kfunc_ids_any) + +static const struct btf_kfunc_id_set scx_kfunc_set_any = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_any, +}; + +__bpf_kfunc_end_defs(); + +static int __init scx_init(void) +{ + int ret; + + /* + * kfunc registration can't be done from init_sched_ext_class() as + * register_btf_kfunc_id_set() needs most of the system to be up. + * + * Some kfuncs are context-sensitive and can only be called from + * specific SCX ops. They are grouped into BTF sets accordingly. + * Unfortunately, BPF currently doesn't have a way of enforcing such + * restrictions. Eventually, the verifier should be able to enforce + * them. For now, register them the same and make each kfunc explicitly + * check using scx_kf_allowed(). + */ + if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_init)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_sleepable)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_enqueue_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_cpu_release)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_ops_only)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &scx_kfunc_set_any))) { + pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); + return ret; + } + + ret = register_pm_notifier(&scx_pm_notifier); + if (ret) { + pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); + return ret; + } + + scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); + if (!scx_kset) { + pr_err("sched_ext: Failed to create /sys/sched_ext\n"); + return -ENOMEM; + } + + ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); + if (ret < 0) { + pr_err("sched_ext: Failed to add global attributes\n"); + return ret; + } + + return 0; +} +__initcall(scx_init); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h new file mode 100644 index 000000000..c61291fb9 --- /dev/null +++ b/kernel/sched/ext.h @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dispatch() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * If the BPF scheduler wants to continue executing the task, + * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. + * If the task gets queued on a different dsq or the BPF side, the BPF + * scheduler is responsible for triggering a follow-up scheduling event. + * Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +#ifdef CONFIG_SCHED_CLASS_EXT + +struct sched_enq_and_set_ctx { + struct task_struct *p; + int queue_flags; + bool queued; + bool running; +}; + +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx); +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); + +extern const struct sched_class ext_sched_class; +extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops; +extern unsigned long scx_watchdog_timeout; +extern unsigned long scx_watchdog_timestamp; + +DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); +#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); + +static inline bool task_on_scx(const struct task_struct *p) +{ + return scx_enabled() && p->sched_class == &ext_sched_class; +} + +bool task_should_scx(struct task_struct *p); +void scx_pre_fork(struct task_struct *p); +int scx_fork(struct task_struct *p); +void scx_post_fork(struct task_struct *p); +void scx_cancel_fork(struct task_struct *p); +int scx_check_setscheduler(struct task_struct *p, int policy); +bool scx_can_stop_tick(struct rq *rq); +void init_sched_ext_class(void); + +__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind, + const char *fmt, ...); +#define scx_ops_error(fmt, args...) \ + scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) + +void __scx_notify_pick_next_task(struct rq *rq, + struct task_struct *p, + const struct sched_class *active); + +static inline void scx_notify_pick_next_task(struct rq *rq, + struct task_struct *p, + const struct sched_class *active) +{ + if (!scx_enabled()) + return; +#ifdef CONFIG_SMP + /* + * Pairs with the smp_load_acquire() issued by a CPU in + * kick_cpus_irq_workfn() who is waiting for this CPU to perform a + * resched. + */ + smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); +#endif + if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + return; + __scx_notify_pick_next_task(rq, p, active); +} + +static inline void scx_notify_sched_tick(void) +{ + unsigned long last_check; + + if (!scx_enabled()) + return; + + last_check = READ_ONCE(scx_watchdog_timestamp); + if (unlikely(time_after(jiffies, + last_check + READ_ONCE(scx_watchdog_timeout)))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_check); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); + } +} + +static inline const struct sched_class *next_active_class(const struct sched_class *class) +{ + class++; + if (scx_switched_all() && class == &fair_sched_class) + class++; + if (!scx_enabled() && class == &ext_sched_class) + class++; + return class; +} + +#define for_active_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = next_active_class(class)) + +#define for_each_active_class(class) \ + for_active_class_range(class, __sched_class_highest, __sched_class_lowest) + +/* + * SCX requires a balance() call before every pick_next_task() call including + * when waking up from idle. + */ +#define for_balance_class_range(class, prev_class, end_class) \ + for_active_class_range(class, (prev_class) > &ext_sched_class ? \ + &ext_sched_class : (prev_class), (end_class)) + +#ifdef CONFIG_SCHED_CORE +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi); +#endif + +#else /* CONFIG_SCHED_CLASS_EXT */ + +#define scx_enabled() false +#define scx_switched_all() false + +static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline void scx_pre_fork(struct task_struct *p) {} +static inline int scx_fork(struct task_struct *p) { return 0; } +static inline void scx_post_fork(struct task_struct *p) {} +static inline void scx_cancel_fork(struct task_struct *p) {} +static inline int scx_check_setscheduler(struct task_struct *p, + int policy) { return 0; } +static inline bool scx_can_stop_tick(struct rq *rq) { return true; } +static inline void init_sched_ext_class(void) {} +static inline void scx_notify_pick_next_task(struct rq *rq, + const struct task_struct *p, + const struct sched_class *active) {} +static inline void scx_notify_sched_tick(void) {} + +#define for_each_active_class for_each_class +#define for_balance_class_range for_class_range + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +void __scx_update_idle(struct rq *rq, bool idle); + +static inline void scx_update_idle(struct rq *rq, bool idle) +{ + if (scx_enabled()) + __scx_update_idle(rq, idle); +} +#else +static inline void scx_update_idle(struct rq *rq, bool idle) {} +#endif + +#ifdef CONFIG_CGROUP_SCHED +#ifdef CONFIG_EXT_GROUP_SCHED +int scx_tg_online(struct task_group *tg); +void scx_tg_offline(struct task_group *tg); +int scx_cgroup_can_attach(struct cgroup_taskset *tset); +void scx_move_task(struct task_struct *p); +void scx_cgroup_finish_attach(void); +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); +void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); +#else /* CONFIG_EXT_GROUP_SCHED */ +static inline int scx_tg_online(struct task_group *tg) { return 0; } +static inline void scx_tg_offline(struct task_group *tg) {} +static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } +static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_finish_attach(void) {} +static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} +static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} +#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97983b041..d65dcbaac 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3842,7 +3842,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } } -void reweight_task(struct task_struct *p, int prio) +static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -8329,7 +8329,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); @@ -12477,14 +12477,14 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); } -static void rq_online_fair(struct rq *rq) +static void rq_online_fair(struct rq *rq, enum rq_onoff_reason reason) { update_sysctl(); update_runtime_enabled(rq); } -static void rq_offline_fair(struct rq *rq) +static void rq_offline_fair(struct rq *rq, enum rq_onoff_reason reason) { update_sysctl(); @@ -13166,6 +13166,7 @@ DEFINE_SCHED_CLASS(fair) = { .task_tick = task_tick_fair, .task_fork = task_fork_fair, + .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 31231925f..0179ce0c7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -438,11 +438,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { + scx_update_idle(rq, false); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); + scx_update_idle(rq, true); schedstat_inc(rq->sched_goidle); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3261b067b..8620474d1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2426,7 +2426,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) } /* Assumes rq->lock is held */ -static void rq_online_rt(struct rq *rq) +static void rq_online_rt(struct rq *rq, enum rq_onoff_reason reason) { if (rq->rt.overloaded) rt_set_overload(rq); @@ -2437,7 +2437,7 @@ static void rq_online_rt(struct rq *rq) } /* Assumes rq->lock is held */ -static void rq_offline_rt(struct rq *rq) +static void rq_offline_rt(struct rq *rq, enum rq_onoff_reason reason) { if (rq->rt.overloaded) rt_clear_overload(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce68f50e7..cf1714a12 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -173,9 +173,19 @@ static inline int idle_policy(int policy) { return policy == SCHED_IDLE; } + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT + if (policy == SCHED_EXT) + return true; +#endif + return policy == SCHED_NORMAL; +} + static inline int fair_policy(int policy) { - return policy == SCHED_NORMAL || policy == SCHED_BATCH; + return normal_policy(policy) || policy == SCHED_BATCH; } static inline int rt_policy(int policy) @@ -223,6 +233,24 @@ static inline void update_avg(u64 *avg, u64 sample) #define shr_bound(val, shift) \ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) +/* + * cgroup weight knobs should use the common MIN, DFL and MAX values which are + * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it + * maps pretty well onto the shares value used by scheduler and the round-trip + * conversions preserve the original value over the entire range. + */ +static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) +{ + return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); +} + +static inline unsigned long sched_weight_to_cgroup(unsigned long weight) +{ + return clamp_t(unsigned long, + DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + /* * !! For sched_setattr_nocheck() (kernel) only !! * @@ -406,6 +434,11 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + struct rcu_head rcu; struct list_head list; @@ -461,6 +494,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) return walk_tg_tree_from(&root_task_group, down, up, data); } +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + extern int tg_nop(struct task_group *tg, void *data); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -517,6 +555,11 @@ extern void set_task_rq_fair(struct sched_entity *se, static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } #endif /* CONFIG_SMP */ +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + return 0; +} #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -677,6 +720,30 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ }; +#ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + SCX_RQ_BALANCING = 1 << 0, + SCX_RQ_CAN_STOP_TICK = 1 << 1, +}; + +struct scx_rq { + struct scx_dispatch_q local_dsq; + struct list_head runnable_list; /* runnable tasks on this rq */ + unsigned long ops_qseq; + u64 extra_enq_flags; /* see move_task_to_local_dsq() */ + u32 nr_running; + u32 flags; + bool cpu_released; + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_kick_if_idle; + cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; + struct irq_work kick_cpus_irq_work; +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -1017,6 +1084,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_EXT + struct scx_rq scx; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -2250,6 +2320,11 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) +enum rq_onoff_reason { + RQ_ONOFF_HOTPLUG, /* CPU is going on/offline */ + RQ_ONOFF_TOPOLOGY, /* sched domain topology update */ +}; + struct affinity_context { const struct cpumask *new_mask; struct cpumask *user_mask; @@ -2288,8 +2363,8 @@ struct sched_class { void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); - void (*rq_online)(struct rq *rq); - void (*rq_offline)(struct rq *rq); + void (*rq_online)(struct rq *rq, enum rq_onoff_reason reason); + void (*rq_offline)(struct rq *rq, enum rq_onoff_reason reason); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); #endif @@ -2303,8 +2378,11 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); void (*switched_from)(struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*reweight_task)(struct rq *this_rq, struct task_struct *task, + int newprio); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); @@ -2462,7 +2540,7 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, int prio); +extern void __setscheduler_prio(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -2542,6 +2620,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY) @@ -2823,8 +2907,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) raw_spin_rq_unlock(rq1); } -extern void set_rq_online (struct rq *rq); -extern void set_rq_offline(struct rq *rq); +extern void set_rq_online (struct rq *rq, enum rq_onoff_reason reason); +extern void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason); extern bool sched_smp_initialized; #else /* CONFIG_SMP */ @@ -3481,4 +3565,27 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_CGROUP_SCHED +enum cpu_cftype_id { +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + CPU_CFTYPE_WEIGHT, + CPU_CFTYPE_WEIGHT_NICE, + CPU_CFTYPE_IDLE, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + CPU_CFTYPE_MAX, + CPU_CFTYPE_MAX_BURST, +#endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + CPU_CFTYPE_UCLAMP_MIN, + CPU_CFTYPE_UCLAMP_MAX, +#endif + CPU_CFTYPE_CNT, +}; + +extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1]; +#endif /* CONFIG_CGROUP_SCHED */ + +#include "ext.h" + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 10d1391e7..7798063b3 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -497,7 +497,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) old_rd = rq->rd; if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); + set_rq_offline(rq, RQ_ONOFF_TOPOLOGY); cpumask_clear_cpu(rq->cpu, old_rd->span); @@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_set_cpu(rq->cpu, rd->span); if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); + set_rq_online(rq, RQ_ONOFF_TOPOLOGY); rq_unlock_irqrestore(rq, &rf); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 222c6d6c8..9581ef4ef 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl) print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); + print_scx_info(log_lvl, current); } /** diff --git a/tools/Makefile b/tools/Makefile index 37e9f6804..8021267f7 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -29,6 +29,7 @@ help: @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' + @echo ' sched_ext - sched_ext example schedulers' @echo ' bootconfig - boot config tool' @echo ' spi - spi tools' @echo ' tmon - thermal monitoring and tuning tool' @@ -92,6 +93,9 @@ perf: FORCE $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= +sched_ext: FORCE + $(call descend,sched_ext) + selftests: FORCE $(call descend,testing/$@) @@ -185,6 +189,9 @@ perf_clean: $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean +sched_ext_clean: + $(call descend,sched_ext,clean) + selftests_clean: $(call descend,testing/$(@:_clean=),clean) @@ -214,6 +221,7 @@ clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_cl mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ - intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean + intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ + sched_ext_clean .PHONY: FORCE diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore new file mode 100644 index 000000000..215ed36b2 --- /dev/null +++ b/tools/sched_ext/.gitignore @@ -0,0 +1,10 @@ +scx_simple +scx_qmap +scx_central +scx_pair +scx_flatcg +scx_userland +*.skel.h +*.subskel.h +/tools/ +build/ diff --git a/tools/sched_ext/Kconfig b/tools/sched_ext/Kconfig new file mode 100644 index 000000000..6543fcf19 --- /dev/null +++ b/tools/sched_ext/Kconfig @@ -0,0 +1,9 @@ +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile new file mode 100644 index 000000000..731bf6278 --- /dev/null +++ b/tools/sched_ext/Makefile @@ -0,0 +1,251 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../build/Build.include +include ../scripts/Makefile.arch +include ../scripts/Makefile.include + +all: all_targets + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi +CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu +CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl +CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu +CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu +CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu +CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu +CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu +CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +ifeq ($(CROSS_COMPILE),) +ifeq ($(CLANG_TARGET_FLAGS),) +$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) +else +CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) +endif # CLANG_TARGET_FLAGS +else +CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif # CROSS_COMPILE + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := $(CROSS_COMPILE)gcc +endif # LLVM + +CURDIR := $(abspath .) +TOOLSDIR := $(abspath ..) +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(abspath ../../include/generated) +GENHDR := $(GENDIR)/autoconf.h + +ifeq ($(O),) +OUTPUT_DIR := $(CURDIR)/build +else +OUTPUT_DIR := $(O)/build +endif # O +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BINDIR := $(OUTPUT_DIR)/bin +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +ifneq ($(CROSS_COMPILE),) +HOST_BUILD_DIR := $(OBJ_DIR)/host +HOST_OUTPUT_DIR := host-tools +HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include +else +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) +HOST_INCLUDE_DIR := $(INCLUDE_DIR) +endif +HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a +RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids +DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include + +CARGOFLAGS := --release --target-dir $(OUTPUT_DIR) +ifneq ($(CARGO_OFFLINE),) +CARGOFLAGS += --offline +endif + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ + | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) + +c-sched-targets = scx_simple scx_qmap + +$(addprefix $(BINDIR)/,$(c-sched-targets)): \ + $(BINDIR)/%: \ + $(filter-out %.bpf.c,%.c) \ + $(INCLUDE_DIR)/%.bpf.skel.h \ + $(SCX_COMMON_DEPS) + $(eval sched=$(notdir $@)) + $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o + $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) + +$(c-sched-targets): %: $(BINDIR)/% + +install: all + $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ + $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ + +clean: + rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(c-sched-targets) + +help: + @echo 'Building targets' + @echo '================' + @echo '' + @echo ' all - Compile all schedulers' + @echo '' + @echo 'Alternatively, you may compile individual schedulers:' + @echo '' + @printf ' %s\n' $(c-sched-targets) + @echo '' + @echo 'For any scheduler build target, you may specify an alternative' + @echo 'build output path with the O= environment variable. For example:' + @echo '' + @echo ' O=/tmp/sched_ext make all' + @echo '' + @echo 'will compile all schedulers, and emit the build artifacts to' + @echo '/tmp/sched_ext/build.' + @echo '' + @echo '' + @echo 'Installing targets' + @echo '==================' + @echo '' + @echo ' install - Compile and install all schedulers to /usr/bin.' + @echo ' You may specify the DESTDIR= environment variable' + @echo ' to indicate a prefix for /usr/bin. For example:' + @echo '' + @echo ' DESTDIR=/tmp/sched_ext make install' + @echo '' + @echo ' will build the schedulers in CWD/build, and' + @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' + @echo '' + @echo '' + @echo 'Cleaning targets' + @echo '================' + @echo '' + @echo ' clean - Remove all generated files' + +all_targets: $(c-sched-targets) + +.PHONY: all all_targets $(c-sched-targets) clean help + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md new file mode 100644 index 000000000..407327494 --- /dev/null +++ b/tools/sched_ext/README.md @@ -0,0 +1,269 @@ +SCHED_EXT EXAMPLE SCHEDULERS +============================ + +# Introduction + +This directory contains a number of example sched_ext schedulers. These +schedulers are meant to provide examples of different types of schedulers +that can be built using sched_ext, and illustrate how various features of +sched_ext can be used. + +Some of the examples are performant, production-ready schedulers. That is, for +the correct workload and with the correct tuning, they may be deployed in a +production environment with acceptable or possibly even improved performance. +Others are just examples that in practice, would not provide acceptable +performance (though they could be improved to get there). + +This README will describe these example schedulers, including describing the +types of workloads or scenarios they're designed to accommodate, and whether or +not they're production ready. For more details on any of these schedulers, +please see the header comment in their .bpf.c file. + + +# Compiling the examples + +There are a few toolchain dependencies for compiling the example schedulers. + +## Toolchain dependencies + +1. clang >= 16.0.0 + +The schedulers are BPF programs, and therefore must be compiled with clang. gcc +is actively working on adding a BPF backend compiler as well, but are still +missing some features such as BTF type tags which are necessary for using +kptrs. + +2. pahole >= 1.25 + +You may need pahole in order to generate BTF from DWARF. + +3. rust >= 1.70.0 + +Rust schedulers uses features present in the rust toolchain >= 1.70.0. You +should be able to use the stable build from rustup, but if that doesn't +work, try using the rustup nightly build. + +There are other requirements as well, such as make, but these are the main / +non-trivial ones. + +## Compiling the kernel + +In order to run a sched_ext scheduler, you'll have to run a kernel compiled +with the patches in this repository, and with a minimum set of necessary +Kconfig options: + +``` +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +``` + +It's also recommended that you also include the following Kconfig options: + +``` +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y +``` + +There is a `Kconfig` file in this directory whose contents you can append to +your local `.config` file, as long as there are no conflicts with any existing +options in the file. + +## Getting a vmlinux.h file + +You may notice that most of the example schedulers include a "vmlinux.h" file. +This is a large, auto-generated header file that contains all of the types +defined in some vmlinux binary that was compiled with +[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig +options specified above). + +The header file is created using `bpftool`, by passing it a vmlinux binary +compiled with BTF as follows: + +```bash +$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h +``` + +`bpftool` analyzes all of the BTF encodings in the binary, and produces a +header file that can be included by BPF programs to access those types. For +example, using vmlinux.h allows a scheduler to access fields defined directly +in vmlinux as follows: + +```c +#include "vmlinux.h" +// vmlinux.h is also implicitly included by scx_common.bpf.h. +#include "scx_common.bpf.h" + +/* + * vmlinux.h provides definitions for struct task_struct and + * struct scx_enable_args. + */ +void BPF_STRUCT_OPS(example_enable, struct task_struct *p, + struct scx_enable_args *args) +{ + bpf_printk("Task %s enabled in example scheduler", p->comm); +} + +// vmlinux.h provides the definition for struct sched_ext_ops. +SEC(".struct_ops.link") +struct sched_ext_ops example_ops { + .enable = (void *)example_enable, + .name = "example", +} +``` + +The scheduler build system will generate this vmlinux.h file as part of the +scheduler build pipeline. It looks for a vmlinux file in the following +dependency order: + +1. If the O= environment variable is defined, at `$O/vmlinux` +2. If the KBUILD_OUTPUT= environment variable is defined, at + `$KBUILD_OUTPUT/vmlinux` +3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're + compiling the schedulers) +3. `/sys/kernel/btf/vmlinux` +4. `/boot/vmlinux-$(uname -r)` + +In other words, if you have compiled a kernel in your local repo, its vmlinux +file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of +the kernel you're currently running on. This means that if you're running on a +kernel with sched_ext support, you may not need to compile a local kernel at +all. + +### Aside on CO-RE + +One of the cooler features of BPF is that it supports +[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run +Everywhere). This feature allows you to reference fields inside of structs with +types defined internal to the kernel, and not have to recompile if you load the +BPF program on a different kernel with the field at a different offset. In our +example above, we print out a task name with `p->comm`. CO-RE would perform +relocations for that access when the program is loaded to ensure that it's +referencing the correct offset for the currently running kernel. + +## Compiling the schedulers + +Once you have your toolchain setup, and a vmlinux that can be used to generate +a full vmlinux.h file, you can compile the schedulers using `make`: + +```bash +$ make -j($nproc) +``` + +# Schedulers + +This directory contains the following simple schedulers as examples. For +more, visit https://github.com/sched-ext/scx. + +-------------------------------------------------------------------------------- + +## scx_simple + +### Overview + +A simple scheduler that provides an example of a minimal sched_ext +scheduler. scx_simple can be run in either global weighted vtime mode, or +FIFO mode. + +### Typical Use Case + +Though very simple, this scheduler should perform reasonably well on +single-socket CPUs with a uniform L3 cache topology. Note that while running in +global FIFO mode may work well for some workloads, saturating threads can +easily drown out inactive ones. + +### Production Ready? + +This scheduler could be used in a production environment, assuming the hardware +constraints enumerated above, and assuming the workload can accommodate a +simple scheduling policy. + +-------------------------------------------------------------------------------- + +## scx_qmap + +### Overview + +Another simple, yet slightly more complex scheduler that provides an example of +a basic weighted FIFO queuing policy. It also provides examples of some common +useful BPF features, such as sleepable per-task storage allocation in the +`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to +enqueue tasks. It also illustrates how core-sched support could be implemented. + +### Typical Use Case + +Purely used to illustrate sched_ext features. + +### Production Ready? + +No + +-------------------------------------------------------------------------------- + +# Troubleshooting + +There are a number of common issues that you may run into when building the +schedulers. We'll go over some of the common ones here. + +## Build Failures + +### Old version of clang + +``` +error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + ^~~~~~~~~~~~~~~~~~~~ +1 error generated. +``` + +This means you built the kernel or the schedulers with an older version of +clang than what's supported (i.e. older than 16.0.0). To remediate this: + +1. `which clang` to make sure you're using a sufficiently new version of clang. + +2. `make fullclean` in the root path of the repository, and rebuild the kernel + and schedulers. + +3. Rebuild the kernel, and then your example schedulers. + +The schedulers are also cleaned if you invoke `make mrproper` in the root +directory of the tree. + +### Stale kernel build / incomplete vmlinux.h file + +As described above, you'll need a `vmlinux.h` file that was generated from a +vmlinux built with BTF, and with sched_ext support enabled. If you don't, +you'll see errors such as the following which indicate that a type being +referenced in a scheduler is unknown: + +``` +/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' + +const struct scx_exit_info *ei) + +^ +``` + +In order to resolve this, please follow the steps above in +[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your +schedulers are using a vmlinux.h file that includes the requisite types. + +## Misc + +### llvm: [OFF] + +You may see the following output when building the schedulers: + +``` +Auto-detecting system features: +... clang-bpf-co-re: [ on ] +... llvm: [ OFF ] +... libcap: [ on ] +... libbfd: [ on ] +``` + +Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h new file mode 100644 index 000000000..ad7d139ce --- /dev/null +++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h @@ -0,0 +1,11 @@ +/* + * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when + * compiling BPF files although its content doesn't play any role. The file in + * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is + * defined. When compiling a BPF source, __x86_64__ isn't set and thus + * stubs-32.h is selected. However, the file is not there if the system doesn't + * have 32bit glibc devel package installed leading to a build failure. + * + * The problem is worked around by making this file available in the include + * search paths before the system one when building BPF. + */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h new file mode 100644 index 000000000..f2336d357 --- /dev/null +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __SCHED_EXT_COMMON_BPF_H +#define __SCHED_EXT_COMMON_BPF_H + +#include "vmlinux.h" +#include +#include +#include +#include "user_exit_info.h" + +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_EXITING 0x00000004 +#define CLOCK_MONOTONIC 1 + +/* + * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can + * lead to really confusing misbehaviors. Let's trigger a build failure. + */ +static inline void ___vmlinux_h_sanity_check___(void) +{ + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); +} + +void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_error_format_checker(const char *fmt, ...) {} + +/* + * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments + * instead of an array of u64. Note that __param[] must have at least one + * element to keep the verifier happy. + */ +#define scx_bpf_error(fmt, args...) \ +({ \ + static char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ + \ + ___scx_bpf_error_format_checker(fmt, ##args); \ +}) + +void scx_bpf_switch_all(void) __ksym; +s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; +void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; +bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; +void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; +void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; +u32 scx_bpf_reenqueue_local(void) __ksym; + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, ##args) + +#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ +SEC("struct_ops.s/"#name) \ +BPF_PROG(name, ##args) + +/** + * RESIZABLE_ARRAY - Generates annotations for an array that may be resized + * @elfsec: the data section of the BPF program in which to place the array + * @arr: the name of the array + * + * libbpf has an API for setting map value sizes. Since data sections (i.e. + * bss, data, rodata) themselves are maps, a data section can be resized. If + * a data section has an array as its last element, the BTF info for that + * array will be adjusted so that length of the array is extended to meet the + * new length of the data section. This macro annotates an array to have an + * element count of one with the assumption that this array can be resized + * within the userspace program. It also annotates the section specifier so + * this array exists in a custom sub data section which can be resized + * independently. + * + * See RESIZE_ARRAY() for the userspace convenience macro for resizing an + * array declared with RESIZABLE_ARRAY(). + */ +#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) + +/** + * MEMBER_VPTR - Obtain the verified pointer to a struct or array member + * @base: struct or array to index + * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) + * + * The verifier often gets confused by the instruction sequence the compiler + * generates for indexing struct fields or arrays. This macro forces the + * compiler to generate a code sequence which first calculates the byte offset, + * checks it against the struct or array size and add that byte offset to + * generate the pointer to the member to help the verifier. + * + * Ideally, we want to abort if the calculated offset is out-of-bounds. However, + * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller + * must check for %NULL and take appropriate action to appease the verifier. To + * avoid confusing the verifier, it's best to check for %NULL and dereference + * immediately. + * + * vptr = MEMBER_VPTR(my_array, [i][j]); + * if (!vptr) + * return error; + * *vptr = new_value; + * + * sizeof(@base) should encompass the memory area to be accessed and thus can't + * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of + * `MEMBER_VPTR(ptr, ->member)`. + */ +#define MEMBER_VPTR(base, member) (typeof((base) member) *)({ \ + u64 __base = (u64)&(base); \ + u64 __addr = (u64)&((base) member) - __base; \ + _Static_assert(sizeof(base) >= sizeof((base) member), \ + "@base is smaller than @member, is @base a pointer?"); \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"i"(sizeof(base) - sizeof((base) member))); \ + __addr; \ +}) + +/** + * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element + * @arr: array to index into + * @i: array index + * @n: number of elements in array + * + * Similar to MEMBER_VPTR() but is intended for use with arrays where the + * element count needs to be explicit. + * It can be used in cases where a global array is defined with an initial + * size but is intended to be be resized before loading the BPF program. + * Without this version of the macro, MEMBER_VPTR() will use the compile time + * size of the array to compute the max, which will result in rejection by + * the verifier. + */ +#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({ \ + u64 __base = (u64)arr; \ + u64 __addr = (u64)&(arr[i]) - __base; \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ + __addr; \ +}) + +/* + * BPF core and other generic helpers + */ + +/* list and rbtree */ +#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; +void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; + +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) +#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; +struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) __ksym; +int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta, __u64 off) __ksym; +#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) + +struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + +/* task */ +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; + +/* cgroup */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; +void bpf_cgroup_release(struct cgroup *cgrp) __ksym; +struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; + +/* cpumask */ +struct bpf_cpumask *bpf_cpumask_create(void) __ksym; +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; +u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; +u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) __ksym; + +/* rcu */ +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + +#endif /* __SCHED_EXT_COMMON_BPF_H */ diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h new file mode 100644 index 000000000..7019d9f2d --- /dev/null +++ b/tools/sched_ext/include/scx/common.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ +#ifndef __SCHED_EXT_COMMON_H +#define __SCHED_EXT_COMMON_H + +#ifdef __KERNEL__ +#error "Should not be included by BPF programs" +#endif + +#include +#include +#include +#include + +#include "user_exit_info.h" + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +#define SCX_BUG(__fmt, ...) \ + do { \ + fprintf(stderr, "%s:%d [scx panic]: %s\n", __FILE__, __LINE__, \ + strerror(errno)); \ + fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + exit(EXIT_FAILURE); \ + } while (0) + +#define SCX_BUG_ON(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ + } while (0) + +/** + * RESIZE_ARRAY - Convenience macro for resizing a BPF array + * @elfsec: the data section of the BPF program in which to the array exists + * @arr: the name of the array + * @n: the desired array element count + * + * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two + * operations. It resizes the map which corresponds to the custom data + * section that contains the target array. As a side effect, the BTF info for + * the array is adjusted so that the array length is sized to cover the new + * data section size. The second operation is reassigning the skeleton pointer + * for that custom data section so that it points to the newly memory mapped + * region. + */ +#define RESIZE_ARRAY(elfsec, arr, n) \ + do { \ + size_t __sz; \ + bpf_map__set_value_size(skel->maps.elfsec##_##arr, \ + sizeof(skel->elfsec##_##arr->arr[0]) * (n)); \ + skel->elfsec##_##arr = \ + bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \ + } while (0) + +#endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/ravg.bpf.h b/tools/sched_ext/include/scx/ravg.bpf.h new file mode 100644 index 000000000..a233d85d0 --- /dev/null +++ b/tools/sched_ext/include/scx/ravg.bpf.h @@ -0,0 +1,42 @@ +#ifndef __SCX_RAVG_BPF_H__ +#define __SCX_RAVG_BPF_H__ + +/* + * Running average helpers to be used in BPF progs. Assumes vmlinux.h has + * already been included. + */ +enum ravg_consts { + RAVG_VAL_BITS = 44, /* input values are 44bit */ + RAVG_FRAC_BITS = 20, /* 1048576 is 1.0 */ +}; + +/* + * Running avg mechanism. Accumulates values between 0 and RAVG_MAX_VAL in + * arbitrary time intervals. The accumulated values are halved every half_life + * with each period starting when the current time % half_life is 0. Zeroing is + * enough for initialization. + * + * See ravg_accumulate() and ravg_read() for more details. + */ +struct ravg_data { + /* current value */ + u64 val; + + /* + * The timestamp of @val. The latest completed seq #: + * + * (val_at / half_life) - 1 + */ + u64 val_at; + + /* running avg as of the latest completed seq */ + u64 old; + + /* + * Accumulated value of the current period. Input value is 48bits and we + * normalize half-life to 16bit, so it should fit in an u64. + */ + u64 cur; +}; + +#endif /* __SCX_RAVG_BPF_H__ */ diff --git a/tools/sched_ext/include/scx/ravg_impl.bpf.h b/tools/sched_ext/include/scx/ravg_impl.bpf.h new file mode 100644 index 000000000..4922a3e68 --- /dev/null +++ b/tools/sched_ext/include/scx/ravg_impl.bpf.h @@ -0,0 +1,358 @@ +/* to be included in the main bpf.c file */ +#include "ravg.bpf.h" + +#define RAVG_FN_ATTRS inline __attribute__((unused, always_inline)) + +static RAVG_FN_ATTRS void ravg_add(u64 *sum, u64 addend) +{ + u64 new = *sum + addend; + + if (new >= *sum) + *sum = new; + else + *sum = -1; +} + +static RAVG_FN_ATTRS u64 ravg_decay(u64 v, u32 shift) +{ + if (shift >= 64) + return 0; + else + return v >> shift; +} + +static RAVG_FN_ATTRS u32 ravg_normalize_dur(u32 dur, u32 half_life) +{ + if (dur < half_life) + return (((u64)dur << RAVG_FRAC_BITS) + half_life - 1) / + half_life; + else + return 1 << RAVG_FRAC_BITS; +} + +/* + * Pre-computed decayed full-period values. This is quicker and keeps the bpf + * verifier happy by removing the need for looping. + * + * [0] = ravg_decay(1 << RAVG_FRAC_BITS, 1) + * [1] = [0] + ravg_decay(1 << RAVG_FRAC_BITS, 2) + * [2] = [1] + ravg_decay(1 << RAVG_FRAC_BITS, 3) + * ... + */ +static u64 ravg_full_sum[] = { + 524288, 786432, 917504, 983040, + 1015808, 1032192, 1040384, 1044480, + 1046528, 1047552, 1048064, 1048320, + 1048448, 1048512, 1048544, 1048560, + 1048568, 1048572, 1048574, 1048575, + /* the same from here on */ +}; + +static const int ravg_full_sum_len = sizeof(ravg_full_sum) / sizeof(ravg_full_sum[0]); + +/** + * ravg_accumulate - Accumulate a new value + * @rd: ravg_data to accumulate into + * @new_val: new value + * @now: current timestamp + * @half_life: decay period, must be the same across calls + * + * The current value is changing to @val at @now. Accumulate accordingly. + */ +static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd, u64 new_val, u64 now, + u32 half_life) +{ + u32 cur_seq, val_seq, seq_delta; + + /* + * It may be difficult for the caller to guarantee monotonic progress if + * multiple CPUs accumulate to the same ravg_data. Handle @now being in + * the past of @rd->val_at. + */ + if (now < rd->val_at) + now = rd->val_at; + + cur_seq = now / half_life; + val_seq = rd->val_at / half_life; + seq_delta = cur_seq - val_seq; + + /* + * Decay ->old and fold ->cur into it. + * + * @end + * v + * timeline |---------|---------|---------|---------|---------| + * seq delta 4 3 2 1 0 + * seq ->seq cur_seq + * val ->old ->cur ^ + * | | | + * \---------+------------------/ + */ + if (seq_delta > 0) { + /* decay ->old to bring it upto the cur_seq - 1 */ + rd->old = ravg_decay(rd->old, seq_delta); + /* non-zero ->cur must be from val_seq, calc and fold */ + ravg_add(&rd->old, ravg_decay(rd->cur, seq_delta)); + /* clear */ + rd->cur = 0; + } + + if (!rd->val) + goto out; + + /* + * Accumulate @rd->val between @rd->val_at and @now. + * + * @rd->val_at @now + * v v + * timeline |---------|---------|---------|---------|---------| + * seq delta [ 3 | 2 | 1 | 0 ] + */ + if (seq_delta > 0) { + u32 dur; + + /* fold the oldest period which may be partial */ + dur = ravg_normalize_dur(half_life - rd->val_at % half_life, half_life); + ravg_add(&rd->old, rd->val * ravg_decay(dur, seq_delta)); + + /* fold the full periods in the middle with precomputed vals */ + if (seq_delta > 1) { + u32 idx = seq_delta - 2; + + if (idx >= ravg_full_sum_len) + idx = ravg_full_sum_len - 1; + + ravg_add(&rd->old, rd->val * ravg_full_sum[idx]); + } + + /* accumulate the current period duration into ->cur */ + rd->cur += rd->val * ravg_normalize_dur(now % half_life, + half_life); + } else { + rd->cur += rd->val * ravg_normalize_dur(now - rd->val_at, + half_life); + } +out: + if (new_val >= 1LLU << RAVG_VAL_BITS) + rd->val = (1LLU << RAVG_VAL_BITS) - 1; + else + rd->val = new_val; + rd->val_at = now; +} + +/** + * ravg_transfer - Transfer in or out a component running avg + * @base: ravg_data to transfer @xfer into or out of + * @base_new_val: new value for @base + * @xfer: ravg_data to transfer + * @xfer_new_val: new value for @xfer + * @is_xfer_in: transfer direction + * + * An ravg may be a sum of component ravgs. For example, a scheduling domain's + * load is the sum of the load values of all member tasks. If a task is migrated + * to a different domain, its contribution should be subtracted from the source + * ravg and added to the destination one. + * + * This function can be used for such component transfers. Both @base and @xfer + * must have been accumulated at the same timestamp. @xfer's contribution is + * subtracted if @is_fer_in is %false and added if %true. + */ +static RAVG_FN_ATTRS void ravg_transfer(struct ravg_data *base, u64 base_new_val, + struct ravg_data *xfer, u64 xfer_new_val, + u32 half_life, bool is_xfer_in) +{ + /* synchronize @base and @xfer */ + if ((s64)(base->val_at - xfer->val_at) < 0) + ravg_accumulate(base, base_new_val, xfer->val_at, half_life); + else if ((s64)(base->val_at - xfer->val_at) > 0) + ravg_accumulate(xfer, xfer_new_val, base->val_at, half_life); + + /* transfer */ + if (is_xfer_in) { + base->old += xfer->old; + base->cur += xfer->cur; + } else { + if (base->old > xfer->old) + base->old -= xfer->old; + else + base->old = 0; + + if (base->cur > xfer->cur) + base->cur -= xfer->cur; + else + base->cur = 0; + } +} + +/** + * u64_x_u32_rshift - Calculate ((u64 * u32) >> rshift) + * @a: multiplicand + * @b: multiplier + * @rshift: number of bits to shift right + * + * Poor man's 128bit arithmetic. Calculate ((@a * @b) >> @rshift) where @a is + * u64 and @b is u32 and (@a * @b) may be bigger than #U64_MAX. The caller must + * ensure that the final shifted result fits in u64. + */ +static inline __attribute__((always_inline)) +u64 u64_x_u32_rshift(u64 a, u32 b, u32 rshift) +{ + const u64 mask32 = (u32)-1; + u64 al = a & mask32; + u64 ah = (a & (mask32 << 32)) >> 32; + + /* + * ah: high 32 al: low 32 + * a |--------------||--------------| + * + * ah * b |--------------||--------------| + * al * b |--------------||--------------| + */ + al *= b; + ah *= b; + + /* + * (ah * b) >> rshift |--------------||--------------| + * (al * b) >> rshift |--------------||--------| + * <--------> + * 32 - rshift + */ + al >>= rshift; + if (rshift <= 32) + ah <<= 32 - rshift; + else + ah >>= rshift - 32; + + return al + ah; +} + +/** + * ravg_scale - Scale a running avg + * @rd: ravg_data to scale + * @mult: multipler + * @rshift: right shift amount + * + * Scale @rd by multiplying the tracked values by @mult and shifting right by + * @rshift. + */ +static RAVG_FN_ATTRS void ravg_scale(struct ravg_data *rd, u32 mult, u32 rshift) +{ + rd->val = u64_x_u32_rshift(rd->val, mult, rshift); + rd->old = u64_x_u32_rshift(rd->old, mult, rshift); + rd->cur = u64_x_u32_rshift(rd->cur, mult, rshift); +} + +/** + * ravg_read - Read the current running avg + * @rd: ravg_data to read from + * @now: timestamp as of which to read the running avg + * @half_life: decay period, must match ravg_accumulate()'s + * + * Read running avg from @rd as of @now. + */ +static RAVG_FN_ATTRS u64 ravg_read(struct ravg_data *rd, u64 now, u64 half_life) +{ + struct ravg_data trd; + u32 elapsed; + + /* + * It may be difficult for the caller to guarantee monotonic progress if + * multiple CPUs accumulate to the same ravg_data. Handle @now being in + * the past of @rd->val_at. + */ + if (now < rd->val_at) + now = rd->val_at; + + elapsed = now % half_life; + + /* + * Accumulate the ongoing period into a temporary copy. This allows + * external readers to access up-to-date avg without strongly + * synchronizing with the updater (we need to add a seq lock tho). + */ + trd = *rd; + rd = &trd; + ravg_accumulate(rd, 0, now, half_life); + + /* + * At the beginning of a new half_life period, the running avg is the + * same as @rd->old. At the beginning of the next, it'd be old load / 2 + * + current load / 2. Inbetween, we blend the two linearly. + */ + if (elapsed) { + u32 progress = ravg_normalize_dur(elapsed, half_life); + /* + * `H` is the duration of the half-life window, and `E` is how + * much time has elapsed in this window. `P` is [0.0, 1.0] + * representing how much the current window has progressed: + * + * P = E / H + * + * If `old` is @rd->old, we would want to calculate the + * following for blending: + * + * old * (1.0 - P / 2) + * + * Because @progress is [0, 1 << RAVG_FRAC_BITS], let's multiply + * and then divide by 1 << RAVG_FRAC_BITS: + * + * (1 << RAVG_FRAC_BITS) - (1 << RAVG_FRAC_BITS) * P / 2 + * old * ----------------------------------------------------- + * 1 << RAVG_FRAC_BITS + * + * As @progress is (1 << RAVG_FRAC_BITS) * P: + * + * (1 << RAVG_FRAC_BITS) - progress / 2 + * old * ------------------------------------ + * 1 << RAVG_FRAC_BITS + * + * As @rd->old uses full 64bit, the multiplication can overflow, + * but we also know that the final result is gonna be smaller + * than @rd->old and thus fit. Use u64_x_u32_rshift() to handle + * the interim multiplication correctly. + */ + u64 old = u64_x_u32_rshift(rd->old, + (1 << RAVG_FRAC_BITS) - progress / 2, + RAVG_FRAC_BITS); + /* + * If `S` is the Sum(val * duration) for this half-life window, + * the avg for this window is: + * + * S / E + * + * We would want to calculate the following for blending: + * + * S / E * (P / 2) + * + * As P = E / H, + * + * S / E * (E / H / 2) + * S / H / 2 + * + * Expanding S, the above becomes: + * + * Sum(val * duration) / H / 2 + * Sum(val * (duration / H)) / 2 + * + * As we use RAVG_FRAC_BITS bits for fixed point arithmetic, + * let's multiply the whole result accordingly: + * + * (Sum(val * (duration / H)) / 2) * (1 << RAVG_FRAC_BITS) + * + * duration * (1 << RAVG_FRAC_BITS) + * Sum(val * --------------------------------) / 2 + * H + * + * The righthand multiplier inside Sum() is the normalized + * duration returned from ravg_normalize_dur(), so, the whole + * Sum term equals @rd->cur. + * + * rd->cur / 2 + */ + u64 cur = rd->cur / 2; + + return old + cur; + } else { + return rd->old; + } +} diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h new file mode 100644 index 000000000..a51b0baf1 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __USER_EXIT_INFO_H +#define __USER_EXIT_INFO_H + +enum uei_sizes { + UEI_REASON_SIZE = 128, + UEI_MSG_SIZE = 1024, + UEI_DUMP_SIZE = 32768, +}; + +struct user_exit_info { + int kind; + char reason[UEI_REASON_SIZE]; + char msg[UEI_MSG_SIZE]; + char dump[UEI_DUMP_SIZE]; +}; + +#ifdef __bpf__ + +#include "vmlinux.h" +#include + +static inline void uei_record(struct user_exit_info *uei, + const struct scx_exit_info *ei) +{ + bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason); + bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg); + bpf_probe_read_kernel_str(uei->dump, sizeof(uei->dump), ei->dump); + /* use __sync to force memory barrier */ + __sync_val_compare_and_swap(&uei->kind, uei->kind, ei->kind); +} + +#else /* !__bpf__ */ + +#include +#include + +static inline bool uei_exited(struct user_exit_info *uei) +{ + /* use __sync to force memory barrier */ + return __sync_val_compare_and_swap(&uei->kind, -1, -1); +} + +static inline void uei_print(const struct user_exit_info *uei) +{ + if (uei->dump[0] != '\0') { + fputs("\nDEBUG DUMP\n", stderr); + fputs("================================================================================\n\n", stderr); + fputs(uei->dump, stderr); + fputs("\n================================================================================\n\n", stderr); + } + fprintf(stderr, "EXIT: %s", uei->reason); + if (uei->msg[0] != '\0') + fprintf(stderr, " (%s)", uei->msg); + fputs("\n", stderr); +} + +#endif /* __bpf__ */ +#endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c new file mode 100644 index 000000000..2fb75543a --- /dev/null +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -0,0 +1,400 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple five-level FIFO queue scheduler. + * + * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets + * assigned to one depending on its compound weight. Each CPU round robins + * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from + * queue0, 2 from queue1, 4 from queue2 and so on. + * + * This scheduler demonstrates: + * + * - BPF-side queueing using PIDs. + * - Sleepable per-task storage allocation using ops.prep_enable(). + * - Using ops.cpu_release() to handle a higher priority scheduling class taking + * the CPU away. + * - Core-sched support. + * + * This scheduler is primarily for demonstration and testing of sched_ext + * features and unlikely to be useful for actual workloads. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +const volatile u64 slice_ns = SCX_SLICE_DFL; +const volatile bool switch_partial; +const volatile u32 stall_user_nth; +const volatile u32 stall_kernel_nth; +const volatile u32 dsp_inf_loop_after; +const volatile s32 disallow_tgid; + +u32 test_error_cnt; + +struct user_exit_info uei; + +struct qmap { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, u32); +} queue0 SEC(".maps"), + queue1 SEC(".maps"), + queue2 SEC(".maps"), + queue3 SEC(".maps"), + queue4 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 5); + __type(key, int); + __array(values, struct qmap); +} queue_arr SEC(".maps") = { + .values = { + [0] = &queue0, + [1] = &queue1, + [2] = &queue2, + [3] = &queue3, + [4] = &queue4, + }, +}; + +/* + * Per-queue sequence numbers to implement core-sched ordering. + * + * Tail seq is assigned to each queued task and incremented. Head seq tracks the + * sequence number of the latest dispatched task. The distance between the a + * task's seq and the associated queue's head seq is called the queue distance + * and used when comparing two tasks for ordering. See qmap_core_sched_before(). + */ +static u64 core_sched_head_seqs[5]; +static u64 core_sched_tail_seqs[5]; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local_dsq */ + u64 core_sched_seq; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* Per-cpu dispatch index and remaining count */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 2); + __type(key, u32); + __type(value, u64); +} dispatch_idx_cnt SEC(".maps"); + +/* Statistics */ +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; +u64 nr_core_sched_execed; + +s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + return cpu; + + return prev_cpu; +} + +static int weight_to_idx(u32 weight) +{ + /* Coarsely map the compound weight to a FIFO. */ + if (weight <= 25) + return 0; + else if (weight <= 50) + return 1; + else if (weight < 200) + return 2; + else if (weight < 400) + return 3; + else + return 4; +} + +void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) +{ + static u32 user_cnt, kernel_cnt; + struct task_ctx *tctx; + u32 pid = p->pid; + int idx = weight_to_idx(p->scx.weight); + void *ring; + + if (p->flags & PF_KTHREAD) { + if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) + return; + } else { + if (stall_user_nth && !(++user_cnt % stall_user_nth)) + return; + } + + if (test_error_cnt && !--test_error_cnt) + scx_bpf_error("test triggering error"); + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + /* + * All enqueued tasks must have their core_sched_seq updated for correct + * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in + * qmap_ops.flags. + */ + tctx->core_sched_seq = core_sched_tail_seqs[idx]++; + + /* + * If qmap_select_cpu() is telling us to or this is the last runnable + * task on the CPU, enqueue locally. + */ + if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { + tctx->force_local = false; + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); + return; + } + + /* + * If the task was re-enqueued due to the CPU being preempted by a + * higher priority scheduling class, just re-enqueue the task directly + * on the global DSQ. As we want another CPU to pick it up, find and + * kick an idle CPU. + */ + if (enq_flags & SCX_ENQ_REENQ) { + s32 cpu; + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags); + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + scx_bpf_kick_cpu(cpu, 0); + return; + } + + ring = bpf_map_lookup_elem(&queue_arr, &idx); + if (!ring) { + scx_bpf_error("failed to find ring %d", idx); + return; + } + + /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ + if (bpf_map_push_elem(ring, &pid, 0)) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_enqueued, 1); +} + +/* + * The BPF queue map doesn't support removal and sched_ext can handle spurious + * dispatches. qmap_dequeue() is only used to collect statistics. + */ +void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) +{ + __sync_fetch_and_add(&nr_dequeued, 1); + if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) + __sync_fetch_and_add(&nr_core_sched_execed, 1); +} + +static void update_core_sched_head_seq(struct task_struct *p) +{ + struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + int idx = weight_to_idx(p->scx.weight); + + if (tctx) + core_sched_head_seqs[idx] = tctx->core_sched_seq; + else + scx_bpf_error("task_ctx lookup failed"); +} + +void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) +{ + u32 zero = 0, one = 1; + u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero); + u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one); + void *fifo; + s32 pid; + int i; + + if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { + struct task_struct *p; + + /* + * PID 2 should be kthreadd which should mostly be idle and off + * the scheduler. Let's keep dispatching it to force the kernel + * to call this function over and over again. + */ + p = bpf_task_from_pid(2); + if (p) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); + bpf_task_release(p); + return; + } + } + + if (!idx || !cnt) { + scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt); + return; + } + + for (i = 0; i < 5; i++) { + /* Advance the dispatch cursor and pick the fifo. */ + if (!*cnt) { + *idx = (*idx + 1) % 5; + *cnt = 1 << *idx; + } + (*cnt)--; + + fifo = bpf_map_lookup_elem(&queue_arr, idx); + if (!fifo) { + scx_bpf_error("failed to find ring %llu", *idx); + return; + } + + /* Dispatch or advance. */ + if (!bpf_map_pop_elem(fifo, &pid)) { + struct task_struct *p; + + p = bpf_task_from_pid(pid); + if (p) { + update_core_sched_head_seq(p); + __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); + bpf_task_release(p); + return; + } + } + + *cnt = 0; + } +} + +/* + * The distance from the head of the queue scaled by the weight of the queue. + * The lower the number, the older the task and the higher the priority. + */ +static s64 task_qdist(struct task_struct *p) +{ + int idx = weight_to_idx(p->scx.weight); + struct task_ctx *tctx; + s64 qdist; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return 0; + } + + qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; + + /* + * As queue index increments, the priority doubles. The queue w/ index 3 + * is dispatched twice more frequently than 2. Reflect the difference by + * scaling qdists accordingly. Note that the shift amount needs to be + * flipped depending on the sign to avoid flipping priority direction. + */ + if (qdist >= 0) + return qdist << (4 - idx); + else + return qdist << idx; +} + +/* + * This is called to determine the task ordering when core-sched is picking + * tasks to execute on SMT siblings and should encode about the same ordering as + * the regular scheduling path. Use the priority-scaled distances from the head + * of the queues to compare the two tasks which should be consistent with the + * dispatch path behavior. + */ +bool BPF_STRUCT_OPS(qmap_core_sched_before, + struct task_struct *a, struct task_struct *b) +{ + return task_qdist(a) > task_qdist(b); +} + +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + u32 cnt; + + /* + * Called when @cpu is taken by a higher priority scheduling class. This + * makes @cpu no longer available for executing sched_ext tasks. As we + * don't want the tasks in @cpu's local dsq to sit there until @cpu + * becomes available again, re-enqueue them into the global dsq. See + * %SCX_ENQ_REENQ handling in qmap_enqueue(). + */ + cnt = scx_bpf_reenqueue_local(); + if (cnt) + __sync_fetch_and_add(&nr_reenqueued, cnt); +} + +s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (p->tgid == disallow_tgid) + p->scx.disallow = true; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(qmap_init) +{ + if (!switch_partial) + scx_bpf_switch_all(); + return 0; +} + +void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops qmap_ops = { + .select_cpu = (void *)qmap_select_cpu, + .enqueue = (void *)qmap_enqueue, + .dequeue = (void *)qmap_dequeue, + .dispatch = (void *)qmap_dispatch, + .core_sched_before = (void *)qmap_core_sched_before, + .cpu_release = (void *)qmap_cpu_release, + .init_task = (void *)qmap_init_task, + .init = (void *)qmap_init, + .exit = (void *)qmap_exit, + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 5000U, + .name = "qmap", +}; diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c new file mode 100644 index 000000000..7008b9138 --- /dev/null +++ b/tools/sched_ext/scx_qmap.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_qmap.bpf.skel.h" + +const char help_fmt[] = +"A simple five-level FIFO queue sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID] [-p]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" +" -t COUNT Stall every COUNT'th user thread\n" +" -T COUNT Stall every COUNT'th kernel thread\n" +" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" +" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_qmap *skel; + struct bpf_link *link; + int opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_qmap__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + while ((opt = getopt(argc, argv, "s:e:t:T:l:d:ph")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'e': + skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); + break; + case 't': + skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); + break; + case 'T': + skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); + break; + case 'l': + skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); + break; + case 'd': + skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); + if (skel->rodata->disallow_tgid < 0) + skel->rodata->disallow_tgid = getpid(); + break; + case 'p': + skel->rodata->switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_BUG_ON(scx_qmap__load(skel), "Failed to load skel"); + + link = bpf_map__attach_struct_ops(skel->maps.qmap_ops); + SCX_BUG_ON(!link, "Failed to attach struct_ops"); + + while (!exit_req && !uei_exited(&skel->bss->uei)) { + long nr_enqueued = skel->bss->nr_enqueued; + long nr_dispatched = skel->bss->nr_dispatched; + + printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%" PRIu64 ", deq=%" PRIu64 ", core=%" PRIu64 "\n", + nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, + skel->bss->nr_reenqueued, skel->bss->nr_dequeued, + skel->bss->nr_core_sched_execed); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + uei_print(&skel->bss->uei); + scx_qmap__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py new file mode 100644 index 000000000..d457d2a74 --- /dev/null +++ b/tools/sched_ext/scx_show_state.py @@ -0,0 +1,39 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Tejun Heo +# Copyright (C) 2024 Meta Platforms, Inc. and affiliates. + +desc = """ +This is a drgn script to show the current sched_ext state. +For more info on drgn, visit https://github.com/osandov/drgn. +""" + +import drgn +import sys + +def err(s): + print(s, file=sys.stderr, flush=True) + sys.exit(1) + +def read_int(name): + return int(prog[name].value_()) + +def read_atomic(name): + return prog[name].counter.value_() + +def read_static_key(name): + return prog[name].key.enabled.counter.value_() + +def ops_state_str(state): + return prog['scx_ops_enable_state_str'][state].string_().decode() + +ops = prog['scx_ops'] +enable_state = read_atomic("scx_ops_enable_state_var") + +print(f'ops : {ops.name.string_().decode()}') +print(f'enabled : {read_static_key("__scx_ops_enabled")}') +print(f'switching_all : {read_int("scx_switching_all")}') +print(f'switched_all : {read_static_key("__scx_switched_all")}') +print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') +print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') +print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c new file mode 100644 index 000000000..95035aa29 --- /dev/null +++ b/tools/sched_ext/scx_simple.bpf.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple scheduler. + * + * By default, it operates as a simple global weighted vtime scheduler and can + * be switched to FIFO scheduling. It also demonstrates the following niceties. + * + * - Statistics tracking how many tasks are queued to local and global dsq's. + * - Termination notification for userspace. + * + * While very simple, this scheduler should work reasonably well on CPUs with a + * uniform L3 cache topology. While preemption is not implemented, the fact that + * the scheduling queue is shared across all CPUs means that whatever is at the + * front of the queue is likely to be executed fairly quickly given enough + * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads + * but comes with the usual problems with FIFO scheduling where saturating + * threads can easily drown out interactive ones. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +const volatile bool fifo_sched; +const volatile bool switch_partial; + +static u64 vtime_now; +struct user_exit_info uei; + +#define SHARED_DSQ 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, global] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc(0); /* count local queueing */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ + stat_inc(1); /* count global queueing */ + + if (fifo_sched) { + scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + vtime = vtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, + enq_flags); + } +} + +void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SHARED_DSQ); +} + +void BPF_STRUCT_OPS(simple_running, struct task_struct *p) +{ + if (fifo_sched) + return; + + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) +{ + if (fifo_sched) + return; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) +{ + if (!switch_partial) + scx_bpf_switch_all(); + + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops simple_ops = { + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .dispatch = (void *)simple_dispatch, + .running = (void *)simple_running, + .stopping = (void *)simple_stopping, + .enable = (void *)simple_enable, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple", +}; diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c new file mode 100644 index 000000000..5c5589770 --- /dev/null +++ b/tools/sched_ext/scx_simple.c @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include "scx_simple.bpf.skel.h" + +const char help_fmt[] = +"A simple sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-f] [-p]\n" +"\n" +" -f Use FIFO scheduling instead of weighted vtime scheduling\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void read_stats(struct scx_simple *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_simple *skel; + struct bpf_link *link; + __u32 opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_simple__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + while ((opt = getopt(argc, argv, "fph")) != -1) { + switch (opt) { + case 'f': + skel->rodata->fifo_sched = true; + break; + case 'p': + skel->rodata->switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_BUG_ON(scx_simple__load(skel), "Failed to load skel"); + + link = bpf_map__attach_struct_ops(skel->maps.simple_ops); + SCX_BUG_ON(!link, "Failed to attach struct_ops"); + + while (!exit_req && !uei_exited(&skel->bss->uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu global=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + uei_print(&skel->bss->uei); + scx_simple__destroy(skel); + return 0; +} diff --git a/tools/testing/selftests/scx/.gitignore b/tools/testing/selftests/scx/.gitignore new file mode 100644 index 000000000..ae5491a11 --- /dev/null +++ b/tools/testing/selftests/scx/.gitignore @@ -0,0 +1,6 @@ +* +!*.c +!*.h +!Makefile +!.gitignore +!config diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile new file mode 100644 index 000000000..c8d5231af --- /dev/null +++ b/tools/testing/selftests/scx/Makefile @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../../../build/Build.include +include ../../../scripts/Makefile.arch +include ../../../scripts/Makefile.include +include ../lib.mk + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := gcc +endif # LLVM + +ifneq ($(CROSS_COMPILE),) +$(error CROSS_COMPILE not supported for scx selftests) +endif # CROSS_COMPILE + +CURDIR := $(abspath .) +REPOROOT := $(abspath ../../../..) +TOOLSDIR := $(REPOROOT)/tools +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(REPOROOT)/include/generated +GENHDR := $(GENDIR)/autoconf.h +SCXTOOLSDIR := $(TOOLSDIR)/sched_ext +SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include + +OUTPUT_DIR := $(CURDIR)/build +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a +DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) + +VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR) + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread -lzstd + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +################ +# C schedulers # +################ + +override define CLEAN + rm -rf $(OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(TEST_GEN_PROGS) + rm -f runner +endef + +# Every testcase takes all of the BPF progs are dependencies by default. This +# allows testcases to load any BPF scheduler, which is useful for testcases +# that don't need their own prog to run their test. +all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog))) + +auto-test-targets := \ + enq_last_no_enq_fails \ + enq_select_cpu_fails \ + ddsp_bogus_dsq_fail \ + ddsp_vtimelocal_fail \ + init_enable_count \ + maximal \ + maybe_null \ + minimal \ + reload_loop \ + select_cpu_dfl \ + select_cpu_dfl_nodispatch \ + select_cpu_dispatch \ + select_cpu_dispatch_bad_dsq \ + select_cpu_dispatch_dbl_dsp \ + select_cpu_vtime \ + test_example \ + +testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) + +$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +# Create all of the test targets object files, whose testcase objects will be +# registered into the runner in ELF constructors. +# +# Note that we must do double expansion here in order to support conditionally +# compiling BPF object files only if one is present, as the wildcard Make +# function doesn't support using implicit rules otherwise. +$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR) + $(eval test=$(patsubst %.o,%.c,$(notdir $@))) + $(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o + +runner: $(SCXOBJ_DIR)/runner.o $(BPFOBJ) $(testcase-targets) + @echo "$(testcase-targets)" + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +TEST_GEN_PROGS := runner + +all: runner + +.PHONY: all clean help + +.DEFAULT_GOAL := all + +.DELETE_ON_ERROR: + +.SECONDARY: diff --git a/tools/testing/selftests/scx/config b/tools/testing/selftests/scx/config new file mode 100644 index 000000000..fef8f81ad --- /dev/null +++ b/tools/testing/selftests/scx/config @@ -0,0 +1,5 @@ +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_EXT_GROUP_SCHED=y diff --git a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c new file mode 100644 index 000000000..dd32b1899 --- /dev/null +++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include + +char _license[] SEC("license") = "GPL"; + +struct user_exit_info uei; + +s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* + * If we dispatch to a bogus DSQ that will fall back to the + * builtin global DSQ, we fail gracefully. + */ + scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_bogus_dsq_fail_ops = { + .select_cpu = ddsp_bogus_dsq_fail_select_cpu, + .exit = ddsp_bogus_dsq_fail_exit, + .init = ddsp_bogus_dsq_fail_init, + .name = "ddsp_bogus_dsq_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c new file mode 100644 index 000000000..ef8ee04ff --- /dev/null +++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include "ddsp_bogus_dsq_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_bogus_dsq_fail *skel; + + skel = ddsp_bogus_dsq_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + + ddsp_bogus_dsq_fail__destroy(skel); +} + +struct scx_test ddsp_bogus_dsq_fail = { + .name = "ddsp_bogus_dsq_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct dispatch to an invalid" + " DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail) diff --git a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c new file mode 100644 index 000000000..9b21c1d57 --- /dev/null +++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include + +char _license[] SEC("license") = "GPL"; + +struct user_exit_info uei; + +s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ + scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_vtimelocal_fail_ops = { + .select_cpu = ddsp_vtimelocal_fail_select_cpu, + .init = ddsp_vtimelocal_fail_init, + .exit = ddsp_vtimelocal_fail_exit, + .name = "ddsp_vtimelocal_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c new file mode 100644 index 000000000..b55611cd0 --- /dev/null +++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include "ddsp_vtimelocal_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_vtimelocal_fail *skel; + + skel = ddsp_vtimelocal_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + + ddsp_vtimelocal_fail__destroy(skel); +} + +struct scx_test ddsp_vtimelocal_fail = { + .name = "ddsp_vtimelocal_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct vtime dispatch to a " + "built-in DSQ from DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_vtimelocal_fail) diff --git a/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c new file mode 100644 index 000000000..4b0f84568 --- /dev/null +++ b/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(enq_last_no_enq_fails_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops enq_last_no_enq_fails_ops = { + .init = enq_last_no_enq_fails_init, + .name = "enq_last_no_enq_fails", + /* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */ + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/enq_last_no_enq_fails.c b/tools/testing/selftests/scx/enq_last_no_enq_fails.c new file mode 100644 index 000000000..2a3eda5e2 --- /dev/null +++ b/tools/testing/selftests/scx/enq_last_no_enq_fails.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "enq_last_no_enq_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_last_no_enq_fails *skel; + + skel = enq_last_no_enq_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops); + if (link) { + SCX_ERR("Incorrectly succeeded in to attaching scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + + enq_last_no_enq_fails__destroy(skel); +} + +struct scx_test enq_last_no_enq_fails = { + .name = "enq_last_no_enq_fails", + .description = "Verify we fail to load a scheduler if we specify " + "the SCX_OPS_ENQ_LAST flag without defining " + "ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_last_no_enq_fails) diff --git a/tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c new file mode 100644 index 000000000..40ea393b2 --- /dev/null +++ b/tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, + u64 enq_flags) +{ + /* + * Need to initialize the variable or the verifier will fail to load. + * Improving these semantics is actively being worked on. + */ + bool found = false; + + /* Can only call from ops.select_cpu() */ + scx_bpf_select_cpu_dfl(p, 0, 0, &found); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(enq_select_cpu_fails_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops enq_select_cpu_fails_ops = { + .select_cpu = enq_select_cpu_fails_select_cpu, + .enqueue = enq_select_cpu_fails_enqueue, + .init = enq_select_cpu_fails_init, + .name = "enq_select_cpu_fails", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/enq_select_cpu_fails.c b/tools/testing/selftests/scx/enq_select_cpu_fails.c new file mode 100644 index 000000000..dd1350e5f --- /dev/null +++ b/tools/testing/selftests/scx/enq_select_cpu_fails.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "enq_select_cpu_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_select_cpu_fails *skel; + + skel = enq_select_cpu_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + sleep(1); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + + enq_select_cpu_fails__destroy(skel); +} + +struct scx_test enq_select_cpu_fails = { + .name = "enq_select_cpu_fails", + .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " + "from ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_select_cpu_fails) diff --git a/tools/testing/selftests/scx/init_enable_count.bpf.c b/tools/testing/selftests/scx/init_enable_count.bpf.c new file mode 100644 index 000000000..a5053aeb0 --- /dev/null +++ b/tools/testing/selftests/scx/init_enable_count.bpf.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that verifies that we do proper counting of init, enable, etc + * callbacks. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt; +u64 init_fork_cnt, init_transition_cnt; +volatile const bool switch_all; + +s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + __sync_fetch_and_add(&init_task_cnt, 1); + + if (args->fork) + __sync_fetch_and_add(&init_fork_cnt, 1); + else + __sync_fetch_and_add(&init_transition_cnt, 1); + + return 0; +} + +void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p) +{ + __sync_fetch_and_add(&exit_task_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p) +{ + __sync_fetch_and_add(&enable_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p) +{ + __sync_fetch_and_add(&disable_cnt, 1); +} + +s32 BPF_STRUCT_OPS(cnt_init) +{ + if (switch_all) + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops init_enable_count_ops = { + .init_task = cnt_init_task, + .exit_task = cnt_exit_task, + .enable = cnt_enable, + .disable = cnt_disable, + .init = cnt_init, + .name = "init_enable_count", +}; diff --git a/tools/testing/selftests/scx/init_enable_count.c b/tools/testing/selftests/scx/init_enable_count.c new file mode 100644 index 000000000..1d89f9f6f --- /dev/null +++ b/tools/testing/selftests/scx/init_enable_count.c @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include +#include +#include "scx_test.h" +#include "init_enable_count.bpf.skel.h" + +#define SCHED_EXT 7 + +static struct init_enable_count * +open_load_prog(bool global) +{ + struct init_enable_count *skel; + + skel = init_enable_count__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + if (global) + skel->rodata->switch_all = global; + + SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); + + return skel; +} + +static enum scx_test_status run_test(bool global) +{ + struct init_enable_count *skel; + struct bpf_link *link; + const u32 num_children = 5, num_pre_forks = 1024; + int ret, i, status; + struct sched_param param = {}; + pid_t pids[num_pre_forks]; + + skel = open_load_prog(global); + + /* + * Fork a bunch of children before we attach the scheduler so that we + * ensure (at least in practical terms) that there are more tasks that + * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that + * take the fork() path either below or in other processes. + */ + for (i = 0; i < num_pre_forks; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + for (i = 0; i < num_pre_forks; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for pre-forked child\n"); + + SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, + status); + } + + /* SCHED_EXT children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + + if (pids[i] == 0) { + ret = sched_setscheduler(0, SCHED_EXT, ¶m); + SCX_BUG_ON(ret, "Failed to set sched to sched_ext"); + + /* + * Reset to SCHED_OTHER for half of them. Counts for + * everything should still be the same regardless, as + * ops.disable() is invoked even if a task is still on + * SCHED_EXT before it exits. + */ + if (i % 2 == 0) { + ret = sched_setscheduler(0, SCHED_OTHER, ¶m); + SCX_BUG_ON(ret, "Failed to reset sched to normal"); + } + exit(0); + } + } + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for SCX child\n"); + + SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i, + status); + } + + /* SCHED_OTHER children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + if (pids[i] == 0) + exit(0); + } + + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for normal child\n"); + + SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i, + status); + } + + sleep(1); + + SCX_GE(skel->bss->init_task_cnt, 2 * num_children); + SCX_GE(skel->bss->exit_task_cnt, 2 * num_children); + + if (global) { + SCX_GE(skel->bss->enable_cnt, 2 * num_children); + SCX_GE(skel->bss->disable_cnt, 2 * num_children); + } else { + SCX_EQ(skel->bss->enable_cnt, num_children); + SCX_EQ(skel->bss->disable_cnt, num_children); + } + /* + * We forked a ton of tasks before we attached the scheduler above, so + * this should be fine. Technically it could be flaky if a ton of forks + * are happening at the same time in other processes, but that should + * be exceedingly unlikely. + */ + SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt); + SCX_GE(skel->bss->init_fork_cnt, 2 * num_children); + + bpf_link__destroy(link); + init_enable_count__destroy(skel); + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + enum scx_test_status status; + + status = run_test(true); + if (status != SCX_TEST_PASS) + return status; + + return run_test(false); +} + +struct scx_test init_enable_count = { + .name = "init_enable_count", + .description = "Verify we do the correct amount of counting of init, " + "enable, etc callbacks.", + .run = run, +}; +REGISTER_SCX_TEST(&init_enable_count) diff --git a/tools/testing/selftests/scx/maximal.bpf.c b/tools/testing/selftests/scx/maximal.bpf.c new file mode 100644 index 000000000..00bfa9cb9 --- /dev/null +++ b/tools/testing/selftests/scx/maximal.bpf.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler with every callback defined. + * + * This scheduler defines every callback. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) +{ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +{} + +void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SCX_DSQ_GLOBAL); +} + +void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +{} + +void BPF_STRUCT_OPS(maximal_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable) +{} + +void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags) +{} + +bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from, + struct task_struct *to) +{ + return false; +} + +bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a, + struct task_struct *b) +{ + return false; +} + +void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight) +{} + +void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{} + +void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle) +{} + +void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu, + struct scx_cpu_acquire_args *args) +{} + +void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu, + struct scx_cpu_release_args *args) +{} + +void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu) +{} + +void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu) +{} + +s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{} + +void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) +{} + +s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp) +{} + +s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) +{} + +s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) +{} + +SEC(".struct_ops.link") +struct sched_ext_ops maximal_ops = { + .select_cpu = maximal_select_cpu, + .enqueue = maximal_enqueue, + .dequeue = maximal_dequeue, + .dispatch = maximal_dispatch, + .runnable = maximal_runnable, + .running = maximal_running, + .stopping = maximal_stopping, + .quiescent = maximal_quiescent, + .yield = maximal_yield, + .core_sched_before = maximal_core_sched_before, + .set_weight = maximal_set_weight, + .set_cpumask = maximal_set_cpumask, + .update_idle = maximal_update_idle, + .cpu_acquire = maximal_cpu_acquire, + .cpu_release = maximal_cpu_release, + .cpu_online = maximal_cpu_online, + .cpu_offline = maximal_cpu_offline, + .init_task = maximal_init_task, + .enable = maximal_enable, + .exit_task = maximal_exit_task, + .disable = maximal_disable, + .cgroup_init = maximal_cgroup_init, + .cgroup_exit = maximal_cgroup_exit, + .cgroup_prep_move = maximal_cgroup_prep_move, + .cgroup_move = maximal_cgroup_move, + .cgroup_cancel_move = maximal_cgroup_cancel_move, + .cgroup_set_weight = maximal_cgroup_set_weight, + .init = maximal_init, + .exit = maximal_exit, + .name = "maximal", +}; diff --git a/tools/testing/selftests/scx/maximal.c b/tools/testing/selftests/scx/maximal.c new file mode 100644 index 000000000..f38fc973c --- /dev/null +++ b/tools/testing/selftests/scx/maximal.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include "maximal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct maximal *skel; + + skel = maximal__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct maximal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct maximal *skel = ctx; + + maximal__destroy(skel); +} + +struct scx_test maximal = { + .name = "maximal", + .description = "Verify we can load a scheduler with every callback defined", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&maximal) diff --git a/tools/testing/selftests/scx/maybe_null.bpf.c b/tools/testing/selftests/scx/maybe_null.bpf.c new file mode 100644 index 000000000..ad5e69422 --- /dev/null +++ b/tools/testing/selftests/scx/maybe_null.bpf.c @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p) +{ + if (p != NULL) + vtime_test = p->scx.dsq_vtime; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_success = { + .dispatch = maybe_null_success_dispatch, + .enable = maybe_null_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/scx/maybe_null.c b/tools/testing/selftests/scx/maybe_null.c new file mode 100644 index 000000000..3f26b784f --- /dev/null +++ b/tools/testing/selftests/scx/maybe_null.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ +#include +#include +#include +#include +#include "maybe_null.bpf.skel.h" +#include "maybe_null_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status run(void *ctx) +{ + struct maybe_null *skel; + struct maybe_null_fail *fail_skel; + + skel = maybe_null__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load maybe_null skel"); + return SCX_TEST_FAIL; + } + maybe_null__destroy(skel); + + fail_skel = maybe_null_fail__open_and_load(); + if (fail_skel) { + maybe_null_fail__destroy(fail_skel); + SCX_ERR("Should failed to open and load maybe_null_fail skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +struct scx_test maybe_null = { + .name = "maybe_null", + .description = "Verify if PTR_MAYBE_NULL work for .dispatch", + .run = run, +}; +REGISTER_SCX_TEST(&maybe_null) diff --git a/tools/testing/selftests/scx/maybe_null_fail.bpf.c b/tools/testing/selftests/scx/maybe_null_fail.bpf.c new file mode 100644 index 000000000..1607fe07b --- /dev/null +++ b/tools/testing/selftests/scx/maybe_null_fail.bpf.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p) +{ + vtime_test = p->scx.dsq_vtime; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_fail = { + .dispatch = maybe_null_fail_dispatch, + .enable = maybe_null_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/scx/minimal.bpf.c b/tools/testing/selftests/scx/minimal.bpf.c new file mode 100644 index 000000000..14b3d44d9 --- /dev/null +++ b/tools/testing/selftests/scx/minimal.bpf.c @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A completely minimal scheduler. + * + * This scheduler defines the absolute minimal set of struct sched_ext_ops + * fields: its name (and until a bug is fixed in libbpf, also an ops.running() + * callback). It should _not_ fail to be loaded, and can be used to exercise + * the default scheduling paths in ext.c. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +void BPF_STRUCT_OPS(minimal_running, struct task_struct *p) +{} + +SEC(".struct_ops.link") +struct sched_ext_ops minimal_ops = { + /* + * It shouldn't be necessary to define this minimal_running op, but + * libbpf currently expects that a struct_ops map will always have at + * least one struct_ops prog when loading. Until that issue is fixed, + * let's also define a minimal prog so that we can load and test. + */ + .enable = minimal_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/scx/minimal.c b/tools/testing/selftests/scx/minimal.c new file mode 100644 index 000000000..6c5db8ebb --- /dev/null +++ b/tools/testing/selftests/scx/minimal.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct minimal *skel; + + skel = minimal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct minimal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct minimal *skel = ctx; + + minimal__destroy(skel); +} + +struct scx_test minimal = { + .name = "minimal", + .description = "Verify we can load a fully minimal scheduler", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&minimal) diff --git a/tools/testing/selftests/scx/reload_loop.c b/tools/testing/selftests/scx/reload_loop.c new file mode 100644 index 000000000..5cfba2d6e --- /dev/null +++ b/tools/testing/selftests/scx/reload_loop.c @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include "maximal.bpf.skel.h" +#include "scx_test.h" + +static struct maximal *skel; +static pthread_t threads[2]; + +bool force_exit = false; + +static enum scx_test_status setup(void **ctx) +{ + skel = maximal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +static void *do_reload_loop(void *arg) +{ + u32 i; + + for (i = 0; i < 1024 && !force_exit; i++) { + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); + if (link) + bpf_link__destroy(link); + } + + return NULL; +} + +static enum scx_test_status run(void *ctx) +{ + int err; + void *ret; + + err = pthread_create(&threads[0], NULL, do_reload_loop, NULL); + SCX_FAIL_IF(err, "Failed to create thread 0"); + + err = pthread_create(&threads[1], NULL, do_reload_loop, NULL); + SCX_FAIL_IF(err, "Failed to create thread 1"); + + SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed"); + SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed"); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + force_exit = true; + maximal__destroy(skel); +} + +struct scx_test reload_loop = { + .name = "reload_loop", + .description = "Stress test loading and unloading schedulers repeatedly in a tight loop", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&reload_loop) diff --git a/tools/testing/selftests/scx/runner.c b/tools/testing/selftests/scx/runner.c new file mode 100644 index 000000000..37b25b283 --- /dev/null +++ b/tools/testing/selftests/scx/runner.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include +#include "scx_test.h" + +const char help_fmt[] = +"The runner for sched_ext tests.\n" +"\n" +"The runner is statically linked against all testcases, and runs them all serially.\n" +"It's required for the testcases to be serial, as only a single host-wide sched_ext\n" +"scheduler may be loaded at any given time." +"\n" +"Usage: %s [-t TEST] [-h]\n" +"\n" +" -t TEST Only run tests whose name includes this string\n" +" -s Include print output for skipped tests\n" +" -q Don't print the test descriptions during run\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; +static bool quiet, print_skipped; + +#define MAX_SCX_TESTS 2048 + +static struct scx_test __scx_tests[MAX_SCX_TESTS]; +static unsigned __scx_num_tests = 0; + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void print_test_preamble(const struct scx_test *test, bool quiet) +{ + printf("===== START =====\n"); + printf("TEST: %s\n", test->name); + if (!quiet) + printf("DESCRIPTION: %s\n", test->description); + printf("OUTPUT:\n"); +} + +static const char *status_to_result(enum scx_test_status status) +{ + switch (status) { + case SCX_TEST_PASS: + case SCX_TEST_SKIP: + return "ok"; + case SCX_TEST_FAIL: + return "not ok"; + } + + return NULL; +} + +static void print_test_result(const struct scx_test *test, + enum scx_test_status status, + unsigned int testnum) +{ + const char *result = status_to_result(status); + const char *directive = status == SCX_TEST_SKIP ? "SKIP " : ""; + + printf("%s %u %s # %s\n", result, testnum, test->name, directive); + printf("===== END =====\n"); +} + +static bool should_skip_test(const struct scx_test *test, const char * filter) +{ + return !strstr(test->name, filter); +} + +static enum scx_test_status run_test(const struct scx_test *test) +{ + enum scx_test_status status; + void *context = NULL; + + if (test->setup) { + status = test->setup(&context); + if (status != SCX_TEST_PASS) + return status; + } + + status = test->run(context); + + if (test->cleanup) + test->cleanup(context); + + return status; +} + +static bool test_valid(const struct scx_test *test) +{ + if (!test) { + fprintf(stderr, "NULL test detected\n"); + return false; + } + + if (!test->name) { + fprintf(stderr, + "Test with no name found. Must specify test name.\n"); + return false; + } + + if (!test->description) { + fprintf(stderr, "Test %s requires description.\n", test->name); + return false; + } + + if (!test->run) { + fprintf(stderr, "Test %s has no run() callback\n", test->name); + return false; + } + + return true; +} + +int main(int argc, char **argv) +{ + const char *filter = NULL; + unsigned testnum = 0, i; + unsigned passed = 0, skipped = 0, failed = 0; + int opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + while ((opt = getopt(argc, argv, "qst:h")) != -1) { + switch (opt) { + case 'q': + quiet = true; + break; + case 's': + print_skipped = true; + break; + case 't': + filter = optarg; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + for (i = 0; i < __scx_num_tests; i++) { + enum scx_test_status status; + struct scx_test *test = &__scx_tests[i]; + + if (filter && should_skip_test(test, filter)) { + /* + * Printing the skipped tests and their preambles can + * add a lot of noise to the runner output. Printing + * this is only really useful for CI, so let's skip it + * by default. + */ + if (print_skipped) { + print_test_preamble(test, quiet); + print_test_result(test, SCX_TEST_SKIP, ++testnum); + } + continue; + } + + print_test_preamble(test, quiet); + status = run_test(test); + print_test_result(test, status, ++testnum); + switch (status) { + case SCX_TEST_PASS: + passed++; + break; + case SCX_TEST_SKIP: + skipped++; + break; + case SCX_TEST_FAIL: + failed++; + break; + } + } + printf("\n\n=============================\n\n"); + printf("RESULTS:\n\n"); + printf("PASSED: %u\n", passed); + printf("SKIPPED: %u\n", skipped); + printf("FAILED: %u\n", failed); + + return 0; +} + +void scx_test_register(struct scx_test *test) +{ + SCX_BUG_ON(!test_valid(test), "Invalid test found"); + SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded"); + + __scx_tests[__scx_num_tests++] = *test; +} diff --git a/tools/testing/selftests/scx/scx_test.h b/tools/testing/selftests/scx/scx_test.h new file mode 100644 index 000000000..4b70bf75f --- /dev/null +++ b/tools/testing/selftests/scx/scx_test.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ + +#ifndef __SCX_TEST_H__ +#define __SCX_TEST_H__ + +#include +#include + +enum scx_test_status { + SCX_TEST_PASS = 0, + SCX_TEST_SKIP, + SCX_TEST_FAIL, +}; + +/* Copied from include/linux/sched/ext.h */ +enum scx_test_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* BPF unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +struct scx_test { + /** + * name - The name of the testcase. + */ + const char *name; + + /** + * description - A description of your testcase: what it tests and is + * meant to validate. + */ + const char *description; + + /* + * setup - Setup the test. + * @ctx: A pointer to a context object that will be passed to run and + * cleanup. + * + * An optional callback that allows a testcase to perform setup for its + * run. A test may return SCX_TEST_SKIP to skip the run. + */ + enum scx_test_status (*setup)(void **ctx); + + /* + * run - Run the test. + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * The main test. Callers should return one of: + * + * - SCX_TEST_PASS: Test passed + * - SCX_TEST_SKIP: Test should be skipped + * - SCX_TEST_FAIL: Test failed + * + * This callback must be defined. + */ + enum scx_test_status (*run)(void *ctx); + + /* + * cleanup - Perform cleanup following the test + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * An optional callback that allows a test to perform cleanup after + * being run. This callback is run even if the run() callback returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. + */ + void (*cleanup)(void *ctx); +}; + +void scx_test_register(struct scx_test *test); + +#define REGISTER_SCX_TEST(__test) \ + __attribute__((constructor)) \ + static void ___scxregister##__LINE__(void) \ + { \ + scx_test_register(__test); \ + } + +#define SCX_ERR(__fmt, ...) \ + do { \ + fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__); \ + fprintf(stderr, __fmt, ##__VA_ARGS__); \ + } while (0) + +#define SCX_FAIL(__fmt, ...) \ + do { \ + SCX_ERR(__fmt, ##__VA_ARGS__); \ + return SCX_TEST_FAIL; \ + } while (0) + +#define SCX_FAIL_IF(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_FAIL(__fmt, ##__VA_ARGS__); \ + } while (0) + +#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)", \ + #_x, (u64)(_x)) + +#endif // # __SCX_TEST_H__ diff --git a/tools/testing/selftests/scx/select_cpu_dfl.bpf.c b/tools/testing/selftests/scx/select_cpu_dfl.bpf.c new file mode 100644 index 000000000..f2fa80628 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dfl.bpf.c @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +static bool task_is_test(const struct task_struct *p) +{ + return !bpf_strncmp(p->comm, 9, "select_cpu"); +} + +void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, + u64 enq_flags) +{ + const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask(); + + if (task_is_test(p) && + bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) { + saw_local = true; + } + scx_bpf_put_idle_cpumask(idle_mask); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_ops = { + .enqueue = select_cpu_dfl_enqueue, + .init = select_cpu_dfl_init, + .name = "select_cpu_dfl", +}; diff --git a/tools/testing/selftests/scx/select_cpu_dfl.c b/tools/testing/selftests/scx/select_cpu_dfl.c new file mode 100644 index 000000000..a53a40c2d --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dfl.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dfl.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl *skel; + + skel = select_cpu_dfl__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(!skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + + select_cpu_dfl__destroy(skel); +} + +struct scx_test select_cpu_dfl = { + .name = "select_cpu_dfl", + .description = "Verify the default ops.select_cpu() dispatches tasks " + "when idles cores are found, and skips ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl) diff --git a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c new file mode 100644 index 000000000..636ea1de1 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag + * specified. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* CPU changed by ops.select_cpu() */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, + &tctx->force_local); + + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, + u64 enq_flags) +{ + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + if (tctx->force_local) { + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + saw_local = true; + } + + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, + struct task_struct *p, struct scx_init_task_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_nodispatch_ops = { + .select_cpu = select_cpu_dfl_nodispatch_select_cpu, + .enqueue = select_cpu_dfl_nodispatch_enqueue, + .init_task = select_cpu_dfl_nodispatch_init_task, + .init = select_cpu_dfl_nodispatch_init, + .name = "select_cpu_dfl_nodispatch", +}; diff --git a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c new file mode 100644 index 000000000..1d85bf4bf --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dfl_nodispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl_nodispatch *skel; + + skel = select_cpu_dfl_nodispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + + select_cpu_dfl_nodispatch__destroy(skel); +} + +struct scx_test select_cpu_dfl_nodispatch = { + .name = "select_cpu_dfl_nodispatch", + .description = "Verify behavior of scx_bpf_select_cpu_dfl() in " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch) diff --git a/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c new file mode 100644 index 000000000..0fda97769 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + u64 dsq_id = SCX_DSQ_LOCAL; + s32 cpu = prev_cpu; + + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + goto dispatch; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto dispatch; + + dsq_id = SCX_DSQ_GLOBAL; + cpu = prev_cpu; + +dispatch: + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); + return cpu; +} + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_ops = { + .select_cpu = select_cpu_dispatch_select_cpu, + .init = select_cpu_dispatch_init, + .name = "select_cpu_dispatch", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/select_cpu_dispatch.c b/tools/testing/selftests/scx/select_cpu_dispatch.c new file mode 100644 index 000000000..0309ca878 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch *skel; + + skel = select_cpu_dispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + + select_cpu_dispatch__destroy(skel); +} + +struct scx_test select_cpu_dispatch = { + .name = "select_cpu_dispatch", + .description = "Test direct dispatching to built-in DSQs from " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch) diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c new file mode 100644 index 000000000..c9105add9 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +struct user_exit_info uei; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching to a random DSQ should fail. */ + scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = { + .select_cpu = select_cpu_dispatch_bad_dsq_select_cpu, + .init = select_cpu_dispatch_bad_dsq_init, + .exit = select_cpu_dispatch_bad_dsq_exit, + .name = "select_cpu_dispatch_bad_dsq", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c new file mode 100644 index 000000000..a7b91d58c --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch_bad_dsq.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel; + + skel = select_cpu_dispatch_bad_dsq__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + + select_cpu_dispatch_bad_dsq__destroy(skel); +} + +struct scx_test select_cpu_dispatch_bad_dsq = { + .name = "select_cpu_dispatch_bad_dsq", + .description = "Verify graceful failure if we direct-dispatch to a " + "bogus DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq) diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c new file mode 100644 index 000000000..82d814839 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +struct user_exit_info uei; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching twice in a row is disallowed. */ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = { + .select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu, + .init = select_cpu_dispatch_dbl_dsp_init, + .exit = select_cpu_dispatch_dbl_dsp_exit, + .name = "select_cpu_dispatch_dbl_dsp", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c new file mode 100644 index 000000000..e32b22963 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel; + + skel = select_cpu_dispatch_dbl_dsp__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + + select_cpu_dispatch_dbl_dsp__destroy(skel); +} + +struct scx_test select_cpu_dispatch_dbl_dsp = { + .name = "select_cpu_dispatch_dbl_dsp", + .description = "Verify graceful failure if we dispatch twice to a " + "DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp) diff --git a/tools/testing/selftests/scx/select_cpu_vtime.bpf.c b/tools/testing/selftests/scx/select_cpu_vtime.bpf.c new file mode 100644 index 000000000..b8bdadf3e --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_vtime.bpf.c @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that enqueue flags are properly stored and + * applied at dispatch time when a task is directly dispatched from + * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and + * making the test a very basic vtime scheduler. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +volatile bool consumed; + +static u64 vtime_now; + +#define VTIME_DSQ 0 + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static inline u64 task_vtime(const struct task_struct *p) +{ + u64 vtime = p->scx.dsq_vtime; + + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + return vtime_now - SCX_SLICE_DFL; + else + return vtime; +} + +s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto ddsp; + + cpu = prev_cpu; + scx_bpf_test_and_clear_cpu_idle(cpu); +ddsp: + scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) +{ + if (scx_bpf_consume(VTIME_DSQ)) + consumed = true; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) +{ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, + bool runnable) +{ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) +{ + scx_bpf_switch_all(); + + return scx_bpf_create_dsq(VTIME_DSQ, -1); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_vtime_ops = { + .select_cpu = select_cpu_vtime_select_cpu, + .dispatch = select_cpu_vtime_dispatch, + .running = select_cpu_vtime_running, + .stopping = select_cpu_vtime_stopping, + .enable = select_cpu_vtime_enable, + .init = select_cpu_vtime_init, + .name = "select_cpu_vtime", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/select_cpu_vtime.c b/tools/testing/selftests/scx/select_cpu_vtime.c new file mode 100644 index 000000000..b4629c236 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_vtime.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_vtime.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_vtime *skel; + + skel = select_cpu_vtime__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + struct bpf_link *link; + + SCX_ASSERT(!skel->bss->consumed); + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_ASSERT(skel->bss->consumed); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + + select_cpu_vtime__destroy(skel); +} + +struct scx_test select_cpu_vtime = { + .name = "select_cpu_vtime", + .description = "Test doing direct vtime-dispatching from " + "ops.select_cpu(), to a non-built-in DSQ", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_vtime) diff --git a/tools/testing/selftests/scx/test_example.c b/tools/testing/selftests/scx/test_example.c new file mode 100644 index 000000000..ce36cdf03 --- /dev/null +++ b/tools/testing/selftests/scx/test_example.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include "scx_test.h" + +static bool setup_called = false; +static bool run_called = false; +static bool cleanup_called = false; + +static int context = 10; + +static enum scx_test_status setup(void **ctx) +{ + setup_called = true; + *ctx = &context; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + int *arg = ctx; + + SCX_ASSERT(setup_called); + SCX_ASSERT(!run_called && !cleanup_called); + SCX_EQ(*arg, context); + + run_called = true; + return SCX_TEST_PASS; +} + +static void cleanup (void *ctx) +{ + SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked"); +} + +struct scx_test example = { + .name = "example", + .description = "Validate the basic function of the test suite itself", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&example) -- 2.44.0.325.g11c821f2f2