From c4464cd35ae633591bebd716038401881ec49e46 Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Wed, 1 May 2024 00:23:28 +0200 Subject: [PATCH] sched-ext Signed-off-by: Piotr Gorski --- .../bindings/net/renesas,etheravb.yaml | 12 +- Documentation/netlink/specs/rt_link.yaml | 80 +- Documentation/networking/dns_resolver.rst | 4 +- Documentation/scheduler/index.rst | 1 + Documentation/scheduler/sched-ext.rst | 307 + MAINTAINERS | 13 + Makefile | 8 +- arch/arm/net/bpf_jit_32.c | 25 +- arch/arm64/net/bpf_jit_comp.c | 7 +- arch/loongarch/net/bpf_jit.c | 22 +- arch/mips/net/bpf_jit_comp.c | 3 +- arch/parisc/net/bpf_jit_core.c | 8 +- arch/s390/net/bpf_jit_comp.c | 6 +- arch/sparc/net/bpf_jit_comp_64.c | 6 +- arch/x86/net/bpf_jit_comp.c | 10 +- arch/x86/net/bpf_jit_comp32.c | 3 +- drivers/net/ethernet/amd/amd8111e.c | 1 - drivers/net/ethernet/amd/amd8111e.h | 1 - .../net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 2 +- drivers/net/ethernet/renesas/ravb_main.c | 9 +- .../net/ethernet/stmicro/stmmac/dwmac-rk.c | 2 - drivers/tty/sysrq.c | 1 + include/asm-generic/vmlinux.lds.h | 1 + include/linux/bpf.h | 9 +- include/linux/cgroup-defs.h | 8 + include/linux/cgroup.h | 5 +- include/linux/filter.h | 20 +- include/linux/netdevice.h | 1 + include/linux/sched.h | 5 + include/linux/sched/ext.h | 201 + include/linux/sched/task.h | 3 +- include/linux/skbuff.h | 6 - include/linux/trace_events.h | 36 +- include/trace/bpf_probe.h | 3 +- include/trace/events/bpf_test_run.h | 17 + include/trace/events/net_probe_common.h | 29 + include/trace/events/sock.h | 35 +- include/trace/events/tcp.h | 29 - include/uapi/linux/bpf.h | 26 +- include/uapi/linux/sched.h | 1 + init/Kconfig | 5 + init/init_task.c | 15 + kernel/Kconfig.preempt | 24 +- kernel/bpf/arena.c | 2 +- kernel/bpf/bpf_struct_ops.c | 10 +- kernel/bpf/cgroup.c | 2 - kernel/bpf/core.c | 32 +- kernel/bpf/hashtab.c | 1 + kernel/bpf/helpers.c | 5 + kernel/bpf/lpm_trie.c | 18 +- kernel/bpf/syscall.c | 22 +- kernel/bpf/trampoline.c | 15 +- kernel/bpf/verifier.c | 35 +- kernel/cgroup/cgroup.c | 97 +- kernel/fork.c | 17 +- kernel/sched/build_policy.c | 8 + kernel/sched/core.c | 337 +- kernel/sched/cpufreq_schedutil.c | 50 +- kernel/sched/deadline.c | 4 +- kernel/sched/debug.c | 3 + kernel/sched/ext.c | 6284 +++++++++++++++++ kernel/sched/ext.h | 139 + kernel/sched/fair.c | 25 +- kernel/sched/idle.c | 2 + kernel/sched/rt.c | 4 +- kernel/sched/sched.h | 123 +- kernel/sched/topology.c | 4 +- kernel/trace/bpf_trace.c | 52 +- lib/dump_stack.c | 1 + net/bpf/bpf_dummy_struct_ops.c | 4 +- net/bpf/test_run.c | 8 + net/core/datagram.c | 19 - net/core/dev.c | 219 +- net/core/filter.c | 17 +- net/core/skbuff.c | 4 +- net/ipv4/tcp_bbr.c | 4 - net/ipv4/tcp_cubic.c | 4 - net/ipv4/tcp_dctcp.c | 4 - tools/Makefile | 10 +- tools/bpf/bpftool/Makefile | 14 +- tools/bpf/bpftool/feature.c | 3 +- tools/bpf/bpftool/gen.c | 3 +- tools/bpf/bpftool/pids.c | 19 +- tools/bpf/bpftool/skeleton/pid_iter.bpf.c | 4 +- tools/include/linux/compiler.h | 4 + tools/include/uapi/linux/bpf.h | 26 +- tools/lib/bpf/bpf.c | 16 +- tools/lib/bpf/bpf.h | 9 + tools/lib/bpf/libbpf.c | 137 +- tools/lib/bpf/libbpf.h | 11 + tools/lib/bpf/libbpf.map | 2 + tools/lib/bpf/libbpf_internal.h | 5 - tools/lib/bpf/libbpf_probes.c | 6 +- tools/net/ynl/lib/ynl.py | 5 +- tools/sched_ext/.gitignore | 2 + tools/sched_ext/Makefile | 246 + tools/sched_ext/README.md | 270 + .../sched_ext/include/bpf-compat/gnu/stubs.h | 11 + tools/sched_ext/include/scx/common.bpf.h | 281 + tools/sched_ext/include/scx/common.h | 71 + tools/sched_ext/include/scx/compat.bpf.h | 47 + tools/sched_ext/include/scx/compat.h | 178 + tools/sched_ext/include/scx/user_exit_info.h | 103 + tools/sched_ext/scx_central.bpf.c | 361 + tools/sched_ext/scx_central.c | 122 + tools/sched_ext/scx_flatcg.bpf.c | 939 +++ tools/sched_ext/scx_flatcg.c | 219 + tools/sched_ext/scx_flatcg.h | 51 + tools/sched_ext/scx_qmap.bpf.c | 587 ++ tools/sched_ext/scx_qmap.c | 117 + tools/sched_ext/scx_show_state.py | 39 + tools/sched_ext/scx_simple.bpf.c | 149 + tools/sched_ext/scx_simple.c | 93 + tools/testing/selftests/bpf/bench.c | 33 +- .../bpf/benchs/bench_local_storage_create.c | 2 +- .../selftests/bpf/benchs/bench_trigger.c | 433 +- .../selftests/bpf/benchs/run_bench_trigger.sh | 22 +- .../selftests/bpf/benchs/run_bench_uprobes.sh | 2 +- tools/testing/selftests/bpf/config | 2 + .../selftests/bpf/prog_tests/bpf_cookie.c | 114 +- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 27 +- .../selftests/bpf/prog_tests/btf_map_in_map.c | 26 +- .../selftests/bpf/prog_tests/fib_lookup.c | 132 +- .../bpf/prog_tests/kprobe_multi_test.c | 248 +- .../testing/selftests/bpf/prog_tests/ksyms.c | 30 +- .../bpf/prog_tests/ns_current_pid_tgid.c | 214 +- .../bpf/prog_tests/test_struct_ops_module.c | 47 + tools/testing/selftests/bpf/progs/iters.c | 2 +- .../selftests/bpf/progs/struct_ops_module.c | 16 +- .../selftests/bpf/progs/tcp_ca_kfunc.c | 121 + .../selftests/bpf/progs/test_bpf_cookie.c | 16 + .../bpf/progs/test_ns_current_pid_tgid.c | 31 +- .../selftests/bpf/progs/trigger_bench.c | 107 +- .../bpf/progs/verifier_helper_restricted.c | 8 +- tools/testing/selftests/bpf/test_cpp.cpp | 5 + tools/testing/selftests/bpf/test_tc_tunnel.sh | 13 +- tools/testing/selftests/bpf/trace_helpers.c | 46 +- tools/testing/selftests/bpf/trace_helpers.h | 7 + .../testing/selftests/drivers/net/hw/Makefile | 25 + .../drivers/net/hw/devlink_port_split.py | 309 + .../selftests/drivers/net/hw/ethtool.sh | 300 + .../drivers/net/hw/ethtool_extended_state.sh | 116 + .../selftests/drivers/net/hw/ethtool_lib.sh | 120 + .../selftests/drivers/net/hw/ethtool_mm.sh | 341 + .../selftests/drivers/net/hw/ethtool_rmon.sh | 144 + .../selftests/drivers/net/hw/hw_stats_l3.sh | 333 + .../drivers/net/hw/hw_stats_l3_gre.sh | 110 + .../selftests/drivers/net/hw/loopback.sh | 103 + .../testing/selftests/drivers/net/hw/settings | 1 + .../selftests/drivers/net/mlxsw/mlxsw_lib.sh | 2 +- .../net/mlxsw/spectrum-2/resource_scale.sh | 1 - .../net/mlxsw/spectrum/resource_scale.sh | 1 - tools/testing/selftests/net/Makefile | 1 - .../testing/selftests/net/forwarding/Makefile | 9 +- tools/testing/selftests/net/forwarding/README | 33 + .../net/forwarding/forwarding.config.sample | 53 +- .../selftests/net/forwarding/ipip_lib.sh | 1 - tools/testing/selftests/net/forwarding/lib.sh | 255 +- .../selftests/net/forwarding/lib_sh_test.sh | 208 + .../net/forwarding/router_mpath_nh_lib.sh | 12 +- .../selftests/net/forwarding/sch_ets_tests.sh | 19 +- .../selftests/net/forwarding/sch_red.sh | 10 +- .../selftests/net/forwarding/sch_tbf_core.sh | 2 +- .../selftests/net/forwarding/tc_common.sh | 2 +- .../selftests/net/forwarding/tc_tunnel_key.sh | 2 - tools/testing/selftests/net/lib.sh | 48 +- tools/testing/selftests/sched_ext/.gitignore | 6 + tools/testing/selftests/sched_ext/Makefile | 216 + tools/testing/selftests/sched_ext/config | 9 + .../sched_ext/ddsp_bogus_dsq_fail.bpf.c | 42 + .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 57 + .../sched_ext/ddsp_vtimelocal_fail.bpf.c | 39 + .../sched_ext/ddsp_vtimelocal_fail.c | 56 + .../sched_ext/enq_last_no_enq_fails.bpf.c | 21 + .../sched_ext/enq_last_no_enq_fails.c | 60 + .../sched_ext/enq_select_cpu_fails.bpf.c | 43 + .../sched_ext/enq_select_cpu_fails.c | 61 + tools/testing/selftests/sched_ext/exit.bpf.c | 84 + tools/testing/selftests/sched_ext/exit.c | 55 + tools/testing/selftests/sched_ext/exit_test.h | 20 + .../testing/selftests/sched_ext/hotplug.bpf.c | 55 + tools/testing/selftests/sched_ext/hotplug.c | 168 + .../selftests/sched_ext/hotplug_test.h | 15 + .../sched_ext/init_enable_count.bpf.c | 53 + .../selftests/sched_ext/init_enable_count.c | 166 + .../testing/selftests/sched_ext/maximal.bpf.c | 164 + tools/testing/selftests/sched_ext/maximal.c | 51 + .../selftests/sched_ext/maybe_null.bpf.c | 26 + .../testing/selftests/sched_ext/maybe_null.c | 40 + .../selftests/sched_ext/maybe_null_fail.bpf.c | 25 + .../testing/selftests/sched_ext/minimal.bpf.c | 21 + tools/testing/selftests/sched_ext/minimal.c | 58 + .../selftests/sched_ext/prog_run.bpf.c | 32 + tools/testing/selftests/sched_ext/prog_run.c | 78 + .../testing/selftests/sched_ext/reload_loop.c | 75 + tools/testing/selftests/sched_ext/runner.c | 201 + tools/testing/selftests/sched_ext/scx_test.h | 131 + .../selftests/sched_ext/select_cpu_dfl.bpf.c | 40 + .../selftests/sched_ext/select_cpu_dfl.c | 72 + .../sched_ext/select_cpu_dfl_nodispatch.bpf.c | 89 + .../sched_ext/select_cpu_dfl_nodispatch.c | 72 + .../sched_ext/select_cpu_dispatch.bpf.c | 41 + .../selftests/sched_ext/select_cpu_dispatch.c | 70 + .../select_cpu_dispatch_bad_dsq.bpf.c | 37 + .../sched_ext/select_cpu_dispatch_bad_dsq.c | 56 + .../select_cpu_dispatch_dbl_dsp.bpf.c | 38 + .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 56 + .../sched_ext/select_cpu_vtime.bpf.c | 92 + .../selftests/sched_ext/select_cpu_vtime.c | 59 + .../selftests/sched_ext/test_example.c | 49 + tools/testing/selftests/sched_ext/util.c | 71 + tools/testing/selftests/sched_ext/util.h | 13 + 212 files changed, 18907 insertions(+), 1086 deletions(-) create mode 100644 Documentation/scheduler/sched-ext.rst create mode 100644 include/linux/sched/ext.h create mode 100644 kernel/sched/ext.c create mode 100644 kernel/sched/ext.h create mode 100644 tools/sched_ext/.gitignore create mode 100644 tools/sched_ext/Makefile create mode 100644 tools/sched_ext/README.md create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h create mode 100644 tools/sched_ext/include/scx/common.bpf.h create mode 100644 tools/sched_ext/include/scx/common.h create mode 100644 tools/sched_ext/include/scx/compat.bpf.h create mode 100644 tools/sched_ext/include/scx/compat.h create mode 100644 tools/sched_ext/include/scx/user_exit_info.h create mode 100644 tools/sched_ext/scx_central.bpf.c create mode 100644 tools/sched_ext/scx_central.c create mode 100644 tools/sched_ext/scx_flatcg.bpf.c create mode 100644 tools/sched_ext/scx_flatcg.c create mode 100644 tools/sched_ext/scx_flatcg.h create mode 100644 tools/sched_ext/scx_qmap.bpf.c create mode 100644 tools/sched_ext/scx_qmap.c create mode 100644 tools/sched_ext/scx_show_state.py create mode 100644 tools/sched_ext/scx_simple.bpf.c create mode 100644 tools/sched_ext/scx_simple.c create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c create mode 100644 tools/testing/selftests/drivers/net/hw/Makefile create mode 100755 tools/testing/selftests/drivers/net/hw/devlink_port_split.py create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh create mode 100644 tools/testing/selftests/drivers/net/hw/ethtool_lib.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_mm.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh create mode 100755 tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh create mode 100755 tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh create mode 100755 tools/testing/selftests/drivers/net/hw/loopback.sh create mode 100644 tools/testing/selftests/drivers/net/hw/settings create mode 100755 tools/testing/selftests/net/forwarding/lib_sh_test.sh create mode 100644 tools/testing/selftests/sched_ext/.gitignore create mode 100644 tools/testing/selftests/sched_ext/Makefile create mode 100644 tools/testing/selftests/sched_ext/config create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c create mode 100644 tools/testing/selftests/sched_ext/exit.c create mode 100644 tools/testing/selftests/sched_ext/exit_test.h create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c create mode 100644 tools/testing/selftests/sched_ext/hotplug.c create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c create mode 100644 tools/testing/selftests/sched_ext/maximal.c create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c create mode 100644 tools/testing/selftests/sched_ext/minimal.c create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c create mode 100644 tools/testing/selftests/sched_ext/prog_run.c create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c create mode 100644 tools/testing/selftests/sched_ext/runner.c create mode 100644 tools/testing/selftests/sched_ext/scx_test.h create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c create mode 100644 tools/testing/selftests/sched_ext/test_example.c create mode 100644 tools/testing/selftests/sched_ext/util.c create mode 100644 tools/testing/selftests/sched_ext/util.h diff --git a/Documentation/devicetree/bindings/net/renesas,etheravb.yaml b/Documentation/devicetree/bindings/net/renesas,etheravb.yaml index de7ba7f34..21a92f179 100644 --- a/Documentation/devicetree/bindings/net/renesas,etheravb.yaml +++ b/Documentation/devicetree/bindings/net/renesas,etheravb.yaml @@ -88,10 +88,16 @@ properties: '#address-cells': description: Number of address cells for the MDIO bus. const: 1 + deprecated: true '#size-cells': description: Number of size cells on the MDIO bus. const: 0 + deprecated: true + + mdio: + $ref: /schemas/net/mdio.yaml# + unevaluatedProperties: false renesas,no-ether-link: type: boolean @@ -110,9 +116,13 @@ properties: tx-internal-delay-ps: enum: [0, 2000] +# In older bindings there where no mdio child-node to describe the MDIO bus +# and the PHY. To not fail older bindings accept any node with an address. New +# users should describe the PHY inside the mdio child-node. patternProperties: "@[0-9a-f]$": type: object + deprecated: true required: - compatible @@ -123,8 +133,6 @@ required: - resets - phy-mode - phy-handle - - '#address-cells' - - '#size-cells' allOf: - $ref: ethernet-controller.yaml# diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 8e4d19ade..81a5a3d1b 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -50,7 +50,16 @@ definitions: name: dormant - name: echo - + - + name: vlan-protocols + type: enum + entries: + - + name: 8021q + value: 33024 + - + name: 8021ad + value: 34984 - name: rtgenmsg type: struct @@ -729,7 +738,38 @@ definitions: - name: filter-mask type: u32 - + - + name: ifla-vlan-flags + type: struct + members: + - + name: flags + type: u32 + enum: vlan-flags + enum-as-flags: true + - + name: mask + type: u32 + display-hint: hex + - + name: vlan-flags + type: flags + entries: + - reorder-hdr + - gvrp + - loose-binding + - mvrp + - bridge-binding + - + name: ifla-vlan-qos-mapping + type: struct + members: + - + name: from + type: u32 + - + name: to + type: u32 attribute-sets: - @@ -1507,6 +1547,39 @@ attribute-sets: - name: num-disabled-queues type: u32 + - + name: linkinfo-vlan-attrs + name-prefix: ifla-vlan- + attributes: + - + name: id + type: u16 + - + name: flag + type: binary + struct: ifla-vlan-flags + - + name: egress-qos + type: nest + nested-attributes: ifla-vlan-qos + - + name: ingress-qos + type: nest + nested-attributes: ifla-vlan-qos + - + name: protocol + type: u16 + enum: vlan-protocols + byte-order: big-endian + - + name: ifla-vlan-qos + name-prefix: ifla-vlan-qos + attributes: + - + name: mapping + type: binary + multi-attr: true + struct: ifla-vlan-qos-mapping - name: linkinfo-vrf-attrs name-prefix: ifla-vrf- @@ -1666,6 +1739,9 @@ sub-messages: - value: tun attribute-set: linkinfo-tun-attrs + - + value: vlan + attribute-set: linkinfo-vlan-attrs - value: vrf attribute-set: linkinfo-vrf-attrs diff --git a/Documentation/networking/dns_resolver.rst b/Documentation/networking/dns_resolver.rst index add4d59a9..c0364f707 100644 --- a/Documentation/networking/dns_resolver.rst +++ b/Documentation/networking/dns_resolver.rst @@ -118,7 +118,7 @@ Keys of dns_resolver type can be read from userspace using keyctl_read() or Mechanism ========= -The dnsresolver module registers a key type called "dns_resolver". Keys of +The dns_resolver module registers a key type called "dns_resolver". Keys of this type are used to transport and cache DNS lookup results from userspace. When dns_query() is invoked, it calls request_key() to search the local @@ -152,4 +152,4 @@ Debugging Debugging messages can be turned on dynamically by writing a 1 into the following file:: - /sys/module/dnsresolver/parameters/debug + /sys/module/dns_resolver/parameters/debug diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst index 43bd8a145..0611dc3dd 100644 --- a/Documentation/scheduler/index.rst +++ b/Documentation/scheduler/index.rst @@ -20,6 +20,7 @@ Scheduler sched-nice-design sched-rt-group sched-stats + sched-ext sched-debug text_files diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst new file mode 100644 index 000000000..b01c4f22f --- /dev/null +++ b/Documentation/scheduler/sched-ext.rst @@ -0,0 +1,307 @@ +========================== +Extensible Scheduler Class +========================== + +sched_ext is a scheduler class whose behavior can be defined by a set of BPF +programs - the BPF scheduler. + +* sched_ext exports a full scheduling interface so that any scheduling + algorithm can be implemented on top. + +* The BPF scheduler can group CPUs however it sees fit and schedule them + together, as tasks aren't tied to specific CPUs at the time of wakeup. + +* The BPF scheduler can be turned on and off dynamically anytime. + +* The system integrity is maintained no matter what the BPF scheduler does. + The default scheduling behavior is restored anytime an error is detected, + a runnable task stalls, or on invoking the SysRq key sequence + :kbd:`SysRq-S`. + +Switching to and from sched_ext +=============================== + +``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and +``tools/sched_ext`` contains the example schedulers. The following config +options should be enabled to use sched_ext: + +.. code-block:: none + + CONFIG_BPF=y + CONFIG_SCHED_CLASS_EXT=y + CONFIG_BPF_SYSCALL=y + CONFIG_BPF_JIT=y + CONFIG_DEBUG_INFO_BTF=y + CONFIG_BPF_JIT_ALWAYS_ON=y + CONFIG_BPF_JIT_DEFAULT_ON=y + CONFIG_PAHOLE_HAS_SPLIT_BTF=y + CONFIG_PAHOLE_HAS_BTF_TAG=y + +sched_ext is used only when the BPF scheduler is loaded and running. + +If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be +treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is +loaded. On load, such tasks will be switched to and scheduled by sched_ext. + +The BPF scheduler can choose to schedule all normal and lower class tasks by +calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this +case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and +``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers, +this mode can be selected with the ``-a`` option. + +Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or +detection of any internal error including stalled runnable tasks aborts the +BPF scheduler and reverts all tasks back to CFS. + +.. code-block:: none + + # make -j16 -C tools/sched_ext + # tools/sched_ext/scx_simple + local=0 global=3 + local=5 global=24 + local=9 global=44 + local=13 global=56 + local=17 global=72 + ^CEXIT: BPF scheduler unregistered + +The current status of the BPF scheduler can be determined as follows: + +.. code-block:: none + + # cat /sys/kernel/sched_ext/state + enabled + # cat /sys/kernel/sched_ext/root/ops + simple + +``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more +detailed information: + +.. code-block:: none + + # tools/sched_ext/scx_show_state.py + ops : simple + enabled : 1 + switching_all : 1 + switched_all : 1 + enable_state : enabled (2) + bypass_depth : 0 + nr_rejected : 0 + +If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can +be determined as follows: + +.. code-block:: none + + # grep ext /proc/self/sched + ext.enabled : 1 + +The Basics +========== + +Userspace can implement an arbitrary BPF scheduler by loading a set of BPF +programs that implement ``struct sched_ext_ops``. The only mandatory field +is ``ops.name`` which must be a valid BPF object name. All operations are +optional. The following modified excerpt is from +``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler. + +.. code-block:: c + + /* + * Decide which CPU a task should be migrated to before being + * enqueued (either at wakeup, fork time, or exec time). If an + * idle core is found by the default ops.select_cpu() implementation, + * then dispatch the task directly to SCX_DSQ_LOCAL and skip the + * ops.enqueue() callback. + * + * Note that this implemenation has exactly the same behavior as the + * default ops.select_cpu implementation. The behavior of the scheduler + * would be exactly same if the implementation just didn't define the + * simple_select_cpu() struct_ops prog. + */ + s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) + { + s32 cpu; + /* Need to initialize or the BPF verifier will reject the program */ + bool direct = false; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct); + + if (direct) + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + + return cpu; + } + + /* + * Do a direct dispatch of a task to the global DSQ. This ops.enqueue() + * callback will only be invoked if we failed to find a core to dispatch + * to in ops.select_cpu() above. + * + * Note that this implemenation has exactly the same behavior as the + * default ops.enqueue implementation, which just dispatches the task + * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same + * if the implementation just didn't define the simple_enqueue struct_ops + * prog. + */ + void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) + { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + s32 BPF_STRUCT_OPS(simple_init) + { + /* + * All SCHED_OTHER, SCHED_IDLE, and SCHED_BATCH tasks should + * use sched_ext. + */ + scx_bpf_switch_all(); + return 0; + } + + void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) + { + exit_type = ei->type; + } + + SEC(".struct_ops") + struct sched_ext_ops simple_ops = { + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple", + }; + +Dispatch Queues +--------------- + +To match the impedance between the scheduler core and the BPF scheduler, +sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a +priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``), +and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage +an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and +``scx_bpf_destroy_dsq()``. + +A CPU always executes a task from its local DSQ. A task is "dispatched" to a +DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's +local DSQ. + +When a CPU is looking for the next task to run, if the local DSQ is not +empty, the first task is picked. Otherwise, the CPU tries to consume the +global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()`` +is invoked. + +Scheduling Cycle +---------------- + +The following briefly shows how a waking task is scheduled and executed. + +1. When a task is waking up, ``ops.select_cpu()`` is the first operation + invoked. This serves two purposes. First, CPU selection optimization + hint. Second, waking up the selected CPU if idle. + + The CPU selected by ``ops.select_cpu()`` is an optimization hint and not + binding. The actual decision is made at the last step of scheduling. + However, there is a small performance gain if the CPU + ``ops.select_cpu()`` returns matches the CPU the task eventually runs on. + + A side-effect of selecting a CPU is waking it up from idle. While a BPF + scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, + using ``ops.select_cpu()`` judiciously can be simpler and more efficient. + + A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by + calling ``scx_bpf_dispatch()``. If the task is dispatched to + ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the + local DSQ of whichever CPU is returned from ``ops.select_cpu()``. + Additionally, dispatching directly from ``ops.select_cpu()`` will cause the + ``ops.enqueue()`` callback to be skipped. + + Note that the scheduler core will ignore an invalid CPU selection, for + example, if it's outside the allowed cpumask of the task. + +2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the + task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()`` + can make one of the following decisions: + + * Immediately dispatch the task to either the global or local DSQ by + calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or + ``SCX_DSQ_LOCAL``, respectively. + + * Immediately dispatch the task to a custom DSQ by calling + ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63. + + * Queue the task on the BPF side. + +3. When a CPU is ready to schedule, it first looks at its local DSQ. If + empty, it then looks at the global DSQ. If there still isn't a task to + run, ``ops.dispatch()`` is invoked which can use the following two + functions to populate the local DSQ. + + * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can + be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``, + ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()`` + currently can't be called with BPF locks held, this is being worked on + and will be supported. ``scx_bpf_dispatch()`` schedules dispatching + rather than performing them immediately. There can be up to + ``ops.dispatch_max_batch`` pending tasks. + + * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ + to the dispatching DSQ. This function cannot be called with any BPF + locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks + before trying to consume the specified DSQ. + +4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ, + the CPU runs the first one. If empty, the following steps are taken: + + * Try to consume the global DSQ. If successful, run the task. + + * If ``ops.dispatch()`` has dispatched any tasks, retry #3. + + * If the previous task is an SCX task and still runnable, keep executing + it (see ``SCX_OPS_ENQ_LAST``). + + * Go idle. + +Note that the BPF scheduler can always choose to dispatch tasks immediately +in ``ops.enqueue()`` as illustrated in the above simple example. If only the +built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as +a task is never queued on the BPF scheduler and both the local and global +DSQs are consumed automatically. + +``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use +``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as +``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue +dispatching, and must be dispatched to with ``scx_bpf_dispatch()``. See the +function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for +more information. + +Where to Look +============= + +* ``include/linux/sched/ext.h`` defines the core data structures, ops table + and constants. + +* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers. + The functions prefixed with ``scx_bpf_`` can be called from the BPF + scheduler. + +* ``tools/sched_ext/`` hosts example BPF scheduler implementations. + + * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a + custom DSQ. + + * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five + levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. + +ABI Instability +=============== + +The APIs provided by sched_ext to BPF schedulers programs have no stability +guarantees. This includes the ops table callbacks and constants defined in +``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in +``kernel/sched/ext.c``. + +While we will attempt to provide a relatively stable API surface when +possible, they are subject to change without warning between kernel +versions. diff --git a/MAINTAINERS b/MAINTAINERS index a60b6d4d0..84c9d8308 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19634,6 +19634,19 @@ F: include/linux/wait.h F: include/uapi/linux/sched.h F: kernel/sched/ +SCHEDULER - SCHED_EXT +R: Tejun Heo +R: David Vernet +L: linux-kernel@vger.kernel.org +S: Maintained +W: https://github.com/sched-ext/scx +T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git +F: include/linux/sched/ext.h +F: kernel/sched/ext.h +F: kernel/sched/ext.c +F: tools/sched_ext/ +F: tools/testing/selftests/sched_ext + SCSI LIBSAS SUBSYSTEM R: John Garry R: Jason Yan diff --git a/Makefile b/Makefile index 6151f438f..05a39090c 100644 --- a/Makefile +++ b/Makefile @@ -1502,6 +1502,12 @@ ifneq ($(wildcard $(resolve_btfids_O)),) $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean endif +tools-clean-targets := sched_ext +PHONY += $(tools-clean-targets) +$(tools-clean-targets): + $(Q)$(MAKE) -sC tools $@_clean +tools_clean: $(tools-clean-targets) + # Clear a bunch of variables before executing the submake ifeq ($(quiet),silent_) tools_silent=s @@ -1671,7 +1677,7 @@ PHONY += $(mrproper-dirs) mrproper $(mrproper-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@) -mrproper: clean $(mrproper-dirs) +mrproper: clean $(mrproper-dirs) tools_clean $(call cmd,rmfiles) @find . $(RCS_FIND_IGNORE) \ \( -name '*.rmeta' \) \ diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 1d672457d..01516f83a 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -2222,28 +2222,21 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* If building the body of the JITed code fails somehow, * we fall back to the interpretation. */ - if (build_body(&ctx) < 0) { - image_ptr = NULL; - bpf_jit_binary_free(header); - prog = orig_prog; - goto out_imms; - } + if (build_body(&ctx) < 0) + goto out_free; build_epilogue(&ctx); /* 3.) Extra pass to validate JITed Code */ - if (validate_code(&ctx)) { - image_ptr = NULL; - bpf_jit_binary_free(header); - prog = orig_prog; - goto out_imms; - } + if (validate_code(&ctx)) + goto out_free; flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx)); if (bpf_jit_enable > 1) /* there are 2 passes here */ bpf_jit_dump(prog->len, image_size, 2, ctx.target); - bpf_jit_binary_lock_ro(header); + if (bpf_jit_binary_lock_ro(header)) + goto out_free; prog->bpf_func = (void *)ctx.target; prog->jited = 1; prog->jited_len = image_size; @@ -2260,5 +2253,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_jit_prog_release_other(prog, prog == orig_prog ? tmp : orig_prog); return prog; + +out_free: + image_ptr = NULL; + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_imms; } diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 122021f9b..82d6c562e 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2176,12 +2176,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size) bpf_prog_pack_free(image, size); } -void arch_protect_bpf_trampoline(void *image, unsigned int size) -{ -} - -void arch_unprotect_bpf_trampoline(void *image, unsigned int size) +int arch_protect_bpf_trampoline(void *image, unsigned int size) { + return 0; } int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index e73323d75..7dbefd4ba 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1294,16 +1294,19 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) flush_icache_range((unsigned long)header, (unsigned long)(ctx.image + ctx.idx)); if (!prog->is_func || extra_pass) { + int err; + if (extra_pass && ctx.idx != jit_data->ctx.idx) { pr_err_once("multi-func JIT bug %d != %d\n", ctx.idx, jit_data->ctx.idx); - bpf_jit_binary_free(header); - prog->bpf_func = NULL; - prog->jited = 0; - prog->jited_len = 0; - goto out_offset; + goto out_free; + } + err = bpf_jit_binary_lock_ro(header); + if (err) { + pr_err_once("bpf_jit_binary_lock_ro() returned %d\n", + err); + goto out_free; } - bpf_jit_binary_lock_ro(header); } else { jit_data->ctx = ctx; jit_data->image = image_ptr; @@ -1334,6 +1337,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) out_offset = -1; return prog; + +out_free: + bpf_jit_binary_free(header); + prog->bpf_func = NULL; + prog->jited = 0; + prog->jited_len = 0; + goto out_offset; } /* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */ diff --git a/arch/mips/net/bpf_jit_comp.c b/arch/mips/net/bpf_jit_comp.c index a40d926b6..e355dfca4 100644 --- a/arch/mips/net/bpf_jit_comp.c +++ b/arch/mips/net/bpf_jit_comp.c @@ -1012,7 +1012,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_prog_fill_jited_linfo(prog, &ctx.descriptors[1]); /* Set as read-only exec and flush instruction cache */ - bpf_jit_binary_lock_ro(header); + if (bpf_jit_binary_lock_ro(header)) + goto out_err; flush_icache_range((unsigned long)header, (unsigned long)&ctx.target[ctx.jit_index]); diff --git a/arch/parisc/net/bpf_jit_core.c b/arch/parisc/net/bpf_jit_core.c index d6ee2fd45..979f45d4d 100644 --- a/arch/parisc/net/bpf_jit_core.c +++ b/arch/parisc/net/bpf_jit_core.c @@ -167,7 +167,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns); if (!prog->is_func || extra_pass) { - bpf_jit_binary_lock_ro(jit_data->header); + if (bpf_jit_binary_lock_ro(jit_data->header)) { + bpf_jit_binary_free(jit_data->header); + prog->bpf_func = NULL; + prog->jited = 0; + prog->jited_len = 0; + goto out_offset; + } prologue_len = ctx->epilogue_offset - ctx->body_len; for (i = 0; i < prog->len; i++) ctx->offset[i] += prologue_len; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 5af0402e9..fa2f824e3 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2108,7 +2108,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) print_fn_code(jit.prg_buf, jit.size_prg); } if (!fp->is_func || extra_pass) { - bpf_jit_binary_lock_ro(header); + if (bpf_jit_binary_lock_ro(header)) { + bpf_jit_binary_free(header); + fp = orig_fp; + goto free_addrs; + } } else { jit_data->header = header; jit_data->ctx = jit; diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index fa0759bfe..73bf0aea8 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1602,7 +1602,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(header, (u8 *)header + header->size); if (!prog->is_func || extra_pass) { - bpf_jit_binary_lock_ro(header); + if (bpf_jit_binary_lock_ro(header)) { + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_off; + } } else { jit_data->ctx = ctx; jit_data->image = image_ptr; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index df5fac428..788a3d6f6 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1351,8 +1351,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image break; case BPF_ALU64 | BPF_MOV | BPF_X: - if (insn->off == BPF_ADDR_SPACE_CAST && - insn->imm == 1U << 16) { + if (insn_is_cast_user(insn)) { if (dst_reg != src_reg) /* 32-bit mov */ emit_mov_reg(&prog, false, dst_reg, src_reg); @@ -3001,12 +3000,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size) bpf_prog_pack_free(image, size); } -void arch_protect_bpf_trampoline(void *image, unsigned int size) -{ -} - -void arch_unprotect_bpf_trampoline(void *image, unsigned int size) +int arch_protect_bpf_trampoline(void *image, unsigned int size) { + return 0; } int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index c10083a8e..de0f9e5f9 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2600,8 +2600,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) if (bpf_jit_enable > 1) bpf_jit_dump(prog->len, proglen, pass + 1, image); - if (image) { - bpf_jit_binary_lock_ro(header); + if (image && !bpf_jit_binary_lock_ro(header)) { prog->bpf_func = (void *)image; prog->jited = 1; prog->jited_len = proglen; diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c index ea6cfc209..c1b5e9a94 100644 --- a/drivers/net/ethernet/amd/amd8111e.c +++ b/drivers/net/ethernet/amd/amd8111e.c @@ -1796,7 +1796,6 @@ static int amd8111e_probe_one(struct pci_dev *pdev, lp = netdev_priv(dev); lp->pci_dev = pdev; lp->amd8111e_net_dev = dev; - lp->pm_cap = pdev->pm_cap; spin_lock_init(&lp->lock); diff --git a/drivers/net/ethernet/amd/amd8111e.h b/drivers/net/ethernet/amd/amd8111e.h index 9d570adb2..305232f54 100644 --- a/drivers/net/ethernet/amd/amd8111e.h +++ b/drivers/net/ethernet/amd/amd8111e.h @@ -764,7 +764,6 @@ struct amd8111e_priv{ u32 ext_phy_id; struct amd8111e_link_config link_config; - int pm_cap; struct net_device *next; int mii; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c index 17faac715..5c13bcb45 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c @@ -406,7 +406,7 @@ free_sge_txq_uld(struct adapter *adap, struct sge_uld_txq_info *txq_info) for (i = 0; i < nq; i++) { struct sge_uld_txq *txq = &txq_info->uldtxq[i]; - if (txq && txq->q.desc) { + if (txq->q.desc) { tasklet_kill(&txq->qresume_tsk); t4_ofld_eq_free(adap, adap->mbox, adap->pf, 0, txq->q.cntxt_id); diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 9b1f639f6..384ddad65 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -2564,6 +2564,7 @@ static int ravb_mdio_init(struct ravb_private *priv) { struct platform_device *pdev = priv->pdev; struct device *dev = &pdev->dev; + struct device_node *mdio_node; struct phy_device *phydev; struct device_node *pn; int error; @@ -2583,7 +2584,13 @@ static int ravb_mdio_init(struct ravb_private *priv) pdev->name, pdev->id); /* Register MDIO bus */ - error = of_mdiobus_register(priv->mii_bus, dev->of_node); + mdio_node = of_get_child_by_name(dev->of_node, "mdio"); + if (!mdio_node) { + /* backwards compatibility for DT lacking mdio subnode */ + mdio_node = of_node_get(dev->of_node); + } + error = of_mdiobus_register(priv->mii_bus, mdio_node); + of_node_put(mdio_node); if (error) goto out_free_bus; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 382e8de12..7ae04d8d2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -12,10 +12,8 @@ #include #include #include -#include #include #include -#include #include #include #include diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 02217e3c9..1ce3535cb 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -520,6 +520,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = { NULL, /* P */ NULL, /* Q */ NULL, /* R */ + /* S: May be registered by sched_ext for resetting */ NULL, /* S */ NULL, /* T */ NULL, /* U */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f7749d0f2..05bfe4acb 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -131,6 +131,7 @@ *(__dl_sched_class) \ *(__rt_sched_class) \ *(__fair_sched_class) \ + *(__ext_sched_class) \ *(__idle_sched_class) \ __sched_class_lowest = .; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 890e152d5..e52d5b3ee 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1116,8 +1116,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i void *func_addr); void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); -void arch_protect_bpf_trampoline(void *image, unsigned int size); -void arch_unprotect_bpf_trampoline(void *image, unsigned int size); +int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size); int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); @@ -1622,6 +1621,12 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_raw_tp_link { + struct bpf_link link; + struct bpf_raw_event_map *btp; + u64 cookie; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861c..bfc027311 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -132,12 +132,18 @@ enum { CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ + CFTYPE_HIDDEN = (1 << 6), /* file type hidden, see cgroup_show_cftypes() */ + /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ __CFTYPE_ADDED = (1 << 18), }; +enum cfile_flags { + CFILE_HIDDEN = (1 << 0), /* file instance hidden */ +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can @@ -145,7 +151,9 @@ enum { */ struct cgroup_file { /* do not access any fields from outside cgroup core */ + struct cftype *cft; struct kernfs_node *kn; + unsigned int flags; unsigned long notified_at; struct timer_list notify_timer; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 34aaf0e87..32679fcff 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -29,8 +29,6 @@ struct kernel_clone_args; -#ifdef CONFIG_CGROUPS - /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of @@ -40,6 +38,8 @@ struct kernel_clone_args; #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 +#ifdef CONFIG_CGROUPS + enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ @@ -114,6 +114,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); +void cgroup_show_cftype(struct cftype *cft, bool show); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); diff --git a/include/linux/filter.h b/include/linux/filter.h index c99bc3df2..44934b968 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -228,6 +228,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; } +/* addr_space_cast from as(0) to as(1) is for converting bpf arena pointers + * to pointers in user vma. + */ +static inline bool insn_is_cast_user(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && + insn->off == BPF_ADDR_SPACE_CAST && + insn->imm == 1U << 16; +} + /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) @@ -887,20 +897,22 @@ bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) -static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { set_vm_flush_reset_perms(fp); - set_memory_ro((unsigned long)fp, fp->pages); + return set_memory_ro((unsigned long)fp, fp->pages); } #endif + return 0; } -static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) +static inline int __must_check +bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); - set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); + return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb37817d6..e41d30eba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3287,6 +3287,7 @@ static inline void dev_xmit_recursion_dec(void) __this_cpu_dec(softnet_data.xmit.recursion); } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); diff --git a/include/linux/sched.h b/include/linux/sched.h index 3c2abbc58..dc07eb0d3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -80,6 +80,8 @@ struct task_group; struct task_struct; struct user_event_mm; +#include + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -798,6 +800,9 @@ struct task_struct { struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; +#ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; +#endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 index 000000000..dd113d4a9 --- /dev/null +++ b/include/linux/sched/ext.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef _LINUX_SCHED_EXT_H +#define _LINUX_SCHED_EXT_H + +#ifdef CONFIG_SCHED_CLASS_EXT + +#include +#include + +enum scx_public_consts { + SCX_OPS_NAME_LEN = 128, + + SCX_SLICE_DFL = 20 * NSEC_PER_MSEC, + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ +}; + +/* + * DSQ (dispatch queue) IDs are 64bit of the format: + * + * Bits: [63] [62 .. 0] + * [ B] [ ID ] + * + * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs + * ID: 63 bit ID + * + * Built-in IDs: + * + * Bits: [63] [62] [61..32] [31 .. 0] + * [ 1] [ L] [ R ] [ V ] + * + * 1: 1 for built-in DSQs. + * L: 1 for LOCAL_ON DSQ IDs, 0 for others + * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. + */ +enum scx_dsq_id_flags { + SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, + SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, + + SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, + SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, + SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, + SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, +}; + +/* + * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the + * scheduler core and the BPF scheduler. See the documentation for more details. + */ +struct scx_dispatch_q { + raw_spinlock_t lock; + struct list_head fifo; /* processed in dispatching order */ + struct rb_root_cached priq; /* processed in p->scx.dsq_vtime order */ + u32 nr; + u64 id; + struct rhash_head hash_node; + struct llist_node free_node; + struct rcu_head rcu; +}; + +/* scx_entity.flags */ +enum scx_ent_flags { + SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ + SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ + SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + + SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, + + SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ +}; + +/* scx_entity.flags & SCX_TASK_STATE_MASK */ +enum scx_task_state { + SCX_TASK_NONE, /* ops.init_task() not called yet */ + SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ + SCX_TASK_READY, /* fully initialized, but not in sched_ext */ + SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + + SCX_TASK_NR_STATES, +}; + +/* scx_entity.dsq_flags */ +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ +}; + +/* + * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from + * everywhere and the following bits track which kfunc sets are currently + * allowed for %current. This simple per-task tracking works because SCX ops + * nest in a limited way. BPF will likely implement a way to allow and disallow + * kfuncs depending on the calling context which will replace this manual + * mechanism. See scx_kf_allow(). + */ +enum scx_kf_mask { + SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ + /* all non-sleepables may be nested inside SLEEPABLE */ + SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ + SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */ + /* ops.dequeue (in REST) may be nested inside DISPATCH */ + SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */ + SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */ + SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */ + SCX_KF_REST = 1 << 5, /* other rq-locked operations */ + + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + +/* + * The following is embedded in task_struct and contains all fields necessary + * for a task to be scheduled by SCX. + */ +struct sched_ext_entity { + struct scx_dispatch_q *dsq; + struct { + struct list_head fifo; /* dispatch order */ + struct rb_node priq; /* p->scx.dsq_vtime order */ + } dsq_node; + u32 flags; /* protected by rq lock */ + u32 dsq_flags; /* protected by dsq lock */ + u32 weight; + s32 sticky_cpu; + s32 holding_cpu; + u32 kf_mask; /* see scx_kf_mask above */ + struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ + atomic_long_t ops_state; + + struct list_head runnable_node; /* rq->scx.runnable_list */ + unsigned long runnable_at; + +#ifdef CONFIG_SCHED_CORE + u64 core_sched_at; /* see scx_prio_less() */ +#endif + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; + + /* BPF scheduler modifiable fields */ + + /* + * Runtime budget in nsecs. This is usually set through + * scx_bpf_dispatch() but can also be modified directly by the BPF + * scheduler. Automatically decreased by SCX as the task executes. On + * depletion, a scheduling event is triggered. + * + * This value is cleared to zero if the task is preempted by + * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the + * task ran. Use p->se.sum_exec_runtime instead. + */ + u64 slice; + + /* + * Used to order tasks when dispatching to the vtime-ordered priority + * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() + * but can also be modified directly by the BPF scheduler. Modifying it + * while a task is queued on a dsq may mangle the ordering and is not + * recommended. + */ + u64 dsq_vtime; + + /* + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * + * If set from ops.init_task() and the task's policy is already + * %SCHED_EXT, which can happen while the BPF scheduler is being loaded + * or by inhering the parent's policy during fork, the task's policy is + * rejected and forcefully reverted to %SCHED_NORMAL. The number of + * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. + */ + bool disallow; /* reject switching into SCX */ + + /* cold fields */ + struct list_head tasks_node; +#ifdef CONFIG_EXT_GROUP_SCHED + struct cgroup *cgrp_moving_from; +#endif +}; + +void sched_ext_free(struct task_struct *p); +void print_scx_info(const char *log_lvl, struct task_struct *p); + +#else /* !CONFIG_SCHED_CLASS_EXT */ + +static inline void sched_ext_free(struct task_struct *p) {} +static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} + +#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* _LINUX_SCHED_EXT_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index d362aacf9..4df2f9055 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9d24aec06..517e546a1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4058,12 +4058,6 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len); -static inline void skb_free_datagram_locked(struct sock *sk, - struct sk_buff *skb) -{ - __skb_free_datagram_locked(sk, skb, 0); -} int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 6f9bdfb09..9df3e2973 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -765,8 +765,11 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); + +struct bpf_raw_tp_link; +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link); +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link); + struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, @@ -794,11 +797,12 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } -static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) +struct bpf_raw_tp_link; +static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { return -EOPNOTSUPP; } -static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) +static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { return -EOPNOTSUPP; } @@ -909,31 +913,31 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); -void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); -void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); -void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run1(struct bpf_raw_tp_link *link, u64 arg1); +void bpf_trace_run2(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2); +void bpf_trace_run3(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3); -void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run4(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4); -void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run5(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5); -void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run6(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6); -void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run7(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); -void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run8(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8); -void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run9(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9); -void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run10(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10); -void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run11(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11); -void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run12(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index e609cd7da..a2ea11cc9 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -46,8 +46,7 @@ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ - struct bpf_prog *prog = __data; \ - CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ } #undef DECLARE_EVENT_CLASS diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h index 265447e3f..0c924d39b 100644 --- a/include/trace/events/bpf_test_run.h +++ b/include/trace/events/bpf_test_run.h @@ -7,6 +7,23 @@ #include +TRACE_EVENT(bpf_trigger_tp, + + TP_PROTO(int nonce), + + TP_ARGS(nonce), + + TP_STRUCT__entry( + __field(int, nonce) + ), + + TP_fast_assign( + __entry->nonce = nonce; + ), + + TP_printk("nonce %d", __entry->nonce) +); + DECLARE_EVENT_CLASS(bpf_test_finish, TP_PROTO(int *err), diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h index 3930119ca..b1f9a4d3e 100644 --- a/include/trace/events/net_probe_common.h +++ b/include/trace/events/net_probe_common.h @@ -41,4 +41,33 @@ #endif +#define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ + do { \ + struct in6_addr *pin6; \ + \ + pin6 = (struct in6_addr *)__entry->saddr_v6; \ + ipv6_addr_set_v4mapped(saddr, pin6); \ + pin6 = (struct in6_addr *)__entry->daddr_v6; \ + ipv6_addr_set_v4mapped(daddr, pin6); \ + } while (0) + +#if IS_ENABLED(CONFIG_IPV6) +#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ + do { \ + if (sk->sk_family == AF_INET6) { \ + struct in6_addr *pin6; \ + \ + pin6 = (struct in6_addr *)__entry->saddr_v6; \ + *pin6 = saddr6; \ + pin6 = (struct in6_addr *)__entry->daddr_v6; \ + *pin6 = daddr6; \ + } else { \ + TP_STORE_V4MAPPED(__entry, saddr, daddr); \ + } \ + } while (0) +#else +#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ + TP_STORE_V4MAPPED(__entry, saddr, daddr) +#endif + #endif diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index fd206a6ab..0d1c5ce4e 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -10,6 +10,7 @@ #include #include #include +#include #define family_names \ EM(AF_INET) \ @@ -159,7 +160,6 @@ TRACE_EVENT(inet_sock_set_state, TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); - struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; @@ -177,20 +177,8 @@ TRACE_EVENT(inet_sock_set_state, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - pin6 = (struct in6_addr *)__entry->saddr_v6; - *pin6 = sk->sk_v6_rcv_saddr; - pin6 = (struct in6_addr *)__entry->daddr_v6; - *pin6 = sk->sk_v6_daddr; - } else -#endif - { - pin6 = (struct in6_addr *)__entry->saddr_v6; - ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); - pin6 = (struct in6_addr *)__entry->daddr_v6; - ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); - } + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", @@ -223,7 +211,6 @@ TRACE_EVENT(inet_sk_error_report, TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); - struct in6_addr *pin6; __be32 *p32; __entry->error = sk->sk_err; @@ -238,20 +225,8 @@ TRACE_EVENT(inet_sk_error_report, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - pin6 = (struct in6_addr *)__entry->saddr_v6; - *pin6 = sk->sk_v6_rcv_saddr; - pin6 = (struct in6_addr *)__entry->daddr_v6; - *pin6 = sk->sk_v6_daddr; - } else -#endif - { - pin6 = (struct in6_addr *)__entry->saddr_v6; - ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); - pin6 = (struct in6_addr *)__entry->daddr_v6; - ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); - } + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 699dafd20..3c08a0846 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -12,35 +12,6 @@ #include #include -#define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ - do { \ - struct in6_addr *pin6; \ - \ - pin6 = (struct in6_addr *)__entry->saddr_v6; \ - ipv6_addr_set_v4mapped(saddr, pin6); \ - pin6 = (struct in6_addr *)__entry->daddr_v6; \ - ipv6_addr_set_v4mapped(daddr, pin6); \ - } while (0) - -#if IS_ENABLED(CONFIG_IPV6) -#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ - do { \ - if (sk->sk_family == AF_INET6) { \ - struct in6_addr *pin6; \ - \ - pin6 = (struct in6_addr *)__entry->saddr_v6; \ - *pin6 = saddr6; \ - pin6 = (struct in6_addr *)__entry->daddr_v6; \ - *pin6 = daddr6; \ - } else { \ - TP_STORE_V4MAPPED(__entry, saddr, daddr); \ - } \ - } while (0) -#else -#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ - TP_STORE_V4MAPPED(__entry, saddr, daddr) -#endif - /* * tcp event with arguments sk and skb * diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c42b9f1b..96d57e483 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1662,8 +1662,10 @@ union bpf_attr { } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ - __u64 name; - __u32 prog_fd; + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ @@ -3392,6 +3394,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -7118,6 +7124,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7195,8 +7202,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ce..359a14cc7 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -118,6 +118,7 @@ struct clone_args { /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 #define SCHED_DEADLINE 6 +#define SCHED_EXT 7 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 diff --git a/init/Kconfig b/init/Kconfig index 98246963d..5faa865ba 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1037,6 +1037,11 @@ config RT_GROUP_SCHED realtime bandwidth for them. See Documentation/scheduler/sched-rt-group.rst for more information. +config EXT_GROUP_SCHED + bool + depends on SCHED_CLASS_EXT && CGROUP_SCHED + default y + endif #CGROUP_SCHED config SCHED_MM_CID diff --git a/init/init_task.c b/init/init_task.c index 4daee6d76..a32c606cc 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,20 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #endif #ifdef CONFIG_CGROUP_SCHED .sched_task_group = &root_task_group, +#endif +#ifdef CONFIG_SCHED_CLASS_EXT + .scx = { + .dsq_node.fifo = LIST_HEAD_INIT(init_task.scx.dsq_node.fifo), + .flags = 0, + .sticky_cpu = -1, + .holding_cpu = -1, + .ops_state = ATOMIC_INIT(0), + .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), + .runnable_at = INITIAL_JIFFIES, + .ddsp_dsq_id = SCX_DSQ_INVALID, + .ddsp_enq_flags = 0, + .slice = SCX_SLICE_DFL, + }, #endif .ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c2f1fd95a..bae49b743 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -133,4 +133,26 @@ config SCHED_CORE which is the likely usage by Linux distributions, there should be no measurable impact on performance. - +config SCHED_CLASS_EXT + bool "Extensible Scheduling Class" + depends on BPF_SYSCALL && BPF_JIT + help + This option enables a new scheduler class sched_ext (SCX), which + allows scheduling policies to be implemented as BPF programs to + achieve the following: + + - Ease of experimentation and exploration: Enabling rapid + iteration of new scheduling policies. + - Customization: Building application-specific schedulers which + implement policies that are not applicable to general-purpose + schedulers. + - Rapid scheduler deployments: Non-disruptive swap outs of + scheduling policies in production environments. + + sched_ext leverages BPF’s struct_ops feature to define a structure + which exports function callbacks and flags to BPF programs that + wish to implement scheduling policies. The struct_ops structure + exported by sched_ext is struct sched_ext_ops, and is conceptually + similar to struct sched_class. + + See Documentation/scheduler/sched-ext.rst for more details. diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c..6c81630c5 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -37,7 +37,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 43356faaa..86c7884ab 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -728,8 +728,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, cur_image = image; trampoline_start = 0; } - if (err < 0) - goto reset_unlock; *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); @@ -742,8 +740,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } - for (i = 0; i < st_map->image_pages_cnt; i++) - arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); + for (i = 0; i < st_map->image_pages_cnt; i++) { + err = arch_protect_bpf_trampoline(st_map->image_pages[i], + PAGE_SIZE); + if (err) + goto reset_unlock; + } if (st_map->map.map_flags & BPF_F_LINK) { err = 0; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 82243cb6c..8ba73042a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2575,8 +2575,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; #ifdef CONFIG_CGROUP_NET_CLASSID diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 696bc55de..5aacb1d3c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -908,23 +908,30 @@ static LIST_HEAD(pack_list); static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; + int err; pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); - if (!pack->ptr) { - kfree(pack); - return NULL; - } + if (!pack->ptr) + goto out; bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); - list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + err = set_memory_rox((unsigned long)pack->ptr, + BPF_PROG_PACK_SIZE / PAGE_SIZE); + if (err) + goto out; + list_add_tail(&pack->list, &pack_list); return pack; + +out: + bpf_jit_free_exec(pack->ptr); + kfree(pack); + return NULL; } void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) @@ -939,9 +946,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { + int err; + bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); + err = set_memory_rox((unsigned long)ptr, + size / PAGE_SIZE); + if (err) { + bpf_jit_free_exec(ptr); + ptr = NULL; + } } goto out; } @@ -2403,7 +2417,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } finalize: - bpf_prog_lock_ro(fp); + *err = bpf_prog_lock_ro(fp); + if (*err) + return fp; /* The tail call compatibility check can only be done at * this late stage as we need to determine, if we deal diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3a088a534..e81059faa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1490,6 +1490,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } + cond_resched(); } migrate_enable(); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 449b9a5d3..d9e7aca8a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1730,6 +1730,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: break; } @@ -2621,6 +2625,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 050fe1ebf..939620b91 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -155,16 +155,17 @@ static inline int extract_bit(const u8 *data, size_t index) } /** - * longest_prefix_match() - determine the longest prefix + * __longest_prefix_match() - determine the longest prefix * @trie: The trie to get internal sizes from * @node: The node to operate on * @key: The key to compare to @node * * Determine the longest prefix of @node that matches the bits in @key. */ -static size_t longest_prefix_match(const struct lpm_trie *trie, - const struct lpm_trie_node *node, - const struct bpf_lpm_trie_key_u8 *key) +static __always_inline +size_t __longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) { u32 limit = min(node->prefixlen, key->prefixlen); u32 prefixlen = 0, i = 0; @@ -224,6 +225,13 @@ static size_t longest_prefix_match(const struct lpm_trie *trie, return prefixlen; } +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) +{ + return __longest_prefix_match(trie, node, key); +} + /* Called from syscall or from eBPF program */ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { @@ -245,7 +253,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) * If it's the maximum possible prefix for this trie, we have * an exact match and can return it directly. */ - matchlen = longest_prefix_match(trie, node, key); + matchlen = __longest_prefix_match(trie, node, key); if (matchlen == trie->max_prefixlen) { found = node; break; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c28792547..c0f2f052a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3498,17 +3498,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, return err; } -struct bpf_raw_tp_link { - struct bpf_link link; - struct bpf_raw_event_map *btp; -}; - static void bpf_raw_tp_link_release(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); - bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); + bpf_probe_unregister(raw_tp->btp, raw_tp); bpf_put_raw_tracepoint(raw_tp->btp); } @@ -3808,7 +3803,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name) + const char __user *user_tp_name, u64 cookie) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -3855,6 +3850,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, &bpf_raw_tp_link_lops, prog); link->btp = btp; + link->cookie = cookie; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -3862,7 +3858,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } - err = bpf_probe_register(link->btp, prog); + err = bpf_probe_register(link->btp, link); if (err) { bpf_link_cleanup(&link_primer); goto out_put_btp; @@ -3875,11 +3871,13 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, return err; } -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { struct bpf_prog *prog; + void __user *tp_name; + __u64 cookie; int fd; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) @@ -3889,7 +3887,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); + cookie = attr->raw_tracepoint.cookie; + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); if (fd < 0) bpf_prog_put(prog); return fd; @@ -5227,7 +5227,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index db7599c59..cc50607f8 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -456,7 +456,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut if (err < 0) goto out_free; - arch_protect_bpf_trampoline(im->image, im->size); + err = arch_protect_bpf_trampoline(im->image, im->size); + if (err) + goto out_free; WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -1072,17 +1074,10 @@ void __weak arch_free_bpf_trampoline(void *image, unsigned int size) bpf_jit_free_exec(image); } -void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) -{ - WARN_ON_ONCE(size > PAGE_SIZE); - set_memory_rox((long)image, 1); -} - -void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) +int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) { WARN_ON_ONCE(size > PAGE_SIZE); - set_memory_nx((long)image, 1); - set_memory_rw((long)image, 1); + return set_memory_rox((long)image, 1); } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98188379d..58952f040 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10201,8 +10201,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { - verbose(env, "unknown func %s#%d\n", func_id_name(func_id), - func_id); + verbose(env, "program of this type cannot use helper %s#%d\n", + func_id_name(func_id), func_id); return -EINVAL; } @@ -14564,7 +14564,19 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state struct tnum t; u64 val; -again: + /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */ + switch (opcode) { + case BPF_JGE: + case BPF_JGT: + case BPF_JSGE: + case BPF_JSGT: + opcode = flip_opcode(opcode); + swap(reg1, reg2); + break; + default: + break; + } + switch (opcode) { case BPF_JEQ: if (is_jmp32) { @@ -14707,14 +14719,6 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); } break; - case BPF_JGE: - case BPF_JGT: - case BPF_JSGE: - case BPF_JSGT: - /* just reuse LE/LT logic above */ - opcode = flip_opcode(opcode); - swap(reg1, reg2); - goto again; default: return; } @@ -19181,6 +19185,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (bpf_prog_calc_tag(func[i])) goto out_free; func[i]->is_func = 1; + func[i]->sleepable = prog->sleepable; func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; @@ -19285,10 +19290,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) * bpf_prog_load will add the kallsyms for the main program. */ for (i = 1; i < env->subprog_cnt; i++) { - bpf_prog_lock_ro(func[i]); - bpf_prog_kallsyms_add(func[i]); + err = bpf_prog_lock_ro(func[i]); + if (err) + goto out_free; } + for (i = 1; i < env->subprog_cnt; i++) + bpf_prog_kallsyms_add(func[i]); + /* Last step: make now unused interpreter insns from main * prog consistent for later dump requests, so they can * later look the same as if they were interpreted only. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c8..b316d3b7f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4206,10 +4206,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (IS_ERR(kn)) return PTR_ERR(kn); + kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN)); + if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); + cfile->cft = cft; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; @@ -4485,6 +4488,24 @@ void cgroup_file_notify(struct cgroup_file *cfile) spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile) +{ + struct kernfs_node *kn; + + spin_lock_irq(&cgroup_file_kn_lock); + kn = cfile->kn; + kernfs_get(kn); + spin_unlock_irq(&cgroup_file_kn_lock); + + return kn; +} + +static bool cfile_visible(struct cgroup_file *cfile) +{ + return !(cfile->cft->flags & CFTYPE_HIDDEN) && + !(cfile->flags & CFILE_HIDDEN); +} + /** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset @@ -4494,15 +4515,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; - spin_lock_irq(&cgroup_file_kn_lock); - kn = cfile->kn; - kernfs_get(kn); - spin_unlock_irq(&cgroup_file_kn_lock); + mutex_lock(&cgroup_mutex); - if (kn) - kernfs_show(kn, show); + if (show) + cfile->flags &= ~CFILE_HIDDEN; + else + cfile->flags |= CFILE_HIDDEN; - kernfs_put(kn); + kn = cfile_kn_get(cfile); + if (kn) { + kernfs_show(kn, cfile_visible(cfile)); + kernfs_put(kn); + } + + mutex_unlock(&cgroup_mutex); } /** @@ -5526,6 +5552,63 @@ static void offline_css(struct cgroup_subsys_state *css) wake_up_all(&css->cgroup->offline_waitq); } +/** + * cgroup_show_cftype - show or hide a cgroup file type + * @cft: cftype to show or hide + * @show: whether to show or hide + * + * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show. + * @cft may or may not be added at the time of this call. After hiding, it's + * guaranteed that there are no in-flight operations on the hidden files. + */ +void cgroup_show_cftype(struct cftype *cft, bool show) +{ + struct cgroup_subsys *ss = cft->ss; + struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp; + struct cgroup_subsys_state *css; + + mutex_lock(&cgroup_mutex); + + if (show) + cft->flags &= ~CFTYPE_HIDDEN; + else + cft->flags |= CFTYPE_HIDDEN; + + if (!(cft->flags & __CFTYPE_ADDED)) + goto out_unlock; + + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { + struct cgroup *cgrp = css->cgroup; + struct kernfs_node *kn; + + if (!(css->flags & CSS_VISIBLE)) + continue; + + if (cft->file_offset) { + struct cgroup_file *cfile = + (void *)css + cft->file_offset; + + kn = cfile_kn_get(cfile); + if (kn) { + kernfs_show(kn, cfile_visible(cfile)); + kernfs_put(kn); + } + } else { + char buf[CGROUP_FILE_NAME_MAX]; + + kn = kernfs_find_and_get(cgrp->kn, + cgroup_file_name(cgrp, cft, buf)); + if (kn) { + kernfs_show(kn, show); + kernfs_put(kn); + } + } + } + +out_unlock: + mutex_unlock(&cgroup_mutex); +} + /** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with diff --git a/kernel/fork.c b/kernel/fork.c index f186c7298..770835e60 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -975,6 +976,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + sched_ext_free(tsk); io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); @@ -2371,7 +2373,7 @@ __latent_entropy struct task_struct *copy_process( retval = perf_event_init_task(p, clone_flags); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_sched_cancel_fork; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; @@ -2504,7 +2506,9 @@ __latent_entropy struct task_struct *copy_process( * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ - sched_cgroup_fork(p, args); + retval = sched_cgroup_fork(p, args); + if (retval) + goto bad_fork_cancel_cgroup; /* * From this point on we must avoid any synchronous user-space @@ -2550,13 +2554,13 @@ __latent_entropy struct task_struct *copy_process( /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; - goto bad_fork_cancel_cgroup; + goto bad_fork_core_free; } /* Let kill terminate clone/fork in the middle */ if (fatal_signal_pending(current)) { retval = -EINTR; - goto bad_fork_cancel_cgroup; + goto bad_fork_core_free; } /* No more failure paths after this point. */ @@ -2630,10 +2634,11 @@ __latent_entropy struct task_struct *copy_process( return p; -bad_fork_cancel_cgroup: +bad_fork_core_free: sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); +bad_fork_cancel_cgroup: cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { @@ -2672,6 +2677,8 @@ __latent_entropy struct task_struct *copy_process( audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_sched_cancel_fork: + sched_cancel_fork(p); bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index d9dc9ab37..e0e73b44a 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -21,13 +21,18 @@ #include #include +#include #include +#include #include +#include #include #include #include #include #include +#include +#include #include @@ -52,3 +57,6 @@ #include "cputime.c" #include "deadline.c" +#ifdef CONFIG_SCHED_CLASS_EXT +# include "ext.c" +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7019a4045..ae7039676 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &idle_sched_class) return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ + if (task_on_scx(p)) + return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ + + return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ } /* @@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a, if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); +#ifdef CONFIG_SCHED_CLASS_EXT + if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ + return scx_prio_less(a, b, in_fi); +#endif + return false; } @@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; - * if there's more than one we need the tick for involuntary - * preemption. + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks + * left. For CFS, if there's more than one we need the tick for + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) + if (!scx_switched_all() && rq->nr_running > 1) + return false; + + if (scx_enabled() && !scx_can_stop_tick(rq)) return false; /* @@ -1342,8 +1353,8 @@ static void set_load_weight(struct task_struct *p, bool update_load) * SCHED_OTHER tasks have to update their load when changing their * weight */ - if (update_load && p->sched_class == &fair_sched_class) { - reweight_task(p, prio); + if (update_load && p->sched_class->reweight_task) { + p->sched_class->reweight_task(task_rq(p), p, prio); } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; @@ -2214,6 +2225,17 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * ->switching_to() is called with the pi_lock and rq_lock held and must not + * mess with locking. + */ +void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class) +{ + if (prev_class != p->sched_class && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); +} + /* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. @@ -2221,9 +2243,9 @@ inline int task_curr(const struct task_struct *p) * this means any call to check_class_changed() must be followed by a call to * balance_callback(). */ -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) +void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) @@ -3986,6 +4008,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { + /* + * The BPF scheduler may depend on select_task_rq() being invoked during + * wakeups. In addition, @p may end up executing on a different CPU + * regardless of what happens in the wakeup path making the ttwu_queue + * optimization less meaningful. Skip if on SCX. + */ + if (task_on_scx(p)) + return false; + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. @@ -4553,6 +4584,23 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->rt.on_rq = 0; p->rt.on_list = 0; +#ifdef CONFIG_SCHED_CLASS_EXT + p->scx.dsq = NULL; + INIT_LIST_HEAD(&p->scx.dsq_node.fifo); + RB_CLEAR_NODE(&p->scx.dsq_node.priq); + p->scx.flags = 0; + p->scx.weight = 0; + p->scx.sticky_cpu = -1; + p->scx.holding_cpu = -1; + p->scx.kf_mask = 0; + atomic_long_set(&p->scx.ops_state, 0); + INIT_LIST_HEAD(&p->scx.runnable_node); + p->scx.runnable_at = jiffies; + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; + p->scx.slice = SCX_SLICE_DFL; +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif @@ -4756,6 +4804,8 @@ late_initcall(sched_core_sysctl_init); */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { + int ret; + __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -4792,12 +4842,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (dl_prio(p->prio)) - return -EAGAIN; - else if (rt_prio(p->prio)) + scx_pre_fork(p); + + if (dl_prio(p->prio)) { + ret = -EAGAIN; + goto out_cancel; + } else if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; - else +#ifdef CONFIG_SCHED_CLASS_EXT + } else if (task_should_scx(p)) { + p->sched_class = &ext_sched_class; +#endif + } else { p->sched_class = &fair_sched_class; + } init_entity_runnable_average(&p->se); @@ -4815,9 +4873,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif return 0; + +out_cancel: + scx_cancel_fork(p); + return ret; } -void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; @@ -4844,11 +4906,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return scx_fork(p); +} + +void sched_cancel_fork(struct task_struct *p) +{ + scx_cancel_fork(p); } void sched_post_fork(struct task_struct *p) { uclamp_post_fork(p); + scx_post_fork(p); } unsigned long to_ratio(u64 period, u64 runtime) @@ -5687,6 +5757,7 @@ void scheduler_tick(void) calc_global_load_tick(rq); sched_core_tick(rq); task_tick_mm_cid(rq, curr); + scx_tick(rq); rq_unlock(rq, &rf); @@ -5699,8 +5770,10 @@ void scheduler_tick(void) wq_worker_tick(curr); #ifdef CONFIG_SMP - rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + if (!scx_switched_all()) { + rq->idle_balance = idle_cpu(cpu); + trigger_load_balance(rq); + } #endif } @@ -6000,7 +6073,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * We can terminate the balance pass as soon as we know there is * a runnable task of @class priority or higher. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { + for_balance_class_range(class, prev->sched_class, &idle_sched_class) { if (class->balance(rq, prev, rf)) break; } @@ -6018,6 +6091,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; struct task_struct *p; + if (scx_enabled()) + goto restart; + /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a @@ -6058,10 +6134,12 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (prev->dl_server) prev->dl_server = NULL; - for_each_class(class) { + for_each_active_class(class) { p = class->pick_next_task(rq); - if (p) + if (p) { + scx_next_task_picked(rq, p, class); return p; + } } BUG(); /* The idle class should always have a runnable task. */ @@ -6091,7 +6169,7 @@ static inline struct task_struct *pick_task(struct rq *rq) const struct sched_class *class; struct task_struct *p; - for_each_class(class) { + for_each_active_class(class) { p = class->pick_task(rq); if (p) return p; @@ -7081,12 +7159,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -static void __setscheduler_prio(struct task_struct *p, int prio) +void __setscheduler_prio(struct task_struct *p, int prio) { if (dl_prio(prio)) p->sched_class = &dl_sched_class; else if (rt_prio(prio)) p->sched_class = &rt_sched_class; +#ifdef CONFIG_SCHED_CLASS_EXT + else if (task_should_scx(p)) + p->sched_class = &ext_sched_class; +#endif else p->sched_class = &fair_sched_class; @@ -7247,6 +7329,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } __setscheduler_prio(p, prio); + check_class_changing(rq, p, prev_class); if (queued) enqueue_task(rq, p, queue_flag); @@ -7468,6 +7551,25 @@ int sched_core_idle_cpu(int cpu) #endif #ifdef CONFIG_SMP +/* + * Load avg and utiliztion metrics need to be updated periodically and before + * consumption. This function updates the metrics for all subsystems except for + * the fair class. @rq must be locked and have its clock updated. + */ +bool update_other_load_avgs(struct rq *rq) +{ + u64 now = rq_clock_pelt(rq); + const struct sched_class *curr_class = rq->curr->sched_class; + unsigned long thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + + lockdep_assert_rq_held(rq); + + return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_irq_load_avg(rq, 0); +} + /* * This function computes an effective utilization for the given CPU, to be * used for frequency selection given the linear relation: f = u * f_max. @@ -7790,6 +7892,10 @@ static int __sched_setscheduler(struct task_struct *p, goto unlock; } + retval = scx_check_setscheduler(p, policy); + if (retval) + goto unlock; + /* * If not changing anything there's no need to proceed further, * but store a possible modification of reset_on_fork. @@ -7892,6 +7998,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_prio(p, newprio); } __setscheduler_uclamp(p, attr); + check_class_changing(rq, p, prev_class); if (queued) { /* @@ -9067,6 +9174,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: + case SCHED_EXT: ret = 0; break; } @@ -9094,6 +9202,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: + case SCHED_EXT: ret = 0; } return ret; @@ -9189,6 +9298,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); + print_scx_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } @@ -9574,7 +9684,7 @@ static inline void balance_hotplug_wait(void) #endif /* CONFIG_HOTPLUG_CPU */ -void set_rq_online(struct rq *rq) +void set_rq_online(struct rq *rq, enum rq_onoff_reason reason) { if (!rq->online) { const struct sched_class *class; @@ -9584,12 +9694,12 @@ void set_rq_online(struct rq *rq) for_each_class(class) { if (class->rq_online) - class->rq_online(rq); + class->rq_online(rq, reason); } } } -void set_rq_offline(struct rq *rq) +void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason) { if (rq->online) { const struct sched_class *class; @@ -9597,7 +9707,7 @@ void set_rq_offline(struct rq *rq) update_rq_clock(rq); for_each_class(class) { if (class->rq_offline) - class->rq_offline(rq); + class->rq_offline(rq, reason); } cpumask_clear_cpu(rq->cpu, rq->rd->online); @@ -9693,7 +9803,7 @@ int sched_cpu_activate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); + set_rq_online(rq, RQ_ONOFF_HOTPLUG); } rq_unlock_irqrestore(rq, &rf); @@ -9737,7 +9847,7 @@ int sched_cpu_deactivate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); + set_rq_offline(rq, RQ_ONOFF_HOTPLUG); } rq_unlock_irqrestore(rq, &rf); @@ -9924,11 +10034,15 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class != &fair_sched_class + 1 || - &fair_sched_class != &rt_sched_class + 1 || - &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class != &stop_sched_class + 1); + BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); +#endif + BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); + BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); + BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); +#ifdef CONFIG_SCHED_CLASS_EXT + BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif wait_bit_init(); @@ -9952,6 +10066,9 @@ void __init sched_init(void) root_task_group.shares = ROOT_TASK_GROUP_LOAD; init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EXT_GROUP_SCHED + root_task_group.scx_weight = CGROUP_WEIGHT_DFL; +#endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -10097,6 +10214,7 @@ void __init sched_init(void) balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); + init_sched_ext_class(); psi_init(); @@ -10382,6 +10500,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); alloc_uclamp_sched_group(tg, parent); return tg; @@ -10509,6 +10628,7 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); + scx_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10523,11 +10643,6 @@ void sched_move_task(struct task_struct *tsk) } } -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct task_group, css) : NULL; -} - static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -10551,6 +10666,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); + int ret; + + ret = scx_tg_online(tg); + if (ret) + return ret; if (parent) sched_online_group(tg, parent); @@ -10565,6 +10685,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + scx_tg_offline(tg); +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -10582,9 +10709,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -#ifdef CONFIG_RT_GROUP_SCHED +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_RT_GROUP_SCHED struct task_struct *task; struct cgroup_subsys_state *css; @@ -10592,7 +10720,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } - return 0; +#endif + return scx_cgroup_can_attach(tset); } #endif @@ -10603,8 +10732,17 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + scx_cgroup_finish_attach(); } +#ifdef CONFIG_EXT_GROUP_SCHED +static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + scx_cgroup_cancel_attach(tset); +} +#endif + #ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css) { @@ -10783,9 +10921,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + int ret; + if (shareval > scale_load_down(ULONG_MAX)) shareval = MAX_SHARES; - return sched_group_set_shares(css_tg(css), scale_load(shareval)); + ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(shareval)); + return ret; } static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, @@ -11182,7 +11326,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif -static struct cftype cpu_legacy_files[] = { +static struct cftype cpu_legacy_cftypes[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", @@ -11293,38 +11437,44 @@ static int cpu_local_stat_show(struct seq_file *sf, return 0; } +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + +static unsigned long tg_weight(struct task_group *tg) +{ #ifdef CONFIG_FAIR_GROUP_SCHED + return scale_load_down(tg->shares); +#else + return sched_weight_from_cgroup(tg->scx_weight); +#endif +} + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - u64 weight = scale_load_down(tg->shares); - - return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); + return sched_weight_to_cgroup(tg_weight(css_tg(css))); } static int cpu_weight_write_u64(struct cgroup_subsys_state *css, - struct cftype *cft, u64 weight) + struct cftype *cft, u64 cgrp_weight) { - /* - * cgroup weight knobs should use the common MIN, DFL and MAX - * values which are 1, 100 and 10000 respectively. While it loses - * a bit of range on both ends, it maps pretty well onto the shares - * value used by scheduler and the round-trip conversions preserve - * the original value over the entire range. - */ - if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + unsigned long weight; + int ret; + + if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; - weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), cgrp_weight); + return ret; } static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - unsigned long weight = scale_load_down(css_tg(css)->shares); + unsigned long weight = tg_weight(css_tg(css)); int last_delta = INT_MAX; int prio, delta; @@ -11343,7 +11493,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; - int idx; + int idx, ret; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; @@ -11352,7 +11502,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, idx = array_index_nospec(idx, 40); weight = sched_prio_to_weight[idx]; - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(weight)); + return ret; } #endif @@ -11413,21 +11567,23 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, } #endif -static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - { +struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = { +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + [CPU_CFTYPE_WEIGHT] = { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_weight_read_u64, .write_u64 = cpu_weight_write_u64, }, - { + [CPU_CFTYPE_WEIGHT_NICE] = { .name = "weight.nice", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, - { +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED + [CPU_CFTYPE_IDLE] = { .name = "idle", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = cpu_idle_read_s64, @@ -11435,13 +11591,13 @@ static struct cftype cpu_files[] = { }, #endif #ifdef CONFIG_CFS_BANDWIDTH - { + [CPU_CFTYPE_MAX] = { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_max_show, .write = cpu_max_write, }, - { + [CPU_CFTYPE_MAX_BURST] = { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_cfs_burst_read_u64, @@ -11449,13 +11605,13 @@ static struct cftype cpu_files[] = { }, #endif #ifdef CONFIG_UCLAMP_TASK_GROUP - { + [CPU_CFTYPE_UCLAMP_MIN] = { .name = "uclamp.min", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_min_show, .write = cpu_uclamp_min_write, }, - { + [CPU_CFTYPE_UCLAMP_MAX] = { .name = "uclamp.max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_max_show, @@ -11468,16 +11624,20 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, .css_local_stat_show = cpu_local_stat_show, -#ifdef CONFIG_RT_GROUP_SCHED +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) .can_attach = cpu_cgroup_can_attach, #endif .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_legacy_files, - .dfl_cftypes = cpu_files, +#ifdef CONFIG_EXT_GROUP_SCHED + .cancel_attach = cpu_cgroup_cancel_attach, +#endif + .legacy_cftypes = cpu_legacy_cftypes, + .dfl_cftypes = cpu_cftypes, .early_init = true, .threaded = true, }; @@ -12065,3 +12225,38 @@ void sched_mm_cid_fork(struct task_struct *t) t->mm_cid_active = 1; } #endif + +#ifdef CONFIG_SCHED_CLASS_EXT +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + + *ctx = (struct sched_enq_and_set_ctx){ + .p = p, + .queue_flags = queue_flags, + .queued = task_on_rq_queued(p), + .running = task_current(rq, p), + }; + + update_rq_clock(rq); + if (ctx->queued) + dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + if (ctx->running) + put_prev_task(rq, p); +} + +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(ctx->p); + + lockdep_assert_rq_held(rq); + + if (ctx->queued) + enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + if (ctx->running) + set_next_task(rq, ctx->p); +} +#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index eece6244f..12174c013 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -197,7 +197,9 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { - unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); + unsigned long min, max; + unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu) + + scx_cpuperf_target(sg_cpu->cpu); util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); util = max(util, boost); @@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, } #ifdef CONFIG_NO_HZ_COMMON -static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); - bool ret = idle_calls == sg_cpu->saved_idle_calls; + unsigned long idle_calls; + bool ret; + + /* + * The heuristics in this function is for the fair class. For SCX, the + * performance target comes directly from the BPF scheduler. Let's just + * follow it. + */ + if (scx_switched_all()) + return false; + + /* if capped by uclamp_max, always update to be in compliance */ + if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) + return false; + + /* + * Maintain the frequency if the CPU has not been idle recently, as + * reduction is likely to be premature. + */ + idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); + ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; return ret; } #else -static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } #endif /* CONFIG_NO_HZ_COMMON */ /* @@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, return; next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - * - * Except when the rq is capped by uclamp_max. - */ - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && + + if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && !sg_policy->need_freq_update) { next_f = sg_policy->next_freq; @@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - /* - * Do not reduce the target performance level if the CPU has not been - * idle recently, as the reduction is likely to be premature then. - * - * Except when the rq is capped by uclamp_max. - */ - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af..010d1dc5f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2607,7 +2607,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, } /* Assumes rq->lock is held */ -static void rq_online_dl(struct rq *rq) +static void rq_online_dl(struct rq *rq, enum rq_onoff_reason reason) { if (rq->dl.overloaded) dl_set_overload(rq); @@ -2618,7 +2618,7 @@ static void rq_online_dl(struct rq *rq) } /* Assumes rq->lock is held */ -static void rq_offline_dl(struct rq *rq) +static void rq_offline_dl(struct rq *rq, enum rq_onoff_reason reason) { if (rq->dl.overloaded) dl_clear_overload(rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8d5d98a58..6f306e1c9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1089,6 +1089,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(dl.runtime); P(dl.deadline); } +#ifdef CONFIG_SCHED_CLASS_EXT + __PS("ext.enabled", task_on_scx(p)); +#endif #undef PN_SCHEDSTAT #undef P_SCHEDSTAT diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c new file mode 100644 index 000000000..fc695b13e --- /dev/null +++ b/kernel/sched/ext.c @@ -0,0 +1,6284 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * An exit code can be specified when exiting with scx_bpf_exit() or + * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN + * respectively. The codes are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* exit code if gracefully exiting */ + s64 exit_code; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * If set, only tasks with policy set to SCHED_EXT are attached to + * sched_ext. If clear, SCHED_NORMAL tasks are also included. + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * CPU cgroup knob enable flags + */ + SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16, /* cpu.weight */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_CGROUP_KNOB_WEIGHT, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * Userland can implement an arbitrary scheduling policy by implementing and + * loading operations in this table. + */ +struct sched_ext_ops { + /** + * select_cpu - Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p + * is dispatched, the ops.enqueue() callback will be skipped. Finally, + * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the + * local DSQ of whatever CPU is returned by this callback. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * enqueue - Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() + * or enqueue on the BPF scheduler. If not directly dispatched, the bpf + * scheduler owns @p and if it fails to dispatch @p, the task will + * stall. + * + * If @p was dispatched from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * dequeue - Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using + * scx_bpf_consume(). + * + * The maximum number of times scx_bpf_dispatch() can be called without + * an intervening scx_bpf_consume() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * tick - Periodic tick + * @p: task running currently + * + * This operation is called every 1/HZ seconds on CPUs which are + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an + * immediate dispatch cycle on the CPU. + */ + void (*tick)(struct task_struct *p); + + /** + * runnable - A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU. Likewise, a task + * may be ->enqueue()'d without being preceded by this operation e.g. + * after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * running - A task is starting to run on its associated CPU + * @p: task starting to run + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * stopping - A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * quiescent - A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * yield - Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * core_sched_before - Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + + /** + * set_weight - Set task weight + * @p: task to set weight for + * @weight: new eight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * set_cpumask - Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * update_idle - Update the idle state of a CPU + * @cpu: CPU to udpate the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * cpu_acquire - A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * cpu_release - A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * init_task - Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); + + /** + * exit_task - Exit a previously-running task from the system + * @p: task to exit + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); + + /** + * enable - Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. enable() is called on @p any time it + * enters SCX, and is always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * disable - Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * cgroup_init - Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * cgroup_exit - Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * cgroup_prep_move - Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_move - Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_cancel_move - Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_set_weight - A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @tg's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); +#endif /* CONFIG_CGROUPS */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * cpu_online - A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks + * associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * cpu_offline - A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks + * associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * init - Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * exit - Clean up after the BPF scheduler + * @info: Exit info + */ + void (*exit)(struct scx_exit_info *info); + + /** + * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * flags - %SCX_OPS_* flags + */ + u64 flags; + + /** + * timeout_ms - The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + + /** + * hotplug_seq - A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** + * name - BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), +}; + +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dispatch() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * If the BPF scheduler wants to continue executing the task, + * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. + * If the task gets queued on a different dsq or the BPF side, the BPF + * scheduler is responsible for triggering a follow-up scheduling event. + * Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +enum scx_ops_enable_state { + SCX_OPS_PREPPING, + SCX_OPS_ENABLING, + SCX_OPS_ENABLED, + SCX_OPS_DISABLING, + SCX_OPS_DISABLED, +}; + +static const char *scx_ops_enable_state_str[] = { + [SCX_OPS_PREPPING] = "prepping", + [SCX_OPS_ENABLING] = "enabling", + [SCX_OPS_ENABLED] = "enabled", + [SCX_OPS_DISABLING] = "disabling", + [SCX_OPS_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +/* + * During exit, a task may schedule after losing its PIDs. When disabling the + * BPF scheduler, we need to be able to iterate tasks in every state to + * guarantee system safety. Maintain a dedicated task list which contains every + * task between its fork and eventual free. + */ +static DEFINE_SPINLOCK(scx_tasks_lock); +static LIST_HEAD(scx_tasks); + +/* ops enable/disable */ +static struct kthread_worker *scx_ops_helper; +static DEFINE_MUTEX(scx_ops_enable_mutex); +DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); +static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static bool scx_switching_all; +DEFINE_STATIC_KEY_FALSE(__scx_switched_all); + +static struct sched_ext_ops scx_ops; +static bool scx_warned_zero_slice; + +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +struct static_key_false scx_has_op[SCX_OPI_END] = + { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; + +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); +static struct scx_exit_info *scx_exit_info; + +static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); + +/* + * The maximum amount of time in jiffies that a task may be runnable without + * being scheduled on a CPU. If this timeout is exceeded, it will trigger + * scx_ops_error(). + */ +static unsigned long scx_watchdog_timeout; + +/* + * The last time the delayed work was run. This delayed work relies on + * ksoftirqd being able to run to service timer interrupts, so it's possible + * that this work itself could get wedged. To account for this, we check that + * it's not stalled in the timer tick, and trigger an error if it is. + */ +static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; + +static struct delayed_work scx_watchdog_work; + +/* idle tracking */ +#ifdef CONFIG_SMP +#ifdef CONFIG_CPUMASK_OFFSTACK +#define CL_ALIGNED_IF_ONSTACK +#else +#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp +#endif + +static struct { + cpumask_var_t cpu; + cpumask_var_t smt; +} idle_masks CL_ALIGNED_IF_ONSTACK; + +#endif /* CONFIG_SMP */ + +/* for %SCX_KICK_WAIT */ +static unsigned long __percpu *scx_kick_cpus_pnt_seqs; + +/* + * Direct dispatch marker. + * + * Non-NULL values are used for direct dispatch from enqueue path. A valid + * pointer points to the task currently being enqueued. An ERR_PTR value is used + * to indicate that direct dispatch has already happened. + */ +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); + +/* dispatch queues */ +static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; + +static const struct rhashtable_params dsq_hash_params = { + .key_len = 8, + .key_offset = offsetof(struct scx_dispatch_q, id), + .head_offset = offsetof(struct scx_dispatch_q, hash_node), +}; + +static struct rhashtable dsq_hash; +static LLIST_HEAD(dsqs_to_free); + +/* dispatch buf */ +struct scx_dsp_buf_ent { + struct task_struct *task; + unsigned long qseq; + u64 dsq_id; + u64 enq_flags; +}; + +static u32 scx_dsp_max_batch; +static struct scx_dsp_buf_ent __percpu *scx_dsp_buf; + +struct scx_dsp_ctx { + struct rq *rq; + struct rq_flags *rf; + u32 buf_cursor; + u32 nr_tasks; +}; + +static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx); + +/* /sys/kernel/sched_ext interface */ +static struct kset *scx_kset; +static struct kobject *scx_root_kobj; + +static void scx_bpf_kick_cpu(s32 cpu, u64 flags); +static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, + s64 exit_code, + const char *fmt, ...); + +#define scx_ops_error_kind(__err, fmt, args...) \ + scx_ops_exit_kind(__err, 0, fmt, ##args) + +#define scx_ops_exit(__code, fmt, args...) \ + scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, __code, fmt, ##args) + +#define scx_ops_error(fmt, args...) \ + scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) + +struct scx_task_iter { + struct sched_ext_entity cursor; + struct task_struct *locked; + struct rq *rq; + struct rq_flags rf; +}; + +#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) + +static long jiffies_delta_msecs(unsigned long at, unsigned long now) +{ + if (time_after(at, now)) + return jiffies_to_msecs(at - now); + else + return -(long)jiffies_to_msecs(now - at); +} + +/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ +static u32 higher_bits(u32 flags) +{ + return ~((1 << fls(flags)) - 1); +} + +/* return the mask with only the highest bit set */ +static u32 highest_bit(u32 flags) +{ + int bit = fls(flags); + return bit ? 1 << (bit - 1) : 0; +} + +/* + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate + * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check + * whether it's running from an allowed context. + * + * @mask is constant, always inline to cull the mask calculations. + */ +static __always_inline void scx_kf_allow(u32 mask) +{ + /* nesting is allowed only in increasing scx_kf_mask order */ + WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, + "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", + current->scx.kf_mask, mask); + current->scx.kf_mask |= mask; +} + +static void scx_kf_disallow(u32 mask) +{ + current->scx.kf_mask &= ~mask; +} + +#define SCX_CALL_OP(mask, op, args...) \ +do { \ + if (mask) { \ + scx_kf_allow(mask); \ + scx_ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + scx_ops.op(args); \ + } \ +} while (0) + +#define SCX_CALL_OP_RET(mask, op, args...) \ +({ \ + __typeof__(scx_ops.op(args)) __ret; \ + if (mask) { \ + scx_kf_allow(mask); \ + __ret = scx_ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + __ret = scx_ops.op(args); \ + } \ + __ret; \ +}) + +/* + * Some kfuncs are allowed only on the tasks that are subjects of the + * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such + * restrictions, the following SCX_CALL_OP_*() variants should be used when + * invoking scx_ops operations that take task arguments. These can only be used + * for non-nesting operations due to the way the tasks are tracked. + * + * kfuncs which can only operate on such tasks can in turn use + * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on + * the specific task. + */ +#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +do { \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + SCX_CALL_OP(mask, op, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ +} while (0) + +#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +({ \ + __typeof__(scx_ops.op(task, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + __ret; \ +}) + +#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +({ \ + __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task0; \ + current->scx.kf_tasks[1] = task1; \ + __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + current->scx.kf_tasks[1] = NULL; \ + __ret; \ +}) + +/* @mask is constant, always inline to cull unnecessary branches */ +static __always_inline bool scx_kf_allowed(u32 mask) +{ + if (unlikely(!(current->scx.kf_mask & mask))) { + scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); + return false; + } + + if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) { + scx_ops_error("sleepable kfunc called from non-sleepable context"); + return false; + } + + /* + * Enforce nesting boundaries. e.g. A kfunc which can be called from + * DISPATCH must not be called if we're running DEQUEUE which is nested + * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE + * boundary thanks to the above in_interrupt() check. + */ + if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && + (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { + scx_ops_error("cpu_release kfunc called from a nested operation"); + return false; + } + + if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && + (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { + scx_ops_error("dispatch kfunc called from a nested operation"); + return false; + } + + return true; +} + +/* see SCX_CALL_OP_TASK() */ +static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, + struct task_struct *p) +{ + if (!scx_kf_allowed(__SCX_KF_RQ_LOCKED)) + return false; + + if (unlikely((p != current->scx.kf_tasks[0] && + p != current->scx.kf_tasks[1]))) { + scx_ops_error("called on a task not being operated on"); + return false; + } + + return true; +} + +/** + * scx_task_iter_init - Initialize a task iterator + * @iter: iterator to init + * + * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, + * @iter must eventually be exited with scx_task_iter_exit(). + * + * scx_tasks_lock may be released between this and the first next() call or + * between any two next() calls. If scx_tasks_lock is released between two + * next() calls, the caller is responsible for ensuring that the task being + * iterated remains accessible either through RCU read lock or obtaining a + * reference count. + * + * All tasks which existed when the iteration started are guaranteed to be + * visited as long as they still exist. + */ +static void scx_task_iter_init(struct scx_task_iter *iter) +{ + lockdep_assert_held(&scx_tasks_lock); + + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; + list_add(&iter->cursor.tasks_node, &scx_tasks); + iter->locked = NULL; +} + +/** + * scx_task_iter_exit - Exit a task iterator + * @iter: iterator to exit + * + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. + * If the iterator holds a task's rq lock, that rq lock is released. See + * scx_task_iter_init() for details. + */ +static void scx_task_iter_exit(struct scx_task_iter *iter) +{ + struct list_head *cursor = &iter->cursor.tasks_node; + + lockdep_assert_held(&scx_tasks_lock); + + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } + + if (list_empty(cursor)) + return; + + list_del_init(cursor); +} + +/** + * scx_task_iter_next - Next task + * @iter: iterator to walk + * + * Visit the next task. See scx_task_iter_init() for details. + */ +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) +{ + struct list_head *cursor = &iter->cursor.tasks_node; + struct sched_ext_entity *pos; + + lockdep_assert_held(&scx_tasks_lock); + + list_for_each_entry(pos, cursor, tasks_node) { + if (&pos->tasks_node == &scx_tasks) + return NULL; + if (!(pos->flags & SCX_TASK_CURSOR)) { + list_move(cursor, &pos->tasks_node); + return container_of(pos, struct task_struct, scx); + } + } + + /* can't happen, should always terminate at scx_tasks above */ + BUG(); +} + +/** + * scx_task_iter_next_filtered - Next non-idle task + * @iter: iterator to walk + * + * Visit the next non-idle task. See scx_task_iter_init() for details. + */ +static struct task_struct * +scx_task_iter_next_filtered(struct scx_task_iter *iter) +{ + struct task_struct *p; + + while ((p = scx_task_iter_next(iter))) { + /* + * is_idle_task() tests %PF_IDLE which may not be set for CPUs + * which haven't yet been onlined. Test sched_class directly. + */ + if (p->sched_class != &idle_sched_class) + return p; + } + return NULL; +} + +/** + * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked + * @iter: iterator to walk + * + * Visit the next non-idle task with its rq lock held. See scx_task_iter_init() + * for details. + */ +static struct task_struct * +scx_task_iter_next_filtered_locked(struct scx_task_iter *iter) +{ + struct task_struct *p; + + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } + + p = scx_task_iter_next_filtered(iter); + if (!p) + return NULL; + + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked = p; + return p; +} + +static enum scx_ops_enable_state scx_ops_enable_state(void) +{ + return atomic_read(&scx_ops_enable_state_var); +} + +static enum scx_ops_enable_state +scx_ops_set_enable_state(enum scx_ops_enable_state to) +{ + return atomic_xchg(&scx_ops_enable_state_var, to); +} + +static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, + enum scx_ops_enable_state from) +{ + int from_v = from; + + return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); +} + +static bool scx_ops_bypassing(void) +{ + return unlikely(atomic_read(&scx_ops_bypass_depth)); +} + +/** + * wait_ops_state - Busy-wait the specified ops state to end + * @p: target task + * @opss: state to wait the end of + * + * Busy-wait for @p to transition out of @opss. This can only be used when the + * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also + * has load_acquire semantics to ensure that the caller can see the updates made + * in the enqueueing and dispatching paths. + */ +static void wait_ops_state(struct task_struct *p, unsigned long opss) +{ + do { + cpu_relax(); + } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); +} + +/** + * ops_cpu_valid - Verify a cpu number + * @cpu: cpu number which came from a BPF ops + * @where: extra information reported on error + * + * @cpu is a cpu number which came from the BPF scheduler and can be any value. + * Verify that it is in range and one of the possible cpus. If invalid, trigger + * an ops error. + */ +static bool ops_cpu_valid(s32 cpu, const char *where) +{ + if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { + return true; + } else { + scx_ops_error("invalid CPU %d%s%s", cpu, + where ? " " : "", where ?: ""); + return false; + } +} + +/** + * ops_sanitize_err - Sanitize a -errno value + * @ops_name: operation to blame on failure + * @err: -errno value to sanitize + * + * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return + * -%EPROTO. This is necessary because returning a rogue -errno up the chain can + * cause misbehaviors. For an example, a large negative return from + * ops.init_task() triggers an oops when passed up the call chain because the + * value fails IS_ERR() test after being encoded with ERR_PTR() and then is + * handled as a pointer. + */ +static int ops_sanitize_err(const char *ops_name, s32 err) +{ + if (err < 0 && err >= -MAX_ERRNO) + return err; + + scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); + return -EPROTO; +} + +/** + * touch_core_sched - Update timestamp used for core-sched task ordering + * @rq: rq to read clock from, must be locked + * @p: task to update the timestamp for + * + * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to + * implement global or local-DSQ FIFO ordering for core-sched. Should be called + * when a task becomes runnable and its turn on the CPU ends (e.g. slice + * exhaustion). + */ +static void touch_core_sched(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SCHED_CORE + /* + * It's okay to update the timestamp spuriously. Use + * sched_core_disabled() which is cheaper than enabled(). + */ + if (!sched_core_disabled()) + p->scx.core_sched_at = rq_clock_task(rq); +#endif +} + +/** + * touch_core_sched_dispatch - Update core-sched timestamp on dispatch + * @rq: rq to read clock from, must be locked + * @p: task being dispatched + * + * If the BPF scheduler implements custom core-sched ordering via + * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO + * ordering within each local DSQ. This function is called from dispatch paths + * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. + */ +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + +#ifdef CONFIG_SCHED_CORE + if (SCX_HAS_OP(core_sched_before)) + touch_core_sched(rq, p); +#endif +} + +static void update_curr_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + u64 now = rq_clock_task(rq); + u64 delta_exec; + + if (time_before_eq64(now, curr->se.exec_start)) + return; + + delta_exec = now - curr->se.exec_start; + curr->se.exec_start = now; + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); + + if (curr->scx.slice != SCX_SLICE_INF) { + curr->scx.slice -= min(curr->scx.slice, delta_exec); + if (!curr->scx.slice) + touch_core_sched(rq, curr); + } +} + +static bool scx_dsq_priq_less(struct rb_node *node_a, + const struct rb_node *node_b) +{ + const struct task_struct *a = + container_of(node_a, struct task_struct, scx.dsq_node.priq); + const struct task_struct *b = + container_of(node_b, struct task_struct, scx.dsq_node.priq); + + return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); +} + +static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) +{ + bool is_local = dsq->id == SCX_DSQ_LOCAL; + + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo)); + WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || + !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); + + if (!is_local) { + raw_spin_lock(&dsq->lock); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { + scx_ops_error("attempting to dispatch to a destroyed dsq"); + /* fall back to the global dsq */ + raw_spin_unlock(&dsq->lock); + dsq = &scx_dsq_global; + raw_spin_lock(&dsq->lock); + } + } + + if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && + (enq_flags & SCX_ENQ_DSQ_PRIQ))) { + /* + * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from + * their FIFO queues. To avoid confusion and accidentally + * starving vtime-dispatched tasks by FIFO-dispatched tasks, we + * disallow any internal DSQ from doing vtime ordering of + * tasks. + */ + scx_ops_error("cannot use vtime ordering for built-in DSQs"); + enq_flags &= ~SCX_ENQ_DSQ_PRIQ; + } + + if (enq_flags & SCX_ENQ_DSQ_PRIQ) { + p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; + rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq, + scx_dsq_priq_less); + /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ + if (unlikely(!list_empty(&dsq->fifo))) + scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); + } else { + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_node.fifo, &dsq->fifo); + else + list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo); + /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ + if (unlikely(rb_first_cached(&dsq->priq))) + scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + } + dsq->nr++; + p->scx.dsq = dsq; + + /* + * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the + * direct dispatch path, but we clear them here because the direct + * dispatch verdict may be overridden on the enqueue path during e.g. + * bypass. + */ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; + + /* + * We're transitioning out of QUEUEING or DISPATCHING. store_release to + * match waiters' load_acquire. + */ + if (enq_flags & SCX_ENQ_CLEAR_OPSS) + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + if (is_local) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, + rq->curr->sched_class)) + resched_curr(rq); + } else { + raw_spin_unlock(&dsq->lock); + } +} + +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_node.priq); + p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; + } else { + list_del_init(&p->scx.dsq_node.fifo); + } +} + +static bool task_linked_on_dsq(struct task_struct *p) +{ + return !list_empty(&p->scx.dsq_node.fifo) || + !RB_EMPTY_NODE(&p->scx.dsq_node.priq); +} + +static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p) +{ + struct scx_dispatch_q *dsq = p->scx.dsq; + bool is_local = dsq == &scx_rq->local_dsq; + + if (!dsq) { + WARN_ON_ONCE(task_linked_on_dsq(p)); + /* + * When dispatching directly from the BPF scheduler to a local + * DSQ, the task isn't associated with any DSQ but + * @p->scx.holding_cpu may be set under the protection of + * %SCX_OPSS_DISPATCHING. + */ + if (p->scx.holding_cpu >= 0) + p->scx.holding_cpu = -1; + return; + } + + if (!is_local) + raw_spin_lock(&dsq->lock); + + /* + * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node + * can't change underneath us. + */ + if (p->scx.holding_cpu < 0) { + /* @p must still be on @dsq, dequeue */ + WARN_ON_ONCE(!task_linked_on_dsq(p)); + task_unlink_from_dsq(p, dsq); + dsq->nr--; + } else { + /* + * We're racing against dispatch_to_local_dsq() which already + * removed @p from @dsq and set @p->scx.holding_cpu. Clear the + * holding_cpu which tells dispatch_to_local_dsq() that it lost + * the race. + */ + WARN_ON_ONCE(task_linked_on_dsq(p)); + p->scx.holding_cpu = -1; + } + p->scx.dsq = NULL; + + if (!is_local) + raw_spin_unlock(&dsq->lock); +} + +static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) +{ + lockdep_assert(rcu_read_lock_any_held()); + + if (dsq_id == SCX_DSQ_GLOBAL) + return &scx_dsq_global; + else + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, + dsq_hash_params); +} + +static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, + struct task_struct *p) +{ + struct scx_dispatch_q *dsq; + + if (dsq_id == SCX_DSQ_LOCAL) + return &rq->scx.local_dsq; + + dsq = find_non_local_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); + return &scx_dsq_global; + } + + return dsq; +} + +static void mark_direct_dispatch(struct task_struct *ddsp_task, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + /* + * Mark that dispatch already happened from ops.select_cpu() or + * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value + * which can never match a valid task pointer. + */ + __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); + + /* @p must match the task on the enqueue path */ + if (unlikely(p != ddsp_task)) { + if (IS_ERR(ddsp_task)) + scx_ops_error("%s[%d] already direct-dispatched", + p->comm, p->pid); + else + scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + ddsp_task->comm, ddsp_task->pid, + p->comm, p->pid); + return; + } + + /* + * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because + * dispatching to the local DSQ of a different CPU requires unlocking + * the current rq which isn't allowed in the enqueue path. Use + * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. + */ + if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { + scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); + return; + } + + WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); + WARN_ON_ONCE(p->scx.ddsp_enq_flags); + + p->scx.ddsp_dsq_id = dsq_id; + p->scx.ddsp_enq_flags = enq_flags; +} + +static void direct_dispatch(struct task_struct *p, u64 enq_flags) +{ + struct scx_dispatch_q *dsq; + + touch_core_sched_dispatch(task_rq(p), p); + + enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); + dispatch_enqueue(dsq, p, enq_flags); +} + +static bool test_rq_online(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->online; +#else + return true; +#endif +} + +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, + int sticky_cpu) +{ + struct task_struct **ddsp_taskp; + unsigned long qseq; + + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + + /* rq migration */ + if (sticky_cpu == cpu_of(rq)) + goto local_norefill; + + /* + * If !rq->online, we already told the BPF scheduler that the CPU is + * offline. We're just trying to on/offline the CPU. Don't bother the + * BPF scheduler. + */ + if (unlikely(!test_rq_online(rq))) + goto local; + + if (scx_ops_bypassing()) { + if (enq_flags & SCX_ENQ_LAST) + goto local; + else + goto global; + } + + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* see %SCX_OPS_ENQ_EXITING */ + if (!static_branch_unlikely(&scx_ops_enq_exiting) && + unlikely(p->flags & PF_EXITING)) + goto local; + + /* see %SCX_OPS_ENQ_LAST */ + if (!static_branch_unlikely(&scx_ops_enq_last) && + (enq_flags & SCX_ENQ_LAST)) + goto local; + + if (!SCX_HAS_OP(enqueue)) + goto global; + + /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ + qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; + + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + + *ddsp_taskp = NULL; + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* + * If not directly dispatched, QUEUEING isn't clear yet and dispatch or + * dequeue may be waiting. The store_release matches their load_acquire. + */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); + return; + +direct: + direct_dispatch(p, enq_flags); + return; + +local: + /* + * For task-ordering, slice refill must be treated as implying the end + * of the current slice. Otherwise, the longer @p stays on the CPU, the + * higher priority it becomes from scx_prio_less()'s POV. + */ + touch_core_sched(rq, p); + p->scx.slice = SCX_SLICE_DFL; +local_norefill: + dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); + return; + +global: + touch_core_sched(rq, p); /* see the comment in local: */ + p->scx.slice = SCX_SLICE_DFL; + dispatch_enqueue(&scx_dsq_global, p, enq_flags); +} + +static bool task_runnable(const struct task_struct *p) +{ + return !list_empty(&p->scx.runnable_node); +} + +static void set_task_runnable(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + + if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { + p->scx.runnable_at = jiffies; + p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; + } + + /* + * list_add_tail() must be used. scx_ops_bypass() depends on tasks being + * appened to the runnable_list. + */ + list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); +} + +static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) +{ + list_del_init(&p->scx.runnable_node); + if (reset_runnable_at) + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; +} + +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) +{ + int sticky_cpu = p->scx.sticky_cpu; + + enq_flags |= rq->scx.extra_enq_flags; + + if (sticky_cpu >= 0) + p->scx.sticky_cpu = -1; + + /* + * Restoring a running task will be immediately followed by + * set_next_task_scx() which expects the task to not be on the BPF + * scheduler as tasks can only start running through local DSQs. Force + * direct-dispatch into the local DSQ by setting the sticky_cpu. + */ + if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) + sticky_cpu = cpu_of(rq); + + if (p->scx.flags & SCX_TASK_QUEUED) { + WARN_ON_ONCE(!task_runnable(p)); + return; + } + + set_task_runnable(rq, p); + p->scx.flags |= SCX_TASK_QUEUED; + rq->scx.nr_running++; + add_nr_running(rq, 1); + + if (SCX_HAS_OP(runnable)) + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + + if (enq_flags & SCX_ENQ_WAKEUP) + touch_core_sched(rq, p); + + do_enqueue_task(rq, p, enq_flags, sticky_cpu); +} + +static void ops_dequeue(struct task_struct *p, u64 deq_flags) +{ + unsigned long opss; + + /* dequeue is always temporary, don't reset runnable_at */ + clr_task_runnable(p, false); + + /* acquire ensures that we see the preceding updates on QUEUED */ + opss = atomic_long_read_acquire(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_NONE: + break; + case SCX_OPSS_QUEUEING: + /* + * QUEUEING is started and finished while holding @p's rq lock. + * As we're holding the rq lock now, we shouldn't see QUEUEING. + */ + BUG(); + case SCX_OPSS_QUEUED: + if (SCX_HAS_OP(dequeue)) + SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + + if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_NONE)) + break; + fallthrough; + case SCX_OPSS_DISPATCHING: + /* + * If @p is being dispatched from the BPF scheduler to a DSQ, + * wait for the transfer to complete so that @p doesn't get + * added to its DSQ after dequeueing is complete. + * + * As we're waiting on DISPATCHING with the rq locked, the + * dispatching side shouldn't try to lock the rq while + * DISPATCHING is set. See dispatch_to_local_dsq(). + * + * DISPATCHING shouldn't have qseq set and control can reach + * here with NONE @opss from the above QUEUED case block. + * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. + */ + wait_ops_state(p, SCX_OPSS_DISPATCHING); + BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + break; + } +} + +static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) +{ + struct scx_rq *scx_rq = &rq->scx; + + if (!(p->scx.flags & SCX_TASK_QUEUED)) { + WARN_ON_ONCE(task_runnable(p)); + return; + } + + ops_dequeue(p, deq_flags); + + /* + * A currently running task which is going off @rq first gets dequeued + * and then stops running. As we want running <-> stopping transitions + * to be contained within runnable <-> quiescent transitions, trigger + * ->stopping() early here instead of in put_prev_task_scx(). + * + * @p may go through multiple stopping <-> running transitions between + * here and put_prev_task_scx() if task attribute changes occur while + * balance_scx() leaves @rq unlocked. However, they don't contain any + * information meaningful to the BPF scheduler and can be suppressed by + * skipping the callbacks if the task is !QUEUED. + */ + if (SCX_HAS_OP(stopping) && task_current(rq, p)) { + update_curr_scx(rq); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + } + + if (SCX_HAS_OP(quiescent)) + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + + if (deq_flags & SCX_DEQ_SLEEP) + p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; + else + p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; + + p->scx.flags &= ~SCX_TASK_QUEUED; + scx_rq->nr_running--; + sub_nr_running(rq, 1); + + dispatch_dequeue(scx_rq, p); +} + +static void yield_task_scx(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (SCX_HAS_OP(yield)) + SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + else + p->scx.slice = 0; +} + +static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) +{ + struct task_struct *from = rq->curr; + + if (SCX_HAS_OP(yield)) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + else + return false; +} + +#ifdef CONFIG_SMP +/** + * move_task_to_local_dsq - Move a task from a different rq to a local DSQ + * @rq: rq to move the task into, currently locked + * @p: task to move + * @enq_flags: %SCX_ENQ_* + * + * Move @p which is currently on a different rq to @rq's local DSQ. The caller + * must: + * + * 1. Start with exclusive access to @p either through its DSQ lock or + * %SCX_OPSS_DISPATCHING flag. + * + * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). + * + * 3. Remember task_rq(@p). Release the exclusive access so that we don't + * deadlock with dequeue. + * + * 4. Lock @rq and the task_rq from #3. + * + * 5. Call this function. + * + * Returns %true if @p was successfully moved. %false after racing dequeue and + * losing. + */ +static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, + u64 enq_flags) +{ + struct rq *task_rq; + + lockdep_assert_rq_held(rq); + + /* + * If dequeue got to @p while we were trying to lock both rq's, it'd + * have cleared @p->scx.holding_cpu to -1. While other cpus may have + * updated it to different values afterwards, as this operation can't be + * preempted or recurse, @p->scx.holding_cpu can never become + * raw_smp_processor_id() again before we're done. Thus, we can tell + * whether we lost to dequeue by testing whether @p->scx.holding_cpu is + * still raw_smp_processor_id(). + * + * See dispatch_dequeue() for the counterpart. + */ + if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) + return false; + + /* @p->rq couldn't have changed if we're still the holding cpu */ + task_rq = task_rq(p); + lockdep_assert_rq_held(task_rq); + + WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); + deactivate_task(task_rq, p, 0); + set_task_cpu(p, cpu_of(rq)); + p->scx.sticky_cpu = cpu_of(rq); + + /* + * We want to pass scx-specific enq_flags but activate_task() will + * truncate the upper 32 bit. As we own @rq, we can pass them through + * @rq->scx.extra_enq_flags instead. + */ + WARN_ON_ONCE(rq->scx.extra_enq_flags); + rq->scx.extra_enq_flags = enq_flags; + activate_task(rq, p, 0); + rq->scx.extra_enq_flags = 0; + + return true; +} + +/** + * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @src_rq: rq to move task from + * @dst_rq: rq to move task to + * + * We're holding @rq lock and trying to dispatch a task from @src_rq to + * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether + * @rq stays locked isn't important as long as the state is restored after + * dispatch_to_local_dsq_unlock(). + */ +static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf, + struct rq *src_rq, struct rq *dst_rq) +{ + rq_unpin_lock(rq, rf); + + if (src_rq == dst_rq) { + raw_spin_rq_unlock(rq); + raw_spin_rq_lock(dst_rq); + } else if (rq == src_rq) { + double_lock_balance(rq, dst_rq); + rq_repin_lock(rq, rf); + } else if (rq == dst_rq) { + double_lock_balance(rq, src_rq); + rq_repin_lock(rq, rf); + } else { + raw_spin_rq_unlock(rq); + double_rq_lock(src_rq, dst_rq); + } +} + +/** + * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @src_rq: rq to move task from + * @dst_rq: rq to move task to + * + * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. + */ +static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf, + struct rq *src_rq, struct rq *dst_rq) +{ + if (src_rq == dst_rq) { + raw_spin_rq_unlock(dst_rq); + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + } else if (rq == src_rq) { + double_unlock_balance(rq, dst_rq); + } else if (rq == dst_rq) { + double_unlock_balance(rq, src_rq); + } else { + double_rq_unlock(src_rq, dst_rq); + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + } +} +#endif /* CONFIG_SMP */ + + +static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq) +{ + return likely(test_rq_online(rq)) && !is_migration_disabled(p) && + cpumask_test_cpu(cpu_of(rq), p->cpus_ptr); +} + +static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, + struct scx_dispatch_q *dsq) +{ + struct scx_rq *scx_rq = &rq->scx; + struct task_struct *p; + struct rb_node *rb_node; + struct rq *task_rq; + bool moved = false; +retry: + if (list_empty(&dsq->fifo) && !rb_first_cached(&dsq->priq)) + return false; + + raw_spin_lock(&dsq->lock); + + list_for_each_entry(p, &dsq->fifo, scx.dsq_node.fifo) { + task_rq = task_rq(p); + if (rq == task_rq) + goto this_rq; + if (task_can_run_on_rq(p, rq)) + goto remote_rq; + } + + for (rb_node = rb_first_cached(&dsq->priq); rb_node; + rb_node = rb_next(rb_node)) { + p = container_of(rb_node, struct task_struct, scx.dsq_node.priq); + task_rq = task_rq(p); + if (rq == task_rq) + goto this_rq; + if (task_can_run_on_rq(p, rq)) + goto remote_rq; + } + + raw_spin_unlock(&dsq->lock); + return false; + +this_rq: + /* @dsq is locked and @p is on this rq */ + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); + list_add_tail(&p->scx.dsq_node.fifo, &scx_rq->local_dsq.fifo); + dsq->nr--; + scx_rq->local_dsq.nr++; + p->scx.dsq = &scx_rq->local_dsq; + raw_spin_unlock(&dsq->lock); + return true; + +remote_rq: +#ifdef CONFIG_SMP + /* + * @dsq is locked and @p is on a remote rq. @p is currently protected by + * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab + * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the + * rq lock or fail, do a little dancing from our side. See + * move_task_to_local_dsq(). + */ + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); + dsq->nr--; + p->scx.holding_cpu = raw_smp_processor_id(); + raw_spin_unlock(&dsq->lock); + + rq_unpin_lock(rq, rf); + double_lock_balance(rq, task_rq); + rq_repin_lock(rq, rf); + + moved = move_task_to_local_dsq(rq, p, 0); + + double_unlock_balance(rq, task_rq); +#endif /* CONFIG_SMP */ + if (likely(moved)) + return true; + goto retry; +} + +enum dispatch_to_local_dsq_ret { + DTL_DISPATCHED, /* successfully dispatched */ + DTL_LOST, /* lost race to dequeue */ + DTL_NOT_LOCAL, /* destination is not a local DSQ */ + DTL_INVALID, /* invalid local dsq_id */ +}; + +/** + * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @dsq_id: destination dsq ID + * @p: task to dispatch + * @enq_flags: %SCX_ENQ_* + * + * We're holding @rq lock and want to dispatch @p to the local DSQ identified by + * @dsq_id. This function performs all the synchronization dancing needed + * because local DSQs are protected with rq locks. + * + * The caller must have exclusive ownership of @p (e.g. through + * %SCX_OPSS_DISPATCHING). + */ +static enum dispatch_to_local_dsq_ret +dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id, + struct task_struct *p, u64 enq_flags) +{ + struct rq *src_rq = task_rq(p); + struct rq *dst_rq; + + /* + * We're synchronized against dequeue through DISPATCHING. As @p can't + * be dequeued, its task_rq and cpus_allowed are stable too. + */ + if (dsq_id == SCX_DSQ_LOCAL) { + dst_rq = rq; + } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) + return DTL_INVALID; + dst_rq = cpu_rq(cpu); + } else { + return DTL_NOT_LOCAL; + } + + /* if dispatching to @rq that @p is already on, no lock dancing needed */ + if (rq == src_rq && rq == dst_rq) { + dispatch_enqueue(&dst_rq->scx.local_dsq, p, + enq_flags | SCX_ENQ_CLEAR_OPSS); + return DTL_DISPATCHED; + } + +#ifdef CONFIG_SMP + if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { + struct rq *locked_dst_rq = dst_rq; + bool dsp; + + /* + * @p is on a possibly remote @src_rq which we need to lock to + * move the task. If dequeue is in progress, it'd be locking + * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq + * lock while holding DISPATCHING. + * + * As DISPATCHING guarantees that @p is wholly ours, we can + * pretend that we're moving from a DSQ and use the same + * mechanism - mark the task under transfer with holding_cpu, + * release DISPATCHING and then follow the same protocol. + */ + p->scx.holding_cpu = raw_smp_processor_id(); + + /* store_release ensures that dequeue sees the above */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq); + + /* + * We don't require the BPF scheduler to avoid dispatching to + * offline CPUs mostly for convenience but also because CPUs can + * go offline between scx_bpf_dispatch() calls and here. If @p + * is destined to an offline CPU, queue it on its current CPU + * instead, which should always be safe. As this is an allowed + * behavior, don't trigger an ops error. + */ + if (unlikely(!test_rq_online(dst_rq))) + dst_rq = src_rq; + + if (src_rq == dst_rq) { + /* + * As @p is staying on the same rq, there's no need to + * go through the full deactivate/activate cycle. + * Optimize by abbreviating the operations in + * move_task_to_local_dsq(). + */ + dsp = p->scx.holding_cpu == raw_smp_processor_id(); + if (likely(dsp)) { + p->scx.holding_cpu = -1; + dispatch_enqueue(&dst_rq->scx.local_dsq, p, + enq_flags); + } + } else { + dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); + } + + /* if the destination CPU is idle, wake it up */ + if (dsp && p->sched_class > dst_rq->curr->sched_class) + resched_curr(dst_rq); + + dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq); + + return dsp ? DTL_DISPATCHED : DTL_LOST; + } +#endif /* CONFIG_SMP */ + + scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", + cpu_of(dst_rq), p->comm, p->pid); + return DTL_INVALID; +} + +/** + * finish_dispatch - Asynchronously finish dispatching a task + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @p: task to finish dispatching + * @qseq_at_dispatch: qseq when @p started getting dispatched + * @dsq_id: destination DSQ ID + * @enq_flags: %SCX_ENQ_* + * + * Dispatching to local DSQs may need to wait for queueing to complete or + * require rq lock dancing. As we don't wanna do either while inside + * ops.dispatch() to avoid locking order inversion, we split dispatching into + * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the + * task and its qseq. Once ops.dispatch() returns, this function is called to + * finish up. + * + * There is no guarantee that @p is still valid for dispatching or even that it + * was valid in the first place. Make sure that the task is still owned by the + * BPF scheduler and claim the ownership before dispatching. + */ +static void finish_dispatch(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, + unsigned long qseq_at_dispatch, + u64 dsq_id, u64 enq_flags) +{ + struct scx_dispatch_q *dsq; + unsigned long opss; + + touch_core_sched_dispatch(rq, p); +retry: + /* + * No need for _acquire here. @p is accessed only after a successful + * try_cmpxchg to DISPATCHING. + */ + opss = atomic_long_read(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_DISPATCHING: + case SCX_OPSS_NONE: + /* someone else already got to it */ + return; + case SCX_OPSS_QUEUED: + /* + * If qseq doesn't match, @p has gone through at least one + * dispatch/dequeue and re-enqueue cycle between + * scx_bpf_dispatch() and here and we have no claim on it. + */ + if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) + return; + + /* + * While we know @p is accessible, we don't yet have a claim on + * it - the BPF scheduler is allowed to dispatch tasks + * spuriously and there can be a racing dequeue attempt. Let's + * claim @p by atomically transitioning it from QUEUED to + * DISPATCHING. + */ + if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_DISPATCHING))) + break; + goto retry; + case SCX_OPSS_QUEUEING: + /* + * do_enqueue_task() is in the process of transferring the task + * to the BPF scheduler while holding @p's rq lock. As we aren't + * holding any kernel or BPF resource that the enqueue path may + * depend upon, it's safe to wait. + */ + wait_ops_state(p, opss); + goto retry; + } + + BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); + + switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) { + case DTL_DISPATCHED: + break; + case DTL_LOST: + break; + case DTL_INVALID: + dsq_id = SCX_DSQ_GLOBAL; + fallthrough; + case DTL_NOT_LOCAL: + dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), + dsq_id, p); + dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + break; + } +} + +static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + u32 u; + + for (u = 0; u < dspc->buf_cursor; u++) { + struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u]; + + finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id, + ent->enq_flags); + } + + dspc->nr_tasks += dspc->buf_cursor; + dspc->buf_cursor = 0; +} + +static int balance_one(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf, bool local) +{ + struct scx_rq *scx_rq = &rq->scx; + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + bool prev_on_scx = prev->sched_class == &ext_sched_class; + int nr_loops = SCX_DSP_MAX_LOOPS; + bool has_tasks = false; + + lockdep_assert_rq_held(rq); + scx_rq->flags |= SCX_RQ_BALANCING; + + if (static_branch_unlikely(&scx_ops_cpu_preempt) && + unlikely(rq->scx.cpu_released)) { + /* + * If the previous sched_class for the current CPU was not SCX, + * notify the BPF scheduler that it again has control of the + * core. This callback complements ->cpu_release(), which is + * emitted in scx_next_task_picked(). + */ + if (SCX_HAS_OP(cpu_acquire)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq), + NULL); + rq->scx.cpu_released = false; + } + + if (prev_on_scx) { + WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP)); + update_curr_scx(rq); + + /* + * If @prev is runnable & has slice left, it has priority and + * fetching more just increases latency for the fetched tasks. + * Tell put_prev_task_scx() to put @prev on local_dsq. If the + * BPF scheduler wants to handle this explicitly, it should + * implement ->cpu_released(). + * + * See scx_ops_disable_workfn() for the explanation on the + * bypassing test. + * + * When balancing a remote CPU for core-sched, there won't be a + * following put_prev_task_scx() call and we don't own + * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the + * same conditions later and pick @rq->curr accordingly. + */ + if ((prev->scx.flags & SCX_TASK_QUEUED) && + prev->scx.slice && !scx_ops_bypassing()) { + if (local) + prev->scx.flags |= SCX_TASK_BAL_KEEP; + goto has_tasks; + } + } + + /* if there already are tasks to run, nothing to do */ + if (scx_rq->local_dsq.nr) + goto has_tasks; + + if (consume_dispatch_q(rq, rf, &scx_dsq_global)) + goto has_tasks; + + if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing()) + goto out; + + dspc->rq = rq; + dspc->rf = rf; + + /* + * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, + * the local DSQ might still end up empty after a successful + * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() + * produced some tasks, retry. The BPF scheduler may depend on this + * looping behavior to simplify its implementation. + */ + do { + dspc->nr_tasks = 0; + + SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), + prev_on_scx ? prev : NULL); + + flush_dispatch_buf(rq, rf); + + if (scx_rq->local_dsq.nr) + goto has_tasks; + if (consume_dispatch_q(rq, rf, &scx_dsq_global)) + goto has_tasks; + + /* + * ops.dispatch() can trap us in this loop by repeatedly + * dispatching ineligible tasks. Break out once in a while to + * allow the watchdog to run. As IRQ can't be enabled in + * balance(), we want to complete this scheduling cycle and then + * start a new one. IOW, we want to call resched_curr() on the + * next, most likely idle, task, not the current one. Use + * scx_bpf_kick_cpu() for deferred kicking. + */ + if (unlikely(!--nr_loops)) { + scx_bpf_kick_cpu(cpu_of(rq), 0); + break; + } + } while (dspc->nr_tasks); + + goto out; + +has_tasks: + has_tasks = true; +out: + scx_rq->flags &= ~SCX_RQ_BALANCING; + return has_tasks; +} + +static int balance_scx(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ + int ret; + + ret = balance_one(rq, prev, rf, true); + +#ifdef CONFIG_SCHED_SMT + /* + * When core-sched is enabled, this ops.balance() call will be followed + * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx() + * on the SMT siblings. Balance the siblings too. + */ + if (sched_core_enabled(rq)) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + int scpu; + + for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { + struct rq *srq = cpu_rq(scpu); + struct rq_flags srf; + struct task_struct *sprev = srq->curr; + + /* + * While core-scheduling, rq lock is shared among + * siblings but the debug annotations and rq clock + * aren't. Do pinning dance to transfer the ownership. + */ + WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); + rq_unpin_lock(rq, rf); + rq_pin_lock(srq, &srf); + + update_rq_clock(srq); + balance_one(srq, sprev, &srf, false); + + rq_unpin_lock(srq, &srf); + rq_repin_lock(rq, rf); + } + } +#endif + return ret; +} + +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) +{ + if (p->scx.flags & SCX_TASK_QUEUED) { + /* + * Core-sched might decide to execute @p before it is + * dispatched. Call ops_dequeue() to notify the BPF scheduler. + */ + ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + dispatch_dequeue(&rq->scx, p); + } + + p->se.exec_start = rq_clock_task(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + + clr_task_runnable(p, true); + + /* + * @p is getting newly scheduled or got kicked after someone updated its + * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). + */ + if ((p->scx.slice == SCX_SLICE_INF) != + (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { + if (p->scx.slice == SCX_SLICE_INF) + rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; + else + rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; + + sched_update_tick_dependency(rq); + + /* + * For now, let's refresh the load_avgs just when transitioning + * in and out of nohz. In the future, we might want to add a + * mechanism which calls the following periodically on + * tick-stopped CPUs. + */ + update_other_load_avgs(rq); + } +} + +static void put_prev_task_scx(struct rq *rq, struct task_struct *p) +{ +#ifndef CONFIG_SMP + /* + * UP workaround. + * + * Because SCX may transfer tasks across CPUs during dispatch, dispatch + * is performed from its balance operation which isn't called in UP. + * Let's work around by calling it from the operations which come right + * after. + * + * 1. If the prev task is on SCX, pick_next_task() calls + * .put_prev_task() right after. As .put_prev_task() is also called + * from other places, we need to distinguish the calls which can be + * done by looking at the previous task's state - if still queued or + * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). + * This case is handled here. + * + * 2. If the prev task is not on SCX, the first following call into SCX + * will be .pick_next_task(), which is covered by calling + * balance_scx() from pick_next_task_scx(). + * + * Note that we can't merge the first case into the second as + * balance_scx() must be called before the previous SCX task goes + * through put_prev_task_scx(). + * + * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf. + * Pass in %NULL. + */ + if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) + balance_scx(rq, p, NULL); +#endif + + update_curr_scx(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + + /* + * If we're being called from put_prev_task_balance(), balance_scx() may + * have decided that @p should keep running. + */ + if (p->scx.flags & SCX_TASK_BAL_KEEP) { + p->scx.flags &= ~SCX_TASK_BAL_KEEP; + set_task_runnable(rq, p); + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + return; + } + + if (p->scx.flags & SCX_TASK_QUEUED) { + set_task_runnable(rq, p); + + /* + * If @p has slice left and balance_scx() didn't tag it for + * keeping, @p is getting preempted by a higher priority + * scheduler class or core-sched forcing a different task. Leave + * it at the head of the local DSQ. + */ + if (p->scx.slice && !scx_ops_bypassing()) { + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + return; + } + + /* + * If we're in the pick_next_task path, balance_scx() should + * have already populated the local DSQ if there are any other + * available tasks. If empty, tell ops.enqueue() that @p is the + * only one available for this cpu. ops.enqueue() should put it + * on the local DSQ so that the subsequent pick_next_task_scx() + * can find the task unless it wants to trigger a separate + * follow-up scheduling event. + */ + if (list_empty(&rq->scx.local_dsq.fifo)) + do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); + else + do_enqueue_task(rq, p, 0, -1); + } +} + +static struct task_struct *first_local_task(struct rq *rq) +{ + WARN_ON_ONCE(rb_first_cached(&rq->scx.local_dsq.priq)); + return list_first_entry_or_null(&rq->scx.local_dsq.fifo, + struct task_struct, scx.dsq_node.fifo); +} + +static struct task_struct *pick_next_task_scx(struct rq *rq) +{ + struct task_struct *p; + +#ifndef CONFIG_SMP + /* UP workaround - see the comment at the head of put_prev_task_scx() */ + if (unlikely(rq->curr->sched_class != &ext_sched_class)) + balance_scx(rq, rq->curr, NULL); +#endif + + p = first_local_task(rq); + if (!p) + return NULL; + + if (unlikely(!p->scx.slice)) { + if (!scx_ops_bypassing() && !scx_warned_zero_slice) { + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", + p->comm, p->pid); + scx_warned_zero_slice = true; + } + p->scx.slice = SCX_SLICE_DFL; + } + + set_next_task_scx(rq, p, true); + + return p; +} + +#ifdef CONFIG_SCHED_CORE +/** + * scx_prio_less - Task ordering for core-sched + * @a: task A + * @b: task B + * + * Core-sched is implemented as an additional scheduling layer on top of the + * usual sched_class'es and needs to find out the expected task ordering. For + * SCX, core-sched calls this function to interrogate the task ordering. + * + * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used + * to implement the default task ordering. The older the timestamp, the higher + * prority the task - the global FIFO ordering matching the default scheduling + * behavior. + * + * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to + * implement FIFO ordering within each local DSQ. See pick_task_scx(). + */ +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) +{ + /* + * The const qualifiers are dropped from task_struct pointers when + * calling ops.core_sched_before(). Accesses are controlled by the + * verifier. + */ + if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + (struct task_struct *)a, + (struct task_struct *)b); + else + return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); +} + +/** + * pick_task_scx - Pick a candidate task for core-sched + * @rq: rq to pick the candidate task from + * + * Core-sched calls this function on each SMT sibling to determine the next + * tasks to run on the SMT siblings. balance_one() has been called on all + * siblings and put_prev_task_scx() has been called only for the current CPU. + * + * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look + * at the first task in the local dsq. @rq->curr has to be considered explicitly + * to mimic %SCX_TASK_BAL_KEEP. + */ +static struct task_struct *pick_task_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct task_struct *first = first_local_task(rq); + + if (curr->scx.flags & SCX_TASK_QUEUED) { + /* is curr the only runnable task? */ + if (!first) + return curr; + + /* + * Does curr trump first? We can always go by core_sched_at for + * this comparison as it represents global FIFO ordering when + * the default core-sched ordering is used and local-DSQ FIFO + * ordering otherwise. + * + * We can have a task with an earlier timestamp on the DSQ. For + * example, when a current task is preempted by a sibling + * picking a different cookie, the task would be requeued at the + * head of the local DSQ with an earlier timestamp than the + * core-sched picked next task. Besides, the BPF scheduler may + * dispatch any tasks to the local DSQ anytime. + */ + if (curr->scx.slice && time_before64(curr->scx.core_sched_at, + first->scx.core_sched_at)) + return curr; + } + + return first; /* this may be %NULL */ +} +#endif /* CONFIG_SCHED_CORE */ + +static enum scx_cpu_preempt_reason +preempt_reason_from_class(const struct sched_class *class) +{ +#ifdef CONFIG_SMP + if (class == &stop_sched_class) + return SCX_CPU_PREEMPT_STOP; +#endif + if (class == &dl_sched_class) + return SCX_CPU_PREEMPT_DL; + if (class == &rt_sched_class) + return SCX_CPU_PREEMPT_RT; + return SCX_CPU_PREEMPT_UNKNOWN; +} + +void scx_next_task_picked(struct rq *rq, struct task_struct *p, + const struct sched_class *active) +{ + lockdep_assert_rq_held(rq); + + if (!scx_enabled()) + return; +#ifdef CONFIG_SMP + /* + * Pairs with the smp_load_acquire() issued by a CPU in + * kick_cpus_irq_workfn() who is waiting for this CPU to perform a + * resched. + */ + smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); +#endif + if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + return; + + /* + * The callback is conceptually meant to convey that the CPU is no + * longer under the control of SCX. Therefore, don't invoke the + * callback if the CPU is is staying on SCX, or going idle (in which + * case the SCX scheduler has actively decided not to schedule any + * tasks on the CPU). + */ + if (likely(active >= &ext_sched_class)) + return; + + /* + * At this point we know that SCX was preempted by a higher priority + * sched_class, so invoke the ->cpu_release() callback if we have not + * done so already. We only send the callback once between SCX being + * preempted, and it regaining control of the CPU. + * + * ->cpu_release() complements ->cpu_acquire(), which is emitted the + * next time that balance_scx() is invoked. + */ + if (!rq->scx.cpu_released) { + if (SCX_HAS_OP(cpu_release)) { + struct scx_cpu_release_args args = { + .reason = preempt_reason_from_class(active), + .task = p, + }; + + SCX_CALL_OP(SCX_KF_CPU_RELEASE, + cpu_release, cpu_of(rq), &args); + } + rq->scx.cpu_released = true; + } +} + +#ifdef CONFIG_SMP + +static bool test_and_clear_cpu_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from idle_masks.smt. Ensure that @cpu + * is eventually cleared. + */ + if (cpumask_intersects(smt, idle_masks.smt)) + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + else if (cpumask_test_cpu(cpu, idle_masks.smt)) + __cpumask_clear_cpu(cpu, idle_masks.smt); + } +#endif + return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); +} + +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (test_and_clear_cpu_idle(cpu)) + return cpu; + else + goto retry; +} + +static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *found) +{ + s32 cpu; + + *found = false; + + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return prev_cpu; + } + + /* + * If WAKE_SYNC, the waker's local DSQ is empty, and the system is + * under utilized, wake up @p to the local DSQ of the waker. Checking + * only for an empty local DSQ is insufficient as it could give the + * wakee an unfair advantage when the system is oversaturated. + * Checking only for the presence of idle CPUs is also insufficient as + * the local DSQ of the waker could have tasks piled up on it even if + * there is an idle core elsewhere on the system. + */ + cpu = smp_processor_id(); + if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && + !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto cpu_found; + } + + if (p->nr_cpus_allowed == 1) { + if (test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } else { + return prev_cpu; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && + test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + if (test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto cpu_found; + + return prev_cpu; + +cpu_found: + *found = true; + return cpu; +} + +static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) +{ + /* + * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it + * can be a good migration opportunity with low cache and memory + * footprint. Returning a CPU different than @prev_cpu triggers + * immediate rq migration. However, for SCX, as the current rq + * association doesn't dictate where the task is going to run, this + * doesn't fit well. If necessary, we can later add a dedicated method + * which can decide to preempt self to force it through the regular + * scheduling path. + */ + if (unlikely(wake_flags & WF_EXEC)) + return prev_cpu; + + if (SCX_HAS_OP(select_cpu)) { + s32 cpu; + struct task_struct **ddsp_taskp; + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, + select_cpu, p, prev_cpu, wake_flags); + *ddsp_taskp = NULL; + if (ops_cpu_valid(cpu, "from ops.select_cpu()")) + return cpu; + else + return prev_cpu; + } else { + bool found; + s32 cpu; + + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); + if (found) { + p->scx.slice = SCX_SLICE_DFL; + p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + } + return cpu; + } +} + +static void set_cpus_allowed_scx(struct task_struct *p, + struct affinity_context *ac) +{ + set_cpus_allowed_common(p, ac); + + /* + * The effective cpumask is stored in @p->cpus_ptr which may temporarily + * differ from the configured one in @p->cpus_mask. Always tell the bpf + * scheduler the effective one. + * + * Fine-grained memory write control is enforced by BPF making the const + * designation pointless. Cast it away when calling the operation. + */ + if (SCX_HAS_OP(set_cpumask)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, + (struct cpumask *)p->cpus_ptr); +} + +static void reset_idle_masks(void) +{ + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + cpumask_copy(idle_masks.cpu, cpu_online_mask); + cpumask_copy(idle_masks.smt, cpu_online_mask); +} + +void __scx_update_idle(struct rq *rq, bool idle) +{ + int cpu = cpu_of(rq); + + if (SCX_HAS_OP(update_idle)) { + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + if (!static_branch_unlikely(&scx_builtin_idle_enabled)) + return; + } + + if (idle) + cpumask_set_cpu(cpu, idle_masks.cpu); + else + cpumask_clear_cpu(cpu, idle_masks.cpu); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + if (idle) { + /* + * idle_masks.smt handling is racy but that's fine as + * it's only for optimization and self-correcting. + */ + for_each_cpu(cpu, smt) { + if (!cpumask_test_cpu(cpu, idle_masks.cpu)) + return; + } + cpumask_or(idle_masks.smt, idle_masks.smt, smt); + } else { + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + } + } +#endif +} + +static void handle_hotplug(struct rq *rq, bool online) +{ + int cpu = cpu_of(rq); + + atomic_long_inc(&scx_hotplug_seq); + + if (online && SCX_HAS_OP(cpu_online)) + SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu); + else if (!online && SCX_HAS_OP(cpu_offline)) + SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu); + else + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); +} + +static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason) +{ + if (reason == RQ_ONOFF_HOTPLUG) + handle_hotplug(rq, true); +} + +static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason) +{ + if (reason == RQ_ONOFF_HOTPLUG) + handle_hotplug(rq, false); +} + +#else /* !CONFIG_SMP */ + +static bool test_and_clear_cpu_idle(int cpu) { return false; } +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } +static void reset_idle_masks(void) {} + +#endif /* CONFIG_SMP */ + +static bool check_rq_for_timeouts(struct rq *rq) +{ + struct task_struct *p; + struct rq_flags rf; + bool timed_out = false; + + rq_lock_irqsave(rq, &rf); + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + unsigned long last_runnable = p->scx.runnable_at; + + if (unlikely(time_after(jiffies, + last_runnable + scx_watchdog_timeout))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, + dur_ms / 1000, dur_ms % 1000); + timed_out = true; + break; + } + } + rq_unlock_irqrestore(rq, &rf); + + return timed_out; +} + +static void scx_watchdog_workfn(struct work_struct *work) +{ + int cpu; + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + + for_each_online_cpu(cpu) { + if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) + break; + + cond_resched(); + } + queue_delayed_work(system_unbound_wq, to_delayed_work(work), + scx_watchdog_timeout / 2); +} + +void scx_tick(struct rq *rq) +{ + unsigned long last_check; + + if (!scx_enabled()) + return; + + last_check = READ_ONCE(scx_watchdog_timestamp); + if (unlikely(time_after(jiffies, + last_check + READ_ONCE(scx_watchdog_timeout)))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_check); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); + } + + update_other_load_avgs(rq); +} + +static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +{ + update_curr_scx(rq); + + /* + * While disabling, always resched and refresh core-sched timestamp as + * we can't trust the slice management or ops.core_sched_before(). + */ + if (scx_ops_bypassing()) { + curr->scx.slice = 0; + touch_core_sched(rq, curr); + } else if (SCX_HAS_OP(tick)) { + SCX_CALL_OP(SCX_KF_REST, tick, curr); + } + + if (!curr->scx.slice) + resched_curr(rq); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static struct cgroup *tg_cgrp(struct task_group *tg) +{ + /* + * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, + * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the + * root cgroup. + */ + if (tg && tg->css.cgroup) + return tg->css.cgroup; + else + return &cgrp_dfl_root.cgrp; +} + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), + +#else /* CONFIG_EXT_GROUP_SCHED */ + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +static enum scx_task_state scx_get_task_state(const struct task_struct *p) +{ + return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; +} + +static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +{ + enum scx_task_state prev_state = scx_get_task_state(p); + bool warn = false; + + BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); + + switch (state) { + case SCX_TASK_NONE: + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_READY: + warn = prev_state == SCX_TASK_NONE; + break; + case SCX_TASK_ENABLED: + warn = prev_state != SCX_TASK_READY; + break; + default: + warn = true; + return; + } + + WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state << SCX_TASK_STATE_SHIFT; +} + +static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) +{ + int ret; + + p->scx.disallow = false; + + if (SCX_HAS_OP(init_task)) { + struct scx_init_task_args args = { + SCX_INIT_TASK_ARGS_CGROUP(tg) + .fork = fork, + }; + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); + if (unlikely(ret)) { + ret = ops_sanitize_err("init_task", ret); + return ret; + } + } + + scx_set_task_state(p, SCX_TASK_INIT); + + if (p->scx.disallow) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + + /* + * We're either in fork or load path and @p->policy will be + * applied right after. Reverting @p->policy here and rejecting + * %SCHED_EXT transitions from scx_check_setscheduler() + * guarantees that if ops.init_task() sets @p->disallow, @p can + * never be in SCX. + */ + if (p->policy == SCHED_EXT) { + p->policy = SCHED_NORMAL; + atomic_long_inc(&scx_nr_rejected); + } + + task_rq_unlock(rq, p, &rf); + } + + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + return 0; +} + +static void set_task_scx_weight(struct task_struct *p) +{ + u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; + + p->scx.weight = sched_weight_to_cgroup(weight); +} + +static void scx_ops_enable_task(struct task_struct *p) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* + * Set the weight before calling ops.enable() so that the scheduler + * doesn't see a stale value if they inspect the task struct. + */ + set_task_scx_weight(p); + if (SCX_HAS_OP(enable)) + SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + scx_set_task_state(p, SCX_TASK_ENABLED); + + if (SCX_HAS_OP(set_weight)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void scx_ops_disable_task(struct task_struct *p) +{ + lockdep_assert_rq_held(task_rq(p)); + WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + + if (SCX_HAS_OP(disable)) + SCX_CALL_OP(SCX_KF_REST, disable, p); + scx_set_task_state(p, SCX_TASK_READY); +} + +static void scx_ops_exit_task(struct task_struct *p) +{ + struct scx_exit_task_args args = { + .cancelled = false, + }; + + lockdep_assert_rq_held(task_rq(p)); + + switch (scx_get_task_state(p)) { + case SCX_TASK_NONE: + return; + case SCX_TASK_INIT: + args.cancelled = true; + break; + case SCX_TASK_READY: + break; + case SCX_TASK_ENABLED: + scx_ops_disable_task(p); + break; + default: + WARN_ON_ONCE(true); + return; + } + + if (SCX_HAS_OP(exit_task)) + SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + scx_set_task_state(p, SCX_TASK_NONE); +} + +void scx_pre_fork(struct task_struct *p) +{ + /* + * BPF scheduler enable/disable paths want to be able to iterate and + * update all tasks which can become complex when racing forks. As + * enable/disable are very cold paths, let's use a percpu_rwsem to + * exclude forks. + */ + percpu_down_read(&scx_fork_rwsem); +} + +int scx_fork(struct task_struct *p) +{ + percpu_rwsem_assert_held(&scx_fork_rwsem); + + if (scx_enabled()) + return scx_ops_init_task(p, task_group(p), true); + else + return 0; +} + +void scx_post_fork(struct task_struct *p) +{ + if (scx_enabled()) { + scx_set_task_state(p, SCX_TASK_READY); + + /* + * Enable the task immediately if it's running on sched_ext. + * Otherwise, it'll be enabled in switching_to_scx() if and + * when it's ever configured to run with a SCHED_EXT policy. + */ + if (p->sched_class == &ext_sched_class) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_ops_enable_task(p); + task_rq_unlock(rq, p, &rf); + } + } + + spin_lock_irq(&scx_tasks_lock); + list_add_tail(&p->scx.tasks_node, &scx_tasks); + spin_unlock_irq(&scx_tasks_lock); + + percpu_up_read(&scx_fork_rwsem); +} + +void scx_cancel_fork(struct task_struct *p) +{ + if (scx_enabled()) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); + scx_ops_exit_task(p); + task_rq_unlock(rq, p, &rf); + } + + percpu_up_read(&scx_fork_rwsem); +} + +void sched_ext_free(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&scx_tasks_lock, flags); + list_del_init(&p->scx.tasks_node); + spin_unlock_irqrestore(&scx_tasks_lock, flags); + + /* + * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> + * ENABLED transitions can't race us. Disable ops for @p. + */ + if (scx_get_task_state(p) != SCX_TASK_NONE) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_ops_exit_task(p); + task_rq_unlock(rq, p, &rf); + } +} + +static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio) +{ + lockdep_assert_rq_held(task_rq(p)); + + set_task_scx_weight(p); + if (SCX_HAS_OP(set_weight)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +{ +} + +static void switching_to_scx(struct rq *rq, struct task_struct *p) +{ + scx_ops_enable_task(p); + + /* + * set_cpus_allowed_scx() is not called while @p is associated with a + * different scheduler class. Keep the BPF scheduler up-to-date. + */ + if (SCX_HAS_OP(set_cpumask)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, + (struct cpumask *)p->cpus_ptr); +} + +static void switched_from_scx(struct rq *rq, struct task_struct *p) +{ + scx_ops_disable_task(p); +} + +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void switched_to_scx(struct rq *rq, struct task_struct *p) {} + +int scx_check_setscheduler(struct task_struct *p, int policy) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* if disallow, reject transitioning into SCX */ + if (scx_enabled() && READ_ONCE(p->scx.disallow) && + p->policy != policy && policy == SCHED_EXT) + return -EACCES; + + return 0; +} + +#ifdef CONFIG_NO_HZ_FULL +bool scx_can_stop_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (scx_ops_bypassing()) + return false; + + if (p->sched_class != &ext_sched_class) + return true; + + /* + * @rq can dispatch from different DSQs, so we can't tell whether it + * needs the tick or not by looking at nr_running. Allow stopping ticks + * iff the BPF scheduler indicated so. See set_next_task_scx(). + */ + return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; +} +#endif + +#ifdef CONFIG_EXT_GROUP_SCHED + +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); + +int scx_tg_online(struct task_group *tg) +{ + int ret = 0; + + WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, + tg->css.cgroup, &args); + if (!ret) + tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + else + ret = ops_sanitize_err("cgroup_init", ret); + } else { + tg->scx_flags |= SCX_TG_ONLINE; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ret; +} + +void scx_tg_offline(struct task_group *tg) +{ + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_exit, tg->css.cgroup); + tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + + percpu_up_read(&scx_cgroup_rwsem); +} + +int scx_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + int ret; + + /* released in scx_finish/cancel_attach() */ + percpu_down_read(&scx_cgroup_rwsem); + + if (!scx_enabled()) + return 0; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup *from = tg_cgrp(task_group(p)); + struct cgroup *to = tg_cgrp(css_tg(css)); + + WARN_ON_ONCE(p->scx.cgrp_moving_from); + + /* + * sched_move_task() omits identity migrations. Let's match the + * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() + * always match one-to-one. + */ + if (from == to) + continue; + + if (SCX_HAS_OP(cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_prep_move, + p, from, css->cgroup); + if (ret) + goto err; + } + + p->scx.cgrp_moving_from = from; + } + + return 0; + +err: + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ops_sanitize_err("cgroup_prep_move", ret); +} + +void scx_move_task(struct task_struct *p) +{ + /* + * We're called from sched_move_task() which handles both cgroup and + * autogroup moves. Ignore the latter. + * + * Also ignore exiting tasks, because in the exit path tasks transition + * from the autogroup to the root group, so task_group_is_autogroup() + * alone isn't able to catch exiting autogroup tasks. This is safe for + * cgroup_move(), because cgroup migrations never happen for PF_EXITING + * tasks. + */ + if (p->flags & PF_EXITING || task_group_is_autogroup(task_group(p))) + return; + + if (!scx_enabled()) + return; + + /* + * @p must have ops.cgroup_prep_move() called on it and thus + * cgrp_moving_from set. + */ + if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, + p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + p->scx.cgrp_moving_from = NULL; +} + +void scx_cgroup_finish_attach(void) +{ + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + + if (!scx_enabled()) + goto out_unlock; + + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } +out_unlock: + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_weight(struct task_group *tg, unsigned long weight) +{ + percpu_down_read(&scx_cgroup_rwsem); + + if (tg->scx_weight != weight) { + if (SCX_HAS_OP(cgroup_set_weight)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_set_weight, + tg_cgrp(tg), weight); + tg->scx_weight = weight; + } + + percpu_up_read(&scx_cgroup_rwsem); +} + +static void scx_cgroup_lock(void) +{ + percpu_down_write(&scx_cgroup_rwsem); +} + +static void scx_cgroup_unlock(void) +{ + percpu_up_write(&scx_cgroup_rwsem); +} + +#else /* CONFIG_EXT_GROUP_SCHED */ + +static inline void scx_cgroup_lock(void) {} +static inline void scx_cgroup_unlock(void) {} + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +/* + * Omitted operations: + * + * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task + * isn't tied to the CPU at that point. Preemption is implemented by resetting + * the victim task's slice to 0 and triggering reschedule on the target CPU. + * + * - migrate_task_rq: Unncessary as task to cpu mapping is transient. + * + * - task_fork/dead: We need fork/dead notifications for all tasks regardless of + * their current sched_class. Call them directly from sched core instead. + * + * - task_woken: Unnecessary. + */ +DEFINE_SCHED_CLASS(ext) = { + .enqueue_task = enqueue_task_scx, + .dequeue_task = dequeue_task_scx, + .yield_task = yield_task_scx, + .yield_to_task = yield_to_task_scx, + + .wakeup_preempt = wakeup_preempt_scx, + + .pick_next_task = pick_next_task_scx, + + .put_prev_task = put_prev_task_scx, + .set_next_task = set_next_task_scx, + +#ifdef CONFIG_SMP + .balance = balance_scx, + .select_task_rq = select_task_rq_scx, + .set_cpus_allowed = set_cpus_allowed_scx, + + .rq_online = rq_online_scx, + .rq_offline = rq_offline_scx, +#endif + +#ifdef CONFIG_SCHED_CORE + .pick_task = pick_task_scx, +#endif + + .task_tick = task_tick_scx, + + .switching_to = switching_to_scx, + .switched_from = switched_from_scx, + .switched_to = switched_to_scx, + .reweight_task = reweight_task_scx, + .prio_changed = prio_changed_scx, + + .update_curr = update_curr_scx, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif +}; + +static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +{ + memset(dsq, 0, sizeof(*dsq)); + + raw_spin_lock_init(&dsq->lock); + INIT_LIST_HEAD(&dsq->fifo); + dsq->id = dsq_id; +} + +static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) +{ + struct scx_dispatch_q *dsq; + int ret; + + if (dsq_id & SCX_DSQ_FLAG_BUILTIN) + return ERR_PTR(-EINVAL); + + dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) + return ERR_PTR(-ENOMEM); + + init_dsq(dsq, dsq_id); + + ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, + dsq_hash_params); + if (ret) { + kfree(dsq); + return ERR_PTR(ret); + } + return dsq; +} + +static void free_dsq_irq_workfn(struct irq_work *irq_work) +{ + struct llist_node *to_free = llist_del_all(&dsqs_to_free); + struct scx_dispatch_q *dsq, *tmp_dsq; + + llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) + kfree_rcu(dsq, rcu); +} + +static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); + +static void destroy_dsq(u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + unsigned long flags; + + rcu_read_lock(); + + dsq = rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); + if (!dsq) + goto out_unlock_rcu; + + raw_spin_lock_irqsave(&dsq->lock, flags); + + if (dsq->nr) { + scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", + dsq->id, dsq->nr); + goto out_unlock_dsq; + } + + if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) + goto out_unlock_dsq; + + /* + * Mark dead by invalidating ->id to prevent dispatch_enqueue() from + * queueing more tasks. As this function can be called from anywhere, + * freeing is bounced through an irq work to avoid nesting RCU + * operations inside scheduler locks. + */ + dsq->id = SCX_DSQ_INVALID; + llist_add(&dsq->free_node, &dsqs_to_free); + irq_work_queue(&free_dsq_irq_work); + +out_unlock_dsq: + raw_spin_unlock_irqrestore(&dsq->lock, flags); +out_unlock_rcu: + rcu_read_unlock(); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static void scx_cgroup_exit(void) +{ + struct cgroup_subsys_state *css; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + /* + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * cgroups and exit all the inited ones, all online cgroups are exited. + */ + rcu_read_lock(); + css_for_each_descendant_post(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + + if (!(tg->scx_flags & SCX_TG_INITED)) + continue; + tg->scx_flags &= ~SCX_TG_INITED; + + if (!scx_ops.cgroup_exit) + continue; + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); +} + +static int scx_cgroup_init(void) +{ + struct cgroup_subsys_state *css; + int ret; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + /* + * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * cgroups and init, all online cgroups are initialized. + */ + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + if ((tg->scx_flags & + (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) + continue; + + if (!scx_ops.cgroup_init) { + tg->scx_flags |= SCX_TG_INITED; + continue; + } + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, + css->cgroup, &args); + if (ret) { + css_put(css); + return ret; + } + tg->scx_flags |= SCX_TG_INITED; + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); + + return 0; +} + +static void scx_cgroup_config_knobs(void) +{ + static DEFINE_MUTEX(cgintf_mutex); + DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { }; + u64 knob_flags; + int i; + + /* + * Called from both class switch and ops enable/disable paths, + * synchronize internally. + */ + mutex_lock(&cgintf_mutex); + + /* if fair is in use, all knobs should be shown */ + if (!scx_switched_all()) { + bitmap_fill(mask, CPU_CFTYPE_CNT); + goto apply; + } + + /* + * On ext, only show the supported knobs. Otherwise, show all possible + * knobs so that configuration attempts succeed and the states are + * remembered while ops is not loaded. + */ + if (scx_enabled()) + knob_flags = scx_ops.flags; + else + knob_flags = SCX_OPS_ALL_FLAGS; + + if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) { + __set_bit(CPU_CFTYPE_WEIGHT, mask); + __set_bit(CPU_CFTYPE_WEIGHT_NICE, mask); + } +apply: + for (i = 0; i < CPU_CFTYPE_CNT; i++) + cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask)); + + mutex_unlock(&cgintf_mutex); +} + +#else +static void scx_cgroup_exit(void) {} +static int scx_cgroup_init(void) { return 0; } +static void scx_cgroup_config_knobs(void) {} +#endif + + +/******************************************************************************** + * Sysfs interface and ops enable/disable. + */ + +#define SCX_ATTR(_name) \ + static struct kobj_attribute scx_attr_##_name = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = scx_attr_##_name##_show, \ + } + +static ssize_t scx_attr_state_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", + scx_ops_enable_state_str[scx_ops_enable_state()]); +} +SCX_ATTR(state); + +static ssize_t scx_attr_switch_all_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); +} +SCX_ATTR(switch_all); + +static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); +} +SCX_ATTR(nr_rejected); + +static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); +} +SCX_ATTR(hotplug_seq); + +static struct attribute *scx_global_attrs[] = { + &scx_attr_state.attr, + &scx_attr_switch_all.attr, + &scx_attr_nr_rejected.attr, + &scx_attr_hotplug_seq.attr, + NULL, +}; + +static const struct attribute_group scx_global_attr_group = { + .attrs = scx_global_attrs, +}; + +static void scx_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static ssize_t scx_attr_ops_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", scx_ops.name); +} +SCX_ATTR(ops); + +static struct attribute *scx_sched_attrs[] = { + &scx_attr_ops.attr, + NULL, +}; +ATTRIBUTE_GROUPS(scx_sched); + +static const struct kobj_type scx_ktype = { + .release = scx_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = scx_sched_groups, +}; + +static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); +} + +static const struct kset_uevent_ops scx_uevent_ops = { + .uevent = scx_uevent, +}; + +/* + * Used by sched_fork() and __setscheduler_prio() to pick the matching + * sched_class. dl/rt are already handled. + */ +bool task_should_scx(struct task_struct *p) +{ + if (!scx_enabled() || + unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) + return false; + if (READ_ONCE(scx_switching_all)) + return true; + return p->policy == SCHED_EXT; +} + +/** + * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * + * Bypassing guarantees that all runnable tasks make forward progress without + * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might + * be held by tasks that the BPF scheduler is forgetting to run, which + * unfortunately also excludes toggling the static branches. + * + * Let's work around by overriding a couple ops and modifying behaviors based on + * the DISABLING state and then cycling the queued tasks through dequeue/enqueue + * to force global FIFO scheduling. + * + * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * + * b. ops.dispatch() is ignored. + * + * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be + * trusted. Whenever a tick triggers, the running task is rotated to the tail + * of the queue with core_sched_at touched. + * + * d. pick_next_task() suppresses zero slice warning. + * + * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + * + * f. scx_prio_less() reverts to the default core_sched_at order. + */ +static void scx_ops_bypass(bool bypass) +{ + int depth, cpu; + + if (bypass) { + depth = atomic_inc_return(&scx_ops_bypass_depth); + WARN_ON_ONCE(depth <= 0); + if (depth != 1) + return; + } else { + depth = atomic_dec_return(&scx_ops_bypass_depth); + WARN_ON_ONCE(depth < 0); + if (depth != 0) + return; + } + + mutex_lock(&scx_ops_enable_mutex); + if (!scx_enabled()) + goto out_unlock; + + /* + * No task property is changing. We just need to make sure all currently + * queued tasks are re-queued according to the new scx_ops_bypassing() + * state. As an optimization, walk each rq's runnable_list instead of + * the scx_tasks list. + * + * This function can't trust the scheduler and thus can't use + * cpus_read_lock(). Walk all possible CPUs instead of online. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p, *n; + + rq_lock_irqsave(rq, &rf); + + /* + * The use of list_for_each_entry_safe_reverse() is required + * because each task is going to be removed from and added back + * to the runnable_list during iteration. Because they're added + * to the tail of the list, safe reverse iteration can still + * visit all nodes. + */ + list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, + scx.runnable_node) { + struct sched_enq_and_set_ctx ctx; + + /* cycling deq/enq is enough, see the function comment */ + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + sched_enq_and_set_task(&ctx); + } + + rq_unlock_irqrestore(rq, &rf); + + /* kick to restore ticks */ + resched_cpu(cpu); + } + +out_unlock: + mutex_unlock(&scx_ops_enable_mutex); +} + +static void free_exit_info(struct scx_exit_info *ei) +{ + kfree(ei->dump); + kfree(ei->msg); + kfree(ei->bt); + kfree(ei); +} + +static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) +{ + struct scx_exit_info *ei; + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) + return NULL; + + ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL); + ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); + ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); + + if (!ei->bt || !ei->msg || !ei->dump) { + free_exit_info(ei); + return NULL; + } + + return ei; +} + +static const char *scx_exit_reason(enum scx_exit_kind kind) +{ + switch (kind) { + case SCX_EXIT_UNREG: + return "Scheduler unregistered from user space"; + case SCX_EXIT_UNREG_BPF: + return "Scheduler unregistered from BPF"; + case SCX_EXIT_UNREG_KERN: + return "Scheduler unregistered from the main kernel"; + case SCX_EXIT_SYSRQ: + return "disabled by sysrq-S"; + case SCX_EXIT_ERROR: + return "runtime error"; + case SCX_EXIT_ERROR_BPF: + return "scx_bpf_error"; + case SCX_EXIT_ERROR_STALL: + return "runnable task stall"; + default: + return ""; + } +} + +static void scx_ops_disable_workfn(struct kthread_work *work) +{ + struct scx_exit_info *ei = scx_exit_info; + struct scx_task_iter sti; + struct task_struct *p; + struct rhashtable_iter rht_iter; + struct scx_dispatch_q *dsq; + int i, kind; + + kind = atomic_read(&scx_exit_kind); + while (true) { + /* + * NONE indicates that a new scx_ops has been registered since + * disable was scheduled - don't kill the new ops. DONE + * indicates that the ops has already been disabled. + */ + if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) + return; + if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) + break; + } + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + /* guarantee forward progress by bypassing scx_ops */ + scx_ops_bypass(true); + + switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { + case SCX_OPS_DISABLING: + WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); + break; + case SCX_OPS_DISABLED: + pr_warn("sched_ext: ops error detected without ops (%s)\n", + scx_exit_info->msg); + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != + SCX_OPS_DISABLING); + goto done; + default: + break; + } + + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ + mutex_lock(&scx_ops_enable_mutex); + + static_branch_disable(&__scx_switched_all); + WRITE_ONCE(scx_switching_all, false); + + /* + * Avoid racing against fork and cgroup changes. See scx_ops_enable() + * for explanation on the locking order. + */ + percpu_down_write(&scx_fork_rwsem); + cpus_read_lock(); + scx_cgroup_lock(); + + spin_lock_irq(&scx_tasks_lock); + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); + __setscheduler_prio(p, p->prio); + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + scx_ops_exit_task(p); + } + scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); + + /* no task is on scx, turn off all the switches and flush in-progress calls */ + static_branch_disable_cpuslocked(&__scx_ops_enabled); + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) + static_branch_disable_cpuslocked(&scx_has_op[i]); + static_branch_disable_cpuslocked(&scx_ops_enq_last); + static_branch_disable_cpuslocked(&scx_ops_enq_exiting); + static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + synchronize_rcu(); + + scx_cgroup_exit(); + + scx_cgroup_unlock(); + cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + + if (ei->kind >= SCX_EXIT_ERROR) { + printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); + + if (ei->msg[0] == '\0') + printk(KERN_ERR "sched_ext: %s\n", ei->reason); + else + printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); + + stack_trace_print(ei->bt, ei->bt_len, 2); + } + + if (scx_ops.exit) + SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + + cancel_delayed_work_sync(&scx_watchdog_work); + + /* + * Delete the kobject from the hierarchy eagerly in addition to just + * dropping a reference. Otherwise, if the object is deleted + * asynchronously, sysfs could observe an object of the same name still + * in the hierarchy when another scheduler is loaded. + */ + kobject_del(scx_root_kobj); + kobject_put(scx_root_kobj); + scx_root_kobj = NULL; + + memset(&scx_ops, 0, sizeof(scx_ops)); + + rhashtable_walk_enter(&dsq_hash, &rht_iter); + do { + rhashtable_walk_start(&rht_iter); + + while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + destroy_dsq(dsq->id); + + rhashtable_walk_stop(&rht_iter); + } while (dsq == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&rht_iter); + + free_percpu(scx_dsp_buf); + scx_dsp_buf = NULL; + scx_dsp_max_batch = 0; + + free_exit_info(scx_exit_info); + scx_exit_info = NULL; + + mutex_unlock(&scx_ops_enable_mutex); + + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != + SCX_OPS_DISABLING); + + scx_cgroup_config_knobs(); +done: + scx_ops_bypass(false); +} + +static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); + +static void schedule_scx_ops_disable_work(void) +{ + struct kthread_worker *helper = READ_ONCE(scx_ops_helper); + + /* + * We may be called spuriously before the first bpf_sched_ext_reg(). If + * scx_ops_helper isn't set up yet, there's nothing to do. + */ + if (helper) + kthread_queue_work(helper, &scx_ops_disable_work); +} + +static void scx_ops_disable(enum scx_exit_kind kind) +{ + int none = SCX_EXIT_NONE; + + if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) + kind = SCX_EXIT_ERROR; + + atomic_try_cmpxchg(&scx_exit_kind, &none, kind); + + schedule_scx_ops_disable_work(); +} + +static void scx_dump_task(struct seq_buf *s, struct task_struct *p, char marker, + unsigned long now) +{ + static unsigned long bt[SCX_EXIT_BT_LEN]; + char dsq_id_buf[19] = "(n/a)"; + unsigned long ops_state = atomic_long_read(&p->scx.ops_state); + unsigned int bt_len; + size_t avail, used; + char *buf; + + if (p->scx.dsq) + scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", + (unsigned long long)p->scx.dsq->id); + + seq_buf_printf(s, "\n %c%c %s[%d] %+ldms\n", + marker, task_state_to_char(p), p->comm, p->pid, + jiffies_delta_msecs(p->scx.runnable_at, now)); + seq_buf_printf(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu\n", + scx_get_task_state(p), + p->scx.flags & ~SCX_TASK_STATE_MASK, + p->scx.dsq_flags, + ops_state & SCX_OPSS_STATE_MASK, + ops_state >> SCX_OPSS_QSEQ_SHIFT); + seq_buf_printf(s, " sticky/holding_cpu=%d/%d dsq_id=%s\n", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + seq_buf_printf(s, " cpus=%*pb\n\n", cpumask_pr_args(p->cpus_ptr)); + + bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); + + avail = seq_buf_get_buf(s, &buf); + used = stack_trace_snprint(buf, avail, bt, bt_len, 3); + seq_buf_commit(s, used < avail ? used : -1); +} + +static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) +{ + const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; + unsigned long now = jiffies; + struct seq_buf s; + size_t avail, used; + char *buf; + int cpu; + + if (dump_len <= sizeof(trunc_marker)) + return; + + seq_buf_init(&s, ei->dump, dump_len - sizeof(trunc_marker)); + + seq_buf_printf(&s, "%s[%d] triggered exit kind %d:\n %s (%s)\n\n", + current->comm, current->pid, ei->kind, ei->reason, ei->msg); + seq_buf_printf(&s, "Backtrace:\n"); + avail = seq_buf_get_buf(&s, &buf); + used = stack_trace_snprint(buf, avail, ei->bt, ei->bt_len, 1); + seq_buf_commit(&s, used < avail ? used : -1); + + seq_buf_printf(&s, "\nRunqueue states\n"); + seq_buf_printf(&s, "---------------\n"); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p; + + rq_lock(rq, &rf); + + if (list_empty(&rq->scx.runnable_list) && + rq->curr->sched_class == &idle_sched_class) + goto next; + + seq_buf_printf(&s, "\nCPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu\n", + cpu, rq->scx.nr_running, rq->scx.flags, + rq->scx.cpu_released, rq->scx.ops_qseq, + rq->scx.pnt_seq); + seq_buf_printf(&s, " curr=%s[%d] class=%ps\n", + rq->curr->comm, rq->curr->pid, + rq->curr->sched_class); + if (!cpumask_empty(rq->scx.cpus_to_kick)) + seq_buf_printf(&s, " cpus_to_kick : %*pb\n", + cpumask_pr_args(rq->scx.cpus_to_kick)); + if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) + seq_buf_printf(&s, " idle_to_kick : %*pb\n", + cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); + if (!cpumask_empty(rq->scx.cpus_to_preempt)) + seq_buf_printf(&s, " cpus_to_preempt: %*pb\n", + cpumask_pr_args(rq->scx.cpus_to_preempt)); + if (!cpumask_empty(rq->scx.cpus_to_wait)) + seq_buf_printf(&s, " cpus_to_wait : %*pb\n", + cpumask_pr_args(rq->scx.cpus_to_wait)); + + if (rq->curr->sched_class == &ext_sched_class) + scx_dump_task(&s, rq->curr, '*', now); + + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) + scx_dump_task(&s, p, ' ', now); + next: + rq_unlock(rq, &rf); + } + + if (seq_buf_has_overflowed(&s)) + memcpy(ei->dump + seq_buf_used(&s) - 1, trunc_marker, + sizeof(trunc_marker)); +} + +static void scx_ops_error_irq_workfn(struct irq_work *irq_work) +{ + scx_dump_state(scx_exit_info, scx_ops.exit_dump_len); + schedule_scx_ops_disable_work(); +} + +static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); + +static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, + s64 exit_code, + const char *fmt, ...) +{ + struct scx_exit_info *ei = scx_exit_info; + int none = SCX_EXIT_NONE; + va_list args; + + if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) + return; + + if (kind >= SCX_EXIT_ERROR) + ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); + + va_start(args, fmt); + vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); + va_end(args); + + ei->exit_code = exit_code; + + /* + * Set ei->kind and ->reason for scx_dump_state(). They'll be set again + * in scx_ops_disable_workfn(). + */ + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + irq_work_queue(&scx_ops_error_irq_work); +} + +static struct kthread_worker *scx_create_rt_helper(const char *name) +{ + struct kthread_worker *helper; + + helper = kthread_create_worker(0, name); + if (helper) + sched_set_fifo(helper->task); + return helper; +} + +static void check_hotplug_seq(const struct sched_ext_ops *ops) +{ + unsigned long long global_hotplug_seq; + + /* + * If a hotplug event has occurred between when a scheduler was + * initialized, and when we were able to attach, exit and notify user + * space about it. + */ + if (ops->hotplug_seq) { + global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); + if (ops->hotplug_seq != global_hotplug_seq) { + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); + } + } +} + +static int validate_ops(const struct sched_ext_ops *ops) +{ + /* + * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the + * ops.enqueue() callback isn't implemented. + */ + if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { + scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + return -EINVAL; + } + + return 0; +} + +static int scx_ops_enable(struct sched_ext_ops *ops) +{ + struct scx_task_iter sti; + struct task_struct *p; + unsigned long timeout; + int i, cpu, ret; + + mutex_lock(&scx_ops_enable_mutex); + + if (!scx_ops_helper) { + WRITE_ONCE(scx_ops_helper, + scx_create_rt_helper("sched_ext_ops_helper")); + if (!scx_ops_helper) { + ret = -ENOMEM; + goto err_unlock; + } + } + + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { + ret = -EBUSY; + goto err_unlock; + } + + scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); + if (!scx_root_kobj) { + ret = -ENOMEM; + goto err_unlock; + } + + scx_root_kobj->kset = scx_kset; + ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); + if (ret < 0) + goto err; + + scx_exit_info = alloc_exit_info(ops->exit_dump_len); + if (!scx_exit_info) { + ret = -ENOMEM; + goto err_del; + } + + /* + * Set scx_ops, transition to PREPPING and clear exit info to arm the + * disable path. Failure triggers full disabling from here on. + */ + scx_ops = *ops; + + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != + SCX_OPS_DISABLED); + + atomic_set(&scx_exit_kind, SCX_EXIT_NONE); + scx_warned_zero_slice = false; + + atomic_long_set(&scx_nr_rejected, 0); + + for_each_possible_cpu(cpu) + cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + + /* + * Keep CPUs stable during enable so that the BPF scheduler can track + * online CPUs by watching ->on/offline_cpu() after ->init(). + */ + cpus_read_lock(); + + if (scx_ops.init) { + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init); + if (ret) { + ret = ops_sanitize_err("init", ret); + goto err_disable_unlock_cpus; + } + } + + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + + cpus_read_unlock(); + + ret = validate_ops(ops); + if (ret) + goto err_disable; + + WARN_ON_ONCE(scx_dsp_buf); + scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; + scx_dsp_buf = __alloc_percpu(sizeof(scx_dsp_buf[0]) * scx_dsp_max_batch, + __alignof__(scx_dsp_buf[0])); + if (!scx_dsp_buf) { + ret = -ENOMEM; + goto err_disable; + } + + if (ops->timeout_ms) + timeout = msecs_to_jiffies(ops->timeout_ms); + else + timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + WRITE_ONCE(scx_watchdog_timeout, timeout); + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + queue_delayed_work(system_unbound_wq, &scx_watchdog_work, + scx_watchdog_timeout / 2); + + /* + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. + * + * We don't need to keep the CPUs stable but static_branch_*() requires + * cpus_read_lock() and scx_cgroup_rwsem must nest inside + * cpu_hotplug_lock because of the following dependency chain: + * + * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem + * + * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use + * static_branch_*_cpuslocked(). + * + * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the + * following dependency chain: + * + * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock + */ + percpu_down_write(&scx_fork_rwsem); + cpus_read_lock(); + scx_cgroup_lock(); + + check_hotplug_seq(ops); + + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + + if (ops->flags & SCX_OPS_ENQ_LAST) + static_branch_enable_cpuslocked(&scx_ops_enq_last); + + if (ops->flags & SCX_OPS_ENQ_EXITING) + static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + if (scx_ops.cpu_acquire || scx_ops.cpu_release) + static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); + + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { + reset_idle_masks(); + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + } else { + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + } + + /* + * All cgroups should be initialized before letting in tasks. cgroup + * on/offlining and task migrations are already locked out. + */ + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; + + static_branch_enable_cpuslocked(&__scx_ops_enabled); + + /* + * Enable ops for every task. Fork is excluded by scx_fork_rwsem + * preventing new tasks from being added. No need to exclude tasks + * leaving as sched_ext_free() can handle both prepped and enabled + * tasks. Prep all tasks first and then enable them with preemption + * disabled. + */ + spin_lock_irq(&scx_tasks_lock); + + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered(&sti))) { + get_task_struct(p); + spin_unlock_irq(&scx_tasks_lock); + + ret = scx_ops_init_task(p, task_group(p), false); + if (ret) { + put_task_struct(p); + spin_lock_irq(&scx_tasks_lock); + scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); + pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", + ret, p->comm, p->pid); + goto err_disable_unlock_all; + } + + put_task_struct(p); + spin_lock_irq(&scx_tasks_lock); + } + scx_task_iter_exit(&sti); + + /* + * All tasks are prepped but are still ops-disabled. Ensure that + * %current can't be scheduled out and switch everyone. + * preempt_disable() is necessary because we can't guarantee that + * %current won't be starved if scheduled out while switching. + */ + preempt_disable(); + + /* + * From here on, the disable path must assume that tasks have ops + * enabled and need to be recovered. + * + * Transition to ENABLING fails iff the BPF scheduler has already + * triggered scx_bpf_error(). Returning an error code here would lose + * the recorded error information. Exit indicating success so that the + * error is notified through ops.exit() with all the details. + */ + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { + preempt_enable(); + spin_unlock_irq(&scx_tasks_lock); + WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); + ret = 0; + goto err_disable_unlock_all; + } + + /* + * We're fully committed and can't fail. The PREPPED -> ENABLED + * transitions here are synchronized against sched_ext_free() through + * scx_tasks_lock. + */ + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + scx_set_task_state(p, SCX_TASK_READY); + __setscheduler_prio(p, p->prio); + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + } + scx_task_iter_exit(&sti); + + spin_unlock_irq(&scx_tasks_lock); + preempt_enable(); + scx_cgroup_unlock(); + cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + + /* see above ENABLING transition for the explanation on exiting with 0 */ + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { + WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); + ret = 0; + goto err_disable; + } + + if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) + static_branch_enable(&__scx_switched_all); + + kobject_uevent(scx_root_kobj, KOBJ_ADD); + mutex_unlock(&scx_ops_enable_mutex); + + scx_cgroup_config_knobs(); + + return 0; + +err_del: + kobject_del(scx_root_kobj); +err: + kobject_put(scx_root_kobj); + scx_root_kobj = NULL; + if (scx_exit_info) { + free_exit_info(scx_exit_info); + scx_exit_info = NULL; + } +err_unlock: + mutex_unlock(&scx_ops_enable_mutex); + return ret; + +err_disable_unlock_all: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable_unlock_cpus: + cpus_read_unlock(); +err_disable: + mutex_unlock(&scx_ops_enable_mutex); + /* must be fully disabled before returning */ + scx_ops_disable(SCX_EXIT_ERROR); + kthread_flush_work(&scx_ops_disable_work); + return ret; +} + + +/******************************************************************************** + * bpf_struct_ops plumbing. + */ +#include +#include +#include + +extern struct btf *btf_vmlinux; +static const struct btf_type *task_struct_type; +static u32 task_struct_type_id; + +/* Make the 2nd argument of .dispatch a pointer that can be NULL. */ +static bool promote_dispatch_2nd_arg(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct btf *btf = bpf_get_btf_vmlinux(); + const struct bpf_struct_ops_desc *st_ops_desc; + const struct btf_member *member; + const struct btf_type *t; + u32 btf_id, member_idx; + const char *mname; + + /* btf_id should be the type id of struct sched_ext_ops */ + btf_id = prog->aux->attach_btf_id; + st_ops_desc = bpf_struct_ops_find(btf, btf_id); + if (!st_ops_desc) + return false; + + /* BTF type of struct sched_ext_ops */ + t = st_ops_desc->type; + + member_idx = prog->expected_attach_type; + if (member_idx >= btf_type_vlen(t)) + return false; + + /* + * Get the member name of this struct_ops program, which corresponds to + * a field in struct sched_ext_ops. For example, the member name of the + * dispatch struct_ops program (callback) is "dispatch". + */ + member = &btf_type_member(t)[member_idx]; + mname = btf_name_by_offset(btf_vmlinux, member->name_off); + + /* + * Check if it is the second argument of the function pointer at + * "dispatch" in struct sched_ext_ops. The arguments of struct_ops + * operators are sequential and 64-bit, so the second argument is at + * offset sizeof(__u64). + */ + if (strcmp(mname, "dispatch") == 0 && + off == sizeof(__u64)) { + /* + * The value is a pointer to a type (struct task_struct) given + * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), + * however, can be a NULL (PTR_MAYBE_NULL). The BPF program + * should check the pointer to make sure it is not NULL before + * using it, or the verifier will reject the program. + * + * Longer term, this is something that should be addressed by + * BTF, and be fully contained within the verifier. + */ + info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; + info->btf = btf_vmlinux; + info->btf_id = task_struct_type_id; + + return true; + } + + return false; +} + +static bool bpf_scx_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type != BPF_READ) + return false; + if (promote_dispatch_2nd_arg(off, size, type, prog, info)) + return true; + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (off % size != 0) + return false; + + return btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, + int size) +{ + const struct btf_type *t; + + t = btf_type_by_id(reg->btf, reg->btf_id); + if (t == task_struct_type) { + if (off >= offsetof(struct task_struct, scx.slice) && + off + size <= offsetofend(struct task_struct, scx.slice)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.disallow) && + off + size <= offsetofend(struct task_struct, scx.disallow)) + return SCALAR_VALUE; + } + + return -EACCES; +} + +static const struct bpf_func_proto * +bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; + default: + return bpf_base_func_proto(func_id, prog); + } +} + +static const struct bpf_verifier_ops bpf_scx_verifier_ops = { + .get_func_proto = bpf_scx_get_func_proto, + .is_valid_access = bpf_scx_is_valid_access, + .btf_struct_access = bpf_scx_btf_struct_access, +}; + +static int bpf_scx_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct sched_ext_ops *uops = udata; + struct sched_ext_ops *ops = kdata; + u32 moff = __btf_member_bit_offset(t, member) / 8; + int ret; + + switch (moff) { + case offsetof(struct sched_ext_ops, dispatch_max_batch): + if (*(u32 *)(udata + moff) > INT_MAX) + return -E2BIG; + ops->dispatch_max_batch = *(u32 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, flags): + if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) + return -EINVAL; + ops->flags = *(u64 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, name): + ret = bpf_obj_name_cpy(ops->name, uops->name, + sizeof(ops->name)); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + return 1; + case offsetof(struct sched_ext_ops, timeout_ms): + if (msecs_to_jiffies(*(u32 *)(udata + moff)) > + SCX_WATCHDOG_MAX_TIMEOUT) + return -E2BIG; + ops->timeout_ms = *(u32 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, exit_dump_len): + ops->exit_dump_len = + *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; + return 1; + case offsetof(struct sched_ext_ops, hotplug_seq): + ops->hotplug_seq = *(u64 *)(udata + moff); + return 1; + } + + return 0; +} + +static int bpf_scx_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct sched_ext_ops, init_task): +#ifdef CONFIG_EXT_GROUP_SCHED + case offsetof(struct sched_ext_ops, cgroup_init): + case offsetof(struct sched_ext_ops, cgroup_exit): + case offsetof(struct sched_ext_ops, cgroup_prep_move): +#endif + case offsetof(struct sched_ext_ops, init): + case offsetof(struct sched_ext_ops, exit): + break; + default: + if (prog->sleepable) + return -EINVAL; + } + + return 0; +} + +static int bpf_scx_reg(void *kdata) +{ + return scx_ops_enable(kdata); +} + +static void bpf_scx_unreg(void *kdata) +{ + scx_ops_disable(SCX_EXIT_UNREG); + kthread_flush_work(&scx_ops_disable_work); +} + +static int bpf_scx_init(struct btf *btf) +{ + u32 type_id; + + type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + task_struct_type = btf_type_by_id(btf, type_id); + task_struct_type_id = type_id; + + return 0; +} + +static int bpf_scx_update(void *kdata, void *old_kdata) +{ + /* + * sched_ext does not support updating the actively-loaded BPF + * scheduler, as registering a BPF scheduler can always fail if the + * scheduler returns an error code for e.g. ops.init(), ops.init_task(), + * etc. Similarly, we can always race with unregistration happening + * elsewhere, such as with sysrq. + */ + return -EOPNOTSUPP; +} + +static int bpf_scx_validate(void *kdata) +{ + return 0; +} + +static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } +static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} +static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} +static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} +static void runnable_stub(struct task_struct *p, u64 enq_flags) {} +static void running_stub(struct task_struct *p) {} +static void stopping_stub(struct task_struct *p, bool runnable) {} +static void quiescent_stub(struct task_struct *p, u64 deq_flags) {} +static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } +static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; } +static void set_weight_stub(struct task_struct *p, u32 weight) {} +static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} +static void update_idle_stub(s32 cpu, bool idle) {} +static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {} +static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {} +static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } +static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} +static void enable_stub(struct task_struct *p) {} +static void disable_stub(struct task_struct *p) {} +#ifdef CONFIG_EXT_GROUP_SCHED +static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } +static void cgroup_exit_stub(struct cgroup *cgrp) {} +static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } +static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} +#endif +static void cpu_online_stub(s32 cpu) {} +static void cpu_offline_stub(s32 cpu) {} +static s32 init_stub(void) { return -EINVAL; } +static void exit_stub(struct scx_exit_info *info) {} + +static struct sched_ext_ops __bpf_ops_sched_ext_ops = { + .select_cpu = select_cpu_stub, + .enqueue = enqueue_stub, + .dequeue = dequeue_stub, + .dispatch = dispatch_stub, + .runnable = runnable_stub, + .running = running_stub, + .stopping = stopping_stub, + .quiescent = quiescent_stub, + .yield = yield_stub, + .core_sched_before = core_sched_before_stub, + .set_weight = set_weight_stub, + .set_cpumask = set_cpumask_stub, + .update_idle = update_idle_stub, + .cpu_acquire = cpu_acquire_stub, + .cpu_release = cpu_release_stub, + .init_task = init_task_stub, + .exit_task = exit_task_stub, + .enable = enable_stub, + .disable = disable_stub, +#ifdef CONFIG_EXT_GROUP_SCHED + .cgroup_init = cgroup_init_stub, + .cgroup_exit = cgroup_exit_stub, + .cgroup_prep_move = cgroup_prep_move_stub, + .cgroup_move = cgroup_move_stub, + .cgroup_cancel_move = cgroup_cancel_move_stub, + .cgroup_set_weight = cgroup_set_weight_stub, +#endif + .cpu_online = cpu_online_stub, + .cpu_offline = cpu_offline_stub, + .init = init_stub, + .exit = exit_stub, +}; + +static struct bpf_struct_ops bpf_sched_ext_ops = { + .verifier_ops = &bpf_scx_verifier_ops, + .reg = bpf_scx_reg, + .unreg = bpf_scx_unreg, + .check_member = bpf_scx_check_member, + .init_member = bpf_scx_init_member, + .init = bpf_scx_init, + .update = bpf_scx_update, + .validate = bpf_scx_validate, + .name = "sched_ext_ops", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_ops_sched_ext_ops +}; + + +/******************************************************************************** + * System integration and init. + */ + +static void sysrq_handle_sched_ext_reset(u8 key) +{ + if (scx_ops_helper) + scx_ops_disable(SCX_EXIT_SYSRQ); + else + pr_info("sched_ext: BPF scheduler not yet used\n"); +} + +static const struct sysrq_key_op sysrq_sched_ext_reset_op = { + .handler = sysrq_handle_sched_ext_reset, + .help_msg = "reset-sched-ext(S)", + .action_msg = "Disable sched_ext and revert all tasks to CFS", + .enable_mask = SYSRQ_ENABLE_RTNICE, +}; + +static bool can_skip_idle_kick(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + /* + * We can skip idle kicking if @rq is going to go through at least one + * full SCX scheduling cycle before going idle. Just checking whether + * curr is not idle is insufficient because we could be racing + * balance_one() trying to pull the next task from a remote rq, which + * may fail, and @rq may become idle afterwards. + * + * The race window is small and we don't and can't guarantee that @rq is + * only kicked while idle anyway. Skip only when sure. + */ + return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING); +} + +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) +{ + struct rq *rq = cpu_rq(cpu); + struct scx_rq *this_scx = &this_rq->scx; + bool should_wait = false; + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + /* + * During CPU hotplug, a CPU may depend on kicking itself to make + * forward progress. Allow kicking self regardless of online state. + */ + if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { + if (rq->curr->sched_class == &ext_sched_class) + rq->curr->scx.slice = 0; + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + } + + if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { + pseqs[cpu] = rq->scx.pnt_seq; + should_wait = true; + } + + resched_curr(rq); + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } + + raw_spin_rq_unlock_irqrestore(rq, flags); + + return should_wait; +} + +static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + if (!can_skip_idle_kick(rq) && + (cpu_online(cpu) || cpu == cpu_of(this_rq))) + resched_curr(rq); + + raw_spin_rq_unlock_irqrestore(rq, flags); +} + +static void kick_cpus_irq_workfn(struct irq_work *irq_work) +{ + struct rq *this_rq = this_rq(); + struct scx_rq *this_scx = &this_rq->scx; + unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + bool should_wait = false; + s32 cpu; + + for_each_cpu(cpu, this_scx->cpus_to_kick) { + should_wait |= kick_one_cpu(cpu, this_rq, pseqs); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { + kick_one_cpu_if_idle(cpu, this_rq); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + if (!should_wait) + return; + + for_each_cpu(cpu, this_scx->cpus_to_wait) { + unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + + if (cpu != cpu_of(this_rq)) { + /* + * Pairs with smp_store_release() issued by this CPU in + * scx_next_task_picked() on the resched path. + * + * We busy-wait here to guarantee that no other task can + * be scheduled on our core before the target CPU has + * entered the resched path. + */ + while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) + cpu_relax(); + } + + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } +} + +/** + * print_scx_info - print out sched_ext scheduler state + * @log_lvl: the log level to use when printing + * @p: target task + * + * If a sched_ext scheduler is enabled, print the name and state of the + * scheduler. If @p is on sched_ext, print further information about the task. + * + * This function can be safely called on any task as long as the task_struct + * itself is accessible. While safe, this function isn't synchronized and may + * print out mixups or garbages of limited length. + */ +void print_scx_info(const char *log_lvl, struct task_struct *p) +{ + enum scx_ops_enable_state state = scx_ops_enable_state(); + const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; + char runnable_at_buf[22] = "?"; + struct sched_class *class; + unsigned long runnable_at; + + if (state == SCX_OPS_DISABLED) + return; + + /* + * Carefully check if the task was running on sched_ext, and then + * carefully copy the time it's been runnable, and its state. + */ + if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || + class != &ext_sched_class) { + printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, + scx_ops_enable_state_str[state], all); + return; + } + + if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, + sizeof(runnable_at))) + scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", + jiffies_delta_msecs(runnable_at, jiffies)); + + /* print everything onto one line to conserve console space */ + printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", + log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, + runnable_at_buf); +} + +static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) +{ + /* + * SCX schedulers often have userspace components which are sometimes + * involved in critial scheduling paths. PM operations involve freezing + * userspace which can lead to scheduling misbehaviors including stalls. + * Let's bypass while PM operations are in progress. + */ + switch (event) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + case PM_RESTORE_PREPARE: + scx_ops_bypass(true); + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + case PM_POST_RESTORE: + scx_ops_bypass(false); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_pm_notifier = { + .notifier_call = scx_pm_handler, +}; + +void __init init_sched_ext_class(void) +{ + s32 cpu, v; + + /* + * The following is to prevent the compiler from optimizing out the enum + * definitions so that BPF scheduler implementations can use them + * through the generated vmlinux.h. + */ + WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | + SCX_TG_ONLINE); + + BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); + init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); +#ifdef CONFIG_SMP + BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); +#endif + scx_kick_cpus_pnt_seqs = + __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, + __alignof__(scx_kick_cpus_pnt_seqs[0])); + BUG_ON(!scx_kick_cpus_pnt_seqs); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + INIT_LIST_HEAD(&rq->scx.runnable_list); + + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + } + + register_sysrq_key('S', &sysrq_sched_ext_reset_op); + INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); + scx_cgroup_config_knobs(); +} + + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ +#include + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_create_dsq - Create a custom DSQ + * @dsq_id: DSQ to create + * @node: NUMA node to allocate from + * + * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(), + * ops.init_task(), ops.cgroup_init() and ops.cgroup_prep_move(). + */ +__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) +{ + if (!scx_kf_allowed(SCX_KF_SLEEPABLE)) + return -EINVAL; + + if (unlikely(node >= (int)nr_node_ids || + (node < 0 && node != NUMA_NO_NODE))) + return -EINVAL; + return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_sleepable) +BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_KFUNCS_END(scx_kfunc_ids_sleepable) + +static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_sleepable, +}; + +static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) +{ + if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + return false; + + lockdep_assert_irqs_disabled(); + + if (unlikely(!p)) { + scx_ops_error("called with NULL task"); + return false; + } + + if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { + scx_ops_error("invalid enq_flags 0x%llx", enq_flags); + return false; + } + + return true; +} + +static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) +{ + struct task_struct *ddsp_task; + int idx; + + ddsp_task = __this_cpu_read(direct_dispatch_task); + if (ddsp_task) { + mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + return; + } + + idx = __this_cpu_read(scx_dsp_ctx.buf_cursor); + if (unlikely(idx >= scx_dsp_max_batch)) { + scx_ops_error("dispatch buffer overflow"); + return; + } + + this_cpu_ptr(scx_dsp_buf)[idx] = (struct scx_dsp_buf_ent){ + .task = p, + .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, + .dsq_id = dsq_id, + .enq_flags = enq_flags, + }; + __this_cpu_inc(scx_dsp_ctx.buf_cursor); +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to + * @slice: duration @p can run for in nsecs + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe + * to call this function spuriously. Can be called from ops.enqueue(), + * ops.select_cpu(), and ops.dispatch(). + * + * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch + * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be + * used to target the local DSQ of a CPU other than the enqueueing one. Use + * ops.select_cpu() to be on the target CPU in the first place. + * + * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p + * will be directly dispatched to the corresponding dispatch queue after + * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be + * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). + * @enq_flags are OR'd with the enqueue flags on the enqueue path before the + * task is dispatched. + * + * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id + * and this function can be called upto ops.dispatch_max_batch times to dispatch + * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the + * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * + * This function doesn't have any locking restrictions and may be called under + * BPF locks (in the future when BPF introduces more flexible locking). + * + * @p is allowed to run for @slice. The scheduling path is triggered on slice + * exhaustion. If zero, the current residual slice is maintained. If + * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with + * scx_bpf_kick_cpu() to trigger scheduling. + */ +__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + if (!scx_dispatch_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + scx_dispatch_commit(p, dsq_id, enq_flags); +} + +/** + * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to + * @slice: duration @p can run for in nsecs + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime and always + * consumed after the tasks in the FIFO queue. All other aspects are identical + * to scx_bpf_dispatch(). + * + * @vtime ordering is according to time_before64() which considers wrapping. A + * numerically larger vtime may indicate an earlier position in the ordering and + * vice-versa. + */ +__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + if (!scx_dispatch_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_enqueue_dispatch, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots + * + * Can only be called from ops.dispatch(). + */ +__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) +{ + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return 0; + + return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx.buf_cursor); +} + +/** + * scx_bpf_dispatch_cancel - Cancel the latest dispatch + * + * Cancel the latest dispatch. Can be called multiple times to cancel further + * dispatches. Can only be called from ops.dispatch(). + */ +__bpf_kfunc void scx_bpf_dispatch_cancel(void) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return; + + if (dspc->buf_cursor > 0) + dspc->buf_cursor--; + else + scx_ops_error("dispatch buffer underflow"); +} + +/** + * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ + * @dsq_id: DSQ to consume + * + * Consume a task from the non-local DSQ identified by @dsq_id and transfer it + * to the current CPU's local DSQ for execution. Can only be called from + * ops.dispatch(). + * + * This function flushes the in-flight dispatches from scx_bpf_dispatch() before + * trying to consume the specified DSQ. It may also grab rq locks and thus can't + * be called under any BPF locks. + * + * Returns %true if a task has been consumed, %false if there isn't any task to + * consume. + */ +__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + struct scx_dispatch_q *dsq; + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return false; + + flush_dispatch_buf(dspc->rq, dspc->rf); + + dsq = find_non_local_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); + return false; + } + + if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) { + /* + * A successfully consumed task can be dequeued before it starts + * running while the CPU is trying to migrate other dispatched + * tasks. Bump nr_tasks to tell balance_scx() to retry on empty + * local DSQ. + */ + dspc->nr_tasks++; + return true; + } else { + return false; + } +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) +BTF_ID_FLAGS(func, scx_bpf_consume) +BTF_KFUNCS_END(scx_kfunc_ids_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_dispatch, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + u32 nr_enqueued, i; + struct rq *rq; + struct scx_rq *scx_rq; + + if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + scx_rq = &rq->scx; + + /* + * Get the number of tasks on the local DSQ before iterating over it to + * pull off tasks. The enqueue callback below can signal that it wants + * the task to stay on the local DSQ, and we want to prevent the BPF + * scheduler from causing us to loop indefinitely. + */ + nr_enqueued = scx_rq->local_dsq.nr; + for (i = 0; i < nr_enqueued; i++) { + struct task_struct *p; + + p = first_local_task(rq); + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != + SCX_OPSS_NONE); + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + WARN_ON_ONCE(p->scx.holding_cpu != -1); + dispatch_dequeue(scx_rq, p); + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + } + + return nr_enqueued; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) +BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) + +static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_cpu_release, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct rq *this_rq; + unsigned long irq_flags; + + if (!ops_cpu_valid(cpu, NULL)) + return; + + /* + * While bypassing for PM ops, IRQ handling may not be online which can + * lead to irq_work_queue() malfunction such as infinite busy wait for + * IRQ status update. Suppress kicking. + */ + if (scx_ops_bypassing()) + return; + + local_irq_save(irq_flags); + + this_rq = this_rq(); + + /* + * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting + * rq locks. We can probably be smarter and avoid bouncing if called + * from ops which don't hold a rq lock. + */ + if (flags & SCX_KICK_IDLE) { + struct rq *target_rq = cpu_rq(cpu); + + if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) + scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + + if (raw_spin_rq_trylock(target_rq)) { + if (can_skip_idle_kick(target_rq)) { + raw_spin_rq_unlock(target_rq); + goto out; + } + raw_spin_rq_unlock(target_rq); + } + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); + } else { + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); + + if (flags & SCX_KICK_PREEMPT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); + if (flags & SCX_KICK_WAIT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); + } + + irq_work_queue(&this_rq->scx.kick_cpus_irq_work); +out: + local_irq_restore(irq_flags); +} + +/** + * scx_bpf_dsq_nr_queued - Return the number of queued tasks + * @dsq_id: id of the DSQ + * + * Return the number of tasks in the DSQ matching @dsq_id. If not found, + * -%ENOENT is returned. Can be called from any non-sleepable online scx_ops + * operations. + */ +__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + + lockdep_assert(rcu_read_lock_any_held()); + + if (dsq_id == SCX_DSQ_LOCAL) { + return this_rq()->scx.local_dsq.nr; + } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (ops_cpu_valid(cpu, NULL)) + return cpu_rq(cpu)->scx.local_dsq.nr; + } else { + dsq = find_non_local_dsq(dsq_id); + if (dsq) + return dsq->nr; + } + return -ENOENT; +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return false; + } + + if (ops_cpu_valid(cpu, NULL)) + return test_and_clear_cpu_idle(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return -EBUSY; + } + + return scx_pick_idle_cpu(cpus_allowed, flags); +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_destroy_dsq - Destroy a custom DSQ + * @dsq_id: DSQ to destroy + * + * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with + * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is + * empty and no further tasks are dispatched to it. Ignored if called on a DSQ + * which doesn't exist. Can be called from any online scx_ops operations. + */ +__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) +{ + destroy_dsq(dsq_id); +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can only be called from ops.select_cpu() if the built-in CPU selection is + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { + *is_idle = false; + return prev_cpu; + } + + return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_ops_only) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_ops_only) + +static const struct btf_kfunc_id_set scx_kfunc_set_ops_only = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_ops_only, +}; + +struct scx_bpf_error_bstr_bufs { + u64 data[MAX_BPRINTF_VARARGS]; + char msg[SCX_EXIT_MSG_LEN]; +}; + +static DEFINE_PER_CPU(struct scx_bpf_error_bstr_bufs, scx_bpf_error_bstr_bufs); + +static void bpf_exit_bstr_common(enum scx_exit_kind kind, s64 exit_code, + char *fmt, unsigned long long *data, + u32 data__sz) +{ + struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; + struct scx_bpf_error_bstr_bufs *bufs; + unsigned long flags; + int ret; + + BUILD_BUG_ON(sizeof(enum scx_exit_code) != sizeof_field(struct scx_exit_info, exit_code)); + local_irq_save(flags); + bufs = this_cpu_ptr(&scx_bpf_error_bstr_bufs); + + if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || + (data__sz && !data)) { + scx_ops_error("invalid data=%p and data__sz=%u", + (void *)data, data__sz); + goto out_restore; + } + + ret = copy_from_kernel_nofault(bufs->data, data, data__sz); + if (ret) { + scx_ops_error("failed to read data fields (%d)", ret); + goto out_restore; + } + + ret = bpf_bprintf_prepare(fmt, UINT_MAX, bufs->data, data__sz / 8, + &bprintf_data); + if (ret < 0) { + scx_ops_error("failed to format prepration (%d)", ret); + goto out_restore; + } + + ret = bstr_printf(bufs->msg, sizeof(bufs->msg), fmt, + bprintf_data.bin_args); + bpf_bprintf_cleanup(&bprintf_data); + if (ret < 0) { + scx_ops_error("scx_ops_error(\"%s\", %p, %u) failed to format", + fmt, data, data__sz); + goto out_restore; + } + + scx_ops_exit_kind(kind, exit_code, "%s", bufs->msg); +out_restore: + local_irq_restore(flags); + +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. + * @exit_code: Exit value to pass to user space via struct scx_exit_info. + * @fmt: error message format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops + * disabling. + */ +__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, + unsigned long long *data, u32 data__sz) +{ + bpf_exit_bstr_common(SCX_EXIT_UNREG_BPF, exit_code, fmt, data, + data__sz); +} + +/** + * scx_bpf_error_bstr - Indicate fatal error + * @fmt: error message format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * Indicate that the BPF scheduler encountered a fatal error and initiate ops + * disabling. + */ +__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, + u32 data__sz) +{ + + bpf_exit_bstr_common(SCX_EXIT_ERROR_BPF, 0, fmt, data, + data__sz); +} + +/** + * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs + * + * All valid CPU IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) +{ + return nr_cpu_ids; +} + +/** + * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU + * @cpu: CPU of interest + * + * Return the maximum relative capacity of @cpu in relation to the most + * performant CPU in the system. The return value is in the range [1, + * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) +{ + if (ops_cpu_valid(cpu, NULL)) + return arch_scale_cpu_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU + * @cpu: CPU of interest + * + * Return the current relative performance of @cpu in relation to its maximum. + * The return value is in the range [1, %SCX_CPUPERF_ONE]. + * + * The current performance level of a CPU in relation to the maximum performance + * available in the system can be calculated as follows: + * + * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE + * + * The result is in the range [1, %SCX_CPUPERF_ONE]. + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) +{ + if (ops_cpu_valid(cpu, NULL)) + return arch_scale_freq_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_set - Set the relative performance target of a CPU + * @cpu: CPU of interest + * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * @flags: %SCX_CPUPERF_* flags + * + * Set the target performance level of @cpu to @perf. @perf is in linear + * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the + * schedutil cpufreq governor chooses the target frequency. + * + * The actual performance level chosen, CPU grouping, and the overhead and + * latency of the operations are dependent on the hardware and cpufreq driver in + * use. Consult hardware and cpufreq documentation for more information. The + * current performance level can be monitored using scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf) +{ + if (unlikely(perf > SCX_CPUPERF_ONE)) { + scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + return; + } + + if (ops_cpu_valid(cpu, NULL)) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.cpuperf_target = perf; + + rcu_read_lock_sched_notrace(); + cpufreq_update_util(cpu_rq(cpu), 0); + rcu_read_unlock_sched_notrace(); + } +} + +/** + * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) +{ + return cpu_possible_mask; +} + +/** + * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) +{ + return cpu_online_mask; +} + +/** + * scx_bpf_put_cpumask - Release a possible/online cpumask + * @cpumask: cpumask to release + */ +__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global cpumask, which is read-only in the caller and + * is never released. The acquire / release semantics here are just used + * to make the cpumask is a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return cpu_none_mask; + } + +#ifdef CONFIG_SMP + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return cpu_none_mask; + } + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_masks.smt; + else + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or + * releasing a reference to a global idle cpumask, which is read-only + * in the caller and is never released. The acquire / release semantics + * here are just used to make the cpumask a trusted pointer in the + * caller. + */ +} + +/** + * scx_bpf_task_running - Is task currently running? + * @p: task of interest + */ +__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) +{ + return task_rq(p)->curr == p; +} + +/** + * scx_bpf_task_cpu - CPU a task is currently associated with + * @p: task of interest + */ +__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) +{ + return task_cpu(p); +} + +/** + * scx_bpf_task_cgroup - Return the sched cgroup of a task + * @p: task of interest + * + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with + * from the scheduler's POV. SCX operations should use this function to + * determine @p's current cgroup as, unlike following @p->cgroups, + * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all + * rq-locked operations. Can be called on the parameter tasks of rq-locked + * operations. The restriction guarantees that @p's rq is locked by the caller. + */ +#ifdef CONFIG_CGROUP_SCHED +__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) +{ + struct task_group *tg = p->sched_task_group; + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + + if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + goto out; + + /* + * A task_group may either be a cgroup or an autogroup. In the latter + * case, @tg->css.cgroup is %NULL. A task_group can't become the other + * kind once created. + */ + if (tg && tg->css.cgroup) + cgrp = tg->css.cgroup; + else + cgrp = &cgrp_dfl_root.cgrp; +out: + cgroup_get(cgrp); + return cgrp; +} +#endif + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +#ifdef CONFIG_CGROUP_SCHED +BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) +#endif +BTF_KFUNCS_END(scx_kfunc_ids_any) + +static const struct btf_kfunc_id_set scx_kfunc_set_any = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_any, +}; + +static int __init scx_init(void) +{ + int ret; + + /* + * kfunc registration can't be done from init_sched_ext_class() as + * register_btf_kfunc_id_set() needs most of the system to be up. + * + * Some kfuncs are context-sensitive and can only be called from + * specific SCX ops. They are grouped into BTF sets accordingly. + * Unfortunately, BPF currently doesn't have a way of enforcing such + * restrictions. Eventually, the verifier should be able to enforce + * them. For now, register them the same and make each kfunc explicitly + * check using scx_kf_allowed(). + */ + if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_sleepable)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_enqueue_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_cpu_release)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_ops_only)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &scx_kfunc_set_any))) { + pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); + return ret; + } + + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); + if (ret) { + pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); + return ret; + } + + ret = register_pm_notifier(&scx_pm_notifier); + if (ret) { + pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); + return ret; + } + + scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); + if (!scx_kset) { + pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); + return -ENOMEM; + } + + ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); + if (ret < 0) { + pr_err("sched_ext: Failed to add global attributes\n"); + return ret; + } + + return 0; +} +__initcall(scx_init); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h new file mode 100644 index 000000000..b443be80a --- /dev/null +++ b/kernel/sched/ext.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifdef CONFIG_SCHED_CLASS_EXT + +struct sched_enq_and_set_ctx { + struct task_struct *p; + int queue_flags; + bool queued; + bool running; +}; + +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx); +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); + +extern const struct sched_class ext_sched_class; + +DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); +#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); + +static inline bool task_on_scx(const struct task_struct *p) +{ + return scx_enabled() && p->sched_class == &ext_sched_class; +} + +bool task_should_scx(struct task_struct *p); +void scx_pre_fork(struct task_struct *p); +int scx_fork(struct task_struct *p); +void scx_post_fork(struct task_struct *p); +void scx_cancel_fork(struct task_struct *p); +int scx_check_setscheduler(struct task_struct *p, int policy); +bool scx_can_stop_tick(struct rq *rq); +void scx_tick(struct rq *rq); +void scx_next_task_picked(struct rq *rq, struct task_struct *p, + const struct sched_class *active); +void init_sched_ext_class(void); + +static inline u32 scx_cpuperf_target(s32 cpu) +{ + if (scx_enabled()) + return cpu_rq(cpu)->scx.cpuperf_target; + else + return 0; +} + +static inline const struct sched_class *next_active_class(const struct sched_class *class) +{ + class++; + if (scx_switched_all() && class == &fair_sched_class) + class++; + if (!scx_enabled() && class == &ext_sched_class) + class++; + return class; +} + +#define for_active_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = next_active_class(class)) + +#define for_each_active_class(class) \ + for_active_class_range(class, __sched_class_highest, __sched_class_lowest) + +/* + * SCX requires a balance() call before every pick_next_task() call including + * when waking up from idle. + */ +#define for_balance_class_range(class, prev_class, end_class) \ + for_active_class_range(class, (prev_class) > &ext_sched_class ? \ + &ext_sched_class : (prev_class), (end_class)) + +#ifdef CONFIG_SCHED_CORE +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi); +#endif + +#else /* CONFIG_SCHED_CLASS_EXT */ + +#define scx_enabled() false +#define scx_switched_all() false + +static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline void scx_pre_fork(struct task_struct *p) {} +static inline int scx_fork(struct task_struct *p) { return 0; } +static inline void scx_post_fork(struct task_struct *p) {} +static inline void scx_cancel_fork(struct task_struct *p) {} +static inline int scx_check_setscheduler(struct task_struct *p, + int policy) { return 0; } +static inline bool scx_can_stop_tick(struct rq *rq) { return true; } +static inline void scx_tick(struct rq *rq) {} +static inline void scx_next_task_picked(struct rq *rq, struct task_struct *p, + const struct sched_class *active) {} +static inline void init_sched_ext_class(void) {} +static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } + +#define for_each_active_class for_each_class +#define for_balance_class_range for_class_range + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +void __scx_update_idle(struct rq *rq, bool idle); + +static inline void scx_update_idle(struct rq *rq, bool idle) +{ + if (scx_enabled()) + __scx_update_idle(rq, idle); +} +#else +static inline void scx_update_idle(struct rq *rq, bool idle) {} +#endif + +#ifdef CONFIG_CGROUP_SCHED +#ifdef CONFIG_EXT_GROUP_SCHED +int scx_tg_online(struct task_group *tg); +void scx_tg_offline(struct task_group *tg); +int scx_cgroup_can_attach(struct cgroup_taskset *tset); +void scx_move_task(struct task_struct *p); +void scx_cgroup_finish_attach(void); +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); +void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); +#else /* CONFIG_EXT_GROUP_SCHED */ +static inline int scx_tg_online(struct task_group *tg) { return 0; } +static inline void scx_tg_offline(struct task_group *tg) {} +static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } +static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_finish_attach(void) {} +static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} +static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} +#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95bade948..ae42b39ad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3854,7 +3854,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } } -void reweight_task(struct task_struct *p, int prio) +static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -8341,7 +8341,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); @@ -9302,28 +9302,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { static bool __update_blocked_others(struct rq *rq, bool *done) { - const struct sched_class *curr_class; - u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; - bool decayed; + bool updated; /* * update_load_avg() can call cpufreq_update_util(). Make sure that RT, * DL and IRQ signals have been updated before updating CFS. */ - curr_class = rq->curr->sched_class; - - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - - decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | - update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | - update_irq_load_avg(rq, 0); + updated = update_other_load_avgs(rq); if (others_have_blocked(rq)) *done = false; - return decayed; + return updated; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -12465,14 +12455,14 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); } -static void rq_online_fair(struct rq *rq) +static void rq_online_fair(struct rq *rq, enum rq_onoff_reason reason) { update_sysctl(); update_runtime_enabled(rq); } -static void rq_offline_fair(struct rq *rq) +static void rq_offline_fair(struct rq *rq, enum rq_onoff_reason reason) { update_sysctl(); @@ -13154,6 +13144,7 @@ DEFINE_SCHED_CLASS(fair) = { .task_tick = task_tick_fair, .task_fork = task_fork_fair, + .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6135fbe83..3b6540cc4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -458,11 +458,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { + scx_update_idle(rq, false); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); + scx_update_idle(rq, true); schedstat_inc(rq->sched_goidle); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3261b067b..8620474d1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2426,7 +2426,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) } /* Assumes rq->lock is held */ -static void rq_online_rt(struct rq *rq) +static void rq_online_rt(struct rq *rq, enum rq_onoff_reason reason) { if (rq->rt.overloaded) rt_set_overload(rq); @@ -2437,7 +2437,7 @@ static void rq_online_rt(struct rq *rq) } /* Assumes rq->lock is held */ -static void rq_offline_rt(struct rq *rq) +static void rq_offline_rt(struct rq *rq, enum rq_onoff_reason reason) { if (rq->rt.overloaded) rt_clear_overload(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f7d2fb6cc..2522f3553 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -173,9 +173,19 @@ static inline int idle_policy(int policy) { return policy == SCHED_IDLE; } + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT + if (policy == SCHED_EXT) + return true; +#endif + return policy == SCHED_NORMAL; +} + static inline int fair_policy(int policy) { - return policy == SCHED_NORMAL || policy == SCHED_BATCH; + return normal_policy(policy) || policy == SCHED_BATCH; } static inline int rt_policy(int policy) @@ -223,6 +233,24 @@ static inline void update_avg(u64 *avg, u64 sample) #define shr_bound(val, shift) \ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) +/* + * cgroup weight knobs should use the common MIN, DFL and MAX values which are + * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it + * maps pretty well onto the shares value used by scheduler and the round-trip + * conversions preserve the original value over the entire range. + */ +static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) +{ + return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); +} + +static inline unsigned long sched_weight_to_cgroup(unsigned long weight) +{ + return clamp_t(unsigned long, + DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + /* * !! For sched_setattr_nocheck() (kernel) only !! * @@ -406,6 +434,11 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + struct rcu_head rcu; struct list_head list; @@ -461,6 +494,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) return walk_tg_tree_from(&root_task_group, down, up, data); } +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + extern int tg_nop(struct task_group *tg, void *data); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -517,6 +555,11 @@ extern void set_task_rq_fair(struct sched_entity *se, static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } #endif /* CONFIG_SMP */ +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + return 0; +} #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -677,6 +720,31 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ }; +#ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + SCX_RQ_BALANCING = 1 << 0, + SCX_RQ_CAN_STOP_TICK = 1 << 1, +}; + +struct scx_rq { + struct scx_dispatch_q local_dsq; + struct list_head runnable_list; /* runnable tasks on this rq */ + unsigned long ops_qseq; + u64 extra_enq_flags; /* see move_task_to_local_dsq() */ + u32 nr_running; + u32 flags; + u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ + bool cpu_released; + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_kick_if_idle; + cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; + struct irq_work kick_cpus_irq_work; +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -1017,6 +1085,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_EXT + struct scx_rq scx; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -2250,6 +2321,11 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) +enum rq_onoff_reason { + RQ_ONOFF_HOTPLUG, /* CPU is going on/offline */ + RQ_ONOFF_TOPOLOGY, /* sched domain topology update */ +}; + struct affinity_context { const struct cpumask *new_mask; struct cpumask *user_mask; @@ -2288,8 +2364,8 @@ struct sched_class { void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); - void (*rq_online)(struct rq *rq); - void (*rq_offline)(struct rq *rq); + void (*rq_online)(struct rq *rq, enum rq_onoff_reason reason); + void (*rq_offline)(struct rq *rq, enum rq_onoff_reason reason); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); #endif @@ -2303,8 +2379,11 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); void (*switched_from)(struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*reweight_task)(struct rq *this_rq, struct task_struct *task, + int newprio); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); @@ -2462,7 +2541,7 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, int prio); +extern void __setscheduler_prio(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -2542,6 +2621,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY) @@ -2823,8 +2908,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) raw_spin_rq_unlock(rq1); } -extern void set_rq_online (struct rq *rq); -extern void set_rq_offline(struct rq *rq); +extern void set_rq_online (struct rq *rq, enum rq_onoff_reason reason); +extern void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason); extern bool sched_smp_initialized; #else /* CONFIG_SMP */ @@ -3007,6 +3092,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP +bool update_other_load_avgs(struct rq *rq); unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3049,6 +3135,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } +#else +static inline bool update_other_load_avgs(struct rq *rq) { return false; } #endif #ifdef CONFIG_UCLAMP_TASK @@ -3481,4 +3569,27 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_CGROUP_SCHED +enum cpu_cftype_id { +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + CPU_CFTYPE_WEIGHT, + CPU_CFTYPE_WEIGHT_NICE, + CPU_CFTYPE_IDLE, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + CPU_CFTYPE_MAX, + CPU_CFTYPE_MAX_BURST, +#endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + CPU_CFTYPE_UCLAMP_MIN, + CPU_CFTYPE_UCLAMP_MAX, +#endif + CPU_CFTYPE_CNT, +}; + +extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1]; +#endif /* CONFIG_CGROUP_SCHED */ + +#include "ext.h" + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea59860..12501543c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -497,7 +497,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) old_rd = rq->rd; if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); + set_rq_offline(rq, RQ_ONOFF_TOPOLOGY); cpumask_clear_cpu(rq->cpu, old_rd->span); @@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_set_cpu(rq->cpu, rd->span); if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); + set_rq_online(rq, RQ_ONOFF_TOPOLOGY); rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9dc605f08..802e4f77a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1053,9 +1053,15 @@ static unsigned long get_entry_ip(unsigned long fentry_ip) { u32 instr; - /* Being extra safe in here in case entry ip is on the page-edge. */ - if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) - return fentry_ip; + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { + if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) + return fentry_ip; + } else { + instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); + } if (is_endbr(instr)) fentry_ip -= ENDBR_INSN_SIZE; return fentry_ip; @@ -1525,8 +1531,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: @@ -1582,8 +1586,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_get_ns_current_pid_tgid: - return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: @@ -2008,6 +2010,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_tracing; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2070,6 +2074,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; case BPF_FUNC_get_attach_cookie: + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_attach_cookie_proto_tracing; return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); @@ -2370,16 +2377,26 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) } static __always_inline -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct bpf_prog *prog = link->link.prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; } + + run_ctx.bpf_cookie = link->cookie; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); + + bpf_reset_run_ctx(old_run_ctx); out: this_cpu_dec(*(prog->active)); } @@ -2408,12 +2425,12 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 #define BPF_TRACE_DEFN_x(x) \ - void bpf_trace_run##x(struct bpf_prog *prog, \ + void bpf_trace_run##x(struct bpf_raw_tp_link *link, \ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ { \ u64 args[x]; \ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ - __bpf_trace_run(prog, args); \ + __bpf_trace_run(link, args); \ } \ EXPORT_SYMBOL_GPL(bpf_trace_run##x) BPF_TRACE_DEFN_x(1); @@ -2429,9 +2446,10 @@ BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); -static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { struct tracepoint *tp = btp->tp; + struct bpf_prog *prog = link->link.prog; /* * check that program doesn't access arguments beyond what's @@ -2443,18 +2461,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_tp_access > btp->writable_size) return -EINVAL; - return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, - prog); -} - -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) -{ - return __bpf_probe_register(btp, prog); + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link); } -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { - return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 222c6d6c8..9581ef4ef 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl) print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); + print_scx_info(log_lvl, current); } /** diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index de33dc1b0..25b758448 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -133,7 +133,9 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err < 0) goto out; - arch_protect_bpf_trampoline(image, PAGE_SIZE); + err = arch_protect_bpf_trampoline(image, PAGE_SIZE); + if (err) + goto out; prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 61efeadaf..f6aad4ed2 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -575,6 +575,13 @@ __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, return a + *b + c + d + (long)e + f + g; } +__bpf_kfunc int bpf_modify_return_test_tp(int nonce) +{ + trace_bpf_trigger_tp(nonce); + + return nonce; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -622,6 +629,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_test_modify_return_ids) diff --git a/net/core/datagram.c b/net/core/datagram.c index a8b625abe..e614cfd8e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -324,25 +324,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(skb_free_datagram); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) -{ - bool slow; - - if (!skb_unref(skb)) { - sk_peek_offset_bwd(sk, len); - return; - } - - slow = lock_sock_fast(sk); - sk_peek_offset_bwd(sk, len); - skb_orphan(skb); - unlock_sock_fast(sk, slow); - - /* skb is now orphaned, can be freed outside of locked section */ - __kfree_skb(skb); -} -EXPORT_SYMBOL(__skb_free_datagram_locked); - int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, diff --git a/net/core/dev.c b/net/core/dev.c index 331848eca..a3483a79a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include @@ -197,35 +198,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock_irqsave(struct softnet_data *sd, - unsigned long *flags) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) +{ + static_branch_enable(&use_backlog_threads_key); + return 0; +} +early_param("thread_backlog_napi", setup_backlog_napi_threads); + +static bool use_backlog_threads(void) +{ + return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) { - if (IS_ENABLED(CONFIG_RPS)) + return true; +} + +#endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_save(*flags); } -static inline void rps_lock_irq_disable(struct softnet_data *sd) +static inline void backlog_lock_irq_disable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); } -static inline void rps_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(*flags); } -static inline void rps_unlock_irq_enable(struct softnet_data *sd) +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); @@ -4410,6 +4436,7 @@ EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ @@ -4433,18 +4460,16 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ thread = READ_ONCE(napi->thread); if (thread) { - /* Avoid doing set_bit() if the thread is in - * INTERRUPTIBLE state, cause napi_thread_wait() - * makes sure to proceed with napi polling - * if the thread is explicitly woken from here. - */ - if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) - set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) + goto use_local_napi; + + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } +use_local_napi: list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() @@ -4684,6 +4709,11 @@ static void napi_schedule_rps(struct softnet_data *sd) #ifdef CONFIG_RPS if (sd != mysd) { + if (use_backlog_threads()) { + __napi_schedule_irqoff(&sd->backlog); + return; + } + sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4698,6 +4728,23 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +{ + unsigned long flags; + + if (use_backlog_threads()) { + backlog_lock_irq_save(sd, &flags); + + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + __napi_schedule_irqoff(&sd->backlog); + + backlog_unlock_irq_restore(sd, &flags); + + } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { + smp_call_function_single_async(cpu, &sd->defer_csd); + } +} + #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif @@ -4753,7 +4800,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, reason = SKB_DROP_REASON_NOT_SPECIFIED; sd = &per_cpu(softnet_data, cpu); - rps_lock_irqsave(sd, &flags); + backlog_lock_irq_save(sd, &flags); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); @@ -4763,7 +4810,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); - rps_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, &flags); return NET_RX_SUCCESS; } @@ -4778,7 +4825,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, drop: sd->dropped++; - rps_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, &flags); dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); @@ -5844,7 +5891,7 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); @@ -5852,7 +5899,7 @@ static void flush_backlog(struct work_struct *work) input_queue_head_incr(sd); } } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { @@ -5870,14 +5917,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); return do_flush; #endif @@ -5943,7 +5990,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; - if (remsd) { + if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); @@ -5958,7 +6005,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS - return sd->rps_ipi_list != NULL; + return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif @@ -5992,7 +6039,7 @@ static int process_backlog(struct napi_struct *napi, int quota) } - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -6002,13 +6049,13 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; again = false; } else { skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); } return work; @@ -6716,8 +6763,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static int napi_thread_wait(struct napi_struct *napi) { - bool woken = false; - set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { @@ -6726,15 +6771,13 @@ static int napi_thread_wait(struct napi_struct *napi) * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ - if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); - /* woken being true indicates this thread owns this napi. */ - woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -6742,43 +6785,48 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static int napi_threaded_poll(void *data) +static void napi_threaded_poll_loop(struct napi_struct *napi) { - struct napi_struct *napi = data; struct softnet_data *sd; - void *have; + unsigned long last_qs = jiffies; - while (!napi_thread_wait(napi)) { - unsigned long last_qs = jiffies; - - for (;;) { - bool repoll = false; + for (;;) { + bool repoll = false; + void *have; - local_bh_disable(); - sd = this_cpu_ptr(&softnet_data); - sd->in_napi_threaded_poll = true; + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; - have = netpoll_poll_lock(napi); - __napi_poll(napi, &repoll); - netpoll_poll_unlock(have); + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); - sd->in_napi_threaded_poll = false; - barrier(); + sd->in_napi_threaded_poll = false; + barrier(); - if (sd_has_rps_ipi_waiting(sd)) { - local_irq_disable(); - net_rps_action_and_irq_enable(sd); - } - skb_defer_free_flush(sd); - local_bh_enable(); + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + skb_defer_free_flush(sd); + local_bh_enable(); - if (!repoll) - break; + if (!repoll) + break; - rcu_softirq_qs_periodic(last_qs); - cond_resched(); - } + rcu_softirq_qs_periodic(last_qs); + cond_resched(); } +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + + while (!napi_thread_wait(napi)) + napi_threaded_poll_loop(napi); + return 0; } @@ -11379,7 +11427,7 @@ static int dev_cpu_dead(unsigned int oldcpu) list_del_init(&napi->poll_list); if (napi->poll == process_backlog) - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } @@ -11387,12 +11435,14 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + if (!use_backlog_threads()) { #ifdef CONFIG_RPS - remsd = oldsd->rps_ipi_list; - oldsd->rps_ipi_list = NULL; + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; #endif - /* send out pending IPI's on offline CPU */ - net_rps_send_ipi(remsd); + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { @@ -11718,7 +11768,7 @@ static int net_page_pool_create(int cpuid) struct page_pool_params page_pool_params = { .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, .flags = PP_FLAG_SYSTEM_POOL, - .nid = NUMA_NO_NODE, + .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; @@ -11731,6 +11781,38 @@ static int net_page_pool_create(int cpuid) return 0; } +static int backlog_napi_should_run(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + + napi_threaded_poll_loop(&sd->backlog); +} + +static void backlog_napi_setup(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + napi->thread = this_cpu_read(backlog_napi); + set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { + .store = &backlog_napi, + .thread_should_run = backlog_napi_should_run, + .thread_fn = run_backlog_napi, + .thread_comm = "backlog_napi/%u", + .setup = backlog_napi_setup, +}; + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -11782,10 +11864,13 @@ static int __init net_dev_init(void) init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; + INIT_LIST_HEAD(&sd->backlog.poll_list); if (net_page_pool_create(i)) goto out; } + if (use_backlog_threads()) + smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; diff --git a/net/core/filter.c b/net/core/filter.c index 8adf95765..786d792ac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,6 +87,9 @@ #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -5884,7 +5887,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -6027,7 +6033,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6105,7 +6114,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ - BPF_FIB_LOOKUP_SRC) + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -8342,8 +8351,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b99127712..17617c29b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7039,8 +7039,8 @@ nodefer: __kfree_skb(skb); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) - smp_call_function_single_async(cpu, &sd->defer_csd); + if (unlikely(kick)) + kick_defer_list_purge(sd, cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 61ed3e549..dc38a1131 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -2356,8 +2356,6 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { }; BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, bbr_init) BTF_ID_FLAGS(func, bbr_main) BTF_ID_FLAGS(func, bbr_sndbuf_expand) @@ -2367,8 +2365,6 @@ BTF_ID_FLAGS(func, bbr_cwnd_event) BTF_ID_FLAGS(func, bbr_ssthresh) BTF_ID_FLAGS(func, bbr_tso_segs) BTF_ID_FLAGS(func, bbr_set_state) -#endif -#endif BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 44869ea08..5dbed91c6 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -486,16 +486,12 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { }; BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, cubictcp_init) BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) BTF_ID_FLAGS(func, cubictcp_cong_avoid) BTF_ID_FLAGS(func, cubictcp_state) BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) -#endif -#endif BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index e33fbe493..6b712a33d 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -261,16 +261,12 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { }; BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, dctcp_init) BTF_ID_FLAGS(func, dctcp_update_alpha) BTF_ID_FLAGS(func, dctcp_cwnd_event) BTF_ID_FLAGS(func, dctcp_ssthresh) BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) -#endif -#endif BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { diff --git a/tools/Makefile b/tools/Makefile index 276f5d0d5..278d24723 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -28,6 +28,7 @@ help: @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' + @echo ' sched_ext - sched_ext example schedulers' @echo ' bootconfig - boot config tool' @echo ' spi - spi tools' @echo ' tmon - thermal monitoring and tuning tool' @@ -91,6 +92,9 @@ perf: FORCE $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= +sched_ext: FORCE + $(call descend,sched_ext) + selftests: FORCE $(call descend,testing/$@) @@ -184,6 +188,9 @@ perf_clean: $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean +sched_ext_clean: + $(call descend,sched_ext,clean) + selftests_clean: $(call descend,testing/$(@:_clean=),clean) @@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ - intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean + intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ + sched_ext_clean .PHONY: FORCE diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index e9154ace8..b67454b45 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -89,6 +89,10 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif +HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ + $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) +HOST_LDFLAGS := $(LDFLAGS) + INSTALL ?= install RM ?= rm -f @@ -178,12 +182,9 @@ ifeq ($(filter -DHAVE_LLVM_SUPPORT -DHAVE_LIBBFD_SUPPORT,$(CFLAGS)),) SRCS := $(filter-out jit_disasm.c,$(SRCS)) endif -HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ - $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) - BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o @@ -231,14 +232,11 @@ endif CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS) -$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c - $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ - $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) - $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ + $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(HOST_LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 708733b0e..c754a428c 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -664,7 +664,8 @@ probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type, probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ") && + !grep(buf, "program of this type cannot use helper "); switch (get_vendor_id(ifindex)) { case 0x19ee: /* Netronome specific */ diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 540c0f2c4..786268f1a 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1131,7 +1131,8 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) continue; codegen("\ \n\ - obj->struct_ops.%1$s = bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ + obj->struct_ops.%1$s = (typeof(obj->struct_ops.%1$s))\n\ + bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ \n\ ", ident); } diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index 00c77edb6..9b898571b 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -101,7 +101,6 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) char buf[4096 / sizeof(*e) * sizeof(*e)]; struct pid_iter_bpf *skel; int err, ret, fd = -1, i; - libbpf_print_fn_t default_print; *map = hashmap__new(hash_fn_for_key_as_id, equal_fn_for_key_as_id, NULL); if (IS_ERR(*map)) { @@ -118,12 +117,18 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) skel->rodata->obj_type = type; - /* we don't want output polluted with libbpf errors if bpf_iter is not - * supported - */ - default_print = libbpf_set_print(libbpf_print_none); - err = pid_iter_bpf__load(skel); - libbpf_set_print(default_print); + if (!verifier_logs) { + libbpf_print_fn_t default_print; + + /* Unless debug information is on, we don't want the output to + * be polluted with libbpf errors if bpf_iter is not supported. + */ + default_print = libbpf_set_print(libbpf_print_none); + err = pid_iter_bpf__load(skel); + libbpf_set_print(default_print); + } else { + err = pid_iter_bpf__load(skel); + } if (err) { /* too bad, kernel doesn't support BPF iterators yet */ err = 0; diff --git a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c index 26004f0c5..7bdbcac3c 100644 --- a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c +++ b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c @@ -102,8 +102,8 @@ int iter(struct bpf_iter__task_file *ctx) BPF_LINK_TYPE_PERF_EVENT___local)) { struct bpf_link *link = (struct bpf_link *) file->private_data; - if (link->type == bpf_core_enum_value(enum bpf_link_type___local, - BPF_LINK_TYPE_PERF_EVENT___local)) { + if (BPF_CORE_READ(link, type) == bpf_core_enum_value(enum bpf_link_type___local, + BPF_LINK_TYPE_PERF_EVENT___local)) { e.has_bpf_cookie = true; e.bpf_cookie = get_bpf_cookie(link); } diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 7b65566f3..8a63a9913 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -58,6 +58,10 @@ #define noinline #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3c42b9f1b..96d57e483 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1662,8 +1662,10 @@ union bpf_attr { } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ - __u64 name; - __u32 prog_fd; + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ @@ -3392,6 +3394,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -7118,6 +7124,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7195,8 +7202,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 97ec005c3..c9f4e04f3 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -785,6 +785,7 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, uprobe_multi)) return libbpf_err(-EINVAL); break; + case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: @@ -1173,20 +1174,31 @@ int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info return bpf_obj_get_info_by_fd(link_fd, info, info_len); } -int bpf_raw_tracepoint_open(const char *name, int prog_fd) +int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); union bpf_attr attr; int fd; + if (!OPTS_VALID(opts, bpf_raw_tp_opts)) + return libbpf_err(-EINVAL); + memset(&attr, 0, attr_sz); - attr.raw_tracepoint.name = ptr_to_u64(name); attr.raw_tracepoint.prog_fd = prog_fd; + attr.raw_tracepoint.name = ptr_to_u64(OPTS_GET(opts, tp_name, NULL)); + attr.raw_tracepoint.cookie = OPTS_GET(opts, cookie, 0); fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); return libbpf_err_errno(fd); } +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + LIBBPF_OPTS(bpf_raw_tp_opts, opts, .tp_name = name); + + return bpf_raw_tracepoint_open_opts(prog_fd, &opts); +} + int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd); diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index df0db2f0c..972e17ec0 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -617,6 +617,15 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); +struct bpf_raw_tp_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + const char *tp_name; + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tp_opts__last_field cookie + +LIBBPF_API int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts); LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a2061fcd6..b091154bc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1132,8 +1132,26 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) const char *mname; mname = btf__name_by_offset(btf, member->name_off); + moff = member->offset / 8; + mdata = data + moff; + msize = btf__resolve_size(btf, member->type); + if (msize < 0) { + pr_warn("struct_ops init_kern %s: failed to resolve the size of member %s\n", + map->name, mname); + return msize; + } + kern_member = find_member_by_name(kern_btf, kern_type, mname); if (!kern_member) { + /* Skip all zeros or null fields if they are not + * presented in the kernel BTF. + */ + if (libbpf_is_mem_zeroed(mdata, msize)) { + pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", + map->name, mname); + continue; + } + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", map->name, mname); return -ENOTSUP; @@ -1147,10 +1165,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) return -ENOTSUP; } - moff = member->offset / 8; kern_moff = kern_member->offset / 8; - - mdata = data + moff; kern_mdata = kern_data + kern_moff; mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); @@ -1230,9 +1245,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) continue; } - msize = btf__resolve_size(btf, mtype_id); kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); - if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + if (kern_msize < 0 || msize != kern_msize) { pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", map->name, mname, (ssize_t)msize, (ssize_t)kern_msize); @@ -1956,6 +1970,20 @@ static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, return NULL; } +static struct extern_desc *find_extern_by_name_with_len(const struct bpf_object *obj, + const void *name, int len) +{ + const char *ext_name; + int i; + + for (i = 0; i < obj->nr_extern; i++) { + ext_name = obj->externs[i].name; + if (strlen(ext_name) == len && strncmp(ext_name, name, len) == 0) + return &obj->externs[i]; + } + return NULL; +} + static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, char value) { @@ -7321,9 +7349,9 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog char *cp, errmsg[STRERR_BUFSIZE]; size_t log_buf_size = 0; char *log_buf = NULL, *tmp; - int btf_fd, ret, err; bool own_log_buf = true; __u32 log_level = prog->log_level; + int ret, err; if (prog->type == BPF_PROG_TYPE_UNSPEC) { /* @@ -7347,9 +7375,8 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog load_attr.prog_ifindex = prog->prog_ifindex; /* specify func_info/line_info only if kernel supports them */ - btf_fd = btf__fd(obj->btf); - if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { - load_attr.prog_btf_fd = btf_fd; + if (obj->btf && btf__fd(obj->btf) >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { + load_attr.prog_btf_fd = btf__fd(obj->btf); load_attr.func_info = prog->func_info; load_attr.func_info_rec_size = prog->func_info_rec_size; load_attr.func_info_cnt = prog->func_info_cnt; @@ -7973,7 +8000,10 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) return 0; } -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +static int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) { char sym_type, sym_name[500]; unsigned long long sym_addr; @@ -8013,8 +8043,13 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, struct bpf_object *obj = ctx; const struct btf_type *t; struct extern_desc *ext; + char *res; - ext = find_extern_by_name(obj, sym_name); + res = strstr(sym_name, ".llvm."); + if (sym_type == 'd' && res) + ext = find_extern_by_name_with_len(obj, sym_name, res - sym_name); + else + ext = find_extern_by_name(obj, sym_name); if (!ext || ext->type != EXT_KSYM) return 0; @@ -8563,6 +8598,11 @@ int bpf_map__pin(struct bpf_map *map, const char *path) return libbpf_err(-EINVAL); } + if (map->fd < 0) { + pr_warn("map '%s': can't pin BPF map without FD (was it created?)\n", map->name); + return libbpf_err(-EINVAL); + } + if (map->pin_path) { if (path && strcmp(path, map->pin_path)) { pr_warn("map '%s' already has pin path '%s' different from '%s'\n", @@ -9298,6 +9338,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/verdict", SK_SKB, BPF_SK_SKB_VERDICT, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), @@ -10307,6 +10348,11 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz, return -EINVAL; } + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); + return -EINVAL; + } + if (!check_value_sz) return 0; @@ -10419,8 +10465,15 @@ long libbpf_get_error(const void *ptr) int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { int ret; + int prog_fd = bpf_program__fd(prog); + + if (prog_fd < 0) { + pr_warn("prog '%s': can't use BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err(-EINVAL); + } - ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); + ret = bpf_link_update(bpf_link__fd(link), prog_fd, NULL); return libbpf_err_errno(ret); } @@ -10614,7 +10667,7 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -11338,6 +11391,13 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, false); addrs = OPTS_GET(opts, addrs, false); cnt = OPTS_GET(opts, cnt, false); @@ -11378,7 +11438,6 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); if (link_fd < 0) { err = -errno; @@ -11761,6 +11820,13 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, if (!OPTS_VALID(opts, bpf_uprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, NULL); offsets = OPTS_GET(opts, offsets, NULL); ref_ctr_offsets = OPTS_GET(opts, ref_ctr_offsets, NULL); @@ -11836,7 +11902,6 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &lopts); if (link_fd < 0) { err = -errno; @@ -12080,7 +12145,7 @@ struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, return libbpf_err_ptr(-EINVAL); if (bpf_program__fd(prog) < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -12271,13 +12336,19 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin return libbpf_get_error(*link); } -struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, - const char *tp_name) +struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts) { + LIBBPF_OPTS(bpf_raw_tp_opts, raw_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; int prog_fd, pfd; + if (!OPTS_VALID(opts, bpf_raw_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { pr_warn("prog '%s': can't attach before loaded\n", prog->name); @@ -12289,7 +12360,9 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return libbpf_err_ptr(-ENOMEM); link->detach = &bpf_link__detach_fd; - pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); + raw_opts.tp_name = tp_name; + raw_opts.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_opts); if (pfd < 0) { pfd = -errno; free(link); @@ -12301,6 +12374,12 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return link; } +struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name) +{ + return bpf_program__attach_raw_tracepoint_opts(prog, tp_name, NULL); +} + static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) { static const char *const prefixes[] = { @@ -12662,6 +12741,12 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog) if (!prog->sec_def || !prog->sec_def->prog_attach_fn) return libbpf_err_ptr(-EOPNOTSUPP); + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link); if (err) return libbpf_err_ptr(err); @@ -12702,8 +12787,13 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) __u32 zero = 0; int err, fd; - if (!bpf_map__is_struct_ops(map) || map->fd == -1) + if (!bpf_map__is_struct_ops(map)) + return libbpf_err_ptr(-EINVAL); + + if (map->fd < 0) { + pr_warn("map '%s': can't attach BPF map without FD (was it created?)\n", map->name); return libbpf_err_ptr(-EINVAL); + } link = calloc(1, sizeof(*link)); if (!link) @@ -12751,8 +12841,13 @@ int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) __u32 zero = 0; int err; - if (!bpf_map__is_struct_ops(map) || !map_is_created(map)) + if (!bpf_map__is_struct_ops(map)) + return -EINVAL; + + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); return -EINVAL; + } st_ops_link = container_of(link, struct bpf_link_struct_ops, link); /* Ensure the type of a link is correct */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 7b510761f..f88ab50c0 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -760,9 +760,20 @@ bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, const char *tp_name, const struct bpf_tracepoint_opts *opts); +struct bpf_raw_tracepoint_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tracepoint_opts__last_field cookie + LIBBPF_API struct bpf_link * bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, const char *tp_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts); struct bpf_trace_opts { /* size of this struct, for forward/backward compatibility */ diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 86804fd90..51732ecb1 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -410,6 +410,8 @@ LIBBPF_1.3.0 { LIBBPF_1.4.0 { global: + bpf_program__attach_raw_tracepoint_opts; + bpf_raw_tracepoint_open_opts; bpf_token_create; btf__new_split; btf_ext__raw_data; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 864b36177..a0dcfb82e 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -518,11 +518,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx); - -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); - /* handle direct returned errors */ static inline int libbpf_err(int ret) { diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 302188122..9dfbe7750 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -448,7 +448,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) * at all, it will emit something like "invalid func unknown#181". * If BPF verifier recognizes BPF helper but it's not supported for - * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166" + * or "program of this type cannot use helper bpf_sys_bpf#166". * In both cases, provided combination of BPF program type and BPF * helper is not supported by the kernel. * In all other cases, probe_prog_load() above will either succeed (e.g., @@ -457,7 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe * that), or we'll get some more specific BPF verifier error about * some unsatisfied conditions. */ - if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") || + strstr(buf, "program of this type cannot use helper "))) return 0; return 1; /* assume supported */ } diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 25810e18b..fd865a913 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -820,7 +820,10 @@ class YnlFamily(SpecFamily): if display_hint == 'mac': formatted = ':'.join('%02x' % b for b in raw) elif display_hint == 'hex': - formatted = bytes.hex(raw, ' ') + if isinstance(raw, int): + formatted = hex(raw) + else: + formatted = bytes.hex(raw, ' ') elif display_hint in [ 'ipv4', 'ipv6' ]: formatted = format(ipaddress.ip_address(raw)) elif display_hint == 'uuid': diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore new file mode 100644 index 000000000..d6264fe1c --- /dev/null +++ b/tools/sched_ext/.gitignore @@ -0,0 +1,2 @@ +tools/ +build/ diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile new file mode 100644 index 000000000..ca3815e57 --- /dev/null +++ b/tools/sched_ext/Makefile @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../build/Build.include +include ../scripts/Makefile.arch +include ../scripts/Makefile.include + +all: all_targets + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi +CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu +CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl +CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu +CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu +CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu +CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu +CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu +CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +ifeq ($(CROSS_COMPILE),) +ifeq ($(CLANG_TARGET_FLAGS),) +$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) +else +CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) +endif # CLANG_TARGET_FLAGS +else +CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif # CROSS_COMPILE + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := $(CROSS_COMPILE)gcc +endif # LLVM + +CURDIR := $(abspath .) +TOOLSDIR := $(abspath ..) +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(abspath ../../include/generated) +GENHDR := $(GENDIR)/autoconf.h + +ifeq ($(O),) +OUTPUT_DIR := $(CURDIR)/build +else +OUTPUT_DIR := $(O)/build +endif # O +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BINDIR := $(OUTPUT_DIR)/bin +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +ifneq ($(CROSS_COMPILE),) +HOST_BUILD_DIR := $(OBJ_DIR)/host +HOST_OUTPUT_DIR := host-tools +HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include +else +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) +HOST_INCLUDE_DIR := $(INCLUDE_DIR) +endif +HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a +RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids +DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ + | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) + +c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg + +$(addprefix $(BINDIR)/,$(c-sched-targets)): \ + $(BINDIR)/%: \ + $(filter-out %.bpf.c,%.c) \ + $(INCLUDE_DIR)/%.bpf.skel.h \ + $(SCX_COMMON_DEPS) + $(eval sched=$(notdir $@)) + $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o + $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) + +$(c-sched-targets): %: $(BINDIR)/% + +install: all + $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ + $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ + +clean: + rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(c-sched-targets) + +help: + @echo 'Building targets' + @echo '================' + @echo '' + @echo ' all - Compile all schedulers' + @echo '' + @echo 'Alternatively, you may compile individual schedulers:' + @echo '' + @printf ' %s\n' $(c-sched-targets) + @echo '' + @echo 'For any scheduler build target, you may specify an alternative' + @echo 'build output path with the O= environment variable. For example:' + @echo '' + @echo ' O=/tmp/sched_ext make all' + @echo '' + @echo 'will compile all schedulers, and emit the build artifacts to' + @echo '/tmp/sched_ext/build.' + @echo '' + @echo '' + @echo 'Installing targets' + @echo '==================' + @echo '' + @echo ' install - Compile and install all schedulers to /usr/bin.' + @echo ' You may specify the DESTDIR= environment variable' + @echo ' to indicate a prefix for /usr/bin. For example:' + @echo '' + @echo ' DESTDIR=/tmp/sched_ext make install' + @echo '' + @echo ' will build the schedulers in CWD/build, and' + @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' + @echo '' + @echo '' + @echo 'Cleaning targets' + @echo '================' + @echo '' + @echo ' clean - Remove all generated files' + +all_targets: $(c-sched-targets) + +.PHONY: all all_targets $(c-sched-targets) clean help + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md new file mode 100644 index 000000000..16a42e406 --- /dev/null +++ b/tools/sched_ext/README.md @@ -0,0 +1,270 @@ +SCHED_EXT EXAMPLE SCHEDULERS +============================ + +# Introduction + +This directory contains a number of example sched_ext schedulers. These +schedulers are meant to provide examples of different types of schedulers +that can be built using sched_ext, and illustrate how various features of +sched_ext can be used. + +Some of the examples are performant, production-ready schedulers. That is, for +the correct workload and with the correct tuning, they may be deployed in a +production environment with acceptable or possibly even improved performance. +Others are just examples that in practice, would not provide acceptable +performance (though they could be improved to get there). + +This README will describe these example schedulers, including describing the +types of workloads or scenarios they're designed to accommodate, and whether or +not they're production ready. For more details on any of these schedulers, +please see the header comment in their .bpf.c file. + + +# Compiling the examples + +There are a few toolchain dependencies for compiling the example schedulers. + +## Toolchain dependencies + +1. clang >= 16.0.0 + +The schedulers are BPF programs, and therefore must be compiled with clang. gcc +is actively working on adding a BPF backend compiler as well, but are still +missing some features such as BTF type tags which are necessary for using +kptrs. + +2. pahole >= 1.25 + +You may need pahole in order to generate BTF from DWARF. + +3. rust >= 1.70.0 + +Rust schedulers uses features present in the rust toolchain >= 1.70.0. You +should be able to use the stable build from rustup, but if that doesn't +work, try using the rustup nightly build. + +There are other requirements as well, such as make, but these are the main / +non-trivial ones. + +## Compiling the kernel + +In order to run a sched_ext scheduler, you'll have to run a kernel compiled +with the patches in this repository, and with a minimum set of necessary +Kconfig options: + +``` +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +``` + +It's also recommended that you also include the following Kconfig options: + +``` +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y +``` + +There is a `Kconfig` file in this directory whose contents you can append to +your local `.config` file, as long as there are no conflicts with any existing +options in the file. + +## Getting a vmlinux.h file + +You may notice that most of the example schedulers include a "vmlinux.h" file. +This is a large, auto-generated header file that contains all of the types +defined in some vmlinux binary that was compiled with +[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig +options specified above). + +The header file is created using `bpftool`, by passing it a vmlinux binary +compiled with BTF as follows: + +```bash +$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h +``` + +`bpftool` analyzes all of the BTF encodings in the binary, and produces a +header file that can be included by BPF programs to access those types. For +example, using vmlinux.h allows a scheduler to access fields defined directly +in vmlinux as follows: + +```c +#include "vmlinux.h" +// vmlinux.h is also implicitly included by scx_common.bpf.h. +#include "scx_common.bpf.h" + +/* + * vmlinux.h provides definitions for struct task_struct and + * struct scx_enable_args. + */ +void BPF_STRUCT_OPS(example_enable, struct task_struct *p, + struct scx_enable_args *args) +{ + bpf_printk("Task %s enabled in example scheduler", p->comm); +} + +// vmlinux.h provides the definition for struct sched_ext_ops. +SEC(".struct_ops.link") +struct sched_ext_ops example_ops { + .enable = (void *)example_enable, + .name = "example", +} +``` + +The scheduler build system will generate this vmlinux.h file as part of the +scheduler build pipeline. It looks for a vmlinux file in the following +dependency order: + +1. If the O= environment variable is defined, at `$O/vmlinux` +2. If the KBUILD_OUTPUT= environment variable is defined, at + `$KBUILD_OUTPUT/vmlinux` +3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're + compiling the schedulers) +3. `/sys/kernel/btf/vmlinux` +4. `/boot/vmlinux-$(uname -r)` + +In other words, if you have compiled a kernel in your local repo, its vmlinux +file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of +the kernel you're currently running on. This means that if you're running on a +kernel with sched_ext support, you may not need to compile a local kernel at +all. + +### Aside on CO-RE + +One of the cooler features of BPF is that it supports +[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run +Everywhere). This feature allows you to reference fields inside of structs with +types defined internal to the kernel, and not have to recompile if you load the +BPF program on a different kernel with the field at a different offset. In our +example above, we print out a task name with `p->comm`. CO-RE would perform +relocations for that access when the program is loaded to ensure that it's +referencing the correct offset for the currently running kernel. + +## Compiling the schedulers + +Once you have your toolchain setup, and a vmlinux that can be used to generate +a full vmlinux.h file, you can compile the schedulers using `make`: + +```bash +$ make -j($nproc) +``` + +# Example schedulers + +This directory contains the following example schedulers. These schedulers are +for testing and demonstrating different aspects of sched_ext. While some may be +useful in limited scenarios, they are not intended to be practical. + +For more scheduler implementations, tools and documentation, visit +https://github.com/sched-ext/scx. + +## scx_simple + +A simple scheduler that provides an example of a minimal sched_ext scheduler. +scx_simple can be run in either global weighted vtime mode, or FIFO mode. + +Though very simple, in limited scenarios, this scheduler can perform reasonably +well on single-socket systems with a unified L3 cache. + +## scx_qmap + +Another simple, yet slightly more complex scheduler that provides an example of +a basic weighted FIFO queuing policy. It also provides examples of some common +useful BPF features, such as sleepable per-task storage allocation in the +`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to +enqueue tasks. It also illustrates how core-sched support could be implemented. + +## scx_central + +A "central" scheduler where scheduling decisions are made from a single CPU. +This scheduler illustrates how scheduling decisions can be dispatched from a +single CPU, allowing other cores to run with infinite slices, without timer +ticks, and without having to incur the overhead of making scheduling decisions. + +The approach demonstrated by this scheduler may be useful for any workload that +benefits from minimizing scheduling overhead and timer ticks. An example of +where this could be particularly useful is running VMs, where running with +infinite slices and no timer ticks allows the VM to avoid unnecessary expensive +vmexits. + +## scx_flatcg + +A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical +weight-based cgroup CPU control by flattening the cgroup hierarchy into a single +layer, by compounding the active weight share at each level. The effect of this +is a much more performant CPU controller, which does not need to descend down +cgroup trees in order to properly compute a cgroup's share. + +Similar to scx_simple, in limited scenarios, this scheduler can perform +reasonably well on single socket-socket systems with a unified L3 cache and show +significantly lowered hierarchical scheduling overhead. + + +# Troubleshooting + +There are a number of common issues that you may run into when building the +schedulers. We'll go over some of the common ones here. + +## Build Failures + +### Old version of clang + +``` +error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + ^~~~~~~~~~~~~~~~~~~~ +1 error generated. +``` + +This means you built the kernel or the schedulers with an older version of +clang than what's supported (i.e. older than 16.0.0). To remediate this: + +1. `which clang` to make sure you're using a sufficiently new version of clang. + +2. `make fullclean` in the root path of the repository, and rebuild the kernel + and schedulers. + +3. Rebuild the kernel, and then your example schedulers. + +The schedulers are also cleaned if you invoke `make mrproper` in the root +directory of the tree. + +### Stale kernel build / incomplete vmlinux.h file + +As described above, you'll need a `vmlinux.h` file that was generated from a +vmlinux built with BTF, and with sched_ext support enabled. If you don't, +you'll see errors such as the following which indicate that a type being +referenced in a scheduler is unknown: + +``` +/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' + +const struct scx_exit_info *ei) + +^ +``` + +In order to resolve this, please follow the steps above in +[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your +schedulers are using a vmlinux.h file that includes the requisite types. + +## Misc + +### llvm: [OFF] + +You may see the following output when building the schedulers: + +``` +Auto-detecting system features: +... clang-bpf-co-re: [ on ] +... llvm: [ OFF ] +... libcap: [ on ] +... libbfd: [ on ] +``` + +Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h new file mode 100644 index 000000000..ad7d139ce --- /dev/null +++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h @@ -0,0 +1,11 @@ +/* + * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when + * compiling BPF files although its content doesn't play any role. The file in + * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is + * defined. When compiling a BPF source, __x86_64__ isn't set and thus + * stubs-32.h is selected. However, the file is not there if the system doesn't + * have 32bit glibc devel package installed leading to a build failure. + * + * The problem is worked around by making this file available in the include + * search paths before the system one when building BPF. + */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h new file mode 100644 index 000000000..44eb4267d --- /dev/null +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -0,0 +1,281 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __SCX_COMMON_BPF_H +#define __SCX_COMMON_BPF_H + +#include "vmlinux.h" +#include +#include +#include +#include "user_exit_info.h" + +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_EXITING 0x00000004 +#define CLOCK_MONOTONIC 1 + +/* + * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can + * lead to really confusing misbehaviors. Let's trigger a build failure. + */ +static inline void ___vmlinux_h_sanity_check___(void) +{ + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); +} + +s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; +void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch_cancel(void) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; +u32 scx_bpf_reenqueue_local(void) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; +bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; +void scx_bpf_exit_bstr(s64 exit_code, char *fmt, + unsigned long long *data, u32 data__sz) __ksym; +void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; +u32 scx_bpf_nr_cpu_ids(void) __ksym; +u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym; +u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym; +void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym; +const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym; +void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym; +const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; +void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_exit_format_checker(const char *fmt, ...) {} + +/* + * Helper macro for initializing the fmt and variadic argument inputs to both + * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to + * refer to the initialized list of inputs to the bstr kfunc. + */ +#define scx_bpf_exit_preamble(fmt, args...) \ + static char ___fmt[] = fmt; \ + /* \ + * Note that __param[] must have at least one \ + * element to keep the verifier happy. \ + */ \ + unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + +/* + * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments + * instead of an array of u64. Using this macro will cause the scheduler to + * exit cleanly with the specified exit code being passed to user space. + */ +#define scx_bpf_exit(code, fmt, args...) \ +({ \ + scx_bpf_exit_preamble(fmt, args) \ + scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_exit_format_checker(fmt, ##args); \ +}) + +/* + * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments + * instead of an array of u64. Invoking this macro will cause the scheduler to + * exit in an erroneous state, with diagnostic information being passed to the + * user. + */ +#define scx_bpf_error(fmt, args...) \ +({ \ + scx_bpf_exit_preamble(fmt, args) \ + scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_exit_format_checker(fmt, ##args); \ +}) + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, ##args) + +#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ +SEC("struct_ops.s/"#name) \ +BPF_PROG(name, ##args) + +/** + * RESIZABLE_ARRAY - Generates annotations for an array that may be resized + * @elfsec: the data section of the BPF program in which to place the array + * @arr: the name of the array + * + * libbpf has an API for setting map value sizes. Since data sections (i.e. + * bss, data, rodata) themselves are maps, a data section can be resized. If + * a data section has an array as its last element, the BTF info for that + * array will be adjusted so that length of the array is extended to meet the + * new length of the data section. This macro annotates an array to have an + * element count of one with the assumption that this array can be resized + * within the userspace program. It also annotates the section specifier so + * this array exists in a custom sub data section which can be resized + * independently. + * + * See RESIZE_ARRAY() for the userspace convenience macro for resizing an + * array declared with RESIZABLE_ARRAY(). + */ +#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) + +/** + * MEMBER_VPTR - Obtain the verified pointer to a struct or array member + * @base: struct or array to index + * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) + * + * The verifier often gets confused by the instruction sequence the compiler + * generates for indexing struct fields or arrays. This macro forces the + * compiler to generate a code sequence which first calculates the byte offset, + * checks it against the struct or array size and add that byte offset to + * generate the pointer to the member to help the verifier. + * + * Ideally, we want to abort if the calculated offset is out-of-bounds. However, + * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller + * must check for %NULL and take appropriate action to appease the verifier. To + * avoid confusing the verifier, it's best to check for %NULL and dereference + * immediately. + * + * vptr = MEMBER_VPTR(my_array, [i][j]); + * if (!vptr) + * return error; + * *vptr = new_value; + * + * sizeof(@base) should encompass the memory area to be accessed and thus can't + * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of + * `MEMBER_VPTR(ptr, ->member)`. + */ +#define MEMBER_VPTR(base, member) (typeof((base) member) *)({ \ + u64 __base = (u64)&(base); \ + u64 __addr = (u64)&((base) member) - __base; \ + _Static_assert(sizeof(base) >= sizeof((base) member), \ + "@base is smaller than @member, is @base a pointer?"); \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"i"(sizeof(base) - sizeof((base) member))); \ + __addr; \ +}) + +/** + * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element + * @arr: array to index into + * @i: array index + * @n: number of elements in array + * + * Similar to MEMBER_VPTR() but is intended for use with arrays where the + * element count needs to be explicit. + * It can be used in cases where a global array is defined with an initial + * size but is intended to be be resized before loading the BPF program. + * Without this version of the macro, MEMBER_VPTR() will use the compile time + * size of the array to compute the max, which will result in rejection by + * the verifier. + */ +#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({ \ + u64 __base = (u64)arr; \ + u64 __addr = (u64)&(arr[i]) - __base; \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ + __addr; \ +}) + +/* + * BPF core and other generic helpers + */ + +/* list and rbtree */ +#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; +void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; + +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) +#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; +struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) __ksym; +int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta, __u64 off) __ksym; +#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) + +struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + +extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; +#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) + +/* task */ +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; + +/* cgroup */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; +void bpf_cgroup_release(struct cgroup *cgrp) __ksym; +struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; + +/* cpumask */ +struct bpf_cpumask *bpf_cpumask_create(void) __ksym; +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; +u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; +u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) __ksym; + +/* rcu */ +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + +#include "compat.bpf.h" + +#endif /* __SCX_COMMON_BPF_H */ diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h new file mode 100644 index 000000000..8d5a6775f --- /dev/null +++ b/tools/sched_ext/include/scx/common.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ +#ifndef __SCHED_EXT_COMMON_H +#define __SCHED_EXT_COMMON_H + +#ifdef __KERNEL__ +#error "Should not be included by BPF programs" +#endif + +#include +#include +#include +#include +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +#define SCX_BUG(__fmt, ...) \ + do { \ + fprintf(stderr, "%s:%d [scx panic]: %s\n", __FILE__, __LINE__, \ + strerror(errno)); \ + fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + exit(EXIT_FAILURE); \ + } while (0) + +#define SCX_BUG_ON(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ + } while (0) + +/** + * RESIZE_ARRAY - Convenience macro for resizing a BPF array + * @elfsec: the data section of the BPF program in which to the array exists + * @arr: the name of the array + * @n: the desired array element count + * + * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two + * operations. It resizes the map which corresponds to the custom data + * section that contains the target array. As a side effect, the BTF info for + * the array is adjusted so that the array length is sized to cover the new + * data section size. The second operation is reassigning the skeleton pointer + * for that custom data section so that it points to the newly memory mapped + * region. + */ +#define RESIZE_ARRAY(elfsec, arr, n) \ + do { \ + size_t __sz; \ + bpf_map__set_value_size(skel->maps.elfsec##_##arr, \ + sizeof(skel->elfsec##_##arr->arr[0]) * (n)); \ + skel->elfsec##_##arr = \ + bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \ + } while (0) + +#include "user_exit_info.h" +#include "compat.h" + +#endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h new file mode 100644 index 000000000..5da12e378 --- /dev/null +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#ifndef __SCX_COMPAT_BPF_H +#define __SCX_COMPAT_BPF_H + +#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ +({ \ + __type __ret = 0; \ + if (bpf_core_enum_value_exists(__type, __ent)) \ + __ret = __ent; \ + __ret; \ +}) + +/* + * %SCX_KICK_IDLE is a later addition. To support both before and after, use + * %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it. + */ +#define __COMPAT_SCX_KICK_IDLE \ + __COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE) + +/* + * scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See + * %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h. + */ +void scx_bpf_switch_all(void) __ksym __weak; + +static inline void __COMPAT_scx_bpf_switch_all(void) +{ + if (!bpf_core_enum_value_exists(enum scx_ops_flags, SCX_OPS_SWITCH_PARTIAL)) + scx_bpf_switch_all(); +} + +/* + * Define sched_ext_ops. This may be expanded to define multiple variants for + * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). + */ +#define SCX_OPS_DEFINE(__name, ...) \ + SEC(".struct_ops.link") \ + struct sched_ext_ops __name = { \ + __VA_ARGS__, \ + }; + +#endif /* __SCX_COMPAT_BPF_H */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h new file mode 100644 index 000000000..e2608ad6e --- /dev/null +++ b/tools/sched_ext/include/scx/compat.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#ifndef __SCX_COMPAT_H +#define __SCX_COMPAT_H + +#include +#include +#include +#include + +struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); + +static inline void __COMPAT_load_vmlinux_btf(void) +{ + if (!__COMPAT_vmlinux_btf) { + __COMPAT_vmlinux_btf = btf__load_vmlinux_btf(); + SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()"); + } +} + +static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v) +{ + const struct btf_type *t; + const char *n; + s32 tid; + int i; + + __COMPAT_load_vmlinux_btf(); + + tid = btf__find_by_name(__COMPAT_vmlinux_btf, type); + if (tid < 0) + return false; + + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); + + if (btf_is_enum(t)) { + struct btf_enum *e = btf_enum(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, name)) { + *v = e[i].val; + return true; + } + } + } else if (btf_is_enum64(t)) { + struct btf_enum64 *e = btf_enum64(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, name)) { + *v = btf_enum64_value(&e[i]); + return true; + } + } + } + + return false; +} + +#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ +({ \ + u64 __val = 0; \ + __COMPAT_read_enum(__type, __ent, &__val); \ + __val; \ +}) + +static inline bool __COMPAT_struct_has_field(const char *type, const char *field) +{ + const struct btf_type *t; + const struct btf_member *m; + const char *n; + s32 tid; + int i; + + __COMPAT_load_vmlinux_btf(); + tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT); + if (tid < 0) + return false; + + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); + + m = btf_members(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, field)) + return true; + } + + return false; +} + +/* + * An ops flag, %SCX_OPS_SWITCH_PARTIAL, replaced scx_bpf_switch_all() which had + * to be called from ops.init(). To support both before and after, use both + * %__COMPAT_SCX_OPS_SWITCH_PARTIAL and %__COMPAT_scx_bpf_switch_all() defined + * in compat.bpf.h. + */ +#define __COMPAT_SCX_OPS_SWITCH_PARTIAL \ + __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") + +static inline long scx_hotplug_seq(void) +{ + int fd; + char buf[32]; + ssize_t len; + long val; + + fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); + if (fd < 0) + return -ENOENT; + + len = read(fd, buf, sizeof(buf) - 1); + SCX_BUG_ON(len <= 0, "read failed (%ld)", len); + buf[len] = 0; + close(fd); + + val = strtoul(buf, NULL, 10); + SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); + + return val; +} + +/* + * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() + * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load + * and attach it, backward compatibility is automatically maintained where + * reasonable. + * + * The following values were added in newer kernels: + * + * - sched_ext_ops.exit_dump_len + * o If nonzero and running on an older kernel, the value is set to zero + * and a warning is emitted + * + * - sched_ext_ops.hotplug_sqn + * o If nonzero and running on an older kernel, the scheduler will fail to + * load + */ +#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ + struct __scx_name *__skel; \ + \ + __skel = __scx_name##__open(); \ + SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ + \ + if (__COMPAT_struct_has_field("sched_ext_ops", "hotplug_seq")) \ + __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ + __skel; \ +}) + +#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ + UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ + if (__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") && \ + (__skel)->struct_ops.__ops_name->exit_dump_len) { \ + fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \ + (__skel)->struct_ops.__ops_name->exit_dump_len = 0; \ + } \ + SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ +}) + +#define SCX_OPS_ATTACH(__skel, __ops_name) ({ \ + struct bpf_link *__link; \ + __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \ + SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \ + __link; \ +}) + +#endif /* __SCX_COMPAT_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h new file mode 100644 index 000000000..2705b81f8 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __USER_EXIT_INFO_H +#define __USER_EXIT_INFO_H + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#ifdef __bpf__ + +#include "vmlinux.h" +#include + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#else /* !__bpf__ */ + +#include +#include + +/* no need to call the following explicitly if SCX_OPS_LOAD() is used */ +#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ + u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ + (__skel)->rodata->__uei_name##_dump_len = __len; \ + RESIZE_ARRAY(data, __uei_name##_dump, __len); \ +}) + +#define UEI_EXITED(__skel, __uei_name) ({ \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ +}) + +#define UEI_KIND(__skel, __uei_name) ((__skel)->data->__uei_name.kind) + +#define ECODE_USER_MASK ((1LLU << 32) - 1) +#define ECODE_SYS_ACT_MASK (((1LLU << 48) - 1) ^ ECODE_USER_MASK) +#define ECODE_SYS_RSN_MASK (~0LLU ^ (ECODE_SYS_ACT_MASK | ECODE_USER_MASK)) + +#define UEI_ECODE(__skel, __uei_name) (__skel)->data->__uei_name.exit_code +#define UEI_ECODE_SYS_ACT(__skel, __uei_name) (UEI_ECODE(__skel, __uei_name) & ECODE_SYS_ACT_MASK) +#define UEI_ECODE_SYS_RSN(__skel, __uei_name) (UEI_ECODE(__skel, __uei_name) & ECODE_SYS_RSN_MASK) +#define UEI_ECODE_USER(__skel, __uei_name) (UEI_ECODE(__skel, __uei_name) & ECODE_USER_MASK) + +#define UEI_REPORT(__skel, __uei_name) ({ \ + struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ + char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ + if (__uei_dump[0] != '\0') { \ + fputs("\nDEBUG DUMP\n", stderr); \ + fputs("================================================================================\n\n", stderr); \ + fputs(__uei_dump, stderr); \ + fputs("\n================================================================================\n\n", stderr); \ + } \ + fprintf(stderr, "EXIT: %s", __uei->reason); \ + if (__uei->msg[0] != '\0') \ + fprintf(stderr, " (%s)", __uei->msg); \ + fputs("\n", stderr); \ +}) + +#define UEI_RESET(__skel, __uei_name) ({ \ + struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ + char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ + size_t __uei_dump_len = (__skel)->rodata->__uei_name##_dump_len; \ + \ + memset(__uei, 0, sizeof(struct user_exit_info)); \ + memset(__uei_dump, 0, __uei_dump_len); \ +}) + +#endif /* __bpf__ */ +#endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c new file mode 100644 index 000000000..1ab8a42ed --- /dev/null +++ b/tools/sched_ext/scx_central.bpf.c @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A central FIFO sched_ext scheduler which demonstrates the followings: + * + * a. Making all scheduling decisions from one CPU: + * + * The central CPU is the only one making scheduling decisions. All other + * CPUs kick the central CPU when they run out of tasks to run. + * + * There is one global BPF queue and the central CPU schedules all CPUs by + * dispatching from the global queue to each CPU's local dsq from dispatch(). + * This isn't the most straightforward. e.g. It'd be easier to bounce + * through per-CPU BPF queues. The current design is chosen to maximally + * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. + * + * b. Tickless operation + * + * All tasks are dispatched with the infinite slice which allows stopping the + * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full + * parameter. The tickless operation can be observed through + * /proc/interrupts. + * + * Periodic switching is enforced by a periodic timer checking all CPUs and + * preempting them as necessary. Unfortunately, BPF timer currently doesn't + * have a way to pin to a specific CPU, so the periodic timer isn't pinned to + * the central CPU. + * + * c. Preemption + * + * Kthreads are unconditionally queued to the head of a matching local dsq + * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always + * prioritized over user threads, which is required for ensuring forward + * progress as e.g. the periodic timer may run on a ksoftirqd and if the + * ksoftirqd gets starved by a user thread, there may not be anything else to + * vacate that user thread. + * + * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the + * next tasks. + * + * This scheduler is designed to maximize usage of various SCX mechanisms. A + * more practical implementation would likely put the scheduling loop outside + * the central CPU's dispatch() path and add some form of priority mechanism. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +enum { + FALLBACK_DSQ_ID = 0, + MS_TO_NS = 1000LLU * 1000, + TIMER_INTERVAL_NS = 1 * MS_TO_NS, +}; + +const volatile s32 central_cpu; +const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ +const volatile u64 slice_ns = SCX_SLICE_DFL; + +bool timer_pinned = true; +u64 nr_total, nr_locals, nr_queued, nr_lost_pids; +u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; +u64 nr_overflows; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, s32); +} central_q SEC(".maps"); + +/* can't use percpu map due to bad lookups */ +bool RESIZABLE_ARRAY(data, cpu_gimme_task); +u64 RESIZABLE_ARRAY(data, cpu_started_at); + +struct central_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct central_timer); +} central_timer SEC(".maps"); + +static bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* + * Steer wakeups to the central CPU as much as possible to avoid + * disturbing other CPUs. It's safe to blindly return the central cpu as + * select_cpu() is a hint and if @p can't be on it, the kernel will + * automatically pick a fallback CPU. + */ + return central_cpu; +} + +void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) +{ + s32 pid = p->pid; + + __sync_fetch_and_add(&nr_total, 1); + + /* + * Push per-cpu kthreads at the head of local dsq's and preempt the + * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked + * behind other threads which is necessary for forward progress + * guarantee as we depend on the BPF timer which may run from ksoftirqd. + */ + if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { + __sync_fetch_and_add(&nr_locals, 1); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, + enq_flags | SCX_ENQ_PREEMPT); + return; + } + + if (bpf_map_push_elem(¢ral_q, &pid, 0)) { + __sync_fetch_and_add(&nr_overflows, 1); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_queued, 1); + + if (!scx_bpf_task_running(p)) + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); +} + +static bool dispatch_to_cpu(s32 cpu) +{ + struct task_struct *p; + s32 pid; + + bpf_repeat(BPF_MAX_LOOPS) { + if (bpf_map_pop_elem(¢ral_q, &pid)) + break; + + __sync_fetch_and_sub(&nr_queued, 1); + + p = bpf_task_from_pid(pid); + if (!p) { + __sync_fetch_and_add(&nr_lost_pids, 1); + continue; + } + + /* + * If we can't run the task at the top, do the dumb thing and + * bounce it to the fallback dsq. + */ + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + __sync_fetch_and_add(&nr_mismatches, 1); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); + bpf_task_release(p); + /* + * We might run out of dispatch buffer slots if we continue dispatching + * to the fallback DSQ, without dispatching to the local DSQ of the + * target CPU. In such a case, break the loop now as will fail the + * next dispatch operation. + */ + if (!scx_bpf_dispatch_nr_slots()) + break; + continue; + } + + /* dispatch to local and mark that @cpu doesn't need more */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); + + if (cpu != central_cpu) + scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE); + + bpf_task_release(p); + return true; + } + + return false; +} + +void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) +{ + if (cpu == central_cpu) { + /* dispatch for all other CPUs first */ + __sync_fetch_and_add(&nr_dispatches, 1); + + bpf_for(cpu, 0, nr_cpu_ids) { + bool *gimme; + + if (!scx_bpf_dispatch_nr_slots()) + break; + + /* central's gimme is never set */ + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); + if (gimme && !*gimme) + continue; + + if (dispatch_to_cpu(cpu)) + *gimme = false; + } + + /* + * Retry if we ran out of dispatch buffer slots as we might have + * skipped some CPUs and also need to dispatch for self. The ext + * core automatically retries if the local dsq is empty but we + * can't rely on that as we're dispatching for other CPUs too. + * Kick self explicitly to retry. + */ + if (!scx_bpf_dispatch_nr_slots()) { + __sync_fetch_and_add(&nr_retries, 1); + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); + return; + } + + /* look for a task to run on the central CPU */ + if (scx_bpf_consume(FALLBACK_DSQ_ID)) + return; + dispatch_to_cpu(central_cpu); + } else { + bool *gimme; + + if (scx_bpf_consume(FALLBACK_DSQ_ID)) + return; + + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); + if (gimme) + *gimme = true; + + /* + * Force dispatch on the scheduling CPU so that it finds a task + * to run for us. + */ + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); + } +} + +void BPF_STRUCT_OPS(central_running, struct task_struct *p) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at) + *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ +} + +void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at) + *started_at = 0; +} + +static int central_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + u64 now = bpf_ktime_get_ns(); + u64 nr_to_kick = nr_queued; + s32 i, curr_cpu; + + curr_cpu = bpf_get_smp_processor_id(); + if (timer_pinned && (curr_cpu != central_cpu)) { + scx_bpf_error("Central timer ran on CPU %d, not central CPU %d", + curr_cpu, central_cpu); + return 0; + } + + bpf_for(i, 0, nr_cpu_ids) { + s32 cpu = (nr_timers + i) % nr_cpu_ids; + u64 *started_at; + + if (cpu == central_cpu) + continue; + + /* kick iff the current one exhausted its slice */ + started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at && *started_at && + vtime_before(now, *started_at + slice_ns)) + continue; + + /* and there's something pending */ + if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || + scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) + ; + else if (nr_to_kick) + nr_to_kick--; + else + continue; + + scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); + } + + bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + __sync_fetch_and_add(&nr_timers, 1); + return 0; +} + +int BPF_STRUCT_OPS_SLEEPABLE(central_init) +{ + u32 key = 0; + struct bpf_timer *timer; + int ret; + + ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); + if (ret) + return ret; + + timer = bpf_map_lookup_elem(¢ral_timer, &key); + if (!timer) + return -ESRCH; + + if (bpf_get_smp_processor_id() != central_cpu) { + scx_bpf_error("init from non-central CPU"); + return -EINVAL; + } + + bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, central_timerfn); + + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + /* + * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a + * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. + * Retry without the PIN. This would be the perfect use case for + * bpf_core_enum_value_exists() but the enum type doesn't have a name + * and can't be used with bpf_core_enum_value_exists(). Oh well... + */ + if (ret == -EINVAL) { + timer_pinned = false; + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); + } + if (ret) + scx_bpf_error("bpf_timer_start failed (%d)", ret); + return ret; +} + +void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(central_ops, + /* + * We are offloading all scheduling decisions to the central CPU + * and thus being the last task on a given CPU doesn't mean + * anything special. Enqueue the last tasks like any other tasks. + */ + .flags = SCX_OPS_ENQ_LAST, + + .select_cpu = (void *)central_select_cpu, + .enqueue = (void *)central_enqueue, + .dispatch = (void *)central_dispatch, + .running = (void *)central_running, + .stopping = (void *)central_stopping, + .init = (void *)central_init, + .exit = (void *)central_exit, + .name = "central"); diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c new file mode 100644 index 000000000..bc209ad6a --- /dev/null +++ b/tools/sched_ext/scx_central.c @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_central.bpf.skel.h" + +const char help_fmt[] = +"A central FIFO sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-c CPU]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -c CPU Override the central CPU (default: 0)\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_central *skel; + struct bpf_link *link; + __u64 seq = 0; + __s32 opt; + cpu_set_t *cpuset; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_central__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + skel->rodata->central_cpu = 0; + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "s:c:ph")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'c': + skel->rodata->central_cpu = strtoul(optarg, NULL, 0); + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + /* Resize arrays so their element count is equal to cpu count. */ + RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids); + + SCX_OPS_LOAD(skel, central_ops, scx_central, uei); + + /* + * Affinitize the loading thread to the central CPU, as: + * - That's where the BPF timer is first invoked in the BPF program. + * - We probably don't want this user space component to take up a core + * from a task that would benefit from avoiding preemption on one of + * the tickless cores. + * + * Until BPF supports pinning the timer, it's not guaranteed that it + * will always be invoked on the central CPU. In practice, this + * suffices the majority of the time. + */ + cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); + SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); + CPU_ZERO(cpuset); + CPU_SET(skel->rodata->central_cpu, cpuset); + SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), + "Failed to affinitize to central CPU %d (max %d)", + skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); + CPU_FREE(cpuset); + + link = SCX_OPS_ATTACH(skel, central_ops); + + if (!skel->data->timer_pinned) + printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + printf("[SEQ %llu]\n", seq++); + printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", + skel->bss->nr_total, + skel->bss->nr_locals, + skel->bss->nr_queued, + skel->bss->nr_lost_pids); + printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", + skel->bss->nr_timers, + skel->bss->nr_dispatches, + skel->bss->nr_mismatches, + skel->bss->nr_retries); + printf("overflow:%10" PRIu64 "\n", + skel->bss->nr_overflows); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + UEI_REPORT(skel, uei); + scx_central__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c new file mode 100644 index 000000000..a9c2fa41a --- /dev/null +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -0,0 +1,939 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext flattened cgroup hierarchy scheduler. It implements + * hierarchical weight-based cgroup CPU control by flattening the cgroup + * hierarchy into a single layer by compounding the active weight share at each + * level. Consider the following hierarchy with weights in parentheses: + * + * R + A (100) + B (100) + * | \ C (100) + * \ D (200) + * + * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. + * Let's say all three have runnable tasks. The total share that each of these + * three cgroups is entitled to can be calculated by compounding its share at + * each level. + * + * For example, B is competing against C and in that competition its share is + * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's + * share in that competition is 200/(200+100) == 1/3. B's eventual share in the + * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's + * eventual shaer is the same at 1/6. D is only competing at the top level and + * its share is 200/(100+200) == 2/3. + * + * So, instead of hierarchically scheduling level-by-level, we can consider it + * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 + * and keep updating the eventual shares as the cgroups' runnable states change. + * + * This flattening of hierarchy can bring a substantial performance gain when + * the cgroup hierarchy is nested multiple levels. in a simple benchmark using + * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it + * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two + * apache instances competing with 2:1 weight ratio nested four level deep. + * + * However, the gain comes at the cost of not being able to properly handle + * thundering herd of cgroups. For example, if many cgroups which are nested + * behind a low priority parent cgroup wake up around the same time, they may be + * able to consume more CPU cycles than they are entitled to. In many use cases, + * this isn't a real concern especially given the performance gain. Also, there + * are ways to mitigate the problem further by e.g. introducing an extra + * scheduling layer on cgroup delegation boundaries. + * + * The scheduler first picks the cgroup to run and then schedule the tasks + * within by using nested weighted vtime scheduling by default. The + * cgroup-internal scheduling can be switched to FIFO with the -f option. + */ +#include +#include "scx_flatcg.h" + +/* + * Maximum amount of retries to find a valid cgroup. + */ +#define CGROUP_MAX_RETRIES 1024 + +char _license[] SEC("license") = "GPL"; + +const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ +const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; +const volatile bool fifo_sched; + +u64 cvtime_now; +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, FCG_NR_STATS); +} stats SEC(".maps"); + +static void stat_inc(enum fcg_stat_idx idx) +{ + u32 idx_v = idx; + + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); + if (cnt_p) + (*cnt_p)++; +} + +struct fcg_cpu_ctx { + u64 cur_cgid; + u64 cur_at; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct fcg_cpu_ctx); + __uint(max_entries, 1); +} cpu_ctx SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct fcg_cgrp_ctx); +} cgrp_ctx SEC(".maps"); + +struct cgv_node { + struct bpf_rb_node rb_node; + __u64 cvtime; + __u64 cgid; + struct bpf_refcount refcount; +}; + +private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; +private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); + +struct cgv_node_stash { + struct cgv_node __kptr *node; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 16384); + __type(key, __u64); + __type(value, struct cgv_node_stash); +} cgv_node_stash SEC(".maps"); + +struct fcg_task_ctx { + u64 bypassed_at; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct fcg_task_ctx); +} task_ctx SEC(".maps"); + +/* gets inc'd on weight tree changes to expire the cached hweights */ +u64 hweight_gen = 1; + +static u64 div_round_up(u64 dividend, u64 divisor) +{ + return (dividend + divisor - 1) / divisor; +} + +static bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct cgv_node *cgc_a, *cgc_b; + + cgc_a = container_of(a, struct cgv_node, rb_node); + cgc_b = container_of(b, struct cgv_node, rb_node); + + return cgc_a->cvtime < cgc_b->cvtime; +} + +static struct fcg_cpu_ctx *find_cpu_ctx(void) +{ + struct fcg_cpu_ctx *cpuc; + u32 idx = 0; + + cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); + if (!cpuc) { + scx_bpf_error("cpu_ctx lookup failed"); + return NULL; + } + return cpuc; +} + +static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) +{ + struct fcg_cgrp_ctx *cgc; + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (!cgc) { + scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); + return NULL; + } + return cgc; +} + +static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) +{ + struct fcg_cgrp_ctx *cgc; + + cgrp = bpf_cgroup_ancestor(cgrp, level); + if (!cgrp) { + scx_bpf_error("ancestor cgroup lookup failed"); + return NULL; + } + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + scx_bpf_error("ancestor cgrp_ctx lookup failed"); + bpf_cgroup_release(cgrp); + return cgc; +} + +static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) +{ + int level; + + if (!cgc->nr_active) { + stat_inc(FCG_STAT_HWT_SKIP); + return; + } + + if (cgc->hweight_gen == hweight_gen) { + stat_inc(FCG_STAT_HWT_CACHE); + return; + } + + stat_inc(FCG_STAT_HWT_UPDATES); + bpf_for(level, 0, cgrp->level + 1) { + struct fcg_cgrp_ctx *cgc; + bool is_active; + + cgc = find_ancestor_cgrp_ctx(cgrp, level); + if (!cgc) + break; + + if (!level) { + cgc->hweight = FCG_HWEIGHT_ONE; + cgc->hweight_gen = hweight_gen; + } else { + struct fcg_cgrp_ctx *pcgc; + + pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); + if (!pcgc) + break; + + /* + * We can be oppotunistic here and not grab the + * cgv_tree_lock and deal with the occasional races. + * However, hweight updates are already cached and + * relatively low-frequency. Let's just do the + * straightforward thing. + */ + bpf_spin_lock(&cgv_tree_lock); + is_active = cgc->nr_active; + if (is_active) { + cgc->hweight_gen = pcgc->hweight_gen; + cgc->hweight = + div_round_up(pcgc->hweight * cgc->weight, + pcgc->child_weight_sum); + } + bpf_spin_unlock(&cgv_tree_lock); + + if (!is_active) { + stat_inc(FCG_STAT_HWT_RACE); + break; + } + } + } +} + +static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) +{ + u64 delta, cvtime, max_budget; + + /* + * A node which is on the rbtree can't be pointed to from elsewhere yet + * and thus can't be updated and repositioned. Instead, we collect the + * vtime deltas separately and apply it asynchronously here. + */ + delta = cgc->cvtime_delta; + __sync_fetch_and_sub(&cgc->cvtime_delta, delta); + cvtime = cgv_node->cvtime + delta; + + /* + * Allow a cgroup to carry the maximum budget proportional to its + * hweight such that a full-hweight cgroup can immediately take up half + * of the CPUs at the most while staying at the front of the rbtree. + */ + max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / + (2 * FCG_HWEIGHT_ONE); + if (vtime_before(cvtime, cvtime_now - max_budget)) + cvtime = cvtime_now - max_budget; + + cgv_node->cvtime = cvtime; +} + +static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) +{ + struct cgv_node_stash *stash; + struct cgv_node *cgv_node; + u64 cgid = cgrp->kn->id; + + /* paired with cmpxchg in try_pick_next_cgroup() */ + if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { + stat_inc(FCG_STAT_ENQ_SKIP); + return; + } + + stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); + if (!stash || !stash->node) { + scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); + return; + } + + cgv_node = bpf_refcount_acquire(stash->node); + if (!cgv_node) { + /* + * Node never leaves cgv_node_stash, this should only happen if + * fcg_cgroup_exit deletes the stashed node + */ + stat_inc(FCG_STAT_ENQ_RACE); + return; + } + + bpf_spin_lock(&cgv_tree_lock); + cgrp_cap_budget(cgv_node, cgc); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); +} + +static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) +{ + /* + * Tell fcg_stopping() that this bypassed the regular scheduling path + * and should be force charged to the cgroup. 0 is used to indicate that + * the task isn't bypassing, so if the current runtime is 0, go back by + * one nanosecond. + */ + taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; +} + +s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + struct fcg_task_ctx *taskc; + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return cpu; + } + + /* + * If select_cpu_dfl() is recommending local enqueue, the target CPU is + * idle. Follow it and charge the cgroup later in fcg_stopping() after + * the fact. + */ + if (is_idle) { + set_bypassed_at(p, taskc); + stat_inc(FCG_STAT_LOCAL); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct fcg_task_ctx *taskc; + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + /* + * Use the direct dispatching and force charging to deal with tasks with + * custom affinities so that we don't have to worry about per-cgroup + * dq's containing tasks that can't be executed from some CPUs. + */ + if (p->nr_cpus_allowed != nr_cpus) { + set_bypassed_at(p, taskc); + + /* + * The global dq is deprioritized as we don't want to let tasks + * to boost themselves by constraining its cpumask. The + * deprioritization is rather severe, so let's not apply that to + * per-cpu kernel threads. This is ham-fisted. We probably wanna + * implement per-cgroup fallback dq's instead so that we have + * more control over when tasks with custom cpumask get issued. + */ + if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { + stat_inc(FCG_STAT_LOCAL); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); + } else { + stat_inc(FCG_STAT_GLOBAL); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + return; + } + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + goto out_release; + + if (fifo_sched) { + scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); + } else { + u64 tvtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) + tvtime = cgc->tvtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, + tvtime, enq_flags); + } + + cgrp_enqueued(cgrp, cgc); +out_release: + bpf_cgroup_release(cgrp); +} + +/* + * Walk the cgroup tree to update the active weight sums as tasks wake up and + * sleep. The weight sums are used as the base when calculating the proportion a + * given cgroup or task is entitled to at each level. + */ +static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) +{ + struct fcg_cgrp_ctx *cgc; + bool updated = false; + int idx; + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + return; + + /* + * In most cases, a hot cgroup would have multiple threads going to + * sleep and waking up while the whole cgroup stays active. In leaf + * cgroups, ->nr_runnable which is updated with __sync operations gates + * ->nr_active updates, so that we don't have to grab the cgv_tree_lock + * repeatedly for a busy cgroup which is staying active. + */ + if (runnable) { + if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) + return; + stat_inc(FCG_STAT_ACT); + } else { + if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) + return; + stat_inc(FCG_STAT_DEACT); + } + + /* + * If @cgrp is becoming runnable, its hweight should be refreshed after + * it's added to the weight tree so that enqueue has the up-to-date + * value. If @cgrp is becoming quiescent, the hweight should be + * refreshed before it's removed from the weight tree so that the usage + * charging which happens afterwards has access to the latest value. + */ + if (!runnable) + cgrp_refresh_hweight(cgrp, cgc); + + /* propagate upwards */ + bpf_for(idx, 0, cgrp->level) { + int level = cgrp->level - idx; + struct fcg_cgrp_ctx *cgc, *pcgc = NULL; + bool propagate = false; + + cgc = find_ancestor_cgrp_ctx(cgrp, level); + if (!cgc) + break; + if (level) { + pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); + if (!pcgc) + break; + } + + /* + * We need the propagation protected by a lock to synchronize + * against weight changes. There's no reason to drop the lock at + * each level but bpf_spin_lock() doesn't want any function + * calls while locked. + */ + bpf_spin_lock(&cgv_tree_lock); + + if (runnable) { + if (!cgc->nr_active++) { + updated = true; + if (pcgc) { + propagate = true; + pcgc->child_weight_sum += cgc->weight; + } + } + } else { + if (!--cgc->nr_active) { + updated = true; + if (pcgc) { + propagate = true; + pcgc->child_weight_sum -= cgc->weight; + } + } + } + + bpf_spin_unlock(&cgv_tree_lock); + + if (!propagate) + break; + } + + if (updated) + __sync_fetch_and_add(&hweight_gen, 1); + + if (runnable) + cgrp_refresh_hweight(cgrp, cgc); +} + +void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) +{ + struct cgroup *cgrp; + + cgrp = scx_bpf_task_cgroup(p); + update_active_weight_sums(cgrp, true); + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) +{ + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + if (fifo_sched) + return; + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (cgc) { + /* + * @cgc->tvtime_now always progresses forward as tasks start + * executing. The test and update can be performed concurrently + * from multiple CPUs and thus racy. Any error should be + * contained and temporary. Let's just live with it. + */ + if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) + cgc->tvtime_now = p->scx.dsq_vtime; + } + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) +{ + struct fcg_task_ctx *taskc; + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + if (!fifo_sched) + p->scx.dsq_vtime += + (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + if (!taskc->bypassed_at) + return; + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (cgc) { + __sync_fetch_and_add(&cgc->cvtime_delta, + p->se.sum_exec_runtime - taskc->bypassed_at); + taskc->bypassed_at = 0; + } + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) +{ + struct cgroup *cgrp; + + cgrp = scx_bpf_task_cgroup(p); + update_active_weight_sums(cgrp, false); + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) +{ + struct fcg_cgrp_ctx *cgc, *pcgc = NULL; + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + return; + + if (cgrp->level) { + pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); + if (!pcgc) + return; + } + + bpf_spin_lock(&cgv_tree_lock); + if (pcgc && cgc->nr_active) + pcgc->child_weight_sum += (s64)weight - cgc->weight; + cgc->weight = weight; + bpf_spin_unlock(&cgv_tree_lock); +} + +static bool try_pick_next_cgroup(u64 *cgidp) +{ + struct bpf_rb_node *rb_node; + struct cgv_node *cgv_node; + struct fcg_cgrp_ctx *cgc; + struct cgroup *cgrp; + u64 cgid; + + /* pop the front cgroup and wind cvtime_now accordingly */ + bpf_spin_lock(&cgv_tree_lock); + + rb_node = bpf_rbtree_first(&cgv_tree); + if (!rb_node) { + bpf_spin_unlock(&cgv_tree_lock); + stat_inc(FCG_STAT_PNC_NO_CGRP); + *cgidp = 0; + return true; + } + + rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); + bpf_spin_unlock(&cgv_tree_lock); + + if (!rb_node) { + /* + * This should never happen. bpf_rbtree_first() was called + * above while the tree lock was held, so the node should + * always be present. + */ + scx_bpf_error("node could not be removed"); + return true; + } + + cgv_node = container_of(rb_node, struct cgv_node, rb_node); + cgid = cgv_node->cgid; + + if (vtime_before(cvtime_now, cgv_node->cvtime)) + cvtime_now = cgv_node->cvtime; + + /* + * If lookup fails, the cgroup's gone. Free and move on. See + * fcg_cgroup_exit(). + */ + cgrp = bpf_cgroup_from_id(cgid); + if (!cgrp) { + stat_inc(FCG_STAT_PNC_GONE); + goto out_free; + } + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (!cgc) { + bpf_cgroup_release(cgrp); + stat_inc(FCG_STAT_PNC_GONE); + goto out_free; + } + + if (!scx_bpf_consume(cgid)) { + bpf_cgroup_release(cgrp); + stat_inc(FCG_STAT_PNC_EMPTY); + goto out_stash; + } + + /* + * Successfully consumed from the cgroup. This will be our current + * cgroup for the new slice. Refresh its hweight. + */ + cgrp_refresh_hweight(cgrp, cgc); + + bpf_cgroup_release(cgrp); + + /* + * As the cgroup may have more tasks, add it back to the rbtree. Note + * that here we charge the full slice upfront and then exact later + * according to the actual consumption. This prevents lowpri thundering + * herd from saturating the machine. + */ + bpf_spin_lock(&cgv_tree_lock); + cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); + cgrp_cap_budget(cgv_node, cgc); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); + + *cgidp = cgid; + stat_inc(FCG_STAT_PNC_NEXT); + return true; + +out_stash: + /* + * Paired with cmpxchg in cgrp_enqueued(). If they see the following + * transition, they'll enqueue the cgroup. If they are earlier, we'll + * see their task in the dq below and requeue the cgroup. + */ + __sync_val_compare_and_swap(&cgc->queued, 1, 0); + + if (scx_bpf_dsq_nr_queued(cgid)) { + bpf_spin_lock(&cgv_tree_lock); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); + stat_inc(FCG_STAT_PNC_RACE); + return false; + } + +out_free: + bpf_obj_drop(cgv_node); + return false; +} + +void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) +{ + struct fcg_cpu_ctx *cpuc; + struct fcg_cgrp_ctx *cgc; + struct cgroup *cgrp; + u64 now = bpf_ktime_get_ns(); + bool picked_next = false; + + cpuc = find_cpu_ctx(); + if (!cpuc) + return; + + if (!cpuc->cur_cgid) + goto pick_next_cgroup; + + if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { + if (scx_bpf_consume(cpuc->cur_cgid)) { + stat_inc(FCG_STAT_CNS_KEEP); + return; + } + stat_inc(FCG_STAT_CNS_EMPTY); + } else { + stat_inc(FCG_STAT_CNS_EXPIRE); + } + + /* + * The current cgroup is expiring. It was already charged a full slice. + * Calculate the actual usage and accumulate the delta. + */ + cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); + if (!cgrp) { + stat_inc(FCG_STAT_CNS_GONE); + goto pick_next_cgroup; + } + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (cgc) { + /* + * We want to update the vtime delta and then look for the next + * cgroup to execute but the latter needs to be done in a loop + * and we can't keep the lock held. Oh well... + */ + bpf_spin_lock(&cgv_tree_lock); + __sync_fetch_and_add(&cgc->cvtime_delta, + (cpuc->cur_at + cgrp_slice_ns - now) * + FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); + bpf_spin_unlock(&cgv_tree_lock); + } else { + stat_inc(FCG_STAT_CNS_GONE); + } + + bpf_cgroup_release(cgrp); + +pick_next_cgroup: + cpuc->cur_at = now; + + if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { + cpuc->cur_cgid = 0; + return; + } + + bpf_repeat(CGROUP_MAX_RETRIES) { + if (try_pick_next_cgroup(&cpuc->cur_cgid)) { + picked_next = true; + break; + } + } + + /* + * This only happens if try_pick_next_cgroup() races against enqueue + * path for more than CGROUP_MAX_RETRIES times, which is extremely + * unlikely and likely indicates an underlying bug. There shouldn't be + * any stall risk as the race is against enqueue. + */ + if (!picked_next) + stat_inc(FCG_STAT_PNC_FAIL); +} + +s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct fcg_task_ctx *taskc; + struct fcg_cgrp_ctx *cgc; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + taskc = bpf_task_storage_get(&task_ctx, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!taskc) + return -ENOMEM; + + taskc->bypassed_at = 0; + + if (!(cgc = find_cgrp_ctx(args->cgroup))) + return -ENOENT; + + p->scx.dsq_vtime = cgc->tvtime_now; + + return 0; +} + +int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + struct fcg_cgrp_ctx *cgc; + struct cgv_node *cgv_node; + struct cgv_node_stash empty_stash = {}, *stash; + u64 cgid = cgrp->kn->id; + int ret; + + /* + * Technically incorrect as cgroup ID is full 64bit while dq ID is + * 63bit. Should not be a problem in practice and easy to spot in the + * unlikely case that it breaks. + */ + ret = scx_bpf_create_dsq(cgid, -1); + if (ret) + return ret; + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!cgc) { + ret = -ENOMEM; + goto err_destroy_dsq; + } + + cgc->weight = args->weight; + cgc->hweight = FCG_HWEIGHT_ONE; + + ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, + BPF_NOEXIST); + if (ret) { + if (ret != -ENOMEM) + scx_bpf_error("unexpected stash creation error (%d)", + ret); + goto err_destroy_dsq; + } + + stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); + if (!stash) { + scx_bpf_error("unexpected cgv_node stash lookup failure"); + ret = -ENOENT; + goto err_destroy_dsq; + } + + cgv_node = bpf_obj_new(struct cgv_node); + if (!cgv_node) { + ret = -ENOMEM; + goto err_del_cgv_node; + } + + cgv_node->cgid = cgid; + cgv_node->cvtime = cvtime_now; + + cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); + if (cgv_node) { + scx_bpf_error("unexpected !NULL cgv_node stash"); + ret = -EBUSY; + goto err_drop; + } + + return 0; + +err_drop: + bpf_obj_drop(cgv_node); +err_del_cgv_node: + bpf_map_delete_elem(&cgv_node_stash, &cgid); +err_destroy_dsq: + scx_bpf_destroy_dsq(cgid); + return ret; +} + +void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + + /* + * For now, there's no way find and remove the cgv_node if it's on the + * cgv_tree. Let's drain them in the dispatch path as they get popped + * off the front of the tree. + */ + bpf_map_delete_elem(&cgv_node_stash, &cgid); + scx_bpf_destroy_dsq(cgid); +} + +void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + struct fcg_cgrp_ctx *from_cgc, *to_cgc; + s64 vtime_delta; + + /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ + if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) + return; + + vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; + p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; +} + +void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(flatcg_ops, + .select_cpu = (void *)fcg_select_cpu, + .enqueue = (void *)fcg_enqueue, + .dispatch = (void *)fcg_dispatch, + .runnable = (void *)fcg_runnable, + .running = (void *)fcg_running, + .stopping = (void *)fcg_stopping, + .quiescent = (void *)fcg_quiescent, + .init_task = (void *)fcg_init_task, + .cgroup_set_weight = (void *)fcg_cgroup_set_weight, + .cgroup_init = (void *)fcg_cgroup_init, + .cgroup_exit = (void *)fcg_cgroup_exit, + .cgroup_move = (void *)fcg_cgroup_move, + .exit = (void *)fcg_exit, + .flags = SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING, + .name = "flatcg"); diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c new file mode 100644 index 000000000..b6f0184f1 --- /dev/null +++ b/tools/sched_ext/scx_flatcg.c @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_flatcg.h" +#include "scx_flatcg.bpf.skel.h" + +#ifndef FILEID_KERNFS +#define FILEID_KERNFS 0xfe +#endif + +const char help_fmt[] = +"A flattened cgroup hierarchy sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -i INTERVAL Report interval\n" +" -f Use FIFO scheduling instead of weighted vtime scheduling\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +static float read_cpu_util(__u64 *last_sum, __u64 *last_idle) +{ + FILE *fp; + char buf[4096]; + char *line, *cur = NULL, *tok; + __u64 sum = 0, idle = 0; + __u64 delta_sum, delta_idle; + int idx; + + fp = fopen("/proc/stat", "r"); + if (!fp) { + perror("fopen(\"/proc/stat\")"); + return 0.0; + } + + if (!fgets(buf, sizeof(buf), fp)) { + perror("fgets(\"/proc/stat\")"); + fclose(fp); + return 0.0; + } + fclose(fp); + + line = buf; + for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) { + char *endp = NULL; + __u64 v; + + if (idx == 0) { + line = NULL; + continue; + } + v = strtoull(tok, &endp, 0); + if (!endp || *endp != '\0') { + fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n", + idx, tok); + continue; + } + sum += v; + if (idx == 4) + idle = v; + } + + delta_sum = sum - *last_sum; + delta_idle = idle - *last_idle; + *last_sum = sum; + *last_idle = idle; + + return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0; +} + +static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats) +{ + __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS); + + for (idx = 0; idx < FCG_NR_STATS; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_flatcg *skel; + struct bpf_link *link; + struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; + bool dump_cgrps = false; + __u64 last_cpu_sum = 0, last_cpu_idle = 0; + __u64 last_stats[FCG_NR_STATS] = {}; + unsigned long seq = 0; + __s32 opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_flatcg__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "s:i:dfph")) != -1) { + double v; + + switch (opt) { + case 's': + v = strtod(optarg, NULL); + skel->rodata->cgrp_slice_ns = v * 1000; + break; + case 'i': + v = strtod(optarg, NULL); + intv_ts.tv_sec = v; + intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; + break; + case 'd': + dump_cgrps = true; + break; + case 'f': + skel->rodata->fifo_sched = true; + break; + case 'h': + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", + (double)skel->rodata->cgrp_slice_ns / 1000000.0, + (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, + dump_cgrps); + + SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei); + link = SCX_OPS_ATTACH(skel, flatcg_ops); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 acc_stats[FCG_NR_STATS]; + __u64 stats[FCG_NR_STATS]; + float cpu_util; + int i; + + cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle); + + fcg_read_stats(skel, acc_stats); + for (i = 0; i < FCG_NR_STATS; i++) + stats[i] = acc_stats[i] - last_stats[i]; + + memcpy(last_stats, acc_stats, sizeof(acc_stats)); + + printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", + seq++, cpu_util * 100.0, skel->data->hweight_gen); + printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n", + stats[FCG_STAT_ACT], + stats[FCG_STAT_DEACT], + stats[FCG_STAT_GLOBAL], + stats[FCG_STAT_LOCAL]); + printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n", + stats[FCG_STAT_HWT_CACHE], + stats[FCG_STAT_HWT_UPDATES], + stats[FCG_STAT_HWT_SKIP], + stats[FCG_STAT_HWT_RACE]); + printf("ENQ skip:%6llu race:%6llu\n", + stats[FCG_STAT_ENQ_SKIP], + stats[FCG_STAT_ENQ_RACE]); + printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", + stats[FCG_STAT_CNS_KEEP], + stats[FCG_STAT_CNS_EXPIRE], + stats[FCG_STAT_CNS_EMPTY], + stats[FCG_STAT_CNS_GONE]); + printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n", + stats[FCG_STAT_PNC_NEXT], + stats[FCG_STAT_PNC_EMPTY], + stats[FCG_STAT_PNC_NO_CGRP], + stats[FCG_STAT_PNC_GONE], + stats[FCG_STAT_PNC_RACE], + stats[FCG_STAT_PNC_FAIL]); + printf("BAD remove:%6llu\n", + acc_stats[FCG_STAT_BAD_REMOVAL]); + fflush(stdout); + + nanosleep(&intv_ts, NULL); + } + + bpf_link__destroy(link); + UEI_REPORT(skel, uei); + scx_flatcg__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h new file mode 100644 index 000000000..6f2ea50ac --- /dev/null +++ b/tools/sched_ext/scx_flatcg.h @@ -0,0 +1,51 @@ +#ifndef __SCX_EXAMPLE_FLATCG_H +#define __SCX_EXAMPLE_FLATCG_H + +enum { + FCG_HWEIGHT_ONE = 1LLU << 16, +}; + +enum fcg_stat_idx { + FCG_STAT_ACT, + FCG_STAT_DEACT, + FCG_STAT_LOCAL, + FCG_STAT_GLOBAL, + + FCG_STAT_HWT_UPDATES, + FCG_STAT_HWT_CACHE, + FCG_STAT_HWT_SKIP, + FCG_STAT_HWT_RACE, + + FCG_STAT_ENQ_SKIP, + FCG_STAT_ENQ_RACE, + + FCG_STAT_CNS_KEEP, + FCG_STAT_CNS_EXPIRE, + FCG_STAT_CNS_EMPTY, + FCG_STAT_CNS_GONE, + + FCG_STAT_PNC_NO_CGRP, + FCG_STAT_PNC_NEXT, + FCG_STAT_PNC_EMPTY, + FCG_STAT_PNC_GONE, + FCG_STAT_PNC_RACE, + FCG_STAT_PNC_FAIL, + + FCG_STAT_BAD_REMOVAL, + + FCG_NR_STATS, +}; + +struct fcg_cgrp_ctx { + u32 nr_active; + u32 nr_runnable; + u32 queued; + u32 weight; + u32 hweight; + u64 child_weight_sum; + u64 hweight_gen; + s64 cvtime_delta; + u64 tvtime_now; +}; + +#endif /* __SCX_EXAMPLE_FLATCG_H */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c new file mode 100644 index 000000000..dbca79a4d --- /dev/null +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -0,0 +1,587 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple five-level FIFO queue scheduler. + * + * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets + * assigned to one depending on its compound weight. Each CPU round robins + * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from + * queue0, 2 from queue1, 4 from queue2 and so on. + * + * This scheduler demonstrates: + * + * - BPF-side queueing using PIDs. + * - Sleepable per-task storage allocation using ops.prep_enable(). + * - Using ops.cpu_release() to handle a higher priority scheduling class taking + * the CPU away. + * - Core-sched support. + * + * This scheduler is primarily for demonstration and testing of sched_ext + * features and unlikely to be useful for actual workloads. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +#define ONE_SEC_IN_NS 1000000000 + +char _license[] SEC("license") = "GPL"; + +const volatile u64 slice_ns = SCX_SLICE_DFL; +const volatile u32 stall_user_nth; +const volatile u32 stall_kernel_nth; +const volatile u32 dsp_inf_loop_after; +const volatile s32 disallow_tgid; +const volatile bool switch_partial; + +u32 test_error_cnt; + +UEI_DEFINE(uei); + +struct qmap { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, u32); +} queue0 SEC(".maps"), + queue1 SEC(".maps"), + queue2 SEC(".maps"), + queue3 SEC(".maps"), + queue4 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 5); + __type(key, int); + __array(values, struct qmap); +} queue_arr SEC(".maps") = { + .values = { + [0] = &queue0, + [1] = &queue1, + [2] = &queue2, + [3] = &queue3, + [4] = &queue4, + }, +}; + +/* + * If enabled, CPU performance target is set according to the queue index + * according to the following table. + */ +static const u32 qidx_to_cpuperf_target[] = { + [0] = SCX_CPUPERF_ONE * 0 / 4, + [1] = SCX_CPUPERF_ONE * 1 / 4, + [2] = SCX_CPUPERF_ONE * 2 / 4, + [3] = SCX_CPUPERF_ONE * 3 / 4, + [4] = SCX_CPUPERF_ONE * 4 / 4, +}; + +/* + * Per-queue sequence numbers to implement core-sched ordering. + * + * Tail seq is assigned to each queued task and incremented. Head seq tracks the + * sequence number of the latest dispatched task. The distance between the a + * task's seq and the associated queue's head seq is called the queue distance + * and used when comparing two tasks for ordering. See qmap_core_sched_before(). + */ +static u64 core_sched_head_seqs[5]; +static u64 core_sched_tail_seqs[5]; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local_dsq */ + u64 core_sched_seq; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +struct cpu_ctx { + u64 dsp_idx; /* dispatch index */ + u64 dsp_cnt; /* remaining count */ + u32 avg_weight; + u32 cpuperf_target; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct cpu_ctx); +} cpu_ctx_stor SEC(".maps"); + +/* Statistics */ +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; +u64 nr_core_sched_execed; +u32 cpuperf_min, cpuperf_avg, cpuperf_max; +u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; + +s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + return cpu; + + return prev_cpu; +} + +static int weight_to_idx(u32 weight) +{ + /* Coarsely map the compound weight to a FIFO. */ + if (weight <= 25) + return 0; + else if (weight <= 50) + return 1; + else if (weight < 200) + return 2; + else if (weight < 400) + return 3; + else + return 4; +} + +void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) +{ + static u32 user_cnt, kernel_cnt; + struct task_ctx *tctx; + u32 pid = p->pid; + int idx = weight_to_idx(p->scx.weight); + void *ring; + + if (p->flags & PF_KTHREAD) { + if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) + return; + } else { + if (stall_user_nth && !(++user_cnt % stall_user_nth)) + return; + } + + if (test_error_cnt && !--test_error_cnt) + scx_bpf_error("test triggering error"); + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + /* + * All enqueued tasks must have their core_sched_seq updated for correct + * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in + * qmap_ops.flags. + */ + tctx->core_sched_seq = core_sched_tail_seqs[idx]++; + + /* + * If qmap_select_cpu() is telling us to or this is the last runnable + * task on the CPU, enqueue locally. + */ + if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { + tctx->force_local = false; + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); + return; + } + + /* + * If the task was re-enqueued due to the CPU being preempted by a + * higher priority scheduling class, just re-enqueue the task directly + * on the global DSQ. As we want another CPU to pick it up, find and + * kick an idle CPU. + */ + if (enq_flags & SCX_ENQ_REENQ) { + s32 cpu; + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags); + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE); + return; + } + + ring = bpf_map_lookup_elem(&queue_arr, &idx); + if (!ring) { + scx_bpf_error("failed to find ring %d", idx); + return; + } + + /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ + if (bpf_map_push_elem(ring, &pid, 0)) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_enqueued, 1); +} + +/* + * The BPF queue map doesn't support removal and sched_ext can handle spurious + * dispatches. qmap_dequeue() is only used to collect statistics. + */ +void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) +{ + __sync_fetch_and_add(&nr_dequeued, 1); + if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) + __sync_fetch_and_add(&nr_core_sched_execed, 1); +} + +static void update_core_sched_head_seq(struct task_struct *p) +{ + struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + int idx = weight_to_idx(p->scx.weight); + + if (tctx) + core_sched_head_seqs[idx] = tctx->core_sched_seq; + else + scx_bpf_error("task_ctx lookup failed"); +} + +void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) +{ + struct cpu_ctx *cpuc; + u32 zero = 0; + void *fifo; + s32 i, pid; + + if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { + struct task_struct *p; + + /* + * PID 2 should be kthreadd which should mostly be idle and off + * the scheduler. Let's keep dispatching it to force the kernel + * to call this function over and over again. + */ + p = bpf_task_from_pid(2); + if (p) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); + bpf_task_release(p); + return; + } + } + + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { + scx_bpf_error("failed to look up cpu_ctx"); + return; + } + + for (i = 0; i < 5; i++) { + /* Advance the dispatch cursor and pick the fifo. */ + if (!cpuc->dsp_cnt) { + cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; + cpuc->dsp_cnt = 1 << cpuc->dsp_idx; + } + cpuc->dsp_cnt--; + + fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); + if (!fifo) { + scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); + return; + } + + /* Dispatch or advance. */ + if (!bpf_map_pop_elem(fifo, &pid)) { + struct task_struct *p; + + p = bpf_task_from_pid(pid); + if (p) { + update_core_sched_head_seq(p); + __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); + bpf_task_release(p); + return; + } + } + + cpuc->dsp_cnt = 0; + } +} + +void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) +{ + struct cpu_ctx *cpuc; + u32 zero = 0; + int idx; + + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { + scx_bpf_error("failed to look up cpu_ctx"); + return; + } + + /* + * Use the running avg of weights to select the target cpuperf level. + * This is a demonstration of the cpuperf feature rather than a + * practical strategy to regulate CPU frequency. + */ + cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; + idx = weight_to_idx(cpuc->avg_weight); + cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; + + scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); +} + +/* + * The distance from the head of the queue scaled by the weight of the queue. + * The lower the number, the older the task and the higher the priority. + */ +static s64 task_qdist(struct task_struct *p) +{ + int idx = weight_to_idx(p->scx.weight); + struct task_ctx *tctx; + s64 qdist; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return 0; + } + + qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; + + /* + * As queue index increments, the priority doubles. The queue w/ index 3 + * is dispatched twice more frequently than 2. Reflect the difference by + * scaling qdists accordingly. Note that the shift amount needs to be + * flipped depending on the sign to avoid flipping priority direction. + */ + if (qdist >= 0) + return qdist << (4 - idx); + else + return qdist << idx; +} + +/* + * This is called to determine the task ordering when core-sched is picking + * tasks to execute on SMT siblings and should encode about the same ordering as + * the regular scheduling path. Use the priority-scaled distances from the head + * of the queues to compare the two tasks which should be consistent with the + * dispatch path behavior. + */ +bool BPF_STRUCT_OPS(qmap_core_sched_before, + struct task_struct *a, struct task_struct *b) +{ + return task_qdist(a) > task_qdist(b); +} + +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + u32 cnt; + + /* + * Called when @cpu is taken by a higher priority scheduling class. This + * makes @cpu no longer available for executing sched_ext tasks. As we + * don't want the tasks in @cpu's local dsq to sit there until @cpu + * becomes available again, re-enqueue them into the global dsq. See + * %SCX_ENQ_REENQ handling in qmap_enqueue(). + */ + cnt = scx_bpf_reenqueue_local(); + if (cnt) + __sync_fetch_and_add(&nr_reenqueued, cnt); +} + +s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (p->tgid == disallow_tgid) + p->scx.disallow = true; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +/* + * Print out the online and possible CPU map using bpf_printk() as a + * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). + */ +static void print_cpus(void) +{ + const struct cpumask *possible, *online; + s32 cpu; + char buf[128] = "", *p; + int idx; + + possible = scx_bpf_get_possible_cpumask(); + online = scx_bpf_get_online_cpumask(); + + idx = 0; + bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { + if (!(p = MEMBER_VPTR(buf, [idx++]))) + break; + if (bpf_cpumask_test_cpu(cpu, online)) + *p++ = 'O'; + else if (bpf_cpumask_test_cpu(cpu, possible)) + *p++ = 'X'; + else + *p++ = ' '; + + if ((cpu & 7) == 7) { + if (!(p = MEMBER_VPTR(buf, [idx++]))) + break; + *p++ = '|'; + } + } + buf[sizeof(buf) - 1] = '\0'; + + scx_bpf_put_cpumask(online); + scx_bpf_put_cpumask(possible); + + bpf_printk("CPUS: |%s", buf); +} + +void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) +{ + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); +} + +void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) +{ + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); +} + +struct cpu_mon_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct cpu_mon_timer); +} central_timer SEC(".maps"); + +/* + * Print out the min, avg and max performance levels of CPUs every second to + * demonstrate the cpuperf interface. + */ +static int cpu_mon_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + u32 zero = 0; + u32 nr_cpu_ids = scx_bpf_nr_cpu_ids(); + u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; + u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; + const struct cpumask *online; + int i, nr_online_cpus = 0; + + online = scx_bpf_get_online_cpumask(); + if (!online) + return -ENOMEM; + + bpf_for(i, 0, nr_cpu_ids) { + struct cpu_ctx *cpuc; + u32 cap, cur; + + if (!bpf_cpumask_test_cpu(i, online)) + continue; + nr_online_cpus++; + + /* collect the capacity and current cpuperf */ + cap = scx_bpf_cpuperf_cap(i); + cur = scx_bpf_cpuperf_cur(i); + + cur_min = cur < cur_min ? cur : cur_min; + cur_max = cur > cur_max ? cur : cur_max; + + /* + * $cur is relative to $cap. Scale it down accordingly so that + * it's in the same scale as other CPUs and $cur_sum/$cap_sum + * makes sense. + */ + cur_sum += cur * cap / SCX_CPUPERF_ONE; + cap_sum += cap; + + if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { + scx_bpf_error("failed to look up cpu_ctx"); + goto out; + } + + /* collect target */ + cur = cpuc->cpuperf_target; + target_sum += cur; + target_min = cur < target_min ? cur : target_min; + target_max = cur > target_max ? cur : target_max; + } + + cpuperf_min = cur_min; + cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; + cpuperf_max = cur_max; + + cpuperf_target_min = target_min; + cpuperf_target_avg = target_sum / nr_online_cpus; + cpuperf_target_max = target_max; + + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); +out: + scx_bpf_put_cpumask(online); + return 0; +} + +s32 BPF_STRUCT_OPS(qmap_init) +{ + u32 key = 0; + struct bpf_timer *timer; + + if (!switch_partial) + __COMPAT_scx_bpf_switch_all(); + + print_cpus(); + + timer = bpf_map_lookup_elem(¢ral_timer, &key); + if (!timer) + return -ESRCH; + + bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, cpu_mon_timerfn); + + return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); +} + +void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(qmap_ops, + .select_cpu = (void *)qmap_select_cpu, + .enqueue = (void *)qmap_enqueue, + .dequeue = (void *)qmap_dequeue, + .dispatch = (void *)qmap_dispatch, + .tick = (void *)qmap_tick, + .core_sched_before = (void *)qmap_core_sched_before, + .cpu_release = (void *)qmap_cpu_release, + .init_task = (void *)qmap_init_task, + .cpu_online = (void *)qmap_cpu_online, + .cpu_offline = (void *)qmap_cpu_offline, + .init = (void *)qmap_init, + .exit = (void *)qmap_exit, + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 5000U, + .name = "qmap"); diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c new file mode 100644 index 000000000..5cadc2e0b --- /dev/null +++ b/tools/sched_ext/scx_qmap.c @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_qmap.bpf.skel.h" + +const char help_fmt[] = +"A simple five-level FIFO queue sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID]\n" +" [-D LEN] [-p]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" +" -t COUNT Stall every COUNT'th user thread\n" +" -T COUNT Stall every COUNT'th kernel thread\n" +" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" +" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" +" -D LEN Set scx_exit_info.dump buffer length\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_qmap *skel; + struct bpf_link *link; + int opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_qmap__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + while ((opt = getopt(argc, argv, "s:e:t:T:l:d:D:ph")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'e': + skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); + break; + case 't': + skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); + break; + case 'T': + skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); + break; + case 'l': + skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); + break; + case 'd': + skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); + if (skel->rodata->disallow_tgid < 0) + skel->rodata->disallow_tgid = getpid(); + break; + case 'D': + skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0); + break; + case 'p': + skel->rodata->switch_partial = true; + skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); + link = SCX_OPS_ATTACH(skel, qmap_ops); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + long nr_enqueued = skel->bss->nr_enqueued; + long nr_dispatched = skel->bss->nr_dispatched; + + printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%" PRIu64 " deq=%" PRIu64 " core=%" PRIu64 "\n", + nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, + skel->bss->nr_reenqueued, skel->bss->nr_dequeued, + skel->bss->nr_core_sched_execed); + printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", + skel->bss->cpuperf_min, + skel->bss->cpuperf_avg, + skel->bss->cpuperf_max, + skel->bss->cpuperf_target_min, + skel->bss->cpuperf_target_avg, + skel->bss->cpuperf_target_max); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + UEI_REPORT(skel, uei); + scx_qmap__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py new file mode 100644 index 000000000..d457d2a74 --- /dev/null +++ b/tools/sched_ext/scx_show_state.py @@ -0,0 +1,39 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Tejun Heo +# Copyright (C) 2024 Meta Platforms, Inc. and affiliates. + +desc = """ +This is a drgn script to show the current sched_ext state. +For more info on drgn, visit https://github.com/osandov/drgn. +""" + +import drgn +import sys + +def err(s): + print(s, file=sys.stderr, flush=True) + sys.exit(1) + +def read_int(name): + return int(prog[name].value_()) + +def read_atomic(name): + return prog[name].counter.value_() + +def read_static_key(name): + return prog[name].key.enabled.counter.value_() + +def ops_state_str(state): + return prog['scx_ops_enable_state_str'][state].string_().decode() + +ops = prog['scx_ops'] +enable_state = read_atomic("scx_ops_enable_state_var") + +print(f'ops : {ops.name.string_().decode()}') +print(f'enabled : {read_static_key("__scx_ops_enabled")}') +print(f'switching_all : {read_int("scx_switching_all")}') +print(f'switched_all : {read_static_key("__scx_switched_all")}') +print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') +print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') +print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c new file mode 100644 index 000000000..535fe52f5 --- /dev/null +++ b/tools/sched_ext/scx_simple.bpf.c @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple scheduler. + * + * By default, it operates as a simple global weighted vtime scheduler and can + * be switched to FIFO scheduling. It also demonstrates the following niceties. + * + * - Statistics tracking how many tasks are queued to local and global dsq's. + * - Termination notification for userspace. + * + * While very simple, this scheduler should work reasonably well on CPUs with a + * uniform L3 cache topology. While preemption is not implemented, the fact that + * the scheduling queue is shared across all CPUs means that whatever is at the + * front of the queue is likely to be executed fairly quickly given enough + * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads + * but comes with the usual problems with FIFO scheduling where saturating + * threads can easily drown out interactive ones. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +const volatile bool fifo_sched; + +static u64 vtime_now; +UEI_DEFINE(uei); + +#define SHARED_DSQ 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, global] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc(0); /* count local queueing */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ + stat_inc(1); /* count global queueing */ + + if (fifo_sched) { + scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + vtime = vtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, + enq_flags); + } +} + +void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SHARED_DSQ); +} + +void BPF_STRUCT_OPS(simple_running, struct task_struct *p) +{ + if (fifo_sched) + return; + + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) +{ + if (fifo_sched) + return; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) +{ + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(simple_ops, + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .dispatch = (void *)simple_dispatch, + .running = (void *)simple_running, + .stopping = (void *)simple_stopping, + .enable = (void *)simple_enable, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple"); diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c new file mode 100644 index 000000000..006a2a5f3 --- /dev/null +++ b/tools/sched_ext/scx_simple.c @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include "scx_simple.bpf.skel.h" + +const char help_fmt[] = +"A simple sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-f]\n" +"\n" +" -f Use FIFO scheduling instead of weighted vtime scheduling\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void read_stats(struct scx_simple *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_simple *skel; + struct bpf_link *link; + __u32 opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_simple__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + while ((opt = getopt(argc, argv, "fh")) != -1) { + switch (opt) { + case 'f': + skel->rodata->fifo_sched = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); + link = SCX_OPS_ATTACH(skel, simple_ops); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu global=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + UEI_REPORT(skel, uei); + scx_simple__destroy(skel); + return 0; +} diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index b2b4c391e..82de56c81 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -280,6 +280,7 @@ extern struct argp bench_strncmp_argp; extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; +extern struct argp bench_trigger_batch_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -292,6 +293,7 @@ static const struct argp_child bench_parsers[] = { { &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 }, { &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 }, { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, + { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, {}, }; @@ -491,24 +493,31 @@ extern const struct bench bench_rename_kretprobe; extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; -extern const struct bench bench_trig_base; -extern const struct bench bench_trig_tp; -extern const struct bench bench_trig_rawtp; + +/* pure counting benchmarks to establish theoretical lmits */ +extern const struct bench bench_trig_usermode_count; +extern const struct bench bench_trig_syscall_count; +extern const struct bench bench_trig_kernel_count; + +/* batched, staying mostly in-kernel benchmarks */ extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; -extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; -extern const struct bench bench_trig_uprobe_base; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; + +/* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; extern const struct bench bench_trig_uretprobe_nop; extern const struct bench bench_trig_uprobe_push; extern const struct bench bench_trig_uretprobe_push; extern const struct bench bench_trig_uprobe_ret; extern const struct bench bench_trig_uretprobe_ret; + extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; extern const struct bench bench_pb_libbpf; @@ -539,24 +548,28 @@ static const struct bench *benchs[] = { &bench_rename_rawtp, &bench_rename_fentry, &bench_rename_fexit, - &bench_trig_base, - &bench_trig_tp, - &bench_trig_rawtp, + /* pure counting benchmarks for establishing theoretical limits */ + &bench_trig_usermode_count, + &bench_trig_kernel_count, + &bench_trig_syscall_count, + /* batched, staying mostly in-kernel triggers */ &bench_trig_kprobe, &bench_trig_kretprobe, &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, &bench_trig_fexit, - &bench_trig_fentry_sleep, &bench_trig_fmodret, - &bench_trig_uprobe_base, + &bench_trig_tp, + &bench_trig_rawtp, + /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, &bench_trig_uprobe_push, &bench_trig_uretprobe_push, &bench_trig_uprobe_ret, &bench_trig_uretprobe_ret, + /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, &bench_pb_libbpf, diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c index b36de42ee..e2ff8ea1c 100644 --- a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c +++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c @@ -186,7 +186,7 @@ static void *task_producer(void *input) for (i = 0; i < batch_sz; i++) { if (!pthd_results[i]) - pthread_join(pthds[i], NULL);; + pthread_join(pthds[i], NULL); } } diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index ace0d1011..4b05539f1 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -1,15 +1,95 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define _GNU_SOURCE +#include +#include +#include #include "bench.h" #include "trigger_bench.skel.h" #include "trace_helpers.h" +#define MAX_TRIG_BATCH_ITERS 1000 + +static struct { + __u32 batch_iters; +} args = { + .batch_iters = 100, +}; + +enum { + ARG_TRIG_BATCH_ITERS = 7000, +}; + +static const struct argp_option opts[] = { + { "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0, + "Number of in-kernel iterations per one driver test run"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_TRIG_BATCH_ITERS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > MAX_TRIG_BATCH_ITERS) { + fprintf(stderr, "invalid --trig-batch-iters value (should be between %d and %d)\n", + 1, MAX_TRIG_BATCH_ITERS); + argp_usage(state); + } + args.batch_iters = ret; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_trigger_batch_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* adjust slot shift in inc_hits() if changing */ +#define MAX_BUCKETS 256 + +#pragma GCC diagnostic ignored "-Wattributes" + /* BPF triggering benchmarks */ static struct trigger_ctx { struct trigger_bench *skel; + bool usermode_counters; + int driver_prog_fd; } ctx; -static struct counter base_hits; +static struct counter base_hits[MAX_BUCKETS]; + +static __always_inline void inc_counter(struct counter *counters) +{ + static __thread int tid = 0; + unsigned slot; + + if (unlikely(tid == 0)) + tid = syscall(SYS_gettid); + + /* multiplicative hashing, it's fast */ + slot = 2654435769U * tid; + slot >>= 24; + + atomic_inc(&base_hits[slot].value); /* use highest byte as an index */ +} + +static long sum_and_reset_counters(struct counter *counters) +{ + int i; + long sum = 0; + + for (i = 0; i < MAX_BUCKETS; i++) + sum += atomic_swap(&counters[i].value, 0); + return sum; +} static void trigger_validate(void) { @@ -19,41 +99,63 @@ static void trigger_validate(void) } } -static void *trigger_base_producer(void *input) +static void *trigger_producer(void *input) { - while (true) { - (void)syscall(__NR_getpgid); - atomic_inc(&base_hits.value); + if (ctx.usermode_counters) { + while (true) { + (void)syscall(__NR_getpgid); + inc_counter(base_hits); + } + } else { + while (true) + (void)syscall(__NR_getpgid); } return NULL; } -static void trigger_base_measure(struct bench_res *res) +static void *trigger_producer_batch(void *input) { - res->hits = atomic_swap(&base_hits.value, 0); -} + int fd = ctx.driver_prog_fd ?: bpf_program__fd(ctx.skel->progs.trigger_driver); -static void *trigger_producer(void *input) -{ while (true) - (void)syscall(__NR_getpgid); + bpf_prog_test_run_opts(fd, NULL); + return NULL; } static void trigger_measure(struct bench_res *res) { - res->hits = atomic_swap(&ctx.skel->bss->hits, 0); + if (ctx.usermode_counters) + res->hits = sum_and_reset_counters(base_hits); + else + res->hits = sum_and_reset_counters(ctx.skel->bss->hits); } static void setup_ctx(void) { setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + + /* default "driver" BPF program */ + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); + + ctx.skel->rodata->batch_iters = args.batch_iters; +} + +static void load_ctx(void) +{ + int err; + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } } static void attach_bpf(struct bpf_program *prog) @@ -67,64 +169,104 @@ static void attach_bpf(struct bpf_program *prog) } } -static void trigger_tp_setup(void) +static void trigger_syscall_count_setup(void) { - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_tp); + ctx.usermode_counters = true; } -static void trigger_rawtp_setup(void) +/* Batched, staying mostly in-kernel triggering setups */ +static void trigger_kernel_count_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_count, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); } static void trigger_kprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe); } static void trigger_kretprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe); } static void trigger_kprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi); } static void trigger_kretprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi); } static void trigger_fentry_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fentry); } static void trigger_fexit_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fexit); } -static void trigger_fentry_sleep_setup(void) +static void trigger_fmodret_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); } -static void trigger_fmodret_setup(void) +static void trigger_tp_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fmodret); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_tp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_rawtp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_rawtp); } /* make sure call is not inlined and not avoided by compiler, so __weak and @@ -137,7 +279,7 @@ static void trigger_fmodret_setup(void) * GCC doesn't generate stack setup preample for these functions due to them * having no input arguments and doing nothing in the body. */ -__weak void uprobe_target_nop(void) +__nocf_check __weak void uprobe_target_nop(void) { asm volatile ("nop"); } @@ -146,7 +288,7 @@ __weak void opaque_noop_func(void) { } -__weak int uprobe_target_push(void) +__nocf_check __weak int uprobe_target_push(void) { /* overhead of function call is negligible compared to uprobe * triggering, so this shouldn't affect benchmark results much @@ -155,16 +297,16 @@ __weak int uprobe_target_push(void) return 1; } -__weak void uprobe_target_ret(void) +__nocf_check __weak void uprobe_target_ret(void) { asm volatile (""); } -static void *uprobe_base_producer(void *input) +static void *uprobe_producer_count(void *input) { while (true) { uprobe_target_nop(); - atomic_inc(&base_hits.value); + inc_counter(base_hits); } return NULL; } @@ -194,15 +336,24 @@ static void usetup(bool use_retprobe, void *target_addr) { size_t uprobe_offset; struct bpf_link *link; + int err; setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + uprobe_offset = get_uprobe_offset(target_addr); link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, use_retprobe, @@ -216,204 +367,90 @@ static void usetup(bool use_retprobe, void *target_addr) ctx.skel->links.bench_trigger_uprobe = link; } -static void uprobe_setup_nop(void) +static void usermode_count_setup(void) +{ + ctx.usermode_counters = true; +} + +static void uprobe_nop_setup(void) { usetup(false, &uprobe_target_nop); } -static void uretprobe_setup_nop(void) +static void uretprobe_nop_setup(void) { usetup(true, &uprobe_target_nop); } -static void uprobe_setup_push(void) +static void uprobe_push_setup(void) { usetup(false, &uprobe_target_push); } -static void uretprobe_setup_push(void) +static void uretprobe_push_setup(void) { usetup(true, &uprobe_target_push); } -static void uprobe_setup_ret(void) +static void uprobe_ret_setup(void) { usetup(false, &uprobe_target_ret); } -static void uretprobe_setup_ret(void) +static void uretprobe_ret_setup(void) { usetup(true, &uprobe_target_ret); } -const struct bench bench_trig_base = { - .name = "trig-base", +const struct bench bench_trig_syscall_count = { + .name = "trig-syscall-count", .validate = trigger_validate, - .producer_thread = trigger_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_tp = { - .name = "trig-tp", - .validate = trigger_validate, - .setup = trigger_tp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_rawtp = { - .name = "trig-rawtp", - .validate = trigger_validate, - .setup = trigger_rawtp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe = { - .name = "trig-kprobe", - .validate = trigger_validate, - .setup = trigger_kprobe_setup, + .setup = trigger_syscall_count_setup, .producer_thread = trigger_producer, .measure = trigger_measure, .report_progress = hits_drops_report_progress, .report_final = hits_drops_report_final, }; -const struct bench bench_trig_kretprobe = { - .name = "trig-kretprobe", - .validate = trigger_validate, - .setup = trigger_kretprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe_multi = { - .name = "trig-kprobe-multi", - .validate = trigger_validate, - .setup = trigger_kprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe_multi = { - .name = "trig-kretprobe-multi", - .validate = trigger_validate, - .setup = trigger_kretprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry = { - .name = "trig-fentry", - .validate = trigger_validate, - .setup = trigger_fentry_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fexit = { - .name = "trig-fexit", - .validate = trigger_validate, - .setup = trigger_fexit_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry_sleep = { - .name = "trig-fentry-sleep", - .validate = trigger_validate, - .setup = trigger_fentry_sleep_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fmodret = { - .name = "trig-fmodret", - .validate = trigger_validate, - .setup = trigger_fmodret_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_base = { - .name = "trig-uprobe-base", - .setup = NULL, /* no uprobe/uretprobe is attached */ - .producer_thread = uprobe_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_nop = { - .name = "trig-uprobe-nop", - .setup = uprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_nop = { - .name = "trig-uretprobe-nop", - .setup = uretprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_push = { - .name = "trig-uprobe-push", - .setup = uprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_push = { - .name = "trig-uretprobe-push", - .setup = uretprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_ret = { - .name = "trig-uprobe-ret", - .setup = uprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_ret = { - .name = "trig-uretprobe-ret", - .setup = uretprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; +/* batched (staying mostly in kernel) kprobe/fentry benchmarks */ +#define BENCH_TRIG_KERNEL(KIND, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .setup = trigger_##KIND##_setup, \ + .producer_thread = trigger_producer_batch, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ + .argp = &bench_trigger_batch_argp, \ +} + +BENCH_TRIG_KERNEL(kernel_count, "kernel-count"); +BENCH_TRIG_KERNEL(kprobe, "kprobe"); +BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); +BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); +BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); +BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(fexit, "fexit"); +BENCH_TRIG_KERNEL(fmodret, "fmodret"); +BENCH_TRIG_KERNEL(tp, "tp"); +BENCH_TRIG_KERNEL(rawtp, "rawtp"); + +/* uprobe benchmarks */ +#define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .validate = trigger_validate, \ + .setup = KIND##_setup, \ + .producer_thread = uprobe_producer_##PRODUCER, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ +} + +BENCH_TRIG_USERMODE(usermode_count, count, "usermode-count"); +BENCH_TRIG_USERMODE(uprobe_nop, nop, "uprobe-nop"); +BENCH_TRIG_USERMODE(uprobe_push, push, "uprobe-push"); +BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret"); +BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); +BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); +BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 78e83f243..a690f5a68 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -2,8 +2,22 @@ set -eufo pipefail -for i in base tp rawtp kprobe fentry fmodret -do - summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) - printf "%-10s: %s\n" $i "$summary" +def_tests=( \ + usermode-count kernel-count syscall-count \ + fentry fexit fmodret \ + rawtp tp \ + kprobe kprobe-multi \ + kretprobe kretprobe-multi \ +) + +tests=("$@") +if [ ${#tests[@]} -eq 0 ]; then + tests=("${def_tests[@]}") +fi + +p=${PROD_CNT:-1} + +for t in "${tests[@]}"; do + summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-15s: %s\n" $t "$summary" done diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index 9bdcc74e0..af169f831 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in base {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 01f241ea2..afd675b1b 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -88,3 +88,5 @@ CONFIG_VSOCKETS=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y CONFIG_XFRM_INTERFACE=y +CONFIG_TCP_CONG_DCTCP=y +CONFIG_TCP_CONG_BBR=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 1454cebc2..4407ea428 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -573,6 +573,115 @@ static void lsm_subtest(struct test_bpf_cookie *skel) close(lsm_fd); } +static void tp_btf_subtest(struct test_bpf_cookie *skel) +{ + __u64 cookie; + int prog_fd, link_fd = -1; + struct bpf_link *link = NULL; + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts); + LIBBPF_OPTS(bpf_trace_opts, trace_opts); + + /* There are three different ways to attach tp_btf (BTF-aware raw + * tracepoint) programs. Let's test all of them. + */ + prog_fd = bpf_program__fd(skel->progs.handle_tp_btf); + + /* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */ + skel->bss->tp_btf_res = 0; + + raw_tp_opts.cookie = cookie = 0x11000000000000L; + link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "raw_tp_open_res"); + + /* low-level generic bpf_link_create() API */ + skel->bss->tp_btf_res = 0; + + link_opts.tracing.cookie = cookie = 0x22000000000000L; + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_RAW_TP, &link_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "link_create_res"); + + /* high-level bpf_link-based bpf_program__attach_trace_opts() API */ + skel->bss->tp_btf_res = 0; + + trace_opts.cookie = cookie = 0x33000000000000L; + link = bpf_program__attach_trace_opts(skel->progs.handle_tp_btf, &trace_opts); + if (!ASSERT_OK_PTR(link, "attach_trace_opts")) + goto cleanup; + + usleep(1); /* trigger */ + bpf_link__destroy(link); /* detach */ + link = NULL; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "attach_trace_opts_res"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + bpf_link__destroy(link); +} + +static void raw_tp_subtest(struct test_bpf_cookie *skel) +{ + __u64 cookie; + int prog_fd, link_fd = -1; + struct bpf_link *link = NULL; + LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts); + LIBBPF_OPTS(bpf_raw_tracepoint_opts, opts); + + /* There are two different ways to attach raw_tp programs */ + prog_fd = bpf_program__fd(skel->progs.handle_raw_tp); + + /* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */ + skel->bss->raw_tp_res = 0; + + raw_tp_opts.tp_name = "sys_enter"; + raw_tp_opts.cookie = cookie = 0x55000000000000L; + link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->raw_tp_res, cookie, "raw_tp_open_res"); + + /* high-level bpf_link-based bpf_program__attach_raw_tracepoint_opts() API */ + skel->bss->raw_tp_res = 0; + + opts.cookie = cookie = 0x66000000000000L; + link = bpf_program__attach_raw_tracepoint_opts(skel->progs.handle_raw_tp, + "sys_enter", &opts); + if (!ASSERT_OK_PTR(link, "attach_raw_tp_opts")) + goto cleanup; + + usleep(1); /* trigger */ + bpf_link__destroy(link); /* detach */ + link = NULL; + + ASSERT_EQ(skel->bss->raw_tp_res, cookie, "attach_raw_tp_opts_res"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + bpf_link__destroy(link); +} + void test_bpf_cookie(void) { struct test_bpf_cookie *skel; @@ -601,6 +710,9 @@ void test_bpf_cookie(void) tracing_subtest(skel); if (test__start_subtest("lsm")) lsm_subtest(skel); - + if (test__start_subtest("tp_btf")) + tp_btf_subtest(skel); + if (test__start_subtest("raw_tp")) + raw_tp_subtest(skel); test_bpf_cookie__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index a88e6e07e..332ed54cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -13,6 +13,7 @@ #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" #include "tcp_ca_unsupp_cong_op.skel.h" +#include "tcp_ca_kfunc.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -89,7 +90,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) WRITE_ONCE(stop, 0); - lfd = socket(AF_INET6, SOCK_STREAM, 0); + lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; @@ -103,21 +104,10 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) settimeo(lfd, 0) || settimeo(fd, 0)) goto done; - /* bind, listen and start server thread to accept */ - sa6.sin6_family = AF_INET6; - sa6.sin6_addr = in6addr_loopback; - err = bind(lfd, (struct sockaddr *)&sa6, addrlen); - if (!ASSERT_NEQ(err, -1, "bind")) - goto done; - err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); if (!ASSERT_NEQ(err, -1, "getsockname")) goto done; - err = listen(lfd, 1); - if (!ASSERT_NEQ(err, -1, "listen")) - goto done; - if (sk_stg_map) { err = bpf_map_update_elem(bpf_map__fd(sk_stg_map), &fd, &expected_stg, BPF_NOEXIST); @@ -315,7 +305,7 @@ static void test_rel_setsockopt(void) struct bpf_dctcp_release *rel_skel; libbpf_print_fn_t old_print_fn; - err_str = "unknown func bpf_setsockopt"; + err_str = "program of this type cannot use helper bpf_setsockopt"; found = false; old_print_fn = libbpf_set_print(libbpf_debug_print); @@ -529,6 +519,15 @@ static void test_link_replace(void) tcp_ca_update__destroy(skel); } +static void test_tcp_ca_kfunc(void) +{ + struct tcp_ca_kfunc *skel; + + skel = tcp_ca_kfunc__open_and_load(); + ASSERT_OK_PTR(skel, "tcp_ca_kfunc__open_and_load"); + tcp_ca_kfunc__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -557,4 +556,6 @@ void test_bpf_tcp_ca(void) test_multi_links(); if (test__start_subtest("link_replace")) test_link_replace(); + if (test__start_subtest("tcp_ca_kfunc")) + test_tcp_ca_kfunc(); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c index a8b53b873..f66ceccd7 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -25,7 +25,7 @@ static void test_lookup_update(void) int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id; int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd; struct test_btf_map_in_map *skel; - int err, key = 0, val, i, fd; + int err, key = 0, val, i; skel = test_btf_map_in_map__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) @@ -102,30 +102,6 @@ static void test_lookup_update(void) CHECK(map1_id == 0, "map1_id", "failed to get ID 1\n"); CHECK(map2_id == 0, "map2_id", "failed to get ID 2\n"); - test_btf_map_in_map__destroy(skel); - skel = NULL; - - /* we need to either wait for or force synchronize_rcu(), before - * checking for "still exists" condition, otherwise map could still be - * resolvable by ID, causing false positives. - * - * Older kernels (5.8 and earlier) freed map only after two - * synchronize_rcu()s, so trigger two, to be entirely sure. - */ - CHECK(kern_sync_rcu(), "sync_rcu", "failed\n"); - CHECK(kern_sync_rcu(), "sync_rcu", "failed\n"); - - fd = bpf_map_get_fd_by_id(map1_id); - if (CHECK(fd >= 0, "map1_leak", "inner_map1 leaked!\n")) { - close(fd); - goto cleanup; - } - fd = bpf_map_get_fd_by_id(map2_id); - if (CHECK(fd >= 0, "map2_leak", "inner_map2 leaked!\n")) { - close(fd); - goto cleanup; - } - cleanup: test_btf_map_in_map__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c index 3379df2d4..bd7658958 100644 --- a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c @@ -26,6 +26,17 @@ #define IPV6_TBID_ADDR "fd00::FFFF" #define IPV6_TBID_NET "fd00::" #define IPV6_TBID_DST "fd00::2" +#define MARK_NO_POLICY 33 +#define MARK 42 +#define MARK_TABLE "200" +#define IPV4_REMOTE_DST "1.2.3.4" +#define IPV4_LOCAL "10.4.0.3" +#define IPV4_GW1 "10.4.0.1" +#define IPV4_GW2 "10.4.0.2" +#define IPV6_REMOTE_DST "be:ef::b0:10" +#define IPV6_LOCAL "fd01::3" +#define IPV6_GW1 "fd01::1" +#define IPV6_GW2 "fd01::2" #define DMAC "11:11:11:11:11:11" #define DMAC_INIT { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, } #define DMAC2 "01:01:01:01:01:01" @@ -36,9 +47,11 @@ struct fib_lookup_test { const char *daddr; int expected_ret; const char *expected_src; + const char *expected_dst; int lookup_flags; __u32 tbid; __u8 dmac[6]; + __u32 mark; }; static const struct fib_lookup_test tests[] = { @@ -90,10 +103,47 @@ static const struct fib_lookup_test tests[] = { .daddr = IPV6_ADDR_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, .expected_src = IPV6_IFACE_ADDR_SEC, .lookup_flags = BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + /* policy routing */ + { .desc = "IPv4 policy routing, default", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv4 policy routing, mark doesn't point to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv4 policy routing, mark points to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv4 policy routing, mark points to a policy, but no flag", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, default", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv6 policy routing, mark doesn't point to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv6 policy routing, mark points to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, mark points to a policy, but no flag", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, }; -static int ifindex; - static int setup_netns(void) { int err; @@ -144,12 +194,24 @@ static int setup_netns(void) if (!ASSERT_OK(err, "write_sysctl(net.ipv6.conf.veth1.forwarding)")) goto fail; + /* Setup for policy routing tests */ + SYS(fail, "ip addr add %s/24 dev veth1", IPV4_LOCAL); + SYS(fail, "ip addr add %s/64 dev veth1 nodad", IPV6_LOCAL); + SYS(fail, "ip route add %s/32 via %s", IPV4_REMOTE_DST, IPV4_GW1); + SYS(fail, "ip route add %s/32 via %s table %s", IPV4_REMOTE_DST, IPV4_GW2, MARK_TABLE); + SYS(fail, "ip -6 route add %s/128 via %s", IPV6_REMOTE_DST, IPV6_GW1); + SYS(fail, "ip -6 route add %s/128 via %s table %s", IPV6_REMOTE_DST, IPV6_GW2, MARK_TABLE); + SYS(fail, "ip rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + SYS(fail, "ip -6 rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + return 0; fail: return -1; } -static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_lookup_test *test) +static int set_lookup_params(struct bpf_fib_lookup *params, + const struct fib_lookup_test *test, + int ifindex) { int ret; @@ -158,6 +220,7 @@ static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_loo params->l4_protocol = IPPROTO_TCP; params->ifindex = ifindex; params->tbid = test->tbid; + params->mark = test->mark; if (inet_pton(AF_INET6, test->daddr, params->ipv6_dst) == 1) { params->family = AF_INET6; @@ -190,40 +253,45 @@ static void mac_str(char *b, const __u8 *mac) mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); } -static void assert_src_ip(struct bpf_fib_lookup *fib_params, const char *expected_src) +static void assert_ip_address(int family, void *addr, const char *expected_str) { + char str[INET6_ADDRSTRLEN]; + u8 expected_addr[16]; + int addr_len = 0; int ret; - __u32 src6[4]; - __be32 src4; - switch (fib_params->family) { + switch (family) { case AF_INET6: - ret = inet_pton(AF_INET6, expected_src, src6); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ret = memcmp(src6, fib_params->ipv6_src, sizeof(fib_params->ipv6_src)); - if (!ASSERT_EQ(ret, 0, "fib_lookup ipv6 src")) { - char str_src6[64]; - - inet_ntop(AF_INET6, fib_params->ipv6_src, str_src6, - sizeof(str_src6)); - printf("ipv6 expected %s actual %s ", expected_src, - str_src6); - } - + ret = inet_pton(AF_INET6, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET6, expected_str)"); + addr_len = 16; break; case AF_INET: - ret = inet_pton(AF_INET, expected_src, &src4); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ASSERT_EQ(fib_params->ipv4_src, src4, "fib_lookup ipv4 src"); - + ret = inet_pton(AF_INET, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET, expected_str)"); + addr_len = 4; break; default: - PRINT_FAIL("invalid addr family: %d", fib_params->family); + PRINT_FAIL("invalid address family: %d", family); + break; + } + + if (memcmp(addr, expected_addr, addr_len)) { + inet_ntop(family, addr, str, sizeof(str)); + PRINT_FAIL("expected %s actual %s ", expected_str, str); } } +static void assert_src_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_src, expected); +} + +static void assert_dst_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_dst, expected); +} + void test_fib_lookup(void) { struct bpf_fib_lookup *fib_params; @@ -256,15 +324,18 @@ void test_fib_lookup(void) if (setup_netns()) goto fail; - ifindex = if_nametoindex("veth1"); - skb.ifindex = ifindex; + skb.ifindex = if_nametoindex("veth1"); + if (!ASSERT_NEQ(skb.ifindex, 0, "if_nametoindex(veth1)")) + goto fail; + fib_params = &skel->bss->fib_params; for (i = 0; i < ARRAY_SIZE(tests); i++) { printf("Testing %s ", tests[i].desc); - if (set_lookup_params(fib_params, &tests[i])) + if (set_lookup_params(fib_params, &tests[i], skb.ifindex)) continue; + skel->bss->fib_lookup_ret = -1; skel->bss->lookup_flags = tests[i].lookup_flags; @@ -278,6 +349,9 @@ void test_fib_lookup(void) if (tests[i].expected_src) assert_src_ip(fib_params, tests[i].expected_src); + if (tests[i].expected_dst) + assert_dst_ip(fib_params, tests[i].expected_dst); + ret = memcmp(tests[i].dmac, fib_params->dmac, sizeof(tests[i].dmac)); if (!ASSERT_EQ(ret, 0, "dmac not match")) { char expected[18], actual[18]; diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 05000810e..51628455b 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -336,15 +336,80 @@ static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) return strcmp((const char *) key1, (const char *) key2) == 0; } +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static bool skip_entry(char *name) +{ + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + return false; +} + +/* Do comparision by ignoring '.llvm.' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + static int get_syms(char ***symsp, size_t *cntp, bool kernel) { - size_t cap = 0, cnt = 0, i; - char *name = NULL, **syms = NULL; + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; char buf[256]; FILE *f; int err = 0; + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) + return -EINVAL; + /* * The available_filter_functions contains many duplicates, * but other than that all symbols are usable in kprobe multi @@ -368,33 +433,23 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) } while (fgets(buf, sizeof(buf), f)) { - if (kernel && strchr(buf, '[')) - continue; - if (!kernel && !strchr(buf, '[')) + if (is_invalid_entry(buf, kernel)) continue; free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - continue; - if (!strcmp(name, "default_idle")) - continue; - if (!strncmp(name, "rcu_", 4)) - continue; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - continue; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) + if (skip_entry(name)) continue; - err = hashmap__add(map, name, 0); + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); if (err == -EEXIST) { err = 0; continue; @@ -407,8 +462,7 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) if (err) goto error; - syms[cnt++] = name; - name = NULL; + syms[cnt++] = ksym_name; } *symsp = syms; @@ -418,42 +472,88 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) free(name); fclose(f); hashmap__free(map); - if (err) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (err) free(syms); + return err; +} + +static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; + } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); return err; } -static void test_kprobe_multi_bench_attach(bool kernel) +static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { - LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); - struct kprobe_multi_empty *skel = NULL; long attach_start_ns, attach_end_ns; long detach_start_ns, detach_end_ns; double attach_delta, detach_delta; struct bpf_link *link = NULL; - char **syms = NULL; - size_t cnt = 0, i; - - if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) - return; - - skel = kprobe_multi_empty__open_and_load(); - if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) - goto cleanup; - - opts.syms = (const char **) syms; - opts.cnt = cnt; attach_start_ns = get_time_ns(); link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_empty, - NULL, &opts); + NULL, opts); attach_end_ns = get_time_ns(); if (!ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts")) - goto cleanup; + return; detach_start_ns = get_time_ns(); bpf_link__destroy(link); @@ -462,17 +562,65 @@ static void test_kprobe_multi_bench_attach(bool kernel) attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; - printf("%s: found %lu functions\n", __func__, cnt); + printf("%s: found %lu functions\n", __func__, opts->cnt); printf("%s: attached in %7.3lfs\n", __func__, attach_delta); printf("%s: detached in %7.3lfs\n", __func__, detach_delta); +} + +static void test_kprobe_multi_bench_attach(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + char **syms = NULL; + size_t cnt = 0; + + if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.syms = (const char **) syms; + opts.cnt = cnt; + + do_bench_test(skel, &opts); cleanup: kprobe_multi_empty__destroy(skel); - if (syms) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (syms) free(syms); +} + +static void test_kprobe_multi_bench_attach_addr(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + unsigned long *addrs = NULL; + size_t cnt = 0; + int err; + + err = get_addrs(&addrs, &cnt, kernel); + if (err == -ENOENT) { + test__skip(); + return; } + + if (!ASSERT_OK(err, "get_addrs")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.addrs = addrs; + opts.cnt = cnt; + + do_bench_test(skel, &opts); + +cleanup: + kprobe_multi_empty__destroy(skel); + free(addrs); } static void test_attach_override(void) @@ -515,6 +663,10 @@ void serial_test_kprobe_multi_bench_attach(void) test_kprobe_multi_bench_attach(true); if (test__start_subtest("modules")) test_kprobe_multi_bench_attach(false); + if (test__start_subtest("kernel")) + test_kprobe_multi_bench_attach_addr(true); + if (test__start_subtest("modules")) + test_kprobe_multi_bench_attach_addr(false); } void test_kprobe_multi_test(void) diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index b295969b2..dc7aab532 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -5,8 +5,6 @@ #include "test_ksyms.skel.h" #include -static int duration; - void test_ksyms(void) { const char *btf_path = "/sys/kernel/btf/vmlinux"; @@ -18,43 +16,37 @@ void test_ksyms(void) int err; err = kallsyms_find("bpf_link_fops", &link_fops_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "bpf_link_fops: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "bpf_link_fops: ksym_find")) return; err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "__per_cpu_start: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "__per_cpu_start: ksym_find")) return; - if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno)) + if (!ASSERT_OK(stat(btf_path, &st), "stat_btf")) return; btf_size = st.st_size; skel = test_ksyms__open_and_load(); - if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) + if (!ASSERT_OK_PTR(skel, "test_ksyms__open_and_load")) return; err = test_ksyms__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ksyms__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); data = skel->data; - CHECK(data->out__bpf_link_fops != link_fops_addr, "bpf_link_fops", - "got 0x%llx, exp 0x%llx\n", - data->out__bpf_link_fops, link_fops_addr); - CHECK(data->out__bpf_link_fops1 != 0, "bpf_link_fops1", - "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0); - CHECK(data->out__btf_size != btf_size, "btf_size", - "got %llu, exp %llu\n", data->out__btf_size, btf_size); - CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start", - "got %llu, exp %llu\n", data->out__per_cpu_start, - per_cpu_start_addr); + ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops"); + ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1"); + ASSERT_EQ(data->out__btf_size, btf_size, "btf_size"); + ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start"); cleanup: test_ksyms__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 24d493482..e72d75d6b 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -12,77 +12,229 @@ #include #include #include +#include "network_helpers.h" #define STACK_SIZE (1024 * 1024) static char child_stack[STACK_SIZE]; -static int test_current_pid_tgid(void *args) +static int get_pid_tgid(pid_t *pid, pid_t *tgid, + struct test_ns_current_pid_tgid__bss *bss) { - struct test_ns_current_pid_tgid__bss *bss; - struct test_ns_current_pid_tgid *skel; - int err = -1, duration = 0; - pid_t tgid, pid; struct stat st; + int err; - skel = test_ns_current_pid_tgid__open_and_load(); - if (CHECK(!skel, "skel_open_load", "failed to load skeleton\n")) - goto cleanup; - - pid = syscall(SYS_gettid); - tgid = getpid(); + *pid = syscall(SYS_gettid); + *tgid = getpid(); err = stat("/proc/self/ns/pid", &st); - if (CHECK(err, "stat", "failed /proc/self/ns/pid: %d\n", err)) - goto cleanup; + if (!ASSERT_OK(err, "stat /proc/self/ns/pid")) + return err; - bss = skel->bss; bss->dev = st.st_dev; bss->ino = st.st_ino; bss->user_pid = 0; bss->user_tgid = 0; + return 0; +} + +static int test_current_pid_tgid_tp(void *args) +{ + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int ret = -1, err; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.tp_handler, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, bss)) + goto cleanup; err = test_ns_current_pid_tgid__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); - ASSERT_EQ(bss->user_pid, pid, "pid"); - ASSERT_EQ(bss->user_tgid, tgid, "tgid"); - err = 0; + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; + +cleanup: + test_ns_current_pid_tgid__destroy(skel); + return ret; +} + +static int test_current_pid_tgid_cgrp(void *args) +{ + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int server_fd = -1, ret = -1, err; + int cgroup_fd = *(int *)args; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.cgroup_bind4, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, bss)) + goto cleanup; + + skel->links.cgroup_bind4 = bpf_program__attach_cgroup( + skel->progs.cgroup_bind4, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.cgroup_bind4, "bpf_program__attach_cgroup")) + goto cleanup; + + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto cleanup; + + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; cleanup: - test_ns_current_pid_tgid__destroy(skel); + if (server_fd >= 0) + close(server_fd); + test_ns_current_pid_tgid__destroy(skel); + return ret; +} + +static int test_current_pid_tgid_sk_msg(void *args) +{ + int verdict, map, server_fd = -1, client_fd = -1; + struct test_ns_current_pid_tgid__bss *bss; + static const char send_msg[] = "message"; + struct test_ns_current_pid_tgid *skel; + int ret = -1, err, key = 0; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.sk_msg, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, skel->bss)) + goto cleanup; + + verdict = bpf_program__fd(skel->progs.sk_msg); + map = bpf_map__fd(skel->maps.sock_map); + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "prog_attach")) + goto cleanup; + + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto cleanup; - return err; + client_fd = connect_to_fd(server_fd, 0); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto cleanup; + + err = bpf_map_update_elem(map, &key, &client_fd, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto cleanup; + + err = send(client_fd, send_msg, sizeof(send_msg), 0); + if (!ASSERT_EQ(err, sizeof(send_msg), "send(msg)")) + goto cleanup; + + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; + +cleanup: + if (server_fd >= 0) + close(server_fd); + if (client_fd >= 0) + close(client_fd); + test_ns_current_pid_tgid__destroy(skel); + return ret; } -static void test_ns_current_pid_tgid_new_ns(void) +static void test_ns_current_pid_tgid_new_ns(int (*fn)(void *), void *arg) { - int wstatus, duration = 0; + int wstatus; pid_t cpid; /* Create a process in a new namespace, this process * will be the init process of this new namespace hence will be pid 1. */ - cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, - CLONE_NEWPID | SIGCHLD, NULL); + cpid = clone(fn, child_stack + STACK_SIZE, + CLONE_NEWPID | SIGCHLD, arg); - if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) + if (!ASSERT_NEQ(cpid, -1, "clone")) return; - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) + if (!ASSERT_NEQ(waitpid(cpid, &wstatus, 0), -1, "waitpid")) return; - if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) + if (!ASSERT_OK(WEXITSTATUS(wstatus), "newns_pidtgid")) return; } +static void test_in_netns(int (*fn)(void *), void *arg) +{ + struct nstoken *nstoken = NULL; + + SYS(cleanup, "ip netns add ns_current_pid_tgid"); + SYS(cleanup, "ip -net ns_current_pid_tgid link set dev lo up"); + + nstoken = open_netns("ns_current_pid_tgid"); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto cleanup; + + test_ns_current_pid_tgid_new_ns(fn, arg); + +cleanup: + if (nstoken) + close_netns(nstoken); + SYS_NOFAIL("ip netns del ns_current_pid_tgid"); +} + /* TODO: use a different tracepoint */ void serial_test_ns_current_pid_tgid(void) { - if (test__start_subtest("ns_current_pid_tgid_root_ns")) - test_current_pid_tgid(NULL); - if (test__start_subtest("ns_current_pid_tgid_new_ns")) - test_ns_current_pid_tgid_new_ns(); + if (test__start_subtest("root_ns_tp")) + test_current_pid_tgid_tp(NULL); + if (test__start_subtest("new_ns_tp")) + test_ns_current_pid_tgid_new_ns(test_current_pid_tgid_tp, NULL); + if (test__start_subtest("new_ns_cgrp")) { + int cgroup_fd = -1; + + cgroup_fd = test__join_cgroup("/sock_addr"); + if (ASSERT_GE(cgroup_fd, 0, "join_cgroup")) { + test_in_netns(test_current_pid_tgid_cgrp, &cgroup_fd); + close(cgroup_fd); + } + } + if (test__start_subtest("new_ns_sk_msg")) + test_in_netns(test_current_pid_tgid_sk_msg, NULL); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index ee5372c7f..098776d00 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -93,9 +93,56 @@ static void test_struct_ops_load(void) struct_ops_module__destroy(skel); } +static void test_struct_ops_not_zeroed(void) +{ + struct struct_ops_module *skel; + int err; + + /* zeroed is 0, and zeroed_op is null */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) + return; + + err = struct_ops_module__load(skel); + ASSERT_OK(err, "struct_ops_module_load"); + + struct_ops_module__destroy(skel); + + /* zeroed is not 0 */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed")) + return; + + /* libbpf should reject the testmod_zeroed since struct + * bpf_testmod_ops in the kernel has no "zeroed" field and the + * value of "zeroed" is non-zero. + */ + skel->struct_ops.testmod_zeroed->zeroed = 0xdeadbeef; + err = struct_ops_module__load(skel); + ASSERT_ERR(err, "struct_ops_module_load_not_zeroed"); + + struct_ops_module__destroy(skel); + + /* zeroed_op is not null */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed_op")) + return; + + /* libbpf should reject the testmod_zeroed since the value of its + * "zeroed_op" is not null. + */ + skel->struct_ops.testmod_zeroed->zeroed_op = skel->progs.test_3; + err = struct_ops_module__load(skel); + ASSERT_ERR(err, "struct_ops_module_load_not_zeroed_op"); + + struct_ops_module__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) test_struct_ops_load(); + if (test__start_subtest("test_struct_ops_not_zeroed")) + test_struct_ops_not_zeroed(); } diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 3db416606..fe65e0952 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -673,7 +673,7 @@ static __noinline void fill(struct bpf_iter_num *it, int *arr, __u32 n, int mul) static __noinline int sum(struct bpf_iter_num *it, int *arr, __u32 n) { - int *t, i, sum = 0;; + int *t, i, sum = 0; while ((t = bpf_iter_num_next(it))) { i = *t; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 026cabfa7..86e1e50c5 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -23,7 +23,7 @@ void BPF_PROG(test_2, int a, int b) test_2_result = a + b; } -SEC("struct_ops/test_3") +SEC("?struct_ops/test_3") int BPF_PROG(test_3, int a, int b) { test_2_result = a + b + 3; @@ -54,3 +54,17 @@ struct bpf_testmod_ops___v2 testmod_2 = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, }; + +struct bpf_testmod_ops___zeroed { + int (*test_1)(void); + void (*test_2)(int a, int b); + int (*test_maybe_null)(int dummy, struct task_struct *task); + void (*zeroed_op)(int a, int b); + int zeroed; +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops___zeroed testmod_zeroed = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2_v2, +}; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c new file mode 100644 index 000000000..fcfbfe033 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Facebook */ + +#include "vmlinux.h" +#include + +extern void bbr_init(struct sock *sk) __ksym; +extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym; +extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; +extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; +extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern u32 bbr_ssthresh(struct sock *sk) __ksym; +extern u32 bbr_min_tso_segs(struct sock *sk) __ksym; +extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym; + +extern void dctcp_init(struct sock *sk) __ksym; +extern void dctcp_update_alpha(struct sock *sk, u32 flags) __ksym; +extern void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) __ksym; +extern u32 dctcp_ssthresh(struct sock *sk) __ksym; +extern u32 dctcp_cwnd_undo(struct sock *sk) __ksym; +extern void dctcp_state(struct sock *sk, u8 new_state) __ksym; + +extern void cubictcp_init(struct sock *sk) __ksym; +extern u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) __ksym; +extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; + +SEC("struct_ops/init") +void BPF_PROG(init, struct sock *sk) +{ + bbr_init(sk); + dctcp_init(sk); + cubictcp_init(sk); +} + +SEC("struct_ops/in_ack_event") +void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) +{ + dctcp_update_alpha(sk, flags); +} + +SEC("struct_ops/cong_control") +void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs) +{ + bbr_main(sk, rs); +} + +SEC("struct_ops/cong_avoid") +void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked) +{ + cubictcp_cong_avoid(sk, ack, acked); +} + +SEC("struct_ops/sndbuf_expand") +u32 BPF_PROG(sndbuf_expand, struct sock *sk) +{ + return bbr_sndbuf_expand(sk); +} + +SEC("struct_ops/undo_cwnd") +u32 BPF_PROG(undo_cwnd, struct sock *sk) +{ + bbr_undo_cwnd(sk); + return dctcp_cwnd_undo(sk); +} + +SEC("struct_ops/cwnd_event") +void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + bbr_cwnd_event(sk, event); + dctcp_cwnd_event(sk, event); + cubictcp_cwnd_event(sk, event); +} + +SEC("struct_ops/ssthresh") +u32 BPF_PROG(ssthresh, struct sock *sk) +{ + bbr_ssthresh(sk); + dctcp_ssthresh(sk); + return cubictcp_recalc_ssthresh(sk); +} + +SEC("struct_ops/min_tso_segs") +u32 BPF_PROG(min_tso_segs, struct sock *sk) +{ + return bbr_min_tso_segs(sk); +} + +SEC("struct_ops/set_state") +void BPF_PROG(set_state, struct sock *sk, u8 new_state) +{ + bbr_set_state(sk, new_state); + dctcp_state(sk, new_state); + cubictcp_state(sk, new_state); +} + +SEC("struct_ops/pkts_acked") +void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +SEC(".struct_ops") +struct tcp_congestion_ops tcp_ca_kfunc = { + .init = (void *)init, + .in_ack_event = (void *)in_ack_event, + .cong_control = (void *)cong_control, + .cong_avoid = (void *)cong_avoid, + .sndbuf_expand = (void *)sndbuf_expand, + .undo_cwnd = (void *)undo_cwnd, + .cwnd_event = (void *)cwnd_event, + .ssthresh = (void *)ssthresh, + .min_tso_segs = (void *)min_tso_segs, + .set_state = (void *)set_state, + .pkts_acked = (void *)pkts_acked, + .name = "tcp_ca_kfunc", +}; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c index 5a3a80f75..c83142b55 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c @@ -15,6 +15,8 @@ __u64 uprobe_res; __u64 uretprobe_res; __u64 tp_res; __u64 pe_res; +__u64 raw_tp_res; +__u64 tp_btf_res; __u64 fentry_res; __u64 fexit_res; __u64 fmod_ret_res; @@ -87,6 +89,20 @@ int handle_pe(struct pt_regs *ctx) return 0; } +SEC("raw_tp/sys_enter") +int handle_raw_tp(void *ctx) +{ + update(ctx, &raw_tp_res); + return 0; +} + +SEC("tp_btf/sys_enter") +int handle_tp_btf(void *ctx) +{ + update(ctx, &tp_btf_res); + return 0; +} + SEC("fentry/bpf_fentry_test1") int BPF_PROG(fentry_test1, int a) { diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c index 0763d49f9..386315afa 100644 --- a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -5,23 +5,48 @@ #include #include +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u32); +} sock_map SEC(".maps"); + __u64 user_pid = 0; __u64 user_tgid = 0; __u64 dev = 0; __u64 ino = 0; -SEC("tracepoint/syscalls/sys_enter_nanosleep") -int handler(const void *ctx) +static void get_pid_tgid(void) { struct bpf_pidns_info nsdata; if (bpf_get_ns_current_pid_tgid(dev, ino, &nsdata, sizeof(struct bpf_pidns_info))) - return 0; + return; user_pid = nsdata.pid; user_tgid = nsdata.tgid; +} +SEC("?tracepoint/syscalls/sys_enter_nanosleep") +int tp_handler(const void *ctx) +{ + get_pid_tgid(); return 0; } +SEC("?cgroup/bind4") +int cgroup_bind4(struct bpf_sock_addr *ctx) +{ + get_pid_tgid(); + return 1; +} + +SEC("?sk_msg") +int sk_msg(struct sk_msg_md *msg) +{ + get_pid_tgid(); + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 5fda43901..2619ed193 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Facebook - #include #include #include @@ -9,82 +8,126 @@ char _license[] SEC("license") = "GPL"; -long hits = 0; +#define CPU_MASK 255 +#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */ -SEC("tp/syscalls/sys_enter_getpgid") -int bench_trigger_tp(void *ctx) +/* matches struct counter in bench.h */ +struct counter { + long value; +} __attribute__((aligned(128))); + +struct counter hits[MAX_CPUS]; + +static __always_inline void inc_counter(void) +{ + int cpu = bpf_get_smp_processor_id(); + + __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); +} + +SEC("?uprobe") +int bench_trigger_uprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("raw_tp/sys_enter") -int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) +const volatile int batch_iters = 0; + +SEC("?raw_tp") +int trigger_count(void *ctx) { - if (id == __NR_getpgid) - __sync_add_and_fetch(&hits, 1); + int i; + + for (i = 0; i < batch_iters; i++) + inc_counter(); + return 0; } -SEC("kprobe/" SYS_PREFIX "sys_getpgid") +SEC("?raw_tp") +int trigger_driver(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_get_numa_node_id(); /* attach point for benchmarking */ + + return 0; +} + +extern int bpf_modify_return_test_tp(int nonce) __ksym __weak; + +SEC("?raw_tp") +int trigger_driver_kfunc(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_modify_return_test_tp(0); /* attach point for benchmarking */ + + return 0; +} + +SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kretprobe/" SYS_PREFIX "sys_getpgid") +SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid") +SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid") +SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fentry/" SYS_PREFIX "sys_getpgid") +SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fexit/" SYS_PREFIX "sys_getpgid") +SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fentry.s/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fentry_sleep(void *ctx) +SEC("?fmod_ret/bpf_modify_return_test_tp") +int bench_trigger_fmodret(void *ctx) { - __sync_add_and_fetch(&hits, 1); - return 0; + inc_counter(); + return -22; } -SEC("fmod_ret/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fmodret(void *ctx) +SEC("?tp/bpf_test_run/bpf_trigger_tp") +int bench_trigger_tp(void *ctx) { - __sync_add_and_fetch(&hits, 1); - return -22; + inc_counter(); + return 0; } -SEC("uprobe") -int bench_trigger_uprobe(void *ctx) +SEC("?raw_tp/bpf_trigger_tp") +int bench_trigger_rawtp(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 0ede0ccd0..059aa716e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -30,7 +30,7 @@ struct { SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_kprobe_1(void) { asm volatile (" \ @@ -44,7 +44,7 @@ __naked void in_bpf_prog_type_kprobe_1(void) SEC("tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_tracepoint_1(void) { asm volatile (" \ @@ -58,7 +58,7 @@ __naked void in_bpf_prog_type_tracepoint_1(void) SEC("perf_event") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_perf_event_1(void) { asm volatile (" \ @@ -72,7 +72,7 @@ __naked void bpf_prog_type_perf_event_1(void) SEC("raw_tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_raw_tracepoint_1(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index f4936834f..dde0bb16e 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -7,6 +7,7 @@ #include #include #include "test_core_extern.skel.h" +#include "struct_ops_module.skel.h" template class Skeleton { @@ -98,6 +99,7 @@ int main(int argc, char *argv[]) { struct btf_dump_opts opts = { }; struct test_core_extern *skel; + struct struct_ops_module *skel2; struct btf *btf; int fd; @@ -118,6 +120,9 @@ int main(int argc, char *argv[]) skel = test_core_extern__open_and_load(); test_core_extern__destroy(skel); + skel2 = struct_ops_module__open_and_load(); + struct_ops_module__destroy(skel2); + fd = bpf_enable_stats(BPF_STATS_RUN_TIME); if (fd < 0) std::cout << "FAILED to enable stats: " << fd << std::endl; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 910044f08..7989ec608 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -72,7 +72,6 @@ cleanup() { server_listen() { ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" & server_pid=$! - sleep 0.2 } client_connect() { @@ -93,6 +92,16 @@ verify_data() { fi } +wait_for_port() { + for i in $(seq 20); do + if ip netns exec "${ns2}" ss ${2:--4}OHntl | grep -q "$1"; then + return 0 + fi + sleep 0.1 + done + return 1 +} + set -e # no arguments: automated test, run all @@ -193,6 +202,7 @@ setup # basic communication works echo "test basic connectivity" server_listen +wait_for_port ${port} ${netcat_opt} client_connect verify_data @@ -204,6 +214,7 @@ ip netns exec "${ns1}" tc filter add dev veth1 egress \ section "encap_${tuntype}_${mac}" echo "test bpf encap without decap (expect failure)" server_listen +wait_for_port ${port} ${netcat_opt} ! client_connect if [[ "$tuntype" =~ "udp" ]]; then diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 27fd7ed3e..7f45b4cb4 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -61,12 +61,7 @@ void free_kallsyms_local(struct ksyms *ksyms) free(ksyms); } -static int ksym_cmp(const void *p1, const void *p2) -{ - return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; -} - -struct ksyms *load_kallsyms_local(void) +static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb) { FILE *f; char func[256], buf[256]; @@ -100,7 +95,7 @@ struct ksyms *load_kallsyms_local(void) goto error; } fclose(f); - qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), ksym_cmp); + qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb); return ksyms; error: @@ -109,6 +104,21 @@ struct ksyms *load_kallsyms_local(void) return NULL; } +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +struct ksyms *load_kallsyms_local(void) +{ + return load_kallsyms_local_common(ksym_cmp); +} + +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb) +{ + return load_kallsyms_local_common(cmp_cb); +} + int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); @@ -148,6 +158,28 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key) return &ksyms->syms[0]; } +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p, + ksym_search_cmp_t cmp_cb) +{ + int start = 0, mid, end = ksyms->sym_cnt; + struct ksym *ks; + int result; + + while (start < end) { + mid = start + (end - start) / 2; + ks = &ksyms->syms[mid]; + result = cmp_cb(p, ks); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return ks; + } + + return NULL; +} + struct ksym *ksym_search(long key) { if (!ksyms) diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 04fd1da70..d1ed71789 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -13,6 +13,9 @@ struct ksym { }; struct ksyms; +typedef int (*ksym_cmp_t)(const void *p1, const void *p2); +typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2); + int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); @@ -22,6 +25,10 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key); long ksym_get_addr_local(struct ksyms *ksyms, const char *name); void free_kallsyms_local(struct ksyms *ksyms); +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb); +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1, + ksym_search_cmp_t cmp_cb); + /* open kallsyms and find addresses on the fly, faster than load + search. */ int kallsyms_find(const char *sym, unsigned long long *addr); diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile new file mode 100644 index 000000000..2259a39a7 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0+ OR MIT + +TEST_PROGS = \ + devlink_port_split.py \ + ethtool.sh \ + ethtool_extended_state.sh \ + ethtool_mm.sh \ + ethtool_rmon.sh \ + hw_stats_l3.sh \ + hw_stats_l3_gre.sh \ + loopback.sh \ + # + +TEST_FILES := \ + ethtool_lib.sh \ + # + +TEST_INCLUDES := \ + ../../../net/lib.sh \ + ../../../net/forwarding/lib.sh \ + ../../../net/forwarding/ipip_lib.sh \ + ../../../net/forwarding/tc_common.sh \ + # + +include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/hw/devlink_port_split.py b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py new file mode 100755 index 000000000..2d84c7a0b --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from subprocess import PIPE, Popen +import json +import time +import argparse +import collections +import sys + +# +# Test port split configuration using devlink-port lanes attribute. +# The test is skipped in case the attribute is not available. +# +# First, check that all the ports with 1 lane fail to split. +# Second, check that all the ports with more than 1 lane can be split +# to all valid configurations (e.g., split to 2, split to 4 etc.) +# + + +# Kselftest framework requirement - SKIP code is 4 +KSFT_SKIP=4 +Port = collections.namedtuple('Port', 'bus_info name') + + +def run_command(cmd, should_fail=False): + """ + Run a command in subprocess. + Return: Tuple of (stdout, stderr). + """ + + p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) + stdout, stderr = p.communicate() + stdout, stderr = stdout.decode(), stderr.decode() + + if stderr != "" and not should_fail: + print("Error sending command: %s" % cmd) + print(stdout) + print(stderr) + return stdout, stderr + + +class devlink_ports(object): + """ + Class that holds information on the devlink ports, required to the tests; + if_names: A list of interfaces in the devlink ports. + """ + + def get_if_names(dev): + """ + Get a list of physical devlink ports. + Return: Array of tuples (bus_info/port, if_name). + """ + + arr = [] + + cmd = "devlink -j port show" + stdout, stderr = run_command(cmd) + assert stderr == "" + ports = json.loads(stdout)['port'] + + validate_devlink_output(ports, 'flavour') + + for port in ports: + if dev in port: + if ports[port]['flavour'] == 'physical': + arr.append(Port(bus_info=port, name=ports[port]['netdev'])) + + return arr + + def __init__(self, dev): + self.if_names = devlink_ports.get_if_names(dev) + + +def get_max_lanes(port): + """ + Get the $port's maximum number of lanes. + Return: number of lanes, e.g. 1, 2, 4 and 8. + """ + + cmd = "devlink -j port show %s" % port + stdout, stderr = run_command(cmd) + assert stderr == "" + values = list(json.loads(stdout)['port'].values())[0] + + if 'lanes' in values: + lanes = values['lanes'] + else: + lanes = 0 + return lanes + + +def get_split_ability(port): + """ + Get the $port split ability. + Return: split ability, true or false. + """ + + cmd = "devlink -j port show %s" % port.name + stdout, stderr = run_command(cmd) + assert stderr == "" + values = list(json.loads(stdout)['port'].values())[0] + + return values['splittable'] + + +def split(k, port, should_fail=False): + """ + Split $port into $k ports. + If should_fail == True, the split should fail. Otherwise, should pass. + Return: Array of sub ports after splitting. + If the $port wasn't split, the array will be empty. + """ + + cmd = "devlink port split %s count %s" % (port.bus_info, k) + stdout, stderr = run_command(cmd, should_fail=should_fail) + + if should_fail: + if not test(stderr != "", "%s is unsplittable" % port.name): + print("split an unsplittable port %s" % port.name) + return create_split_group(port, k) + else: + if stderr == "": + return create_split_group(port, k) + print("didn't split a splittable port %s" % port.name) + + return [] + + +def unsplit(port): + """ + Unsplit $port. + """ + + cmd = "devlink port unsplit %s" % port + stdout, stderr = run_command(cmd) + test(stderr == "", "Unsplit port %s" % port) + + +def exists(port, dev): + """ + Check if $port exists in the devlink ports. + Return: True is so, False otherwise. + """ + + return any(dev_port.name == port + for dev_port in devlink_ports.get_if_names(dev)) + + +def exists_and_lanes(ports, lanes, dev): + """ + Check if every port in the list $ports exists in the devlink ports and has + $lanes number of lanes after splitting. + Return: True if both are True, False otherwise. + """ + + for port in ports: + max_lanes = get_max_lanes(port) + if not exists(port, dev): + print("port %s doesn't exist in devlink ports" % port) + return False + if max_lanes != lanes: + print("port %s has %d lanes, but %s were expected" + % (port, lanes, max_lanes)) + return False + return True + + +def test(cond, msg): + """ + Check $cond and print a message accordingly. + Return: True is pass, False otherwise. + """ + + if cond: + print("TEST: %-60s [ OK ]" % msg) + else: + print("TEST: %-60s [FAIL]" % msg) + + return cond + + +def create_split_group(port, k): + """ + Create the split group for $port. + Return: Array with $k elements, which are the split port group. + """ + + return list(port.name + "s" + str(i) for i in range(k)) + + +def split_unsplittable_port(port, k): + """ + Test that splitting of unsplittable port fails. + """ + + # split to max + new_split_group = split(k, port, should_fail=True) + + if new_split_group != []: + unsplit(port.bus_info) + + +def split_splittable_port(port, k, lanes, dev): + """ + Test that splitting of splittable port passes correctly. + """ + + new_split_group = split(k, port) + + # Once the split command ends, it takes some time to the sub ifaces' + # to get their names. Use udevadm to continue only when all current udev + # events are handled. + cmd = "udevadm settle" + stdout, stderr = run_command(cmd) + assert stderr == "" + + if new_split_group != []: + test(exists_and_lanes(new_split_group, lanes/k, dev), + "split port %s into %s" % (port.name, k)) + + unsplit(port.bus_info) + + +def validate_devlink_output(devlink_data, target_property=None): + """ + Determine if test should be skipped by checking: + 1. devlink_data contains values + 2. The target_property exist in devlink_data + """ + skip_reason = None + if any(devlink_data.values()): + if target_property: + skip_reason = "{} not found in devlink output, test skipped".format(target_property) + for key in devlink_data: + if target_property in devlink_data[key]: + skip_reason = None + else: + skip_reason = 'devlink output is empty, test skipped' + + if skip_reason: + print(skip_reason) + sys.exit(KSFT_SKIP) + + +def make_parser(): + parser = argparse.ArgumentParser(description='A test for port splitting.') + parser.add_argument('--dev', + help='The devlink handle of the device under test. ' + + 'The default is the first registered devlink ' + + 'handle.') + + return parser + + +def main(cmdline=None): + parser = make_parser() + args = parser.parse_args(cmdline) + + dev = args.dev + if not dev: + cmd = "devlink -j dev show" + stdout, stderr = run_command(cmd) + assert stderr == "" + + validate_devlink_output(json.loads(stdout)) + devs = json.loads(stdout)['dev'] + dev = list(devs.keys())[0] + + cmd = "devlink dev show %s" % dev + stdout, stderr = run_command(cmd) + if stderr != "": + print("devlink device %s can not be found" % dev) + sys.exit(1) + + ports = devlink_ports(dev) + + found_max_lanes = False + for port in ports.if_names: + max_lanes = get_max_lanes(port.name) + + # If max lanes is 0, do not test port splitting at all + if max_lanes == 0: + continue + + # If 1 lane, shouldn't be able to split + elif max_lanes == 1: + test(not get_split_ability(port), + "%s should not be able to split" % port.name) + split_unsplittable_port(port, max_lanes) + + # Else, splitting should pass and all the split ports should exist. + else: + lane = max_lanes + test(get_split_ability(port), + "%s should be able to split" % port.name) + while lane > 1: + split_splittable_port(port, lane, max_lanes, dev) + + lane //= 2 + found_max_lanes = True + + if not found_max_lanes: + print(f"Test not started, no port of device {dev} reports max_lanes") + sys.exit(KSFT_SKIP) + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/ethtool.sh b/tools/testing/selftests/drivers/net/hw/ethtool.sh new file mode 100755 index 000000000..bb12d5d70 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool.sh @@ -0,0 +1,300 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + same_speeds_autoneg_off + different_speeds_autoneg_off + combination_of_neg_on_and_off + advertise_subset_of_speeds + check_highest_speed_is_chosen + different_speeds_autoneg_on +" +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source ethtool_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/24 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/24 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.2.2/24 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + h1_create + h2_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy +} + +same_speeds_autoneg_off() +{ + # Check that when each of the reported speeds is forced, the links come + # up and are operational. + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 0)) + + for speed in "${speeds_arr[@]}"; do + RET=0 + ethtool_set $h1 speed $speed autoneg off + ethtool_set $h2 speed $speed autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "speed $speed autoneg off" + log_test "force of same speed autoneg off" + log_info "speed = $speed" + done + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +different_speeds_autoneg_off() +{ + # Test that when we force different speeds, links are not up and ping + # fails. + RET=0 + + local -a speeds_arr=($(different_speeds_get $h1 $h2 0 0)) + local speed1=${speeds_arr[0]} + local speed2=${speeds_arr[1]} + + ethtool_set $h1 speed $speed1 autoneg off + ethtool_set $h2 speed $speed2 autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_fail $? "ping with different speeds" + + log_test "force of different speeds autoneg off" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +combination_of_neg_on_and_off() +{ + # Test that when one device is forced to a speed supported by both + # endpoints and the other device is configured to autoneg on, the links + # are up and ping passes. + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) + + for speed in "${speeds_arr[@]}"; do + RET=0 + ethtool_set $h1 speed $speed autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "h1-speed=$speed autoneg off, h2 autoneg on" + log_test "one side with autoneg off and another with autoneg on" + log_info "force speed = $speed" + done + + ethtool -s $h1 autoneg on +} + +hex_speed_value_get() +{ + local speed=$1; shift + + local shift_size=${speed_values[$speed]} + speed=$((0x1 << $"shift_size")) + printf "%#x" "$speed" +} + +subset_of_common_speeds_get() +{ + local dev1=$1; shift + local dev2=$1; shift + local adver=$1; shift + + local -a speeds_arr=($(common_speeds_get $dev1 $dev2 0 $adver)) + local speed_to_advertise=0 + local speed_to_remove=${speeds_arr[0]} + speed_to_remove+='base' + + local -a speeds_mode_arr=($(common_speeds_get $dev1 $dev2 1 $adver)) + + for speed in ${speeds_mode_arr[@]}; do + if [[ $speed != $speed_to_remove* ]]; then + speed=$(hex_speed_value_get $speed) + speed_to_advertise=$(($speed_to_advertise | \ + $speed)) + fi + + done + + # Convert to hex. + printf "%#x" "$speed_to_advertise" +} + +speed_to_advertise_get() +{ + # The function returns the hex number that is composed by OR-ing all + # the modes corresponding to the provided speed. + local speed_without_mode=$1; shift + local supported_speeds=("$@"); shift + local speed_to_advertise=0 + + speed_without_mode+='base' + + for speed in ${supported_speeds[@]}; do + if [[ $speed == $speed_without_mode* ]]; then + speed=$(hex_speed_value_get $speed) + speed_to_advertise=$(($speed_to_advertise | \ + $speed)) + fi + + done + + # Convert to hex. + printf "%#x" "$speed_to_advertise" +} + +advertise_subset_of_speeds() +{ + # Test that when one device advertises a subset of speeds and another + # advertises a specific speed (but all modes of this speed), the links + # are up and ping passes. + RET=0 + + local speed_1_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) + ethtool_set $h1 advertise $speed_1_to_advertise + + if [ $RET != 0 ]; then + log_test "advertise subset of speeds" + return + fi + + local -a speeds_arr_without_mode=($(common_speeds_get $h1 $h2 0 1)) + # Check only speeds that h1 advertised. Remove the first speed. + unset speeds_arr_without_mode[0] + local -a speeds_arr_with_mode=($(common_speeds_get $h1 $h2 1 1)) + + for speed_value in ${speeds_arr_without_mode[@]}; do + RET=0 + local speed_2_to_advertise=$(speed_to_advertise_get $speed_value \ + "${speeds_arr_with_mode[@]}") + ethtool_set $h2 advertise $speed_2_to_advertise + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" + + log_test "advertise subset of speeds" + log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise" + done + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +check_highest_speed_is_chosen() +{ + # Test that when one device advertises a subset of speeds, the other + # chooses the highest speed. This test checks configuration without + # traffic. + RET=0 + + local max_speed + local chosen_speed + local speed_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) + + ethtool_set $h1 advertise $speed_to_advertise + + if [ $RET != 0 ]; then + log_test "check highest speed" + return + fi + + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) + + max_speed=${speeds_arr[0]} + for current in ${speeds_arr[@]}; do + if [[ $current -gt $max_speed ]]; then + max_speed=$current + fi + done + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + chosen_speed=$(ethtool $h1 | grep 'Speed:') + chosen_speed=${chosen_speed%"Mb/s"*} + chosen_speed=${chosen_speed#*"Speed: "} + ((chosen_speed == max_speed)) + check_err $? "h1 advertise $speed_to_advertise, h2 sync to speed $chosen_speed" + + log_test "check highest speed" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +different_speeds_autoneg_on() +{ + # Test that when we configure links to advertise different speeds, + # links are not up and ping fails. + RET=0 + + local -a speeds=($(different_speeds_get $h1 $h2 1 1)) + local speed1=${speeds[0]} + local speed2=${speeds[1]} + + speed1=$(hex_speed_value_get $speed1) + speed2=$(hex_speed_value_get $speed2) + + ethtool_set $h1 advertise $speed1 + ethtool_set $h2 advertise $speed2 + + if (($RET)); then + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_fail $? "ping with different speeds autoneg on" + fi + + log_test "advertise different speeds autoneg on" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +declare -gA speed_values +eval "speed_values=($(speeds_arr_get))" + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh new file mode 100755 index 000000000..a75844484 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + autoneg + autoneg_force_mode + no_cable +" + +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source ethtool_lib.sh + +TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + swp3=$NETIF_NO_CABLE +} + +ethtool_ext_state() +{ + local dev=$1; shift + local expected_ext_state=$1; shift + local expected_ext_substate=${1:-""}; shift + + local ext_state=$(ethtool $dev | grep "Link detected" \ + | cut -d "(" -f2 | cut -d ")" -f1) + local ext_substate=$(echo $ext_state | cut -sd "," -f2 \ + | sed -e 's/^[[:space:]]*//') + ext_state=$(echo $ext_state | cut -d "," -f1) + + if [[ $ext_state != $expected_ext_state ]]; then + echo "Expected \"$expected_ext_state\", got \"$ext_state\"" + return 1 + fi + if [[ $ext_substate != $expected_ext_substate ]]; then + echo "Expected \"$expected_ext_substate\", got \"$ext_substate\"" + return 1 + fi +} + +autoneg() +{ + local msg + + RET=0 + + ip link set dev $swp1 up + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ + "Autoneg" "No partner detected") + check_err $? "$msg" + + log_test "Autoneg, No partner detected" + + ip link set dev $swp1 down +} + +autoneg_force_mode() +{ + local msg + + RET=0 + + ip link set dev $swp1 up + ip link set dev $swp2 up + + local -a speeds_arr=($(different_speeds_get $swp1 $swp2 0 0)) + local speed1=${speeds_arr[0]} + local speed2=${speeds_arr[1]} + + ethtool_set $swp1 speed $speed1 autoneg off + ethtool_set $swp2 speed $speed2 autoneg off + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ + "Autoneg" "No partner detected during force mode") + check_err $? "$msg" + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp2 \ + "Autoneg" "No partner detected during force mode") + check_err $? "$msg" + + log_test "Autoneg, No partner detected during force mode" + + ethtool -s $swp2 autoneg on + ethtool -s $swp1 autoneg on + + ip link set dev $swp2 down + ip link set dev $swp1 down +} + +no_cable() +{ + local msg + + RET=0 + + ip link set dev $swp3 up + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp3 "No cable") + check_err $? "$msg" + + log_test "No cable" + + ip link set dev $swp3 down +} + +setup_prepare + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh new file mode 100644 index 000000000..b9bfb4508 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +speeds_arr_get() +{ + cmd='/ETHTOOL_LINK_MODE_[^[:space:]]*_BIT[[:space:]]+=[[:space:]]+/ \ + {sub(/,$/, "") \ + sub(/ETHTOOL_LINK_MODE_/,"") \ + sub(/_BIT/,"") \ + sub(/_Full/,"/Full") \ + sub(/_Half/,"/Half");\ + print "["$1"]="$3}' + + awk "${cmd}" /usr/include/linux/ethtool.h +} + +ethtool_set() +{ + local cmd="$@" + local out=$(ethtool -s $cmd 2>&1 | wc -l) + + check_err $out "error in configuration. $cmd" +} + +dev_linkmodes_params_get() +{ + local dev=$1; shift + local adver=$1; shift + local -a linkmodes_params + local param_count + local arr + + if (($adver)); then + mode="Advertised link modes" + else + mode="Supported link modes" + fi + + local -a dev_linkmodes=($(dev_speeds_get $dev 1 $adver)) + for ((i=0; i<${#dev_linkmodes[@]}; i++)); do + linkmodes_params[$i]=$(echo -e "${dev_linkmodes[$i]}" | \ + # Replaces all non numbers with spaces + sed -e 's/[^0-9]/ /g' | \ + # Squeeze spaces in sequence to 1 space + tr -s ' ') + # Count how many numbers were found in the linkmode + param_count=$(echo "${linkmodes_params[$i]}" | wc -w) + if [[ $param_count -eq 1 ]]; then + linkmodes_params[$i]="${linkmodes_params[$i]} 1" + elif [[ $param_count -ge 3 ]]; then + arr=(${linkmodes_params[$i]}) + # Take only first two params + linkmodes_params[$i]=$(echo "${arr[@]:0:2}") + fi + done + echo ${linkmodes_params[@]} +} + +dev_speeds_get() +{ + local dev=$1; shift + local with_mode=$1; shift + local adver=$1; shift + local speeds_str + + if (($adver)); then + mode="Advertised link modes" + else + mode="Supported link modes" + fi + + speeds_str=$(ethtool "$dev" | \ + # Snip everything before the link modes section. + sed -n '/'"$mode"':/,$p' | \ + # Quit processing the rest at the start of the next section. + # When checking, skip the header of this section (hence the 2,). + sed -n '2,${/^[\t][^ \t]/q};p' | \ + # Drop the section header of the current section. + cut -d':' -f2) + + local -a speeds_arr=($speeds_str) + if [[ $with_mode -eq 0 ]]; then + for ((i=0; i<${#speeds_arr[@]}; i++)); do + speeds_arr[$i]=${speeds_arr[$i]%base*} + done + fi + echo ${speeds_arr[@]} +} + +common_speeds_get() +{ + dev1=$1; shift + dev2=$1; shift + with_mode=$1; shift + adver=$1; shift + + local -a dev1_speeds=($(dev_speeds_get $dev1 $with_mode $adver)) + local -a dev2_speeds=($(dev_speeds_get $dev2 $with_mode $adver)) + + comm -12 \ + <(printf '%s\n' "${dev1_speeds[@]}" | sort -u) \ + <(printf '%s\n' "${dev2_speeds[@]}" | sort -u) +} + +different_speeds_get() +{ + local dev1=$1; shift + local dev2=$1; shift + local with_mode=$1; shift + local adver=$1; shift + + local -a speeds_arr + + speeds_arr=($(common_speeds_get $dev1 $dev2 $with_mode $adver)) + if [[ ${#speeds_arr[@]} < 2 ]]; then + check_err 1 "cannot check different speeds. There are not enough speeds" + fi + + echo ${speeds_arr[0]} ${speeds_arr[1]} +} diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh new file mode 100755 index 000000000..c301e735c --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh @@ -0,0 +1,341 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + manual_with_verification_h1_to_h2 + manual_with_verification_h2_to_h1 + manual_without_verification_h1_to_h2 + manual_without_verification_h2_to_h1 + manual_failed_verification_h1_to_h2 + manual_failed_verification_h2_to_h1 + lldp +" + +NUM_NETIFS=2 +REQUIRE_MZ=no +PREEMPTIBLE_PRIO=0 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +traffic_test() +{ + local if=$1; shift + local src=$1; shift + local num_pkts=10000 + local before= + local after= + local delta= + + if [ ${has_pmac_stats[$if]} = false ]; then + src="aggregate" + fi + + before=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) + + $MZ $if -q -c $num_pkts -p 64 -b bcast -t ip -R $PREEMPTIBLE_PRIO + + after=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) + + delta=$((after - before)) + + # Allow an extra 1% tolerance for random packets sent by the stack + [ $delta -ge $num_pkts ] && [ $delta -le $((num_pkts + 100)) ] +} + +manual_with_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + # It isn't completely clear from IEEE 802.3-2018 Figure 99-5: Transmit + # Processing state diagram whether the "send_r" variable (send response + # to verification frame) should be taken into consideration while the + # MAC Merge TX direction is disabled. That being said, at least the + # NXP ENETC does not, and requires tx-enabled on in order to respond to + # the link partner's verification frames. + ethtool --set-mm $rx tx-enabled on + ethtool --set-mm $tx verify-enabled on tx-enabled on + + # Wait for verification to finish + sleep 1 + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'SUCCEEDED' + check_err "$?" "Verification did not succeed" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_err "$?" "pMAC TX is not active" + + traffic_test $tx "pmac" + check_err "$?" "Traffic did not get sent through $tx's pMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + ethtool --set-mm $rx tx-enabled off + + log_test "Manual configuration with verification: $tx to $rx" +} + +manual_with_verification_h1_to_h2() +{ + manual_with_verification $h1 $h2 +} + +manual_with_verification_h2_to_h1() +{ + manual_with_verification $h2 $h1 +} + +manual_without_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + ethtool --set-mm $tx verify-enabled off tx-enabled on + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'DISABLED' + check_err "$?" "Verification is not disabled" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_err "$?" "pMAC TX is not active" + + traffic_test $tx "pmac" + check_err "$?" "Traffic did not get sent through $tx's pMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + + log_test "Manual configuration without verification: $tx to $rx" +} + +manual_without_verification_h1_to_h2() +{ + manual_without_verification $h1 $h2 +} + +manual_without_verification_h2_to_h1() +{ + manual_without_verification $h2 $h1 +} + +manual_failed_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + ethtool --set-mm $rx pmac-enabled off + ethtool --set-mm $tx verify-enabled on tx-enabled on + + # Wait for verification to time out + sleep 1 + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'SUCCEEDED' + check_fail "$?" "Verification succeeded when it shouldn't have" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_fail "$?" "pMAC TX is active when it shouldn't have" + + traffic_test $tx "emac" + check_err "$?" "Traffic did not get sent through $tx's eMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + ethtool --set-mm $rx pmac-enabled on + + log_test "Manual configuration with failed verification: $tx to $rx" +} + +manual_failed_verification_h1_to_h2() +{ + manual_failed_verification $h1 $h2 +} + +manual_failed_verification_h2_to_h1() +{ + manual_failed_verification $h2 $h1 +} + +smallest_supported_add_frag_size() +{ + local iface=$1 + local rx_min_frag_size= + + rx_min_frag_size=$(ethtool --json --show-mm $iface | \ + jq '.[]."rx-min-frag-size"') + + if [ $rx_min_frag_size -le 60 ]; then + echo 0 + elif [ $rx_min_frag_size -le 124 ]; then + echo 1 + elif [ $rx_min_frag_size -le 188 ]; then + echo 2 + elif [ $rx_min_frag_size -le 252 ]; then + echo 3 + else + echo "$iface: RX min frag size $rx_min_frag_size cannot be advertised over LLDP" + exit 1 + fi +} + +expected_add_frag_size() +{ + local iface=$1 + local requested=$2 + local min=$(smallest_supported_add_frag_size $iface) + + [ $requested -le $min ] && echo $min || echo $requested +} + +lldp_change_add_frag_size() +{ + local add_frag_size=$1 + local pattern= + + lldptool -T -i $h1 -V addEthCaps addFragSize=$add_frag_size >/dev/null + # Wait for TLVs to be received + sleep 2 + pattern=$(printf "Additional fragment size: %d" \ + $(expected_add_frag_size $h1 $add_frag_size)) + lldptool -i $h2 -t -n -V addEthCaps | grep -q "$pattern" +} + +lldp() +{ + RET=0 + + systemctl start lldpad + + # Configure the interfaces to receive and transmit LLDPDUs + lldptool -L -i $h1 adminStatus=rxtx >/dev/null + lldptool -L -i $h2 adminStatus=rxtx >/dev/null + + # Enable the transmission of Additional Ethernet Capabilities TLV + lldptool -T -i $h1 -V addEthCaps enableTx=yes >/dev/null + lldptool -T -i $h2 -V addEthCaps enableTx=yes >/dev/null + + # Wait for TLVs to be received + sleep 2 + + lldptool -i $h1 -t -n -V addEthCaps | \ + grep -q "Preemption capability active" + check_err "$?" "$h1 pMAC TX is not active" + + lldptool -i $h2 -t -n -V addEthCaps | \ + grep -q "Preemption capability active" + check_err "$?" "$h2 pMAC TX is not active" + + lldp_change_add_frag_size 3 + check_err "$?" "addFragSize 3" + + lldp_change_add_frag_size 2 + check_err "$?" "addFragSize 2" + + lldp_change_add_frag_size 1 + check_err "$?" "addFragSize 1" + + lldp_change_add_frag_size 0 + check_err "$?" "addFragSize 0" + + traffic_test $h1 "pmac" + check_err "$?" "Traffic did not get sent through $h1's pMAC" + + traffic_test $h2 "pmac" + check_err "$?" "Traffic did not get sent through $h2's pMAC" + + systemctl stop lldpad + + log_test "LLDP" +} + +h1_create() +{ + ip link set dev $h1 up + + tc qdisc add dev $h1 root mqprio num_tc 4 map 0 1 2 3 \ + queues 1@0 1@1 1@2 1@3 \ + fp P E E E \ + hw 1 + + ethtool --set-mm $h1 pmac-enabled on tx-enabled off verify-enabled off +} + +h2_create() +{ + ip link set dev $h2 up + + ethtool --set-mm $h2 pmac-enabled on tx-enabled off verify-enabled off + + tc qdisc add dev $h2 root mqprio num_tc 4 map 0 1 2 3 \ + queues 1@0 1@1 1@2 1@3 \ + fp P E E E \ + hw 1 +} + +h1_destroy() +{ + ethtool --set-mm $h1 pmac-enabled off tx-enabled off verify-enabled off + + tc qdisc del dev $h1 root + + ip link set dev $h1 down +} + +h2_destroy() +{ + tc qdisc del dev $h2 root + + ethtool --set-mm $h2 pmac-enabled off tx-enabled off verify-enabled off + + ip link set dev $h2 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + h1_create + h2_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy +} + +check_ethtool_mm_support +check_tc_fp_support +require_command lldptool +bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually" + +for netif in ${NETIFS[@]}; do + ethtool --show-mm $netif 2>&1 &> /dev/null + if [[ $? -ne 0 ]]; then + echo "SKIP: $netif does not support MAC Merge" + exit $ksft_skip + fi + + if check_ethtool_pmac_std_stats_support $netif eth-mac; then + has_pmac_stats[$netif]=true + else + has_pmac_stats[$netif]=false + echo "$netif does not report pMAC statistics, falling back to aggregate" + fi +done + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh new file mode 100755 index 000000000..e2a1c10d3 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + rmon_rx_histogram + rmon_tx_histogram +" + +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +ETH_FCS_LEN=4 +ETH_HLEN=$((6+6+2)) + +declare -A netif_mtu + +ensure_mtu() +{ + local iface=$1; shift + local len=$1; shift + local current=$(ip -j link show dev $iface | jq -r '.[0].mtu') + local required=$((len - ETH_HLEN - ETH_FCS_LEN)) + + if [ $current -lt $required ]; then + ip link set dev $iface mtu $required || return 1 + fi +} + +bucket_test() +{ + local iface=$1; shift + local neigh=$1; shift + local set=$1; shift + local bucket=$1; shift + local len=$1; shift + local num_rx=10000 + local num_tx=20000 + local expected= + local before= + local after= + local delta= + + # Mausezahn does not include FCS bytes in its length - but the + # histogram counters do + len=$((len - ETH_FCS_LEN)) + + before=$(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") + + # Send 10k one way and 20k in the other, to detect counters + # mapped to the wrong direction + $MZ $neigh -q -c $num_rx -p $len -a own -b bcast -d 10us + $MZ $iface -q -c $num_tx -p $len -a own -b bcast -d 10us + + after=$(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") + + delta=$((after - before)) + + expected=$([ $set = rx ] && echo $num_rx || echo $num_tx) + + # Allow some extra tolerance for other packets sent by the stack + [ $delta -ge $expected ] && [ $delta -le $((expected + 100)) ] +} + +rmon_histogram() +{ + local iface=$1; shift + local neigh=$1; shift + local set=$1; shift + local nbuckets=0 + local step= + + RET=0 + + while read -r -a bucket; do + step="$set-pkts${bucket[0]}to${bucket[1]} on $iface" + + for if in $iface $neigh; do + if ! ensure_mtu $if ${bucket[0]}; then + log_test_xfail "$if does not support the required MTU for $step" + return + fi + done + + if ! bucket_test $iface $neigh $set $nbuckets ${bucket[0]}; then + check_err 1 "$step failed" + return 1 + fi + log_test "$step" + nbuckets=$((nbuckets + 1)) + done < <(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null) + + if [ $nbuckets -eq 0 ]; then + log_test_xfail "$iface does not support $set histogram counters" + return + fi +} + +rmon_rx_histogram() +{ + rmon_histogram $h1 $h2 rx + rmon_histogram $h2 $h1 rx +} + +rmon_tx_histogram() +{ + rmon_histogram $h1 $h2 tx + rmon_histogram $h2 $h1 tx +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + for iface in $h1 $h2; do + netif_mtu[$iface]=$(ip -j link show dev $iface | jq -r '.[0].mtu') + ip link set dev $iface up + done +} + +cleanup() +{ + pre_cleanup + + for iface in $h2 $h1; do + ip link set dev $iface \ + mtu ${netif_mtu[$iface]} \ + down + done +} + +check_ethtool_counter_group_support +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh new file mode 100755 index 000000000..7dfc50366 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh @@ -0,0 +1,333 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +----------------------+ +# | H1 | | H2 | +# | | | | +# | $h1.200 + | | + $h2.200 | +# | 192.0.2.1/28 | | | | 192.0.2.18/28 | +# | 2001:db8:1::1/64 | | | | 2001:db8:2::1/64 | +# | | | | | | +# | $h1 + | | + $h2 | +# | | | | | | +# +------------------|-+ +-|--------------------+ +# | | +# +------------------|-------------------------|--------------------+ +# | SW | | | +# | | | | +# | $rp1 + + $rp2 | +# | | | | +# | $rp1.200 + + $rp2.200 | +# | 192.0.2.2/28 192.0.2.17/28 | +# | 2001:db8:1::2/64 2001:db8:2::2/64 | +# | | +# +-----------------------------------------------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + respin_enablement + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + reapply_config + ping_ipv4 + ping_ipv6 + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + test_stats_report_rx + test_stats_report_tx + test_destroy_enabled + test_double_enable +" +NUM_NETIFS=4 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +h1_create() +{ + simple_if_init $h1 + vlan_create $h1 200 v$h1 192.0.2.1/28 2001:db8:1::1/64 + ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + ip -6 route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 +} + +h1_destroy() +{ + ip -6 route del 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 + ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + vlan_destroy $h1 200 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + vlan_create $h2 200 v$h2 192.0.2.18/28 2001:db8:2::1/64 + ip route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 + ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 +} + +h2_destroy() +{ + ip -6 route del 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 + ip route del 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 + vlan_destroy $h2 200 + simple_if_fini $h2 +} + +router_rp1_200_create() +{ + ip link add name $rp1.200 link $rp1 type vlan id 200 + ip link set dev $rp1.200 addrgenmode eui64 + ip link set dev $rp1.200 up + ip address add dev $rp1.200 192.0.2.2/28 + ip address add dev $rp1.200 2001:db8:1::2/64 + ip stats set dev $rp1.200 l3_stats on +} + +router_rp1_200_destroy() +{ + ip stats set dev $rp1.200 l3_stats off + ip address del dev $rp1.200 2001:db8:1::2/64 + ip address del dev $rp1.200 192.0.2.2/28 + ip link del dev $rp1.200 +} + +router_create() +{ + ip link set dev $rp1 up + router_rp1_200_create + + ip link set dev $rp2 up + vlan_create $rp2 200 "" 192.0.2.17/28 2001:db8:2::2/64 +} + +router_destroy() +{ + vlan_destroy $rp2 200 + ip link set dev $rp2 down + + router_rp1_200_destroy + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1mac=$(mac_get $rp1) + rp2mac=$(mac_get $rp2) + + vrf_prepare + + h1_create + h2_create + + router_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + router_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1.200 192.0.2.18 " IPv4" +} + +ping_ipv6() +{ + ping_test $h1.200 2001:db8:2::1 " IPv6" +} + +send_packets_rx_ipv4() +{ + # Send 21 packets instead of 20, because the first one might trap and go + # through the SW datapath, which might not bump the HW counter. + $MZ $h1.200 -c 21 -d 20msec -p 100 \ + -a own -b $rp1mac -A 192.0.2.1 -B 192.0.2.18 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_rx_ipv6() +{ + $MZ $h1.200 -6 -c 21 -d 20msec -p 100 \ + -a own -b $rp1mac -A 2001:db8:1::1 -B 2001:db8:2::1 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_tx_ipv4() +{ + $MZ $h2.200 -c 21 -d 20msec -p 100 \ + -a own -b $rp2mac -A 192.0.2.18 -B 192.0.2.1 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_tx_ipv6() +{ + $MZ $h2.200 -6 -c 21 -d 20msec -p 100 \ + -a own -b $rp2mac -A 2001:db8:2::1 -B 2001:db8:1::1 \ + -q -t udp sp=54321,dp=12345 +} + +___test_stats() +{ + local dir=$1; shift + local prot=$1; shift + + local a + local b + + a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) + send_packets_${dir}_${prot} + "$@" + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $rp1.200 ${dir} packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" +} + +__test_stats() +{ + local dir=$1; shift + local prot=$1; shift + + RET=0 + ___test_stats "$dir" "$prot" + log_test "Test $dir packets: $prot" +} + +test_stats_rx_ipv4() +{ + __test_stats rx ipv4 +} + +test_stats_tx_ipv4() +{ + __test_stats tx ipv4 +} + +test_stats_rx_ipv6() +{ + __test_stats rx ipv6 +} + +test_stats_tx_ipv6() +{ + __test_stats tx ipv6 +} + +# Make sure everything works well even after stats have been disabled and +# reenabled on the same device without touching the L3 configuration. +respin_enablement() +{ + log_info "Turning stats off and on again" + ip stats set dev $rp1.200 l3_stats off + ip stats set dev $rp1.200 l3_stats on +} + +# For the initial run, l3_stats is enabled on a completely set up netdevice. Now +# do it the other way around: enabling the L3 stats on an L2 netdevice, and only +# then apply the L3 configuration. +reapply_config() +{ + log_info "Reapplying configuration" + + router_rp1_200_destroy + + ip link add name $rp1.200 link $rp1 type vlan id 200 + ip link set dev $rp1.200 addrgenmode none + ip stats set dev $rp1.200 l3_stats on + ip link set dev $rp1.200 addrgenmode eui64 + ip link set dev $rp1.200 up + ip address add dev $rp1.200 192.0.2.2/28 + ip address add dev $rp1.200 2001:db8:1::2/64 +} + +__test_stats_report() +{ + local dir=$1; shift + local prot=$1; shift + + local a + local b + + RET=0 + + a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) + send_packets_${dir}_${prot} + ip address flush dev $rp1.200 + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $rp1.200 ${dir} packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" + log_test "Test ${dir} packets: stats pushed on loss of L3" + + ip stats set dev $rp1.200 l3_stats off + ip link del dev $rp1.200 + router_rp1_200_create +} + +test_stats_report_rx() +{ + __test_stats_report rx ipv4 +} + +test_stats_report_tx() +{ + __test_stats_report tx ipv4 +} + +test_destroy_enabled() +{ + RET=0 + + ip link del dev $rp1.200 + router_rp1_200_create + + log_test "Destroy l3_stats-enabled netdev" +} + +test_double_enable() +{ + RET=0 + ___test_stats rx ipv4 \ + ip stats set dev $rp1.200 l3_stats on + log_test "Test stat retention across a spurious enablement" +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info | + jq '.[].info.l3_stats.used') +[[ $used = true ]] +check_err $? "hw_stats_info.used=$used" +log_test "l3_stats offloaded" +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh new file mode 100755 index 000000000..ab8d04855 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test L3 stats on IP-in-IP GRE tunnel without key. + +# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more +# details. + +ALL_TESTS=" + ping_ipv4 + test_stats_rx + test_stats_tx +" +NUM_NETIFS=6 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source "$lib_dir"/../../../net/forwarding/ipip_lib.sh + +setup_prepare() +{ + h1=${NETIFS[p1]} + ol1=${NETIFS[p2]} + + ul1=${NETIFS[p3]} + ul2=${NETIFS[p4]} + + ol2=${NETIFS[p5]} + h2=${NETIFS[p6]} + + ol1mac=$(mac_get $ol1) + + forwarding_enable + vrf_prepare + h1_create + h2_create + sw1_flat_create gre $ol1 $ul1 + sw2_flat_create gre $ol2 $ul2 + ip stats set dev g1a l3_stats on + ip stats set dev g2a l3_stats on +} + +cleanup() +{ + pre_cleanup + + ip stats set dev g1a l3_stats off + ip stats set dev g2a l3_stats off + + sw2_flat_destroy $ol2 $ul2 + sw1_flat_destroy $ol1 $ul1 + h2_destroy + h1_destroy + + vrf_cleanup + forwarding_restore +} + +ping_ipv4() +{ + RET=0 + + ping_test $h1 192.0.2.18 " gre flat" +} + +send_packets_ipv4() +{ + # Send 21 packets instead of 20, because the first one might trap and go + # through the SW datapath, which might not bump the HW counter. + $MZ $h1 -c 21 -d 20msec -p 100 \ + -a own -b $ol1mac -A 192.0.2.1 -B 192.0.2.18 \ + -q -t udp sp=54321,dp=12345 +} + +test_stats() +{ + local dev=$1; shift + local dir=$1; shift + + local a + local b + + RET=0 + + a=$(hw_stats_get l3_stats $dev $dir packets) + send_packets_ipv4 + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $dev $dir packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" + + log_test "Test $dir packets: $prot" +} + +test_stats_tx() +{ + test_stats g1a tx +} + +test_stats_rx() +{ + test_stats g2a rx +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/loopback.sh b/tools/testing/selftests/drivers/net/hw/loopback.sh new file mode 100755 index 000000000..5acc3ff82 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/loopback.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +ALL_TESTS="loopback_test" +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/tc_common.sh +source "$lib_dir"/../../../net/forwarding/lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 + tc qdisc add dev $h1 clsact +} + +h1_destroy() +{ + tc qdisc del dev $h1 clsact + simple_if_fini $h1 192.0.2.1/24 +} + +h2_create() +{ + simple_if_init $h2 +} + +h2_destroy() +{ + simple_if_fini $h2 +} + +loopback_test() +{ + RET=0 + + tc filter add dev $h1 ingress protocol arp pref 1 handle 101 flower \ + skip_hw arp_op reply arp_tip 192.0.2.1 action drop + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 1 + check_fail $? "Matched on a filter without loopback setup" + + ethtool -K $h1 loopback on + check_err $? "Failed to enable loopback" + + setup_wait_dev $h1 + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 1 + check_err $? "Did not match on filter with loopback" + + ethtool -K $h1 loopback off + check_err $? "Failed to disable loopback" + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 2 + check_fail $? "Matched on a filter after loopback was removed" + + tc filter del dev $h1 ingress protocol arp pref 1 handle 101 flower + + log_test "loopback" +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + vrf_prepare + + h1_create + h2_create + + if ethtool -k $h1 | grep loopback | grep -q fixed; then + log_test "SKIP: dev $h1 does not support loopback feature" + exit $ksft_skip + fi +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy + + vrf_cleanup +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/settings b/tools/testing/selftests/drivers/net/hw/settings new file mode 100644 index 000000000..e7b941753 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh index 6369927e9..48395cfd4 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh @@ -42,7 +42,7 @@ __mlxsw_only_on_spectrum() local src=$1; shift if ! mlxsw_on_spectrum "$rev"; then - log_test_skip $src:$caller "(Spectrum-$rev only)" + log_test_xfail $src:$caller "(Spectrum-$rev only)" return 1 fi } diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index a88d8a8c8..899b68926 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -47,7 +47,6 @@ for current_test in ${TESTS:-$ALL_TESTS}; do RET=0 target=$(${current_test}_get_target "$should_fail") if ((target == 0)); then - log_test_skip "'$current_test' should_fail=$should_fail test" continue fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index f981c957f..482ebb744 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -52,7 +52,6 @@ for current_test in ${TESTS:-$ALL_TESTS}; do RET=0 target=$(${current_test}_get_target "$should_fail") if ((target == 0)); then - log_test_skip "'$current_test' [$profile] should_fail=$should_fail test" continue fi ${current_test}_setup_prepare diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7b6918d5f..cb418a234 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -20,7 +20,6 @@ TEST_PROGS += reuseaddr_ports_exhausted.sh TEST_PROGS += txtimestamp.sh TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS += rxtimestamp.sh -TEST_PROGS += devlink_port_split.py TEST_PROGS += drop_monitor_tests.sh TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 535865b3d..fa7b59ff4 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -15,18 +15,12 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ bridge_vlan_unaware.sh \ custom_multipath_hash.sh \ dual_vxlan_bridge.sh \ - ethtool_extended_state.sh \ - ethtool_mm.sh \ - ethtool_rmon.sh \ - ethtool.sh \ gre_custom_multipath_hash.sh \ gre_inner_v4_multipath.sh \ gre_inner_v6_multipath.sh \ gre_multipath_nh_res.sh \ gre_multipath_nh.sh \ gre_multipath.sh \ - hw_stats_l3.sh \ - hw_stats_l3_gre.sh \ ip6_forward_instats_vrf.sh \ ip6gre_custom_multipath_hash.sh \ ip6gre_flat_key.sh \ @@ -43,8 +37,8 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ ipip_hier_gre_key.sh \ ipip_hier_gre_keys.sh \ ipip_hier_gre.sh \ + lib_sh_test.sh \ local_termination.sh \ - loopback.sh \ mirror_gre_bound.sh \ mirror_gre_bridge_1d.sh \ mirror_gre_bridge_1d_vlan.sh \ @@ -113,7 +107,6 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_symmetric.sh TEST_FILES := devlink_lib.sh \ - ethtool_lib.sh \ fib_offload_lib.sh \ forwarding.config.sample \ ip6gre_lib.sh \ diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README index b8a2af8fc..7fdb6a9ca 100644 --- a/tools/testing/selftests/net/forwarding/README +++ b/tools/testing/selftests/net/forwarding/README @@ -56,3 +56,36 @@ o Checks shall be added to lib.sh for any external dependencies. o Code shall be checked using ShellCheck [1] prior to submission. 1. https://www.shellcheck.net/ + +Customization +============= + +The forwarding selftests framework uses a number of variables that +influence its behavior and tools it invokes, and how it invokes them, in +various ways. A number of these variables can be overridden. The way these +overridable variables are specified is typically one of the following two +syntaxes: + + : "${VARIABLE:=default_value}" + VARIABLE=${VARIABLE:=default_value} + +Any of these variables can be overridden. Notably net/forwarding/lib.sh and +net/lib.sh contain a number of overridable variables. + +One way of overriding these variables is through the environment: + + PAUSE_ON_FAIL=yes ./some_test.sh + +The variable NETIFS is special. Since it is an array variable, there is no +way to pass it through the environment. Its value can instead be given as +consecutive arguments to the selftest: + + ./some_test.sh swp{1..8} + +A way to customize variables in a persistent fashion is to create a file +named forwarding.config in this directory. lib.sh sources the file if +present, so it can contain any shell code. Typically it will contain +assignments of variables whose value should be overridden. + +forwarding.config.sample is available in the directory as an example of +how forwarding.config might look. diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample index 1fc4f0242..f1ca95e79 100644 --- a/tools/testing/selftests/net/forwarding/forwarding.config.sample +++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample @@ -3,51 +3,28 @@ ############################################################################## # Topology description. p1 looped back to p2, p3 to p4 and so on. -declare -A NETIFS -NETIFS[p1]=veth0 -NETIFS[p2]=veth1 -NETIFS[p3]=veth2 -NETIFS[p4]=veth3 -NETIFS[p5]=veth4 -NETIFS[p6]=veth5 -NETIFS[p7]=veth6 -NETIFS[p8]=veth7 -NETIFS[p9]=veth8 -NETIFS[p10]=veth9 +NETIFS=( + [p1]=veth0 + [p2]=veth1 + [p3]=veth2 + [p4]=veth3 + [p5]=veth4 + [p6]=veth5 + [p7]=veth6 + [p8]=veth7 + [p9]=veth8 + [p10]=veth9 +) # Port that does not have a cable connected. NETIF_NO_CABLE=eth8 ############################################################################## -# Defines +# In addition to the topology-related variables, it is also possible to override +# in this file other variables that net/lib.sh, net/forwarding/lib.sh or other +# libraries or selftests use. E.g.: -# IPv4 ping utility name -PING=ping -# IPv6 ping utility name. Some distributions use 'ping' for IPv6. PING6=ping6 -# Packet generator. Some distributions use 'mz'. MZ=mausezahn -# mausezahn delay between transmissions in microseconds. -MZ_DELAY=0 -# Time to wait after interfaces participating in the test are all UP WAIT_TIME=5 -# Whether to pause on failure or not. -PAUSE_ON_FAIL=no -# Whether to pause on cleanup or not. -PAUSE_ON_CLEANUP=no -# Type of network interface to create -NETIF_TYPE=veth -# Whether to create virtual interfaces (veth) or not -NETIF_CREATE=yes -# Timeout (in seconds) before ping exits regardless of how many packets have -# been sent or received -PING_TIMEOUT=5 -# Minimum ageing_time (in centiseconds) supported by hardware -LOW_AGEING_TIME=1000 -# Flag for tc match, supposed to be skip_sw/skip_hw which means do not process -# filter by software/hardware -TC_FLAG=skip_hw -# IPv6 traceroute utility name. -TROUTE6=traceroute6 - diff --git a/tools/testing/selftests/net/forwarding/ipip_lib.sh b/tools/testing/selftests/net/forwarding/ipip_lib.sh index 30f36a57b..01e62c4ac 100644 --- a/tools/testing/selftests/net/forwarding/ipip_lib.sh +++ b/tools/testing/selftests/net/forwarding/ipip_lib.sh @@ -141,7 +141,6 @@ # | $h2 + | # | 192.0.2.18/28 | # +---------------------------+ -source lib.sh h1_create() { diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index e579c2e0c..4103ed7af 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1,34 +1,88 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +############################################################################## +# Topology description. p1 looped back to p2, p3 to p4 and so on. + +declare -A NETIFS=( + [p1]=veth0 + [p2]=veth1 + [p3]=veth2 + [p4]=veth3 + [p5]=veth4 + [p6]=veth5 + [p7]=veth6 + [p8]=veth7 + [p9]=veth8 + [p10]=veth9 +) + +# Port that does not have a cable connected. +: "${NETIF_NO_CABLE:=eth8}" + ############################################################################## # Defines -# Can be overridden by the configuration file. -PING=${PING:=ping} -PING6=${PING6:=ping6} -MZ=${MZ:=mausezahn} -MZ_DELAY=${MZ_DELAY:=0} -ARPING=${ARPING:=arping} -TEAMD=${TEAMD:=teamd} -WAIT_TIME=${WAIT_TIME:=5} -PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} -PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no} -NETIF_TYPE=${NETIF_TYPE:=veth} -NETIF_CREATE=${NETIF_CREATE:=yes} -MCD=${MCD:=smcrouted} -MC_CLI=${MC_CLI:=smcroutectl} -PING_COUNT=${PING_COUNT:=10} -PING_TIMEOUT=${PING_TIMEOUT:=5} -WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} -INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600} -LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000} -REQUIRE_JQ=${REQUIRE_JQ:=yes} -REQUIRE_MZ=${REQUIRE_MZ:=yes} -REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no} -STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no} -TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=} -TROUTE6=${TROUTE6:=traceroute6} +# Networking utilities. +: "${PING:=ping}" +: "${PING6:=ping6}" # Some distros just use ping. +: "${ARPING:=arping}" +: "${TROUTE6:=traceroute6}" + +# Packet generator. +: "${MZ:=mausezahn}" # Some distributions use 'mz'. +: "${MZ_DELAY:=0}" + +# Host configuration tools. +: "${TEAMD:=teamd}" +: "${MCD:=smcrouted}" +: "${MC_CLI:=smcroutectl}" + +# Constants for netdevice bring-up: +# Default time in seconds to wait for an interface to come up before giving up +# and bailing out. Used during initial setup. +: "${INTERFACE_TIMEOUT:=600}" +# Like INTERFACE_TIMEOUT, but default for ad-hoc waiting in testing scripts. +: "${WAIT_TIMEOUT:=20}" +# Time to wait after interfaces participating in the test are all UP. +: "${WAIT_TIME:=5}" + +# Whether to pause on, respectively, after a failure and before cleanup. +: "${PAUSE_ON_FAIL:=no}" +: "${PAUSE_ON_CLEANUP:=no}" + +# Whether to create virtual interfaces, and what netdevice type they should be. +: "${NETIF_CREATE:=yes}" +: "${NETIF_TYPE:=veth}" + +# Constants for ping tests: +# How many packets should be sent. +: "${PING_COUNT:=10}" +# Timeout (in seconds) before ping exits regardless of how many packets have +# been sent or received +: "${PING_TIMEOUT:=5}" + +# Minimum ageing_time (in centiseconds) supported by hardware +: "${LOW_AGEING_TIME:=1000}" + +# Whether to check for availability of certain tools. +: "${REQUIRE_JQ:=yes}" +: "${REQUIRE_MZ:=yes}" +: "${REQUIRE_MTOOLS:=no}" + +# Whether to override MAC addresses on interfaces participating in the test. +: "${STABLE_MAC_ADDRS:=no}" + +# Flags for tcpdump +: "${TCPDUMP_EXTRA_FLAGS:=}" + +# Flags for TC filters. +: "${TC_FLAG:=skip_hw}" + +# Whether the machine is "slow" -- i.e. might be incapable of running tests +# involving heavy traffic. This might be the case on a debug kernel, a VM, or +# e.g. a low-power board. +: "${KSFT_MACHINE_SLOW:=no}" net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") @@ -205,17 +259,6 @@ check_port_mab_support() fi } -skip_on_veth() -{ - local kind=$(ip -j -d link show dev ${NETIFS[p1]} | - jq -r '.[].linkinfo.info_kind') - - if [[ $kind == veth ]]; then - echo "SKIP: Test cannot be run with veth pairs" - exit $ksft_skip - fi -} - if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" exit $ksft_skip @@ -358,14 +401,31 @@ EXIT_STATUS=0 # Per-test return value. Clear at the beginning of each test. RET=0 +ret_set_ksft_status() +{ + local ksft_status=$1; shift + local msg=$1; shift + + RET=$(ksft_status_merge $RET $ksft_status) + if (( $? )); then + retmsg=$msg + fi +} + +# Whether FAILs should be interpreted as XFAILs. Internal. +FAIL_TO_XFAIL= + check_err() { local err=$1 local msg=$2 - if [[ $RET -eq 0 && $err -ne 0 ]]; then - RET=$err - retmsg=$msg + if ((err)); then + if [[ $FAIL_TO_XFAIL = yes ]]; then + ret_set_ksft_status $ksft_xfail "$msg" + else + ret_set_ksft_status $ksft_fail "$msg" + fi fi } @@ -374,10 +434,7 @@ check_fail() local err=$1 local msg=$2 - if [[ $RET -eq 0 && $err -eq 0 ]]; then - RET=1 - retmsg=$msg - fi + check_err $((!err)) "$msg" } check_err_fail() @@ -393,6 +450,85 @@ check_err_fail() fi } +xfail_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW = yes ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + +xfail_on_veth() +{ + local dev=$1; shift + local kind + + kind=$(ip -j -d link show dev $dev | + jq -r '.[].linkinfo.info_kind') + if [[ $kind = veth ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + +log_test_result() +{ + local test_name=$1; shift + local opt_str=$1; shift + local result=$1; shift + local retmsg=$1; shift + + printf "TEST: %-60s [%s]\n" "$test_name $opt_str" "$result" + if [[ $retmsg ]]; then + printf "\t%s\n" "$retmsg" + fi +} + +pause_on_fail() +{ + if [[ $PAUSE_ON_FAIL == yes ]]; then + echo "Hit enter to continue, 'q' to quit" + read a + [[ $a == q ]] && exit 1 + fi +} + +handle_test_result_pass() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" " OK " +} + +handle_test_result_fail() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" FAIL "$retmsg" + pause_on_fail +} + +handle_test_result_xfail() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" XFAIL "$retmsg" + pause_on_fail +} + +handle_test_result_skip() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" SKIP "$retmsg" +} + log_test() { local test_name=$1 @@ -402,31 +538,28 @@ log_test() opt_str="($opt_str)" fi - if [[ $RET -ne 0 ]]; then - EXIT_STATUS=1 - printf "TEST: %-60s [FAIL]\n" "$test_name $opt_str" - if [[ ! -z "$retmsg" ]]; then - printf "\t%s\n" "$retmsg" - fi - if [ "${PAUSE_ON_FAIL}" = "yes" ]; then - echo "Hit enter to continue, 'q' to quit" - read a - [ "$a" = "q" ] && exit 1 - fi - return 1 + if ((RET == ksft_pass)); then + handle_test_result_pass "$test_name" "$opt_str" + elif ((RET == ksft_xfail)); then + handle_test_result_xfail "$test_name" "$opt_str" + elif ((RET == ksft_skip)); then + handle_test_result_skip "$test_name" "$opt_str" + else + handle_test_result_fail "$test_name" "$opt_str" fi - printf "TEST: %-60s [ OK ]\n" "$test_name $opt_str" - return 0 + EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET) + return $RET } log_test_skip() { - local test_name=$1 - local opt_str=$2 + RET=$ksft_skip retmsg= log_test "$@" +} - printf "TEST: %-60s [SKIP]\n" "$test_name $opt_str" - return 0 +log_test_xfail() +{ + RET=$ksft_xfail retmsg= log_test "$@" } log_info() diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh new file mode 100755 index 000000000..ff2acccca --- /dev/null +++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh @@ -0,0 +1,208 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This tests the operation of lib.sh itself. + +ALL_TESTS=" + test_ret + test_exit_status +" +NUM_NETIFS=0 +source lib.sh + +# Simulated checks. + +do_test() +{ + local msg=$1; shift + + "$@" + check_err $? "$msg" +} + +tpass() +{ + do_test "tpass" true +} + +tfail() +{ + do_test "tfail" false +} + +txfail() +{ + FAIL_TO_XFAIL=yes do_test "txfail" false +} + +# Simulated tests. + +pass() +{ + RET=0 + do_test "true" true + log_test "true" +} + +fail() +{ + RET=0 + do_test "false" false + log_test "false" +} + +xfail() +{ + RET=0 + FAIL_TO_XFAIL=yes do_test "xfalse" false + log_test "xfalse" +} + +skip() +{ + RET=0 + log_test_skip "skip" +} + +slow_xfail() +{ + RET=0 + xfail_on_slow do_test "slow_false" false + log_test "slow_false" +} + +# lib.sh tests. + +ret_tests_run() +{ + local t + + RET=0 + retmsg= + for t in "$@"; do + $t + done + echo "$retmsg" + return $RET +} + +ret_subtest() +{ + local expect_ret=$1; shift + local expect_retmsg=$1; shift + local -a tests=( "$@" ) + + local status_names=(pass fail xfail xpass skip) + local ret + local out + + RET=0 + + # Run this in a subshell, so that our environment is intact. + out=$(ret_tests_run "${tests[@]}") + ret=$? + + (( ret == expect_ret )) + check_err $? "RET=$ret expected $expect_ret" + + [[ $out == $expect_retmsg ]] + check_err $? "retmsg=$out expected $expect_retmsg" + + log_test "RET $(echo ${tests[@]}) -> ${status_names[$ret]}" +} + +test_ret() +{ + ret_subtest $ksft_pass "" + + ret_subtest $ksft_pass "" tpass + ret_subtest $ksft_fail "tfail" tfail + ret_subtest $ksft_xfail "txfail" txfail + + ret_subtest $ksft_pass "" tpass tpass + ret_subtest $ksft_fail "tfail" tpass tfail + ret_subtest $ksft_xfail "txfail" tpass txfail + + ret_subtest $ksft_fail "tfail" tfail tpass + ret_subtest $ksft_xfail "txfail" txfail tpass + + ret_subtest $ksft_fail "tfail" tfail tfail + ret_subtest $ksft_fail "tfail" tfail txfail + + ret_subtest $ksft_fail "tfail" txfail tfail + + ret_subtest $ksft_xfail "txfail" txfail txfail +} + +exit_status_tests_run() +{ + EXIT_STATUS=0 + tests_run > /dev/null + return $EXIT_STATUS +} + +exit_status_subtest() +{ + local expect_exit_status=$1; shift + local tests=$1; shift + local what=$1; shift + + local status_names=(pass fail xfail xpass skip) + local exit_status + local out + + RET=0 + + # Run this in a subshell, so that our environment is intact. + out=$(TESTS="$tests" exit_status_tests_run) + exit_status=$? + + (( exit_status == expect_exit_status )) + check_err $? "EXIT_STATUS=$exit_status, expected $expect_exit_status" + + log_test "EXIT_STATUS $tests$what -> ${status_names[$exit_status]}" +} + +test_exit_status() +{ + exit_status_subtest $ksft_pass ":" + + exit_status_subtest $ksft_pass "pass" + exit_status_subtest $ksft_fail "fail" + exit_status_subtest $ksft_pass "xfail" + exit_status_subtest $ksft_skip "skip" + + exit_status_subtest $ksft_pass "pass pass" + exit_status_subtest $ksft_fail "pass fail" + exit_status_subtest $ksft_pass "pass xfail" + exit_status_subtest $ksft_skip "pass skip" + + exit_status_subtest $ksft_fail "fail pass" + exit_status_subtest $ksft_pass "xfail pass" + exit_status_subtest $ksft_skip "skip pass" + + exit_status_subtest $ksft_fail "fail fail" + exit_status_subtest $ksft_fail "fail xfail" + exit_status_subtest $ksft_fail "fail skip" + + exit_status_subtest $ksft_fail "xfail fail" + exit_status_subtest $ksft_fail "skip fail" + + exit_status_subtest $ksft_pass "xfail xfail" + exit_status_subtest $ksft_skip "xfail skip" + exit_status_subtest $ksft_skip "skip xfail" + + exit_status_subtest $ksft_skip "skip skip" + + KSFT_MACHINE_SLOW=yes \ + exit_status_subtest $ksft_pass "slow_xfail" ": slow" + + KSFT_MACHINE_SLOW=no \ + exit_status_subtest $ksft_fail "slow_xfail" ": fast" +} + +trap pre_cleanup EXIT + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh index 7e7d62161..2903294d8 100644 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh @@ -56,21 +56,12 @@ nh_stats_test_dispatch_swhw() local group_id=$1; shift local mz="$@" - local used - nh_stats_do_test "$what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get "${mz[@]}" - used=$(ip -s -j -d nexthop show id $group_id | - jq '.[].hw_stats.used') - kind=$(ip -j -d link show dev $rp11 | - jq -r '.[].linkinfo.info_kind') - if [[ $used == true ]]; then + xfail_on_veth $rp11 \ nh_stats_do_test "HW $what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get_hw "${mz[@]}" - elif [[ $kind == veth ]]; then - log_test_skip "HW stats not offloaded on veth topology" - fi } nh_stats_test_dispatch() @@ -83,7 +74,6 @@ nh_stats_test_dispatch() local mz="$@" local enabled - local kind if ! ip nexthop help 2>&1 | grep -q hw_stats; then log_test_skip "NH stats test: ip doesn't support HW stats" diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh index cdf689e99..f9d26a791 100644 --- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh @@ -199,25 +199,28 @@ ets_set_dwrr_two_bands() ets_test_strict() { ets_set_strict - ets_dwrr_test_01 - ets_dwrr_test_12 + xfail_on_slow ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_12 } ets_test_mixed() { ets_set_mixed - ets_dwrr_test_01 - ets_dwrr_test_12 + xfail_on_slow ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_12 } ets_test_dwrr() { ets_set_dwrr_uniform - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_set_dwrr_varying - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_change_quantum - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_set_dwrr_two_bands - ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_01 } diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh index 81f31179a..17f286445 100755 --- a/tools/testing/selftests/net/forwarding/sch_red.sh +++ b/tools/testing/selftests/net/forwarding/sch_red.sh @@ -451,35 +451,35 @@ uninstall_qdisc() ecn_test() { install_qdisc ecn - do_ecn_test $BACKLOG + xfail_on_slow do_ecn_test $BACKLOG uninstall_qdisc } ecn_nodrop_test() { install_qdisc ecn nodrop - do_ecn_nodrop_test $BACKLOG + xfail_on_slow do_ecn_nodrop_test $BACKLOG uninstall_qdisc } red_test() { install_qdisc - do_red_test $BACKLOG + xfail_on_slow do_red_test $BACKLOG uninstall_qdisc } red_qevent_test() { install_qdisc qevent early_drop block 10 - do_red_qevent_test $BACKLOG + xfail_on_slow do_red_qevent_test $BACKLOG uninstall_qdisc } ecn_qevent_test() { install_qdisc ecn qevent mark block 10 - do_ecn_qevent_test $BACKLOG + xfail_on_slow do_ecn_qevent_test $BACKLOG uninstall_qdisc } diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh index d1f26cb7c..9cd884d4a 100644 --- a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh +++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh @@ -227,7 +227,7 @@ do_tbf_test() local nr=$(rate $t2 $t3 10) local nr_pct=$((100 * (nr - er) / er)) ((-5 <= nr_pct && nr_pct <= 5)) - check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." + xfail_on_slow check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit" } diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh index bce8bb8d2..2e3326edf 100644 --- a/tools/testing/selftests/net/forwarding/tc_common.sh +++ b/tools/testing/selftests/net/forwarding/tc_common.sh @@ -4,7 +4,7 @@ CHECK_TC="yes" # Can be overridden by the configuration file. See lib.sh -TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms +: "${TC_HIT_TIMEOUT:=1000}" # ms tc_check_packets() { diff --git a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh index 5a5dd9034..79775b10b 100755 --- a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh +++ b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh @@ -1,7 +1,5 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 ALL_TESTS="tunnel_key_nofrag_test" diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index f9fe182df..b7f7b8695 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -4,16 +4,60 @@ ############################################################################## # Defines -WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} +: "${WAIT_TIMEOUT:=20}" + BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms -# Kselftest framework requirement - SKIP code is 4. +# Kselftest framework constants. +ksft_pass=0 +ksft_fail=1 +ksft_xfail=2 ksft_skip=4 + # namespace list created by setup_ns NS_LIST="" ############################################################################## # Helpers + +__ksft_status_merge() +{ + local a=$1; shift + local b=$1; shift + local -A weights + local weight=0 + + for i in "$@"; do + weights[$i]=$((weight++)) + done + + if [[ ${weights[$a]} > ${weights[$b]} ]]; then + echo "$a" + return 0 + else + echo "$b" + return 1 + fi +} + +ksft_status_merge() +{ + local a=$1; shift + local b=$1; shift + + __ksft_status_merge "$a" "$b" \ + $ksft_pass $ksft_xfail $ksft_skip $ksft_fail +} + +ksft_exit_status_merge() +{ + local a=$1; shift + local b=$1; shift + + __ksft_status_merge "$a" "$b" \ + $ksft_xfail $ksft_pass $ksft_skip $ksft_fail +} + busywait() { local timeout=$1; shift diff --git a/tools/testing/selftests/sched_ext/.gitignore b/tools/testing/selftests/sched_ext/.gitignore new file mode 100644 index 000000000..ae5491a11 --- /dev/null +++ b/tools/testing/selftests/sched_ext/.gitignore @@ -0,0 +1,6 @@ +* +!*.c +!*.h +!Makefile +!.gitignore +!config diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile new file mode 100644 index 000000000..a95fa2a1a --- /dev/null +++ b/tools/testing/selftests/sched_ext/Makefile @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../../../build/Build.include +include ../../../scripts/Makefile.arch +include ../../../scripts/Makefile.include +include ../lib.mk + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := gcc +endif # LLVM + +ifneq ($(CROSS_COMPILE),) +$(error CROSS_COMPILE not supported for scx selftests) +endif # CROSS_COMPILE + +CURDIR := $(abspath .) +REPOROOT := $(abspath ../../../..) +TOOLSDIR := $(REPOROOT)/tools +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(REPOROOT)/include/generated +GENHDR := $(GENDIR)/autoconf.h +SCXTOOLSDIR := $(TOOLSDIR)/sched_ext +SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include + +OUTPUT_DIR := $(CURDIR)/build +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a +DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) + +VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR) + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread -lzstd + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +################ +# C schedulers # +################ + +override define CLEAN + rm -rf $(OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(TEST_GEN_PROGS) + rm -f runner +endef + +# Every testcase takes all of the BPF progs are dependencies by default. This +# allows testcases to load any BPF scheduler, which is useful for testcases +# that don't need their own prog to run their test. +all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog))) + +auto-test-targets := \ + enq_last_no_enq_fails \ + enq_select_cpu_fails \ + ddsp_bogus_dsq_fail \ + ddsp_vtimelocal_fail \ + exit \ + hotplug \ + init_enable_count \ + maximal \ + maybe_null \ + minimal \ + prog_run \ + reload_loop \ + select_cpu_dfl \ + select_cpu_dfl_nodispatch \ + select_cpu_dispatch \ + select_cpu_dispatch_bad_dsq \ + select_cpu_dispatch_dbl_dsp \ + select_cpu_vtime \ + test_example \ + +testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) + +$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +# Create all of the test targets object files, whose testcase objects will be +# registered into the runner in ELF constructors. +# +# Note that we must do double expansion here in order to support conditionally +# compiling BPF object files only if one is present, as the wildcard Make +# function doesn't support using implicit rules otherwise. +$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR) + $(eval test=$(patsubst %.o,%.c,$(notdir $@))) + $(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o + +$(SCXOBJ_DIR)/util.o: util.c | $(SCXOBJ_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +runner: $(SCXOBJ_DIR)/runner.o $(SCXOBJ_DIR)/util.o $(BPFOBJ) $(testcase-targets) + @echo "$(testcase-targets)" + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +TEST_GEN_PROGS := runner + +all: runner + +.PHONY: all clean help + +.DEFAULT_GOAL := all + +.DELETE_ON_ERROR: + +.SECONDARY: diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config new file mode 100644 index 000000000..0de9b4ee2 --- /dev/null +++ b/tools/testing/selftests/sched_ext/config @@ -0,0 +1,9 @@ +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_EXT_GROUP_SCHED=y +CONFIG_BPF=y +CONFIG_BPF_SYSCALL=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_BTF=y diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c new file mode 100644 index 000000000..e97ad41d3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* + * If we dispatch to a bogus DSQ that will fall back to the + * builtin global DSQ, we fail gracefully. + */ + scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_bogus_dsq_fail_ops = { + .select_cpu = ddsp_bogus_dsq_fail_select_cpu, + .exit = ddsp_bogus_dsq_fail_exit, + .name = "ddsp_bogus_dsq_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c new file mode 100644 index 000000000..e65d22f23 --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include "ddsp_bogus_dsq_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_bogus_dsq_fail *skel; + + skel = ddsp_bogus_dsq_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + + ddsp_bogus_dsq_fail__destroy(skel); +} + +struct scx_test ddsp_bogus_dsq_fail = { + .name = "ddsp_bogus_dsq_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct dispatch to an invalid" + " DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail) diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c new file mode 100644 index 000000000..dde7e7daf --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ + scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_vtimelocal_fail_ops = { + .select_cpu = ddsp_vtimelocal_fail_select_cpu, + .exit = ddsp_vtimelocal_fail_exit, + .name = "ddsp_vtimelocal_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c new file mode 100644 index 000000000..abafee587 --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include "ddsp_vtimelocal_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_vtimelocal_fail *skel; + + skel = ddsp_vtimelocal_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + + ddsp_vtimelocal_fail__destroy(skel); +} + +struct scx_test ddsp_vtimelocal_fail = { + .name = "ddsp_vtimelocal_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct vtime dispatch to a " + "built-in DSQ from DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_vtimelocal_fail) diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c new file mode 100644 index 000000000..b0b99531d --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +SEC(".struct_ops.link") +struct sched_ext_ops enq_last_no_enq_fails_ops = { + .name = "enq_last_no_enq_fails", + /* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */ + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c new file mode 100644 index 000000000..2a3eda5e2 --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "enq_last_no_enq_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_last_no_enq_fails *skel; + + skel = enq_last_no_enq_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops); + if (link) { + SCX_ERR("Incorrectly succeeded in to attaching scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + + enq_last_no_enq_fails__destroy(skel); +} + +struct scx_test enq_last_no_enq_fails = { + .name = "enq_last_no_enq_fails", + .description = "Verify we fail to load a scheduler if we specify " + "the SCX_OPS_ENQ_LAST flag without defining " + "ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_last_no_enq_fails) diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c new file mode 100644 index 000000000..b3dfc1033 --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, + u64 enq_flags) +{ + /* + * Need to initialize the variable or the verifier will fail to load. + * Improving these semantics is actively being worked on. + */ + bool found = false; + + /* Can only call from ops.select_cpu() */ + scx_bpf_select_cpu_dfl(p, 0, 0, &found); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +SEC(".struct_ops.link") +struct sched_ext_ops enq_select_cpu_fails_ops = { + .select_cpu = enq_select_cpu_fails_select_cpu, + .enqueue = enq_select_cpu_fails_enqueue, + .name = "enq_select_cpu_fails", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c new file mode 100644 index 000000000..dd1350e5f --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "enq_select_cpu_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_select_cpu_fails *skel; + + skel = enq_select_cpu_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + sleep(1); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + + enq_select_cpu_fails__destroy(skel); +} + +struct scx_test enq_select_cpu_fails = { + .name = "enq_select_cpu_fails", + .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " + "from ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_select_cpu_fails) diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c new file mode 100644 index 000000000..ae12ddaac --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit.bpf.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +char _license[] SEC("license") = "GPL"; + +#include "exit_test.h" + +const volatile int exit_point; +UEI_DEFINE(uei); + +#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point) + +s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + bool found; + + if (exit_point == EXIT_SELECT_CPU) + EXIT_CLEANLY(); + + return scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &found); +} + +void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) +{ + if (exit_point == EXIT_ENQUEUE) + EXIT_CLEANLY(); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) +{ + if (exit_point == EXIT_DISPATCH) + EXIT_CLEANLY(); + + scx_bpf_consume(SCX_DSQ_GLOBAL); +} + +void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) +{ + if (exit_point == EXIT_ENABLE) + EXIT_CLEANLY(); +} + +s32 BPF_STRUCT_OPS(exit_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (exit_point == EXIT_INIT_TASK) + EXIT_CLEANLY(); + + return 0; +} + +void BPF_STRUCT_OPS(exit_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init) +{ + if (exit_point == EXIT_INIT) + EXIT_CLEANLY(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops exit_ops = { + .select_cpu = exit_select_cpu, + .enqueue = exit_enqueue, + .dispatch = exit_dispatch, + .init_task = exit_init_task, + .enable = exit_enable, + .exit = exit_exit, + .init = exit_init, + .name = "exit", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c new file mode 100644 index 000000000..31bcd06e2 --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit.c @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include "exit.bpf.skel.h" +#include "scx_test.h" + +#include "exit_test.h" + +static enum scx_test_status run(void *ctx) +{ + enum exit_test_case tc; + + for (tc = 0; tc < NUM_EXITS; tc++) { + struct exit *skel; + struct bpf_link *link; + char buf[16]; + + skel = exit__open(); + skel->rodata->exit_point = tc; + exit__load(skel); + link = bpf_map__attach_struct_ops(skel->maps.exit_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + exit__destroy(skel); + return SCX_TEST_FAIL; + } + + /* Assumes uei.kind is written last */ + while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) + sched_yield(); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); + SCX_EQ(skel->data->uei.exit_code, tc); + sprintf(buf, "%d", tc); + SCX_ASSERT(!strcmp(skel->data->uei.msg, buf)); + bpf_link__destroy(link); + exit__destroy(skel); + } + + return SCX_TEST_PASS; +} + +struct scx_test exit_test = { + .name = "exit", + .description = "Verify we can cleanly exit a scheduler in multiple places", + .run = run, +}; +REGISTER_SCX_TEST(&exit_test) diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h new file mode 100644 index 000000000..94f0268b9 --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit_test.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#ifndef __EXIT_TEST_H__ +#define __EXIT_TEST_H__ + +enum exit_test_case { + EXIT_SELECT_CPU, + EXIT_ENQUEUE, + EXIT_DISPATCH, + EXIT_ENABLE, + EXIT_INIT_TASK, + EXIT_INIT, + NUM_EXITS, +}; + +#endif // # __EXIT_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/hotplug.bpf.c b/tools/testing/selftests/sched_ext/hotplug.bpf.c new file mode 100644 index 000000000..1f5f91f4f --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +char _license[] SEC("license") = "GPL"; + +#include "hotplug_test.h" + +UEI_DEFINE(uei); + +void BPF_STRUCT_OPS(hotplug_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +static void exit_from_hotplug(s32 cpu, bool onlining) +{ + s64 code = SCX_ECODE_ACT_RESTART | HOTPLUG_EXIT_RSN; + + if (onlining) + code |= HOTPLUG_ONLINING; + + scx_bpf_exit(code, "hotplug event detected (%d going %s)", cpu, + onlining ? "online" : "offline"); +} + +void BPF_STRUCT_OPS(hotplug_cpu_online, s32 cpu) +{ + exit_from_hotplug(cpu, true); +} + +void BPF_STRUCT_OPS(hotplug_cpu_offline, s32 cpu) +{ + exit_from_hotplug(cpu, false); +} + +SEC(".struct_ops.link") +struct sched_ext_ops hotplug_cb_ops = { + .cpu_online = hotplug_cpu_online, + .cpu_offline = hotplug_cpu_offline, + .exit = hotplug_exit, + .name = "hotplug_cbs", + .timeout_ms = 1000U, +}; + +SEC(".struct_ops.link") +struct sched_ext_ops hotplug_nocb_ops = { + .exit = hotplug_exit, + .name = "hotplug_nocbs", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c new file mode 100644 index 000000000..ce23d797a --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include + +#include "hotplug_test.h" +#include "hotplug.bpf.skel.h" +#include "scx_test.h" +#include "util.h" + +const char *online_path = "/sys/devices/system/cpu/cpu1/online"; + +static bool is_cpu_online(void) +{ + return file_read_long(online_path) > 0; +} + +static void toggle_online_status(bool online) +{ + long val = online ? 1 : 0; + int ret; + + ret = file_write_long(online_path, val); + if (ret != 0) + fprintf(stderr, "Failed to bring CPU %s (%s)", + online ? "online" : "offline", strerror(errno)); +} + +static enum scx_test_status setup(void **ctx) +{ + if (!is_cpu_online()) + return SCX_TEST_SKIP; + + return SCX_TEST_PASS; +} + +static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) +{ + struct hotplug *skel; + struct bpf_link *link; + long kind, code; + + SCX_ASSERT(is_cpu_online()); + + skel = hotplug__open_and_load(); + SCX_ASSERT(skel); + + /* Testing the offline -> online path, so go offline before starting */ + if (onlining) + toggle_online_status(0); + + if (cbs_defined) { + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_BPF); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | HOTPLUG_EXIT_RSN; + if (onlining) + code |= HOTPLUG_ONLINING; + } else { + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | + SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); + } + + if (cbs_defined) + link = bpf_map__attach_struct_ops(skel->maps.hotplug_cb_ops); + else + link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); + + if (!link) { + SCX_ERR("Failed to attach scheduler"); + hotplug__destroy(skel); + return SCX_TEST_FAIL; + } + + toggle_online_status(onlining ? 1 : 0); + + while (!UEI_EXITED(skel, uei)) + sched_yield(); + + SCX_EQ(UEI_KIND(skel, uei), kind); + SCX_EQ(UEI_ECODE(skel, uei), code); + + if (!onlining) + toggle_online_status(1); + + bpf_link__destroy(link); + hotplug__destroy(skel); + + return SCX_TEST_PASS; +} + +static enum scx_test_status test_hotplug_attach(void) +{ + struct hotplug *skel; + struct bpf_link *link; + enum scx_test_status status = SCX_TEST_PASS; + long kind, code; + + SCX_ASSERT(is_cpu_online()); + SCX_ASSERT(scx_hotplug_seq() > 0); + + skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug); + SCX_ASSERT(skel); + + SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei); + + /* + * Take the CPU offline to increment the global hotplug seq, which + * should cause attach to fail due to us setting the hotplug seq above + */ + toggle_online_status(0); + link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); + + toggle_online_status(1); + + SCX_ASSERT(link); + while (!UEI_EXITED(skel, uei)) + sched_yield(); + + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | + SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); + SCX_EQ(UEI_KIND(skel, uei), kind); + SCX_EQ(UEI_ECODE(skel, uei), code); + + bpf_link__destroy(link); + hotplug__destroy(skel); + + return status; +} + +static enum scx_test_status run(void *ctx) +{ + +#define HP_TEST(__onlining, __cbs_defined) ({ \ + if (test_hotplug(__onlining, __cbs_defined) != SCX_TEST_PASS) \ + return SCX_TEST_FAIL; \ +}) + + HP_TEST(true, true); + HP_TEST(false, true); + HP_TEST(true, false); + HP_TEST(false, false); + +#undef HP_TEST + + return test_hotplug_attach(); +} + +static void cleanup(void *ctx) +{ + toggle_online_status(1); +} + +struct scx_test hotplug_test = { + .name = "hotplug", + .description = "Verify hotplug behavior", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&hotplug_test) diff --git a/tools/testing/selftests/sched_ext/hotplug_test.h b/tools/testing/selftests/sched_ext/hotplug_test.h new file mode 100644 index 000000000..73d236f90 --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug_test.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#ifndef __HOTPLUG_TEST_H__ +#define __HOTPLUG_TEST_H__ + +enum hotplug_test_flags { + HOTPLUG_EXIT_RSN = 1LLU << 0, + HOTPLUG_ONLINING = 1LLU << 1, +}; + +#endif // # __HOTPLUG_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c new file mode 100644 index 000000000..47ea89a62 --- /dev/null +++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that verifies that we do proper counting of init, enable, etc + * callbacks. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt; +u64 init_fork_cnt, init_transition_cnt; + +s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + __sync_fetch_and_add(&init_task_cnt, 1); + + if (args->fork) + __sync_fetch_and_add(&init_fork_cnt, 1); + else + __sync_fetch_and_add(&init_transition_cnt, 1); + + return 0; +} + +void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p) +{ + __sync_fetch_and_add(&exit_task_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p) +{ + __sync_fetch_and_add(&enable_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p) +{ + __sync_fetch_and_add(&disable_cnt, 1); +} + +SEC(".struct_ops.link") +struct sched_ext_ops init_enable_count_ops = { + .init_task = cnt_init_task, + .exit_task = cnt_exit_task, + .enable = cnt_enable, + .disable = cnt_disable, + .name = "init_enable_count", +}; diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c new file mode 100644 index 000000000..ef9da0a50 --- /dev/null +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include +#include +#include "scx_test.h" +#include "init_enable_count.bpf.skel.h" + +#define SCHED_EXT 7 + +static struct init_enable_count * +open_load_prog(bool global) +{ + struct init_enable_count *skel; + + skel = init_enable_count__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + if (!global) + skel->struct_ops.init_enable_count_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL; + + SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); + + return skel; +} + +static enum scx_test_status run_test(bool global) +{ + struct init_enable_count *skel; + struct bpf_link *link; + const u32 num_children = 5, num_pre_forks = 1024; + int ret, i, status; + struct sched_param param = {}; + pid_t pids[num_pre_forks]; + + skel = open_load_prog(global); + + /* + * Fork a bunch of children before we attach the scheduler so that we + * ensure (at least in practical terms) that there are more tasks that + * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that + * take the fork() path either below or in other processes. + */ + for (i = 0; i < num_pre_forks; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + for (i = 0; i < num_pre_forks; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for pre-forked child\n"); + + SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, + status); + } + + bpf_link__destroy(link); + SCX_GE(skel->bss->init_task_cnt, num_pre_forks); + SCX_GE(skel->bss->exit_task_cnt, num_pre_forks); + + link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + /* SCHED_EXT children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + + if (pids[i] == 0) { + ret = sched_setscheduler(0, SCHED_EXT, ¶m); + SCX_BUG_ON(ret, "Failed to set sched to sched_ext"); + + /* + * Reset to SCHED_OTHER for half of them. Counts for + * everything should still be the same regardless, as + * ops.disable() is invoked even if a task is still on + * SCHED_EXT before it exits. + */ + if (i % 2 == 0) { + ret = sched_setscheduler(0, SCHED_OTHER, ¶m); + SCX_BUG_ON(ret, "Failed to reset sched to normal"); + } + exit(0); + } + } + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for SCX child\n"); + + SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i, + status); + } + + /* SCHED_OTHER children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + if (pids[i] == 0) + exit(0); + } + + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for normal child\n"); + + SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i, + status); + } + + bpf_link__destroy(link); + + SCX_GE(skel->bss->init_task_cnt, 2 * num_children); + SCX_GE(skel->bss->exit_task_cnt, 2 * num_children); + + if (global) { + SCX_GE(skel->bss->enable_cnt, 2 * num_children); + SCX_GE(skel->bss->disable_cnt, 2 * num_children); + } else { + SCX_EQ(skel->bss->enable_cnt, num_children); + SCX_EQ(skel->bss->disable_cnt, num_children); + } + /* + * We forked a ton of tasks before we attached the scheduler above, so + * this should be fine. Technically it could be flaky if a ton of forks + * are happening at the same time in other processes, but that should + * be exceedingly unlikely. + */ + SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt); + SCX_GE(skel->bss->init_fork_cnt, 2 * num_children); + + init_enable_count__destroy(skel); + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + enum scx_test_status status; + + status = run_test(true); + if (status != SCX_TEST_PASS) + return status; + + return run_test(false); +} + +struct scx_test init_enable_count = { + .name = "init_enable_count", + .description = "Verify we do the correct amount of counting of init, " + "enable, etc callbacks.", + .run = run, +}; +REGISTER_SCX_TEST(&init_enable_count) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c new file mode 100644 index 000000000..00bfa9cb9 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler with every callback defined. + * + * This scheduler defines every callback. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) +{ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +{} + +void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SCX_DSQ_GLOBAL); +} + +void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +{} + +void BPF_STRUCT_OPS(maximal_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable) +{} + +void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags) +{} + +bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from, + struct task_struct *to) +{ + return false; +} + +bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a, + struct task_struct *b) +{ + return false; +} + +void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight) +{} + +void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{} + +void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle) +{} + +void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu, + struct scx_cpu_acquire_args *args) +{} + +void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu, + struct scx_cpu_release_args *args) +{} + +void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu) +{} + +void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu) +{} + +s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{} + +void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) +{} + +s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp) +{} + +s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) +{} + +s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) +{} + +SEC(".struct_ops.link") +struct sched_ext_ops maximal_ops = { + .select_cpu = maximal_select_cpu, + .enqueue = maximal_enqueue, + .dequeue = maximal_dequeue, + .dispatch = maximal_dispatch, + .runnable = maximal_runnable, + .running = maximal_running, + .stopping = maximal_stopping, + .quiescent = maximal_quiescent, + .yield = maximal_yield, + .core_sched_before = maximal_core_sched_before, + .set_weight = maximal_set_weight, + .set_cpumask = maximal_set_cpumask, + .update_idle = maximal_update_idle, + .cpu_acquire = maximal_cpu_acquire, + .cpu_release = maximal_cpu_release, + .cpu_online = maximal_cpu_online, + .cpu_offline = maximal_cpu_offline, + .init_task = maximal_init_task, + .enable = maximal_enable, + .exit_task = maximal_exit_task, + .disable = maximal_disable, + .cgroup_init = maximal_cgroup_init, + .cgroup_exit = maximal_cgroup_exit, + .cgroup_prep_move = maximal_cgroup_prep_move, + .cgroup_move = maximal_cgroup_move, + .cgroup_cancel_move = maximal_cgroup_cancel_move, + .cgroup_set_weight = maximal_cgroup_set_weight, + .init = maximal_init, + .exit = maximal_exit, + .name = "maximal", +}; diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c new file mode 100644 index 000000000..f38fc973c --- /dev/null +++ b/tools/testing/selftests/sched_ext/maximal.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include "maximal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct maximal *skel; + + skel = maximal__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct maximal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct maximal *skel = ctx; + + maximal__destroy(skel); +} + +struct scx_test maximal = { + .name = "maximal", + .description = "Verify we can load a scheduler with every callback defined", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&maximal) diff --git a/tools/testing/selftests/sched_ext/maybe_null.bpf.c b/tools/testing/selftests/sched_ext/maybe_null.bpf.c new file mode 100644 index 000000000..ad5e69422 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p) +{ + if (p != NULL) + vtime_test = p->scx.dsq_vtime; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_success = { + .dispatch = maybe_null_success_dispatch, + .enable = maybe_null_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c new file mode 100644 index 000000000..3f26b784f --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ +#include +#include +#include +#include +#include "maybe_null.bpf.skel.h" +#include "maybe_null_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status run(void *ctx) +{ + struct maybe_null *skel; + struct maybe_null_fail *fail_skel; + + skel = maybe_null__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load maybe_null skel"); + return SCX_TEST_FAIL; + } + maybe_null__destroy(skel); + + fail_skel = maybe_null_fail__open_and_load(); + if (fail_skel) { + maybe_null_fail__destroy(fail_skel); + SCX_ERR("Should failed to open and load maybe_null_fail skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +struct scx_test maybe_null = { + .name = "maybe_null", + .description = "Verify if PTR_MAYBE_NULL work for .dispatch", + .run = run, +}; +REGISTER_SCX_TEST(&maybe_null) diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c new file mode 100644 index 000000000..1607fe07b --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p) +{ + vtime_test = p->scx.dsq_vtime; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_fail = { + .dispatch = maybe_null_fail_dispatch, + .enable = maybe_null_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/sched_ext/minimal.bpf.c b/tools/testing/selftests/sched_ext/minimal.bpf.c new file mode 100644 index 000000000..6a7eccef0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/minimal.bpf.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A completely minimal scheduler. + * + * This scheduler defines the absolute minimal set of struct sched_ext_ops + * fields: its name. It should _not_ fail to be loaded, and can be used to + * exercise the default scheduling paths in ext.c. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +SEC(".struct_ops.link") +struct sched_ext_ops minimal_ops = { + .name = "minimal", +}; diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c new file mode 100644 index 000000000..6c5db8ebb --- /dev/null +++ b/tools/testing/selftests/sched_ext/minimal.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct minimal *skel; + + skel = minimal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct minimal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct minimal *skel = ctx; + + minimal__destroy(skel); +} + +struct scx_test minimal = { + .name = "minimal", + .description = "Verify we can load a fully minimal scheduler", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&minimal) diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c new file mode 100644 index 000000000..fd2c8f12a --- /dev/null +++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that we can invoke sched_ext kfuncs in + * BPF_PROG_TYPE_SYSCALL programs. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +UEI_DEFINE(uei); + +char _license[] SEC("license") = "GPL"; + +SEC("syscall") +int BPF_PROG(prog_run_syscall) +{ + scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN"); + return 0; +} + +void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops prog_run_ops = { + .exit = prog_run_exit, + .name = "prog_run", +}; diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c new file mode 100644 index 000000000..3cd57ef8d --- /dev/null +++ b/tools/testing/selftests/sched_ext/prog_run.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include "prog_run.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct prog_run *skel; + + skel = prog_run__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct prog_run *skel = ctx; + struct bpf_link *link; + int prog_fd, err = 0; + + prog_fd = bpf_program__fd(skel->progs.prog_run_syscall); + if (prog_fd < 0) { + SCX_ERR("Failed to get BPF_PROG_RUN prog"); + return SCX_TEST_FAIL; + } + + LIBBPF_OPTS(bpf_test_run_opts, topts); + + link = bpf_map__attach_struct_ops(skel->maps.prog_run_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + close(prog_fd); + return SCX_TEST_FAIL; + } + + err = bpf_prog_test_run_opts(prog_fd, &topts); + SCX_EQ(err, 0); + + /* Assumes uei.kind is written last */ + while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) + sched_yield(); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); + SCX_EQ(skel->data->uei.exit_code, 0xdeadbeef); + close(prog_fd); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct prog_run *skel = ctx; + + prog_run__destroy(skel); +} + +struct scx_test prog_run = { + .name = "prog_run", + .description = "Verify we can call into a scheduler with BPF_PROG_RUN, and invoke kfuncs", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&prog_run) diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c new file mode 100644 index 000000000..5cfba2d6e --- /dev/null +++ b/tools/testing/selftests/sched_ext/reload_loop.c @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include "maximal.bpf.skel.h" +#include "scx_test.h" + +static struct maximal *skel; +static pthread_t threads[2]; + +bool force_exit = false; + +static enum scx_test_status setup(void **ctx) +{ + skel = maximal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +static void *do_reload_loop(void *arg) +{ + u32 i; + + for (i = 0; i < 1024 && !force_exit; i++) { + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); + if (link) + bpf_link__destroy(link); + } + + return NULL; +} + +static enum scx_test_status run(void *ctx) +{ + int err; + void *ret; + + err = pthread_create(&threads[0], NULL, do_reload_loop, NULL); + SCX_FAIL_IF(err, "Failed to create thread 0"); + + err = pthread_create(&threads[1], NULL, do_reload_loop, NULL); + SCX_FAIL_IF(err, "Failed to create thread 1"); + + SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed"); + SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed"); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + force_exit = true; + maximal__destroy(skel); +} + +struct scx_test reload_loop = { + .name = "reload_loop", + .description = "Stress test loading and unloading schedulers repeatedly in a tight loop", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&reload_loop) diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c new file mode 100644 index 000000000..eab48c7ff --- /dev/null +++ b/tools/testing/selftests/sched_ext/runner.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include +#include "scx_test.h" + +const char help_fmt[] = +"The runner for sched_ext tests.\n" +"\n" +"The runner is statically linked against all testcases, and runs them all serially.\n" +"It's required for the testcases to be serial, as only a single host-wide sched_ext\n" +"scheduler may be loaded at any given time." +"\n" +"Usage: %s [-t TEST] [-h]\n" +"\n" +" -t TEST Only run tests whose name includes this string\n" +" -s Include print output for skipped tests\n" +" -q Don't print the test descriptions during run\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; +static bool quiet, print_skipped; + +#define MAX_SCX_TESTS 2048 + +static struct scx_test __scx_tests[MAX_SCX_TESTS]; +static unsigned __scx_num_tests = 0; + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void print_test_preamble(const struct scx_test *test, bool quiet) +{ + printf("===== START =====\n"); + printf("TEST: %s\n", test->name); + if (!quiet) + printf("DESCRIPTION: %s\n", test->description); + printf("OUTPUT:\n"); +} + +static const char *status_to_result(enum scx_test_status status) +{ + switch (status) { + case SCX_TEST_PASS: + case SCX_TEST_SKIP: + return "ok"; + case SCX_TEST_FAIL: + return "not ok"; + default: + return ""; + } +} + +static void print_test_result(const struct scx_test *test, + enum scx_test_status status, + unsigned int testnum) +{ + const char *result = status_to_result(status); + const char *directive = status == SCX_TEST_SKIP ? "SKIP " : ""; + + printf("%s %u %s # %s\n", result, testnum, test->name, directive); + printf("===== END =====\n"); +} + +static bool should_skip_test(const struct scx_test *test, const char * filter) +{ + return !strstr(test->name, filter); +} + +static enum scx_test_status run_test(const struct scx_test *test) +{ + enum scx_test_status status; + void *context = NULL; + + if (test->setup) { + status = test->setup(&context); + if (status != SCX_TEST_PASS) + return status; + } + + status = test->run(context); + + if (test->cleanup) + test->cleanup(context); + + return status; +} + +static bool test_valid(const struct scx_test *test) +{ + if (!test) { + fprintf(stderr, "NULL test detected\n"); + return false; + } + + if (!test->name) { + fprintf(stderr, + "Test with no name found. Must specify test name.\n"); + return false; + } + + if (!test->description) { + fprintf(stderr, "Test %s requires description.\n", test->name); + return false; + } + + if (!test->run) { + fprintf(stderr, "Test %s has no run() callback\n", test->name); + return false; + } + + return true; +} + +int main(int argc, char **argv) +{ + const char *filter = NULL; + unsigned testnum = 0, i; + unsigned passed = 0, skipped = 0, failed = 0; + int opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + while ((opt = getopt(argc, argv, "qst:h")) != -1) { + switch (opt) { + case 'q': + quiet = true; + break; + case 's': + print_skipped = true; + break; + case 't': + filter = optarg; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + for (i = 0; i < __scx_num_tests; i++) { + enum scx_test_status status; + struct scx_test *test = &__scx_tests[i]; + + if (filter && should_skip_test(test, filter)) { + /* + * Printing the skipped tests and their preambles can + * add a lot of noise to the runner output. Printing + * this is only really useful for CI, so let's skip it + * by default. + */ + if (print_skipped) { + print_test_preamble(test, quiet); + print_test_result(test, SCX_TEST_SKIP, ++testnum); + } + continue; + } + + print_test_preamble(test, quiet); + status = run_test(test); + print_test_result(test, status, ++testnum); + switch (status) { + case SCX_TEST_PASS: + passed++; + break; + case SCX_TEST_SKIP: + skipped++; + break; + case SCX_TEST_FAIL: + failed++; + break; + } + } + printf("\n\n=============================\n\n"); + printf("RESULTS:\n\n"); + printf("PASSED: %u\n", passed); + printf("SKIPPED: %u\n", skipped); + printf("FAILED: %u\n", failed); + + return 0; +} + +void scx_test_register(struct scx_test *test) +{ + SCX_BUG_ON(!test_valid(test), "Invalid test found"); + SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded"); + + __scx_tests[__scx_num_tests++] = *test; +} diff --git a/tools/testing/selftests/sched_ext/scx_test.h b/tools/testing/selftests/sched_ext/scx_test.h new file mode 100644 index 000000000..90b8d6915 --- /dev/null +++ b/tools/testing/selftests/sched_ext/scx_test.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ + +#ifndef __SCX_TEST_H__ +#define __SCX_TEST_H__ + +#include +#include +#include + +enum scx_test_status { + SCX_TEST_PASS = 0, + SCX_TEST_SKIP, + SCX_TEST_FAIL, +}; + +#define EXIT_KIND(__ent) __COMPAT_ENUM_OR_ZERO("scx_exit_kind", #__ent) + +struct scx_test { + /** + * name - The name of the testcase. + */ + const char *name; + + /** + * description - A description of your testcase: what it tests and is + * meant to validate. + */ + const char *description; + + /* + * setup - Setup the test. + * @ctx: A pointer to a context object that will be passed to run and + * cleanup. + * + * An optional callback that allows a testcase to perform setup for its + * run. A test may return SCX_TEST_SKIP to skip the run. + */ + enum scx_test_status (*setup)(void **ctx); + + /* + * run - Run the test. + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * The main test. Callers should return one of: + * + * - SCX_TEST_PASS: Test passed + * - SCX_TEST_SKIP: Test should be skipped + * - SCX_TEST_FAIL: Test failed + * + * This callback must be defined. + */ + enum scx_test_status (*run)(void *ctx); + + /* + * cleanup - Perform cleanup following the test + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * An optional callback that allows a test to perform cleanup after + * being run. This callback is run even if the run() callback returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. + */ + void (*cleanup)(void *ctx); +}; + +void scx_test_register(struct scx_test *test); + +#define REGISTER_SCX_TEST(__test) \ + __attribute__((constructor)) \ + static void ___scxregister##__LINE__(void) \ + { \ + scx_test_register(__test); \ + } + +#define SCX_ERR(__fmt, ...) \ + do { \ + fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__); \ + fprintf(stderr, __fmt"\n", ##__VA_ARGS__); \ + } while (0) + +#define SCX_FAIL(__fmt, ...) \ + do { \ + SCX_ERR(__fmt, ##__VA_ARGS__); \ + return SCX_TEST_FAIL; \ + } while (0) + +#define SCX_FAIL_IF(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_FAIL(__fmt, ##__VA_ARGS__); \ + } while (0) + +#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)", \ + #_x, (u64)(_x)) + +#define SCX_ECODE_VAL(__ecode) ({ \ + u64 __val = 0; \ + bool __found = false; \ + \ + __found = __COMPAT_read_enum("scx_exit_code", #__ecode, &__val); \ + SCX_ASSERT(__found); \ + (s64)__val; \ +}) + +#define SCX_KIND_VAL(__kind) ({ \ + u64 __val = 0; \ + bool __found = false; \ + \ + __found = __COMPAT_read_enum("scx_exit_kind", #__kind, &__val); \ + SCX_ASSERT(__found); \ + __val; \ +}) + +#endif // # __SCX_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c new file mode 100644 index 000000000..2ed2991af --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +static bool task_is_test(const struct task_struct *p) +{ + return !bpf_strncmp(p->comm, 9, "select_cpu"); +} + +void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, + u64 enq_flags) +{ + const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask(); + + if (task_is_test(p) && + bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) { + saw_local = true; + } + scx_bpf_put_idle_cpumask(idle_mask); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_ops = { + .enqueue = select_cpu_dfl_enqueue, + .name = "select_cpu_dfl", +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c new file mode 100644 index 000000000..a53a40c2d --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dfl.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl *skel; + + skel = select_cpu_dfl__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(!skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + + select_cpu_dfl__destroy(skel); +} + +struct scx_test select_cpu_dfl = { + .name = "select_cpu_dfl", + .description = "Verify the default ops.select_cpu() dispatches tasks " + "when idles cores are found, and skips ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c new file mode 100644 index 000000000..4bb5abb2d --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag + * specified. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* CPU changed by ops.select_cpu() */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, + &tctx->force_local); + + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, + u64 enq_flags) +{ + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + if (tctx->force_local) { + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + saw_local = true; + } + + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, + struct task_struct *p, struct scx_init_task_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_nodispatch_ops = { + .select_cpu = select_cpu_dfl_nodispatch_select_cpu, + .enqueue = select_cpu_dfl_nodispatch_enqueue, + .init_task = select_cpu_dfl_nodispatch_init_task, + .name = "select_cpu_dfl_nodispatch", +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c new file mode 100644 index 000000000..1d85bf4bf --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dfl_nodispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl_nodispatch *skel; + + skel = select_cpu_dfl_nodispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + + select_cpu_dfl_nodispatch__destroy(skel); +} + +struct scx_test select_cpu_dfl_nodispatch = { + .name = "select_cpu_dfl_nodispatch", + .description = "Verify behavior of scx_bpf_select_cpu_dfl() in " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c new file mode 100644 index 000000000..f0b96a4a0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + u64 dsq_id = SCX_DSQ_LOCAL; + s32 cpu = prev_cpu; + + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + goto dispatch; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto dispatch; + + dsq_id = SCX_DSQ_GLOBAL; + cpu = prev_cpu; + +dispatch: + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); + return cpu; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_ops = { + .select_cpu = select_cpu_dispatch_select_cpu, + .name = "select_cpu_dispatch", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c new file mode 100644 index 000000000..0309ca878 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch *skel; + + skel = select_cpu_dispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + + select_cpu_dispatch__destroy(skel); +} + +struct scx_test select_cpu_dispatch = { + .name = "select_cpu_dispatch", + .description = "Test direct dispatching to built-in DSQs from " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c new file mode 100644 index 000000000..7b42ddce0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching to a random DSQ should fail. */ + scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = { + .select_cpu = select_cpu_dispatch_bad_dsq_select_cpu, + .exit = select_cpu_dispatch_bad_dsq_exit, + .name = "select_cpu_dispatch_bad_dsq", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c new file mode 100644 index 000000000..47eb6ed76 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch_bad_dsq.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel; + + skel = select_cpu_dispatch_bad_dsq__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + + select_cpu_dispatch_bad_dsq__destroy(skel); +} + +struct scx_test select_cpu_dispatch_bad_dsq = { + .name = "select_cpu_dispatch_bad_dsq", + .description = "Verify graceful failure if we direct-dispatch to a " + "bogus DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c new file mode 100644 index 000000000..653e3dc0b --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching twice in a row is disallowed. */ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = { + .select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu, + .exit = select_cpu_dispatch_dbl_dsp_exit, + .name = "select_cpu_dispatch_dbl_dsp", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c new file mode 100644 index 000000000..48ff028a3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel; + + skel = select_cpu_dispatch_dbl_dsp__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + + select_cpu_dispatch_dbl_dsp__destroy(skel); +} + +struct scx_test select_cpu_dispatch_dbl_dsp = { + .name = "select_cpu_dispatch_dbl_dsp", + .description = "Verify graceful failure if we dispatch twice to a " + "DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp) diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c new file mode 100644 index 000000000..7f3ebf4fc --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that enqueue flags are properly stored and + * applied at dispatch time when a task is directly dispatched from + * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and + * making the test a very basic vtime scheduler. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +volatile bool consumed; + +static u64 vtime_now; + +#define VTIME_DSQ 0 + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static inline u64 task_vtime(const struct task_struct *p) +{ + u64 vtime = p->scx.dsq_vtime; + + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + return vtime_now - SCX_SLICE_DFL; + else + return vtime; +} + +s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto ddsp; + + cpu = prev_cpu; + scx_bpf_test_and_clear_cpu_idle(cpu); +ddsp: + scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) +{ + if (scx_bpf_consume(VTIME_DSQ)) + consumed = true; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) +{ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, + bool runnable) +{ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) +{ + return scx_bpf_create_dsq(VTIME_DSQ, -1); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_vtime_ops = { + .select_cpu = select_cpu_vtime_select_cpu, + .dispatch = select_cpu_vtime_dispatch, + .running = select_cpu_vtime_running, + .stopping = select_cpu_vtime_stopping, + .enable = select_cpu_vtime_enable, + .init = select_cpu_vtime_init, + .name = "select_cpu_vtime", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c new file mode 100644 index 000000000..b4629c236 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_vtime.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_vtime *skel; + + skel = select_cpu_vtime__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + struct bpf_link *link; + + SCX_ASSERT(!skel->bss->consumed); + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_ASSERT(skel->bss->consumed); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + + select_cpu_vtime__destroy(skel); +} + +struct scx_test select_cpu_vtime = { + .name = "select_cpu_vtime", + .description = "Test doing direct vtime-dispatching from " + "ops.select_cpu(), to a non-built-in DSQ", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_vtime) diff --git a/tools/testing/selftests/sched_ext/test_example.c b/tools/testing/selftests/sched_ext/test_example.c new file mode 100644 index 000000000..ce36cdf03 --- /dev/null +++ b/tools/testing/selftests/sched_ext/test_example.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include "scx_test.h" + +static bool setup_called = false; +static bool run_called = false; +static bool cleanup_called = false; + +static int context = 10; + +static enum scx_test_status setup(void **ctx) +{ + setup_called = true; + *ctx = &context; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + int *arg = ctx; + + SCX_ASSERT(setup_called); + SCX_ASSERT(!run_called && !cleanup_called); + SCX_EQ(*arg, context); + + run_called = true; + return SCX_TEST_PASS; +} + +static void cleanup (void *ctx) +{ + SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked"); +} + +struct scx_test example = { + .name = "example", + .description = "Validate the basic function of the test suite itself", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&example) diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c new file mode 100644 index 000000000..e47769c91 --- /dev/null +++ b/tools/testing/selftests/sched_ext/util.c @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include + +/* Returns read len on success, or -errno on failure. */ +static ssize_t read_text(const char *path, char *buf, size_t max_len) +{ + ssize_t len; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -errno; + + len = read(fd, buf, max_len - 1); + + if (len >= 0) + buf[len] = 0; + + close(fd); + return len < 0 ? -errno : len; +} + +/* Returns written len on success, or -errno on failure. */ +static ssize_t write_text(const char *path, char *buf, ssize_t len) +{ + int fd; + ssize_t written; + + fd = open(path, O_WRONLY | O_APPEND); + if (fd < 0) + return -errno; + + written = write(fd, buf, len); + close(fd); + return written < 0 ? -errno : written; +} + +long file_read_long(const char *path) +{ + char buf[128]; + + + if (read_text(path, buf, sizeof(buf)) <= 0) + return -1; + + return atol(buf); +} + +int file_write_long(const char *path, long val) +{ + char buf[64]; + int ret; + + ret = sprintf(buf, "%lu", val); + if (ret < 0) + return ret; + + if (write_text(path, buf, sizeof(buf)) <= 0) + return -1; + + return 0; +} diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h new file mode 100644 index 000000000..bc13dfec1 --- /dev/null +++ b/tools/testing/selftests/sched_ext/util.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#ifndef __SCX_TEST_UTIL_H__ +#define __SCX_TEST_UTIL_H__ + +long file_read_long(const char *path); +int file_write_long(const char *path, long val); + +#endif // __SCX_TEST_H__ -- 2.44.0.325.g11c821f2f2