From 04ac47d0fbec56876b0e46ad5cdb53c05d2f792c Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 10 May 2022 14:10:35 +0200 Subject: [PATCH] cachy Signed-off-by: Peter Jung (cherry picked from commit b2eb864e0c5cc7862a1cf2f4c267421a48126ea0) Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 6 ++ Documentation/admin-guide/pm/cpuidle.rst | 15 +++-- Makefile | 5 +- arch/Kconfig | 10 ++++ arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/mwait.h | 1 + arch/x86/kernel/process.c | 39 +++++++++---- drivers/cpufreq/Kconfig | 2 +- drivers/cpufreq/cpufreq_ondemand.c | 6 +- drivers/hid/usbhid/hid-core.c | 8 +++ drivers/input/evdev.c | 19 +++--- include/linux/pageblock-flags.h | 2 +- include/linux/pagemap.h | 2 +- include/linux/sched.h | 1 + init/Kconfig | 4 ++ kernel/Kconfig.hz | 24 ++++++++ kernel/rcu/Kconfig | 13 +++++ kernel/rcu/tree_nocb.h | 15 ++++- kernel/sched/core.c | 5 +- kernel/sched/fair.c | 58 +++++++++++++++++-- kernel/sched/features.h | 2 + mm/compaction.c | 6 +- mm/internal.h | 1 + mm/page-writeback.c | 8 +++ mm/page_alloc.c | 42 +++++++++++--- mm/vmpressure.c | 4 ++ mm/vmscan.c | 24 ++++++-- scripts/Makefile.lib | 4 ++ scripts/Makefile.modfinal | 4 ++ 29 files changed, 275 insertions(+), 57 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 59f881f36779..3bd6a7f809ff 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3505,6 +3505,9 @@ just as if they had also been called out in the rcu_nocbs= boot parameter. + Note that this argument takes precedence over + the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option. + noiotrap [SH] Disables trapped I/O port accesses. noirqdebug [X86-32] Disables the code which attempts to detect and @@ -4397,6 +4400,9 @@ no-callback mode from boot but the mode may be toggled at runtime via cpusets. + Note that this argument takes precedence over + the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option. + rcu_nocb_poll [KNL] Rather than requiring that offloaded CPUs (specified by rcu_nocbs= above) explicitly diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index aec2cd2aaea7..19754beb5a4e 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -612,8 +612,8 @@ the ``menu`` governor to be used on the systems that use the ``ladder`` governor by default this way, for example. The other kernel command line parameters controlling CPU idle time management -described below are only relevant for the *x86* architecture and some of -them affect Intel processors only. +described below are only relevant for the *x86* architecture and references +to ``intel_idle`` affect Intel processors only. The *x86* architecture support code recognizes three kernel command line options related to CPU idle time management: ``idle=poll``, ``idle=halt``, @@ -635,10 +635,13 @@ idle, so it very well may hurt single-thread computations performance as well as energy-efficiency. Thus using it for performance reasons may not be a good idea at all.] -The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes -``acpi_idle`` to be used (as long as all of the information needed by it is -there in the system's ACPI tables), but it is not allowed to use the -``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states. +The ``idle=nomwait`` option prevents the use of ``MWAIT`` instruction of +the CPU to enter idle states. When this option is used, the ``acpi_idle`` +driver will use the ``HLT`` instruction instead of ``MWAIT``. On systems +running Intel processors, this option disables the ``intel_idle`` driver +and forces the use of the ``acpi_idle`` driver instead. Note that in either +case, ``acpi_idle`` driver will function only if all the information needed +by it is in the system's ACPI tables. In addition to the architecture-level kernel command line options affecting CPU idle time management, there are parameters affecting individual ``CPUIdle`` diff --git a/Makefile b/Makefile index 7ef8dd5ab6f2..f6c9669801a0 100644 --- a/Makefile +++ b/Makefile @@ -885,7 +885,8 @@ endif ifdef CONFIG_LTO_CLANG ifdef CONFIG_LTO_CLANG_THIN CC_FLAGS_LTO := -flto=thin -fsplit-lto-unit -KBUILD_LDFLAGS += --thinlto-cache-dir=$(extmod_prefix).thinlto-cache +export thinlto-dir = $(if $(CONFIG_LTO_CLANG_THIN_CACHEDIR),$(CONFIG_LTO_CLANG_THIN_CACHEDIR)/) +KBUILD_LDFLAGS += --thinlto-cache-dir=$(thinlto-dir)$(extmod_prefix).thinlto-cache else CC_FLAGS_LTO := -flto endif @@ -1708,7 +1709,7 @@ PHONY += compile_commands.json clean-dirs := $(KBUILD_EXTMOD) clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers $(KBUILD_EXTMOD)/modules.nsdeps \ - $(KBUILD_EXTMOD)/compile_commands.json $(KBUILD_EXTMOD)/.thinlto-cache + $(KBUILD_EXTMOD)/compile_commands.json $(thinlto-dir)$(KBUILD_EXTMOD)/.thinlto-cache PHONY += prepare # now expand this into a simple variable to reduce the cost of shell evaluations diff --git a/arch/Kconfig b/arch/Kconfig index 5e88237f84d2..032df1abcc10 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -706,6 +706,16 @@ config LTO_CLANG_THIN https://clang.llvm.org/docs/ThinLTO.html If unsure, say Y. + +config LTO_CLANG_THIN_CACHEDIR + string "Clang ThinLTO cache directory" + depends on LTO_CLANG_THIN + default "/tmp" + help + This option allows users to choose a directory that stores + Clang's ThinLTO cache. + Leave empty for default. + endchoice config ARCH_SUPPORTS_CFI_CLANG diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 65d147974f8d..691cd4e250d3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,7 +219,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */ +#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 29dd27b5a339..3a8fdf881313 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -13,6 +13,7 @@ #define MWAIT_SUBSTATE_SIZE 4 #define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) #define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) +#define MWAIT_C1_SUBSTATE_MASK 0xf0 #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 81d8ef036637..c50208b9a2a3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -810,24 +810,43 @@ static void amd_e400_idle(void) } /* - * Intel Core2 and older machines prefer MWAIT over HALT for C1. - * We can't rely on cpuidle installing MWAIT, because it will not load - * on systems that support only C1 -- so the boot default must be MWAIT. + * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf + * exists and whenever MONITOR/MWAIT extensions are present there is at + * least one C1 substate. * - * Some AMD machines are the opposite, they depend on using HALT. - * - * So for default C1, which is used during boot until cpuidle loads, - * use MWAIT-C1 on Intel HW that has it, else use HALT. + * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait + * is passed to kernel commandline parameter. */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { - if (c->x86_vendor != X86_VENDOR_INTEL) + u32 eax, ebx, ecx, edx; + + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) + /* MWAIT is not supported on this platform. Fallback to HALT */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) return 0; - return 1; + /* Monitor has a bug. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR)) + return 0; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* + * If MWAIT extensions are not available, it is safe to use MWAIT + * with EAX=0, ECX=0. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) + return 1; + + /* + * If MWAIT extensions are available, there should be least one + * MWAIT C1 substate present. + */ + return (edx & MWAIT_C1_SUBSTATE_MASK); } /* diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index c3038cdc6865..25bf88b571ae 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -72,7 +72,7 @@ config CPU_FREQ_DEFAULT_GOV_USERSPACE config CPU_FREQ_DEFAULT_GOV_ONDEMAND bool "ondemand" - depends on !(X86_INTEL_PSTATE && SMP) + depends on SMP select CPU_FREQ_GOV_ONDEMAND select CPU_FREQ_GOV_PERFORMANCE help diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 6a41ea4729b8..370af073c27f 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -18,10 +18,10 @@ #include "cpufreq_ondemand.h" /* On-demand governor macros */ -#define DEF_FREQUENCY_UP_THRESHOLD (80) -#define DEF_SAMPLING_DOWN_FACTOR (1) +#define DEF_FREQUENCY_UP_THRESHOLD (65) +#define DEF_SAMPLING_DOWN_FACTOR (10) #define MAX_SAMPLING_DOWN_FACTOR (100000) -#define MICRO_FREQUENCY_UP_THRESHOLD (95) +#define MICRO_FREQUENCY_UP_THRESHOLD (85) #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) #define MIN_FREQUENCY_UP_THRESHOLD (1) #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 54752c85604b..6fa8de3ce7aa 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -45,6 +45,10 @@ * Module parameters. */ +static unsigned int hid_poll_interval; +module_param_named(poll, hid_poll_interval, uint, 0644); +MODULE_PARM_DESC(poll, "Polling interval of any device"); + static unsigned int hid_mousepoll_interval; module_param_named(mousepoll, hid_mousepoll_interval, uint, 0644); MODULE_PARM_DESC(mousepoll, "Polling interval of mice"); @@ -1107,6 +1111,10 @@ static int usbhid_start(struct hid_device *hid) hid->name, endpoint->bInterval, interval); } + /* First set polling interval for all devices */ + if (hid_poll_interval > 0) + interval = hid_poll_interval; + /* Change the polling interval of mice, joysticks * and keyboards. */ diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 95f90699d2b1..2b10fe29d2c8 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -46,6 +46,7 @@ struct evdev_client { struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; + struct rcu_head rcu; enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; @@ -377,13 +378,22 @@ static void evdev_attach_client(struct evdev *evdev, spin_unlock(&evdev->client_lock); } +static void evdev_reclaim_client(struct rcu_head *rp) +{ + struct evdev_client *client = container_of(rp, struct evdev_client, rcu); + unsigned int i; + for (i = 0; i < EV_CNT; ++i) + bitmap_free(client->evmasks[i]); + kvfree(client); +} + static void evdev_detach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_del_rcu(&client->node); spin_unlock(&evdev->client_lock); - synchronize_rcu(); + call_rcu(&client->rcu, evdev_reclaim_client); } static int evdev_open_device(struct evdev *evdev) @@ -436,7 +446,6 @@ static int evdev_release(struct inode *inode, struct file *file) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; - unsigned int i; mutex_lock(&evdev->mutex); @@ -448,11 +457,6 @@ static int evdev_release(struct inode *inode, struct file *file) evdev_detach_client(evdev, client); - for (i = 0; i < EV_CNT; ++i) - bitmap_free(client->evmasks[i]); - - kvfree(client); - evdev_close_device(evdev); return 0; @@ -495,7 +499,6 @@ static int evdev_open(struct inode *inode, struct file *file) err_free_client: evdev_detach_client(evdev, client); - kvfree(client); return error; } diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 973fd731a520..4fd8f258972a 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -45,7 +45,7 @@ extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order (MAX_ORDER-1) +#define pageblock_order PAGE_ALLOC_COSTLY_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 270bf5136c34..13207a15b326 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1024,7 +1024,7 @@ struct readahead_control { ._index = i, \ } -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) +#define VM_READAHEAD_PAGES (SZ_8M / PAGE_SIZE) void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); diff --git a/init/Kconfig b/init/Kconfig index e9119bf54b1f..fd593d6221b9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -107,6 +107,10 @@ config THREAD_INFO_IN_TASK menu "General setup" +config CACHY + bool "Some kernel tweaks by CachyOS" + default y + config BROKEN bool diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888e..0575979188c9 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -40,6 +40,27 @@ choice on SMP and NUMA systems and exactly dividing by both PAL and NTSC frame rates for video and multimedia work. + config HZ_500 + bool "500 HZ" + help + 500 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + + config HZ_600 + bool "600 HZ" + help + 600 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + + config HZ_750 + bool "750 HZ" + help + 750 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + config HZ_1000 bool "1000 HZ" help @@ -53,6 +74,9 @@ config HZ default 100 if HZ_100 default 250 if HZ_250 default 300 if HZ_300 + default 500 if HZ_500 + default 600 if HZ_600 + default 750 if HZ_750 default 1000 if HZ_1000 config SCHED_HRTICK diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..746a668bf81d 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -223,6 +223,19 @@ config RCU_NOCB_CPU Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. +config RCU_NOCB_CPU_DEFAULT_ALL + bool "Offload RCU callback processing from all CPUs by default" + depends on RCU_NOCB_CPU + default n + help + Use this option to offload callback processing from all CPUs + by default, in the absence of the rcu_nocbs or nohz_full boot + parameter. This also avoids the need to use any boot parameters + to achieve the effect of offloading all CPUs on boot. + + Say Y here if you want offload all CPUs by default on boot. + Say N here if you are unsure. + config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" depends on RCU_EXPERT diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index eeafb546a7a0..f648f773600a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1166,11 +1166,21 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; + bool offload_all = false; struct rcu_data *rdp; +#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) + if (!rcu_nocb_is_setup) { + need_rcu_nocb_mask = true; + offload_all = true; + } +#endif + #if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) { need_rcu_nocb_mask = true; + offload_all = false; /* NO_HZ_FULL has its own mask. */ + } #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (need_rcu_nocb_mask) { @@ -1191,6 +1201,9 @@ void __init rcu_init_nohz(void) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); #endif /* #if defined(CONFIG_NO_HZ_FULL) */ + if (offload_all) + cpumask_setall(rcu_nocb_mask); + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1eec4925b8c6..e12777a3ed4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -384,8 +384,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * part of the period that we allow rt tasks to run in us. * default: 0.95s */ +#ifdef CONFIG_CACHY +int sysctl_sched_rt_runtime = 980000; +#else int sysctl_sched_rt_runtime = 950000; - +#endif /* * Serialization rules: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2f461f059278..ca48e8b59fd1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -35,9 +35,13 @@ * * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_latency = 3000000ULL; +static unsigned int normalized_sysctl_sched_latency = 4000000ULL; +#else unsigned int sysctl_sched_latency = 6000000ULL; static unsigned int normalized_sysctl_sched_latency = 6000000ULL; - +#endif /* * The initial- and re-scaling of tunables is configurable * @@ -56,8 +60,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_min_granularity = 400000ULL; +static unsigned int normalized_sysctl_sched_min_granularity = 500000ULL; +#else unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +#endif /* * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. @@ -87,8 +96,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; * * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_wakeup_granularity = 500000UL; +static unsigned int normalized_sysctl_sched_wakeup_granularity = 800000UL; +#else unsigned int sysctl_sched_wakeup_granularity = 1000000UL; static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +#endif const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -141,8 +155,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) { diff --git a/mm/compaction.c b/mm/compaction.c index b4e94cda3019..16b3f41c07bf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1699,7 +1699,7 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. */ -#ifdef CONFIG_PREEMPT_RT +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY) int sysctl_compact_unevictable_allowed __read_mostly = 0; #else int sysctl_compact_unevictable_allowed __read_mostly = 1; @@ -2705,7 +2705,11 @@ static void compact_nodes(void) * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ +#ifdef CONFIG_CACHY +unsigned int __read_mostly sysctl_compaction_proactiveness; +#else unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +#endif int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) diff --git a/mm/internal.h b/mm/internal.h index d80300392a19..96aa50e46119 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -248,6 +248,7 @@ extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern atomic_long_t kswapd_waiters; extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d163f8d36b..1b225c38cbf3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -70,7 +70,11 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ +#ifdef CONFIG_CACHY +int dirty_background_ratio = 5; +#else int dirty_background_ratio = 10; +#endif /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of @@ -98,7 +102,11 @@ unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ +#ifdef CONFIG_CACHY +unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */ +#else unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ +#endif EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e6f211dcf82e..063c5046f5bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -121,6 +121,8 @@ typedef int __bitwise fpi_t; */ #define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) +atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0); + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -341,7 +343,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -int watermark_boost_factor __read_mostly = 15000; +int watermark_boost_factor __read_mostly; int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; @@ -3016,15 +3018,16 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } /* - * Obtain a specified number of elements from the buddy allocator, all under - * a single hold of the lock, for efficiency. Add them to the supplied list. - * Returns the number of new pages which were placed at *list. + * Obtain a specified number of elements from the buddy allocator, and relax the + * zone lock when needed. Add them to the supplied list. Returns the number of + * new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { - int i, allocated = 0; + const bool can_resched = !preempt_count() && !irqs_disabled(); + int i, allocated = 0, last_mod = 0; /* * local_lock_irq held so equivalent to spin_lock_irqsave for @@ -3037,6 +3040,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; + /* Reschedule and ease the contention on the lock if needed */ + if (i + 1 < count && ((can_resched && need_resched()) || + spin_needbreak(&zone->lock))) { + __mod_zone_page_state(zone, NR_FREE_PAGES, + -((i + 1 - last_mod) << order)); + last_mod = i + 1; + spin_unlock(&zone->lock); + if (can_resched) + cond_resched(); + spin_lock(&zone->lock); + } + if (unlikely(check_pcp_refill(page))) continue; @@ -3063,7 +3078,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * on i. Do not confuse with 'allocated' which is the number of * pages added to the pcp list. */ - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + __mod_zone_page_state(zone, NR_FREE_PAGES, -((i - last_mod) << order)); spin_unlock(&zone->lock); return allocated; } @@ -4878,6 +4893,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int no_progress_loops; unsigned int cpuset_mems_cookie; int reserve_flags; + bool woke_kswapd = false; /* * We also sanity check to catch abuse of atomic reserves being used by @@ -4924,8 +4940,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; } - if (alloc_flags & ALLOC_KSWAPD) + if (alloc_flags & ALLOC_KSWAPD) { + if (!woke_kswapd) { + atomic_long_inc(&kswapd_waiters); + woke_kswapd = true; + } wake_all_kswapds(order, gfp_mask, ac); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -5130,9 +5151,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto retry; } fail: - warn_alloc(gfp_mask, ac->nodemask, - "page allocation failure: order:%u", order); got_pg: + if (woke_kswapd) + atomic_long_dec(&kswapd_waiters); + if (!page) + warn_alloc(gfp_mask, ac->nodemask, + "page allocation failure: order:%u", order); return page; } diff --git a/mm/vmpressure.c b/mm/vmpressure.c index b52644771cc4..11a4b0e3b583 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; * essence, they are percents: the higher the value, the more number * unsuccessful reclaims there were. */ +#ifdef CONFIG_CACHY +static const unsigned int vmpressure_level_med = 65; +#else static const unsigned int vmpressure_level_med = 60; +#endif static const unsigned int vmpressure_level_critical = 95; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 59b14e0d696c..a2c4f025cab1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -174,7 +174,11 @@ struct scan_control { /* * From 0 .. 200. Higher means more swappy. */ +#ifdef CONFIG_CACHY +int vm_swappiness = 20; +#else int vm_swappiness = 60; +#endif static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) @@ -3658,7 +3662,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, return 0; } -static bool allow_direct_reclaim(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -3687,6 +3691,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) wmark_ok = free_pages > pfmemalloc_reserve / 2; + /* The throttled direct reclaimer is now a kswapd waiter */ + if (unlikely(!using_kswapd && !wmark_ok)) + atomic_long_inc(&kswapd_waiters); + /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) @@ -3752,7 +3760,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (allow_direct_reclaim(pgdat)) + if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM)) goto out; break; } @@ -3774,11 +3782,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat), HZ); + allow_direct_reclaim(pgdat, true), HZ); else /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); + allow_direct_reclaim(pgdat, true)); + + if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM))) + atomic_long_dec(&kswapd_waiters); if (fatal_signal_pending(current)) return true; @@ -4268,14 +4279,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - allow_direct_reclaim(pgdat)) + allow_direct_reclaim(pgdat, true)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); __fs_reclaim_acquire(_THIS_IP_); - if (ret || kthread_should_stop()) + if (ret || kthread_should_stop() || + !atomic_long_read(&kswapd_waiters)) break; /* diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 79be57fdd32a..291845c6d1c4 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -202,6 +202,10 @@ endif part-of-module = $(if $(filter $(basename $@).o, $(real-obj-m)),y) quiet_modtag = $(if $(part-of-module),[M], ) +ifdef CONFIG_LTO_CLANG_THIN +KBUILD_LDFLAGS += --thinlto-cache-dir=$(thinlto-dir)$(extmod-prefix).thinlto-cache +endif + modkern_cflags = \ $(if $(part-of-module), \ $(KBUILD_CFLAGS_MODULE) $(CFLAGS_MODULE), \ diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index 7f39599e9fae..c7c407993966 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -54,6 +54,10 @@ if_changed_except = $(if $(call newer_prereqs_except,$(2))$(cmd-check), \ $(cmd); \ printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd, @:) +ifdef CONFIG_LTO_CLANG_THIN +KBUILD_LDFLAGS += --thinlto-cache-dir=$(thinlto-dir)$(extmod-prefix).thinlto-cache +endif # CONFIG_LTO_CLANG_THIN + # Re-generate module BTFs if either module's .ko or vmlinux changed $(modules): %.ko: %$(mod-prelink-ext).o %.mod.o scripts/module.lds $(if $(KBUILD_BUILTIN),vmlinux) FORCE -- 2.36.1