From 022403b26fa4b198af67f978ab72c9f7a1fb98c0 Mon Sep 17 00:00:00 2001 From: "P. Jung" Date: Wed, 17 Nov 2021 18:21:34 +0000 Subject: [PATCH] mm-lru Signed-off-by: P. Jung --- Documentation/admin-guide/sysctl/vm.rst | 66 + Documentation/vm/index.rst | 1 + Documentation/vm/multigen_lru.rst | 132 + arch/Kconfig | 9 + arch/arm64/include/asm/cpufeature.h | 5 + arch/arm64/include/asm/pgtable.h | 13 +- arch/arm64/kernel/cpufeature.c | 10 + arch/arm64/tools/cpucaps | 1 + arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 9 +- arch/x86/mm/pgtable.c | 5 +- fs/exec.c | 2 + fs/fuse/dev.c | 3 +- include/linux/cgroup.h | 15 +- include/linux/memcontrol.h | 7 + include/linux/mm.h | 40 + include/linux/mm_inline.h | 198 ++ include/linux/mm_types.h | 106 + include/linux/mmzone.h | 175 ++ include/linux/nodemask.h | 1 + include/linux/oom.h | 16 + include/linux/page-flags-layout.h | 19 +- include/linux/page-flags.h | 4 +- include/linux/pgtable.h | 17 +- include/linux/sched.h | 3 + include/linux/swap.h | 3 + kernel/bounds.c | 3 + kernel/cgroup/cgroup-internal.h | 1 - kernel/exit.c | 1 + kernel/fork.c | 10 + kernel/kthread.c | 1 + kernel/sched/core.c | 2 + kernel/sysctl.c | 21 + mm/Kconfig | 122 + mm/huge_memory.c | 3 +- mm/memcontrol.c | 31 + mm/memory.c | 21 +- mm/mm_init.c | 6 +- mm/oom_kill.c | 4 +- mm/page_alloc.c | 1 + mm/rmap.c | 8 + mm/swap.c | 51 +- mm/swapfile.c | 2 + mm/vmscan.c | 2992 +++++++++++++++++++++-- mm/workingset.c | 120 +- 45 files changed, 4026 insertions(+), 235 deletions(-) create mode 100644 Documentation/vm/multigen_lru.rst diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5e795202111f..2f606e23bbed 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -25,6 +25,9 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes +- anon_min_kbytes +- clean_low_kbytes +- clean_min_kbytes - compact_memory - compaction_proactiveness - compact_unevictable_allowed @@ -105,6 +108,61 @@ On x86_64 this is about 128MB. Changing this takes effect whenever an application requests memory. +anon_min_kbytes +=============== + +This knob provides *hard* protection of anonymous pages. The anonymous pages +on the current node won't be reclaimed under any conditions when their amount +is below vm.anon_min_kbytes. + +This knob may be used to prevent excessive swap thrashing when anonymous +memory is low (for example, when memory is going to be overfilled by +compressed data of zram module). + +Setting this value too high (close to MemTotal) can result in inability to +swap and can lead to early OOM under memory pressure. + +The default value is defined by CONFIG_ANON_MIN_KBYTES. + + +clean_low_kbytes +================ + +This knob provides *best-effort* protection of clean file pages. The file pages +on the current node won't be reclaimed under memory pressure when the amount of +clean file pages is below vm.clean_low_kbytes *unless* we threaten to OOM. + +Protection of clean file pages using this knob may be used when swapping is +still possible to + - prevent disk I/O thrashing under memory pressure; + - improve performance in disk cache-bound tasks under memory pressure. + +Setting it to a high value may result in a early eviction of anonymous pages +into the swap space by attempting to hold the protected amount of clean file +pages in memory. + +The default value is defined by CONFIG_CLEAN_LOW_KBYTES. + + +clean_min_kbytes +================ + +This knob provides *hard* protection of clean file pages. The file pages on the +current node won't be reclaimed under memory pressure when the amount of clean +file pages is below vm.clean_min_kbytes. + +Hard protection of clean file pages using this knob may be used to + - prevent disk I/O thrashing under memory pressure even with no free swap space; + - improve performance in disk cache-bound tasks under memory pressure; + - avoid high latency and prevent livelock in near-OOM conditions. + +Setting it to a high value may result in a early out-of-memory condition due to +the inability to reclaim the protected amount of clean file pages when other +types of pages cannot be reclaimed. + +The default value is defined by CONFIG_CLEAN_MIN_KBYTES. + + compact_memory ============== @@ -864,6 +922,14 @@ be 133 (x + 2x = 200, 2x = 133.33). At 0, the kernel will not initiate swap until the amount of free and file-backed pages is less than the high watermark in a zone. +This knob has no effect if the amount of clean file pages on the current +node is below vm.clean_low_kbytes or vm.clean_min_kbytes. In this case, +only anonymous pages can be reclaimed. + +If the number of anonymous pages on the current node is below +vm.anon_min_kbytes, then only file pages can be reclaimed with +any vm.swappiness value. + unprivileged_userfaultfd ======================== diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index b51f0d8992f8..779772a025a0 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -17,6 +17,7 @@ various features of the Linux memory management swap_numa zswap + multigen_lru Kernel developers MM documentation ================================== diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst new file mode 100644 index 000000000000..7c064a378b85 --- /dev/null +++ b/Documentation/vm/multigen_lru.rst @@ -0,0 +1,132 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Multigenerational LRU +===================== + +Quick Start +=========== +Build Configurations +-------------------- +:Required: Set ``CONFIG_LRU_GEN=y``. + +:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by + default. + +Runtime Configurations +---------------------- +:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the + feature was not turned on by default. + +:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to + protect the working set of ``N`` milliseconds. The OOM killer is + invoked if this working set cannot be kept in memory. + +:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature + is turned on. This file has the following output: + +:: + + memcg memcg_id memcg_path + node node_id + min_gen birth_time anon_size file_size + ... + max_gen birth_time anon_size file_size + +``min_gen`` is the oldest generation number and ``max_gen`` is the +youngest generation number. ``birth_time`` is in milliseconds. +``anon_size`` and ``file_size`` are in pages. + +Phones/Laptops/Workstations +--------------------------- +No additional configurations required. + +Servers/Data Centers +-------------------- +:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a + larger number. + +:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger + number. + +:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``. + +:Working set estimation: Write ``+ memcg_id node_id max_gen + [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to + invoke the aging, which scans PTEs for accessed pages and then + creates the next generation ``max_gen+1``. A swap file and a non-zero + ``swappiness``, which overrides ``vm.swappiness``, are required to + scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to + override the default behavior which only scans PTE tables found + populated. + +:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness] + [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the + eviction, which evicts generations less than or equal to ``min_gen``. + ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and + ``max_gen-1`` are not fully aged and therefore cannot be evicted. + Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple + command lines are supported, so does concatenation with delimiters + ``,`` and ``;``. + +Framework +========= +For each ``lruvec``, evictable pages are divided into multiple +generations. The youngest generation number is stored in +``lrugen->max_seq`` for both anon and file types as they are aged on +an equal footing. The oldest generation numbers are stored in +``lrugen->min_seq[]`` separately for anon and file types as clean +file pages can be evicted regardless of swap and writeback +constraints. These three variables are monotonically increasing. +Generation numbers are truncated into +``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into +``page->flags``. The sliding window technique is used to prevent +truncated generation numbers from overlapping. Each truncated +generation number is an index to an array of per-type and per-zone +lists ``lrugen->lists``. + +Each generation is divided into multiple tiers. Tiers represent +different ranges of numbers of accesses from file descriptors only. +Pages accessed ``N`` times via file descriptors belong to tier +``order_base_2(N)``. Each generation contains at most +``CONFIG_TIERS_PER_GEN`` tiers, and they require additional +``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to +moving between generations which requires list operations, moving +between tiers only involves operations on ``page->flags`` and +therefore has a negligible cost. A feedback loop modeled after the PID +controller monitors refaulted % across all tiers and decides when to +protect pages from which tiers. + +The framework comprises two conceptually independent components: the +aging and the eviction, which can be invoked separately from user +space for the purpose of working set estimation and proactive reclaim. + +Aging +----- +The aging produces young generations. Given an ``lruvec``, the aging +traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()`` +to scan PTEs for accessed pages (a ``mm_struct`` list is maintained +for each ``memcg``). Upon finding one, the aging updates its +generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``). +After each round of traversal, the aging increments ``max_seq``. The +aging is due when ``min_seq[]`` reaches ``max_seq-1``. + +Eviction +-------- +The eviction consumes old generations. Given an ``lruvec``, the +eviction scans pages on the per-zone lists indexed by anon and file +``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to +select a type based on the values of ``min_seq[]``. If they are +equal, it selects the type that has a lower refaulted %. The eviction +sorts a page according to its updated generation number if the aging +has found this page accessed. It also moves a page to the next +generation if this page is from an upper tier that has a higher +refaulted % than the base tier. The eviction increments ``min_seq[]`` +of a selected type when it finds all the per-zone lists indexed by +``min_seq[]`` of this selected type are empty. + +To-do List +========== +KVM Optimization +---------------- +Support shadow page table walk. diff --git a/arch/Kconfig b/arch/Kconfig index d1e69d6e8498..ec01a6d69a7d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1291,6 +1291,15 @@ config ARCH_HAS_ELFCORE_COMPAT config ARCH_HAS_PARANOID_L1D_FLUSH bool +config ARCH_HAS_NONLEAF_PMD_YOUNG + bool + depends on PGTABLE_LEVELS > 2 + help + Architectures that select this are able to set the accessed bit on + non-leaf PMD entries in addition to leaf PTE entries where pages are + mapped. For them, page table walkers that clear the accessed bit may + stop at non-leaf PMD entries if they do not see the accessed bit. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index ef6be92b1921..99518b4b2a9e 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -779,6 +779,11 @@ static inline bool system_supports_tlb_range(void) cpus_have_const_cap(ARM64_HAS_TLB_RANGE); } +static inline bool system_has_hw_af(void) +{ + return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF); +} + extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 72f95c6a7051..a7a23e589d5f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -999,13 +999,16 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, * page after fork() + CoW for pfn mappings. We don't always have a * hardware-managed access flag on arm64. */ -static inline bool arch_faults_on_old_pte(void) +static inline bool arch_has_hw_pte_young(bool local) { - WARN_ON(preemptible()); + if (local) { + WARN_ON(preemptible()); + return cpu_has_hw_af(); + } - return !cpu_has_hw_af(); + return system_has_hw_af(); } -#define arch_faults_on_old_pte arch_faults_on_old_pte +#define arch_has_hw_pte_young arch_has_hw_pte_young /* * Experimentally, it's cheap to set the access flag in hardware and we @@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pte(void) */ static inline bool arch_wants_old_prefaulted_pte(void) { - return !arch_faults_on_old_pte(); + return arch_has_hw_pte_young(true); } #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7553c98f379f..bafd0728e96a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2161,6 +2161,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_hw_dbm, .cpu_enable = cpu_enable_hw_dbm, }, + { + .desc = "Hardware update of the Access flag", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HW_AF, + .sys_reg = SYS_ID_AA64MMFR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR1_HADBS_SHIFT, + .min_field_value = 1, + .matches = has_cpuid_feature, + }, #endif { .desc = "CRC32 instructions", diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 49305c2e6dfd..d52f50671e60 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -35,6 +35,7 @@ HAS_STAGE2_FWB HAS_SYSREG_GIC_CPUIF HAS_TLB_RANGE HAS_VIRT_HOST_EXTN +HW_AF HW_DBM KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab9a4cbda78b..a53d5f68de41 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -84,6 +84,7 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_COPY_MC if X86_64 select ARCH_HAS_SET_MEMORY diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 448cd01eb3ec..36205ec0acac 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { - return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; + return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != + (_KERNPG_TABLE & ~_PAGE_ACCESSED); } static inline unsigned long pages_to_mb(unsigned long npg) @@ -1397,10 +1398,10 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_faults_on_old_pte arch_faults_on_old_pte -static inline bool arch_faults_on_old_pte(void) +#define arch_has_hw_pte_young arch_has_hw_pte_young +static inline bool arch_has_hw_pte_young(bool local) { - return false; + return true; } #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3481b35cb4ec..a224193d84bf 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, return ret; } +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { diff --git a/fs/exec.c b/fs/exec.c index a098c133d8d7..c7e55b757e87 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1007,6 +1007,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; + lru_gen_add_mm(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1017,6 +1018,7 @@ static int exec_mmap(struct mm_struct *mm) if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); + lru_gen_switch_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); tsk->mm->vmacache_seqnum = 0; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5a1f142bdb48..ae8960675911 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page) 1 << PG_active | 1 << PG_workingset | 1 << PG_reclaim | - 1 << PG_waiters))) { + 1 << PG_waiters | + LRU_GEN_MASK | LRU_REFS_MASK))) { dump_page(page, "fuse: trying to steal weird page"); return 1; } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 75c151413fda..b145025f3eac 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } +extern struct mutex cgroup_mutex; + +static inline void cgroup_lock(void) +{ + mutex_lock(&cgroup_mutex); +} + +static inline void cgroup_unlock(void) +{ + mutex_unlock(&cgroup_mutex); +} + /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for @@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU -extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ @@ -707,6 +718,8 @@ struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} +static inline void cgroup_lock(void) {} +static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3096c9a0ee01..e284a4aa1bd8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -347,6 +347,10 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_LRU_GEN + struct lru_gen_mm_list mm_list; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; @@ -1350,10 +1354,13 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) static inline void lock_page_memcg(struct page *page) { + /* to match page_memcg_rcu() */ + rcu_read_lock(); } static inline void unlock_page_memcg(struct page *page) { + rcu_read_unlock(); } static inline void mem_cgroup_handle_over_high(void) diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a52aba448f..dcddc7a47242 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -203,6 +203,10 @@ static inline void __mm_zero_struct_page(struct page *page) extern int sysctl_max_map_count; +extern unsigned long sysctl_anon_min_kbytes; +extern unsigned long sysctl_clean_low_kbytes; +extern unsigned long sysctl_clean_min_kbytes; + extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; @@ -1093,6 +1097,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -1800,6 +1806,40 @@ static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif +#ifdef CONFIG_LRU_GEN +static inline void task_enter_nonseq_fault(void) +{ + WARN_ON(current->in_nonseq_fault); + + current->in_nonseq_fault = 1; +} + +static inline void task_exit_nonseq_fault(void) +{ + WARN_ON(!current->in_nonseq_fault); + + current->in_nonseq_fault = 0; +} + +static inline bool task_in_nonseq_fault(void) +{ + return current->in_nonseq_fault; +} +#else +static inline void task_enter_nonseq_fault(void) +{ +} + +static inline void task_exit_nonseq_fault(void) +{ +} + +static inline bool task_in_nonseq_fault(void) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ + static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 355ea1ee32bd..7d520f45e612 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -79,11 +79,203 @@ static __always_inline enum lru_list page_lru(struct page *page) return lru; } +#ifdef CONFIG_LRU_GEN + +static inline bool lru_gen_enabled(void) +{ +#ifdef CONFIG_LRU_GEN_ENABLED + DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); + + return static_branch_likely(&lru_gen_static_key); +#else + DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); + + return static_branch_unlikely(&lru_gen_static_key); +#endif +} + +/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ +static inline int lru_gen_from_seq(unsigned long seq) +{ + return seq % MAX_NR_GENS; +} + +/* Return a proper index regardless whether we keep stats for historical generations. */ +static inline int lru_hist_from_seq(unsigned long seq) +{ + return seq % NR_HIST_GENS; +} + +/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ +static inline int lru_tier_from_refs(int refs) +{ + VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); + + return order_base_2(refs + 1); +} + +/* The youngest and the second youngest generations are counted as active. */ +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) +{ + unsigned long max_seq = lruvec->evictable.max_seq; + + VM_BUG_ON(gen >= MAX_NR_GENS); + + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); +} + +/* Update the sizes of the multigenerational lru lists. */ +static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, + int old_gen, int new_gen) +{ + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + int delta = thp_nr_pages(page); + enum lru_list lru = type * LRU_FILE; + struct lrugen *lrugen = &lruvec->evictable; + + lockdep_assert_held(&lruvec->lru_lock); + VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); + VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); + VM_BUG_ON(old_gen == -1 && new_gen == -1); + + if (old_gen >= 0) + WRITE_ONCE(lrugen->sizes[old_gen][type][zone], + lrugen->sizes[old_gen][type][zone] - delta); + if (new_gen >= 0) + WRITE_ONCE(lrugen->sizes[new_gen][type][zone], + lrugen->sizes[new_gen][type][zone] + delta); + + if (old_gen < 0) { + if (lru_gen_is_active(lruvec, new_gen)) + lru += LRU_ACTIVE; + update_lru_size(lruvec, lru, zone, delta); + return; + } + + if (new_gen < 0) { + if (lru_gen_is_active(lruvec, old_gen)) + lru += LRU_ACTIVE; + update_lru_size(lruvec, lru, zone, -delta); + return; + } + + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { + update_lru_size(lruvec, lru, zone, -delta); + update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); + } + + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); +} + +/* Add a page to one of the multigenerational lru lists. Return true on success. */ +static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) +{ + int gen; + unsigned long old_flags, new_flags; + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + struct lrugen *lrugen = &lruvec->evictable; + + if (PageUnevictable(page) || !lrugen->enabled[type]) + return false; + /* + * If a page shouldn't be considered for eviction, i.e., a page mapped + * upon fault during which the accessed bit is set, add it to the + * youngest generation. + * + * If a page can't be evicted immediately, i.e., an anon page not in + * swap cache or a dirty page pending writeback, add it to the second + * oldest generation. + * + * If a page could be evicted immediately, e.g., a clean page, add it to + * the oldest generation. + */ + if (PageActive(page)) + gen = lru_gen_from_seq(lrugen->max_seq); + else if ((!type && !PageSwapCache(page)) || + (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); + else + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + do { + new_flags = old_flags = READ_ONCE(page->flags); + VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); + + new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); + + lru_gen_update_size(page, lruvec, -1, gen); + /* for rotate_reclaimable_page() */ + if (reclaiming) + list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); + else + list_add(&page->lru, &lrugen->lists[gen][type][zone]); + + return true; +} + +/* Delete a page from one of the multigenerational lru lists. Return true on success. */ +static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) +{ + int gen; + unsigned long old_flags, new_flags; + + do { + new_flags = old_flags = READ_ONCE(page->flags); + if (!(new_flags & LRU_GEN_MASK)) + return false; + + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(PageUnevictable(page), page); + + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + + new_flags &= ~LRU_GEN_MASK; + if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for shrink_page_list() */ + if (reclaiming) + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); + else if (lru_gen_is_active(lruvec, gen)) + new_flags |= BIT(PG_active); + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); + + lru_gen_update_size(page, lruvec, gen, -1); + list_del(&page->lru); + + return true; +} + +#else + +static inline bool lru_gen_enabled(void) +{ + return false; +} + +static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) +{ + return false; +} + +static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) +{ + return false; +} + +#endif /* CONFIG_LRU_GEN */ + static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec) { enum lru_list lru = page_lru(page); + if (lru_gen_add_page(page, lruvec, false)) + return; + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } @@ -93,6 +285,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, { enum lru_list lru = page_lru(page); + if (lru_gen_add_page(page, lruvec, true)) + return; + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add_tail(&page->lru, &lruvec->lists[lru]); } @@ -100,6 +295,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec) { + if (lru_gen_del_page(page, lruvec, false)) + return; + list_del(&page->lru); update_lru_size(lruvec, page_lru(page), page_zonenum(page), -thp_nr_pages(page)); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..a6ca0607c549 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include @@ -580,6 +582,22 @@ struct mm_struct { #ifdef CONFIG_IOMMU_SUPPORT u32 pasid; #endif +#ifdef CONFIG_LRU_GEN + struct { + /* the node of a global or per-memcg mm_struct list */ + struct list_head list; +#ifdef CONFIG_MEMCG + /* points to the memcg of the owner task above */ + struct mem_cgroup *memcg; +#endif + /* whether this mm_struct has been used since the last walk */ + nodemask_t nodes; +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + /* the number of CPUs using this mm_struct */ + atomic_t nr_cpus; +#endif + } lrugen; +#endif /* CONFIG_LRU_GEN */ } __randomize_layout; /* @@ -606,6 +624,94 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return (struct cpumask *)&mm->cpu_bitmap; } +#ifdef CONFIG_LRU_GEN + +struct lru_gen_mm_list { + /* a global or per-memcg mm_struct list */ + struct list_head fifo; + /* protects the list above */ + spinlock_t lock; +}; + +void lru_gen_add_mm(struct mm_struct *mm); +void lru_gen_del_mm(struct mm_struct *mm); +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm); +#endif + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ + INIT_LIST_HEAD(&mm->lrugen.list); +#ifdef CONFIG_MEMCG + mm->lrugen.memcg = NULL; +#endif +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + atomic_set(&mm->lrugen.nr_cpus, 0); +#endif + nodes_clear(mm->lrugen.nodes); +} + +/* Track the usage of each mm_struct so that we can skip inactive ones. */ +static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) +{ + /* exclude init_mm, efi_mm, etc. */ + if (!core_kernel_data((unsigned long)old)) { + nodes_setall(old->lrugen.nodes); +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + atomic_dec(&old->lrugen.nr_cpus); +#endif + } + + if (!core_kernel_data((unsigned long)new)) { + /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ + VM_WARN_ON(list_empty(&new->lrugen.list)); +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + atomic_inc(&new->lrugen.nr_cpus); +#endif + } +} + +/* Return whether this mm_struct is being used on any CPUs. */ +static inline bool lru_gen_mm_is_active(struct mm_struct *mm) +{ +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + return !cpumask_empty(mm_cpumask(mm)); +#else + return atomic_read(&mm->lrugen.nr_cpus); +#endif +} + +#else /* !CONFIG_LRU_GEN */ + +static inline void lru_gen_add_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_del_mm(struct mm_struct *mm) +{ +} + +#ifdef CONFIG_MEMCG +static inline void lru_gen_migrate_mm(struct mm_struct *mm) +{ +} +#endif + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) +{ +} + +static inline bool lru_gen_mm_is_active(struct mm_struct *mm) +{ + return false; +} + +#endif /* CONFIG_LRU_GEN */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6a1d79d84675..a7544dc4c91b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -294,6 +294,172 @@ enum lruvec_flags { */ }; +struct lruvec; +struct page_vma_mapped_walk; + +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) + +#ifdef CONFIG_LRU_GEN + +/* + * For each lruvec, evictable pages are divided into multiple generations. The + * youngest and the oldest generation numbers, AKA max_seq and min_seq, are + * monotonically increasing. The sliding window technique is used to track at + * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the + * window, AKA gen, indexes an array of per-type and per-zone lists for the + * corresponding generation. The counter in page->flags stores gen+1 while a + * page is on one of the multigenerational lru lists. Otherwise, it stores 0. + * + * After a page is faulted in, the aging must check the accessed bit at least + * twice before the eviction would consider it. The first check clears the + * accessed bit set during the initial fault. The second check makes sure this + * page hasn't been used since then. + */ +#define MIN_NR_GENS 2 +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + +/* + * Each generation is divided into multiple tiers. Tiers represent different + * ranges of numbers of accesses from file descriptors, i.e., + * mark_page_accessed(). In contrast to moving between generations which + * requires the lru lock, moving between tiers only involves an atomic + * operation on page->flags and therefore has a negligible cost. + * + * The purposes of tiers are to: + * 1) estimate whether pages accessed multiple times via file descriptors are + * more active than pages accessed only via page tables by separating the two + * access types into upper tiers and the base tier, and comparing refaulted % + * across all tiers. + * 2) improve buffered io performance by deferring the protection of pages + * accessed multiple times until the eviction. That is the protection happens + * in the reclaim path, not the access path. + * + * Pages accessed N times via file descriptors belong to tier order_base_2(N). + * The base tier may be marked by PageReferenced(). All upper tiers are marked + * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are + * used to support more than one upper tier. + */ +#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + +/* Whether to keep stats for historical generations. */ +#ifdef CONFIG_LRU_GEN_STATS +#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) +#else +#define NR_HIST_GENS 1U +#endif + +struct lrugen { + /* the aging increments the max generation number */ + unsigned long max_seq; + /* the eviction increments the min generation numbers */ + unsigned long min_seq[ANON_AND_FILE]; + /* the birth time of each generation in jiffies */ + unsigned long timestamps[MAX_NR_GENS]; + /* the multigenerational lru lists */ + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the sizes of the multigenerational lru lists in pages */ + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the exponential moving average of refaulted */ + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; + /* the exponential moving average of protected+evicted */ + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; + /* the base tier isn't protected, hence the minus one */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* incremented without holding the lru lock */ + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multigenerational lru is enabled */ + bool enabled[ANON_AND_FILE]; +}; + +enum { + MM_LEAF_TOTAL, /* total leaf entries */ + MM_LEAF_OLD, /* old leaf entries */ + MM_LEAF_YOUNG, /* young leaf entries */ + MM_NONLEAF_TOTAL, /* total non-leaf entries */ + MM_NONLEAF_PREV, /* previously worthy non-leaf entries */ + MM_NONLEAF_CUR, /* currently worthy non-leaf entries */ + NR_MM_STATS +}; + +/* mnemonic codes for the stats above */ +#define MM_STAT_CODES "toydpc" + +/* double buffering bloom filters */ +#define NR_BLOOM_FILTERS 2 + +struct lru_gen_mm_walk { + /* set to max_seq after each round of walk */ + unsigned long seq; + /* the next mm_struct on the list to walk */ + struct list_head *head; + /* the first mm_struct never walked before */ + struct list_head *tail; + /* to wait for the last walker to finish */ + struct wait_queue_head wait; + /* bloom filters flip after each round of walk */ + unsigned long *filters[NR_BLOOM_FILTERS]; + /* page table stats for debugging */ + unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; + /* the number of concurrent walkers */ + int nr_walkers; +}; + +#define MIN_BATCH_SIZE 64 +#define MAX_BATCH_SIZE 8192 + +struct mm_walk_args { + struct mem_cgroup *memcg; + unsigned long max_seq; + unsigned long start_pfn; + unsigned long end_pfn; + unsigned long next_addr; + unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)]; + int node_id; + int swappiness; + int batch_size; + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + int mm_stats[NR_MM_STATS]; + bool use_filter; +}; + +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); +void lru_gen_change_state(bool enable, bool main, bool swap); +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg); +void lru_gen_free_memcg(struct mem_cgroup *memcg); +#endif + +#else /* !CONFIG_LRU_GEN */ + +static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) +{ +} + +static inline void lru_gen_change_state(bool enable, bool main, bool swap) +{ +} + +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ +} + +#ifdef CONFIG_MEMCG +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_free_memcg(struct mem_cgroup *memcg) +{ +} +#endif + +#endif /* CONFIG_LRU_GEN */ + struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ @@ -311,6 +477,12 @@ struct lruvec { unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; +#ifdef CONFIG_LRU_GEN + /* unevictable pages are on LRU_UNEVICTABLE */ + struct lrugen evictable; + /* state for mm list and page table walks */ + struct lru_gen_mm_walk mm_walk; +#endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif @@ -895,6 +1067,9 @@ typedef struct pglist_data { unsigned long flags; +#ifdef CONFIG_LRU_GEN + struct mm_walk_args mm_walk_args; +#endif ZONE_PADDING(_pad2_) /* Per-node vmstats */ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 567c3ddba2c4..90840c459abc 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) +#define next_memory_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U diff --git a/include/linux/oom.h b/include/linux/oom.h index 2db9a1432511..c4c8c7e71099 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -57,6 +57,22 @@ struct oom_control { extern struct mutex oom_lock; extern struct mutex oom_adj_mutex; +#ifdef CONFIG_MMU +extern struct task_struct *oom_reaper_list; +extern struct wait_queue_head oom_reaper_wait; + +static inline bool oom_reaping_in_progress(void) +{ + /* racy check to see if oom reaping could be in progress */ + return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait); +} +#else +static inline bool oom_reaping_in_progress(void) +{ + return false; +} +#endif + static inline void set_current_oom_origin(void) { current->signal->oom_flag_origin = true; diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index ef1e3e736e14..8cdbbdccb5ad 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -26,6 +26,14 @@ #define ZONES_WIDTH ZONES_SHIFT +#ifdef CONFIG_LRU_GEN +/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ +#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) +#else +#define LRU_GEN_WIDTH 0 +#define LRU_REFS_WIDTH 0 +#endif /* CONFIG_LRU_GEN */ + #ifdef CONFIG_SPARSEMEM #include #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) @@ -55,7 +63,8 @@ #define SECTIONS_WIDTH 0 #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ + <= BITS_PER_LONG - NR_PAGEFLAGS #define NODES_WIDTH NODES_SHIFT #elif defined(CONFIG_SPARSEMEM_VMEMMAP) #error "Vmemmap: No space for nodes field in page flags" @@ -89,8 +98,8 @@ #define LAST_CPUPID_SHIFT 0 #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ - <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else #define LAST_CPUPID_WIDTH 0 @@ -100,8 +109,8 @@ #define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ - > BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS #error "Not enough bits in page flags" #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index fbfd3fad48f2..a7d7ff4c621d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ - 1UL << PG_unevictable | __PG_MLOCKED) + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. @@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ - (PAGEFLAGS_MASK & ~__PG_HWPOISON) + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e24d2c992b11..afb9004b778a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -211,7 +211,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) @@ -232,7 +232,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, BUILD_BUG(); return 0; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH @@ -258,6 +258,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef arch_has_hw_pte_young +/* + * Return whether the accessed bit is supported by the local CPU or all CPUs. + * + * Those arches which have hw access flag feature need to implement their own + * helper. By default, "false" means pagefault will be hit on old pte. + */ +static inline bool arch_has_hw_pte_young(bool local) +{ + return false; +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/include/linux/sched.h b/include/linux/sched.h index 71b012a224e4..aa2603c6f794 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -911,6 +911,9 @@ struct task_struct { #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif +#ifdef CONFIG_LRU_GEN + unsigned in_nonseq_fault:1; +#endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index ba52f3a3478e..2c4a5a88f83c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -137,6 +137,9 @@ union swap_header { */ struct reclaim_state { unsigned long reclaimed_slab; +#ifdef CONFIG_LRU_GEN + struct mm_walk_args *mm_walk_args; +#endif }; #ifdef __KERNEL__ diff --git a/kernel/bounds.c b/kernel/bounds.c index 9795d75b09b2..aba13aa7336c 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -22,6 +22,9 @@ int main(void) DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); +#ifdef CONFIG_LRU_GEN + DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); +#endif /* End of constants */ return 0; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bfbeabc17a9d..bec59189e206 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -146,7 +146,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; diff --git a/kernel/exit.c b/kernel/exit.c index 91a43e57a32e..788a299abb4e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm) goto retry; } WRITE_ONCE(mm->owner, c); + lru_gen_migrate_mm(mm); task_unlock(c); put_task_struct(c); } diff --git a/kernel/fork.c b/kernel/fork.c index 10885c649ca4..291b371ab423 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -681,6 +681,7 @@ static void check_mm(struct mm_struct *mm) #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif + VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm); } #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) @@ -1080,6 +1081,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, goto fail_nocontext; mm->user_ns = get_user_ns(user_ns); + lru_gen_init_mm(mm); return mm; fail_nocontext: @@ -1122,6 +1124,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + lru_gen_del_mm(mm); mmdrop(mm); } @@ -2606,6 +2609,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + /* lock the task to synchronize with memcg migration */ + task_lock(p); + lru_gen_add_mm(p->mm); + task_unlock(p); + } + wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 5b37a8567168..fd827fdad26b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1361,6 +1361,7 @@ void kthread_use_mm(struct mm_struct *mm) tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); + lru_gen_switch_mm(active_mm, mm); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d37f959ed1ab..04e030688752 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4944,6 +4944,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_switch_mm(prev->active_mm, next->mm); if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ @@ -8812,6 +8813,7 @@ void idle_task_exit(void) if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + lru_gen_switch_mm(mm, &init_mm); finish_arch_post_lock_switch(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d7..65fc387566be 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3131,6 +3131,27 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif + { + .procname = "anon_min_kbytes", + .data = &sysctl_anon_min_kbytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "clean_low_kbytes", + .data = &sysctl_clean_low_kbytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "clean_min_kbytes", + .data = &sysctl_clean_min_kbytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { .procname = "user_reserve_kbytes", .data = &sysctl_user_reserve_kbytes, diff --git a/mm/Kconfig b/mm/Kconfig index d16ba9249bc5..9167c043b1a9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -89,6 +89,69 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. +config ANON_MIN_KBYTES + int "Default value for vm.anon_min_kbytes" + depends on SYSCTL + range 0 4294967295 + default 0 + help + This option sets the default value for vm.anon_min_kbytes sysctl knob. + + The vm.anon_min_kbytes sysctl knob provides *hard* protection of + anonymous pages. The anonymous pages on the current node won't be + reclaimed under any conditions when their amount is below + vm.anon_min_kbytes. This knob may be used to prevent excessive swap + thrashing when anonymous memory is low (for example, when memory is + going to be overfilled by compressed data of zram module). + + Setting this value too high (close to MemTotal) can result in + inability to swap and can lead to early OOM under memory pressure. + +config CLEAN_LOW_KBYTES + int "Default value for vm.clean_low_kbytes" + depends on SYSCTL + range 0 4294967295 + default 0 + help + This option sets the default value for vm.clean_low_kbytes sysctl knob. + + The vm.clean_low_kbytes sysctl knob provides *best-effort* + protection of clean file pages. The file pages on the current node + won't be reclaimed under memory pressure when the amount of clean file + pages is below vm.clean_low_kbytes *unless* we threaten to OOM. + Protection of clean file pages using this knob may be used when + swapping is still possible to + - prevent disk I/O thrashing under memory pressure; + - improve performance in disk cache-bound tasks under memory + pressure. + + Setting it to a high value may result in a early eviction of anonymous + pages into the swap space by attempting to hold the protected amount + of clean file pages in memory. + +config CLEAN_MIN_KBYTES + int "Default value for vm.clean_min_kbytes" + depends on SYSCTL + range 0 4294967295 + default 0 + help + This option sets the default value for vm.clean_min_kbytes sysctl knob. + + The vm.clean_min_kbytes sysctl knob provides *hard* protection of + clean file pages. The file pages on the current node won't be + reclaimed under memory pressure when the amount of clean file pages is + below vm.clean_min_kbytes. Hard protection of clean file pages using + this knob may be used to + - prevent disk I/O thrashing under memory pressure even with no free + swap space; + - improve performance in disk cache-bound tasks under memory + pressure; + - avoid high latency and prevent livelock in near-OOM conditions. + + Setting it to a high value may result in a early out-of-memory condition + due to the inability to reclaim the protected amount of clean file pages + when other types of pages cannot be reclaimed. + config HAVE_MEMBLOCK_PHYS_MAP bool @@ -896,4 +959,63 @@ config SECRETMEM source "mm/damon/Kconfig" +# the multigenerational lru { +config LRU_GEN + bool "Multigenerational LRU" + depends on MMU + # the following options may leave not enough spare bits in page->flags + depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) + help + A high performance LRU implementation to heavily overcommit workloads + that are not IO bound. See Documentation/vm/multigen_lru.rst for + details. + + Warning: do not enable this option unless you plan to use it because + it introduces a small per-process and per-memcg and per-node memory + overhead. + +config LRU_GEN_ENABLED + bool "Turn on by default" + depends on LRU_GEN + help + The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option + changes it to 1. + + Warning: the default value is the fast path. See + Documentation/static-keys.txt for details. + +config LRU_GEN_STATS + bool "Full stats for debugging" + depends on LRU_GEN + help + This option keeps full stats for each generation, which can be read + from /sys/kernel/debug/lru_gen_full. + + Warning: do not enable this option unless you plan to use it because + it introduces an additional small per-process and per-memcg and + per-node memory overhead. + +config NR_LRU_GENS + int "Max number of generations" + depends on LRU_GEN + range 4 31 + default 7 + help + This will use order_base_2(N+1) spare bits from page flags. + + Warning: do not use numbers larger than necessary because each + generation introduces a small per-node and per-memcg memory overhead. + +config TIERS_PER_GEN + int "Number of tiers per generation" + depends on LRU_GEN + range 2 5 + default 4 + help + This will use N-2 spare bits from page flags. + + Larger values generally offer better protection to active pages under + heavy buffered I/O workloads. +# } + endmenu diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c5142d237e48..875ed5559d95 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struct page *head, int tail, #ifdef CONFIG_64BIT (1L << PG_arch_2) | #endif - (1L << PG_dirty))); + (1L << PG_dirty) | + LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first tail page is compound_mapcount */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 87e41c3cac10..afaedd156830 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1239,12 +1239,17 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, *lru_size += nr_pages; size = *lru_size; +#ifdef CONFIG_LRU_GEN + /* unlikely but not a bug when reset_batch_size() is pending */ + VM_WARN_ON(size + MAX_BATCH_SIZE < 0); +#else if (WARN_ONCE(size < 0, "%s(%p, %d, %d): lru_size %ld\n", __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; } +#endif if (nr_pages > 0) *lru_size += nr_pages; @@ -5099,6 +5104,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { + lru_gen_free_memcg(memcg); memcg_wb_domain_exit(memcg); __mem_cgroup_free(memcg); } @@ -5162,6 +5168,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + lru_gen_init_memcg(memcg); return memcg; fail: mem_cgroup_id_remove(memcg); @@ -6145,6 +6152,29 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_LRU_GEN +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *task = NULL; + + cgroup_taskset_for_each_leader(task, css, tset) + break; + + if (!task) + return; + + task_lock(task); + if (task->mm && task->mm->owner == task) + lru_gen_migrate_mm(task->mm); + task_unlock(task); +} +#else +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ +} +#endif /* CONFIG_LRU_GEN */ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6488,6 +6518,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .dfl_cftypes = memory_files, diff --git a/mm/memory.c b/mm/memory.c index c52be6d6b605..6b2b665c0866 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -121,18 +121,6 @@ int randomize_va_space __read_mostly = 2; #endif -#ifndef arch_faults_on_old_pte -static inline bool arch_faults_on_old_pte(void) -{ - /* - * Those arches which don't have hw access flag feature need to - * implement their own helper. By default, "true" means pagefault - * will be hit on old pte. - */ - return true; -} -#endif - #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { @@ -2769,7 +2757,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ - if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { + if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); @@ -4774,6 +4762,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { vm_fault_t ret; + bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); __set_current_state(TASK_RUNNING); @@ -4795,11 +4784,17 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); + if (nonseq_fault) + task_enter_nonseq_fault(); + if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); + if (nonseq_fault) + task_exit_nonseq_fault(); + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 9ddaf0e1b0ab..0d7b2bd2454a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) shift = 8 * sizeof(unsigned long); width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, KASAN_TAG_WIDTH, + LRU_GEN_WIDTH, + LRU_REFS_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bfa9e348c3a3..870a4f9f070e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) * victim (if that is possible) to help the OOM killer to move on. */ static struct task_struct *oom_reaper_th; -static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static struct task_struct *oom_reaper_list; +DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); bool __oom_reap_task_mm(struct mm_struct *mm) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23d3339ac4e8..5bee46999f60 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7405,6 +7405,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); lruvec_init(&pgdat->__lruvec); + lru_gen_init_state(NULL, &pgdat->__lruvec); } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, diff --git a/mm/rmap.c b/mm/rmap.c index 6aebd1747251..a6b522fe871a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -790,6 +791,13 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (pvmw.pte) { + /* the multigenerational lru exploits the spatial locality */ + if (lru_gen_enabled() && pte_young(*pvmw.pte) && + !(vma->vm_flags & VM_SEQ_READ)) { + lru_gen_look_around(&pvmw); + referenced++; + } + if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { /* diff --git a/mm/swap.c b/mm/swap.c index af3cad4e5378..93f5fe5f99ca 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -389,6 +389,43 @@ static void __lru_cache_activate_page(struct page *page) local_unlock(&lru_pvecs.lock); } +#ifdef CONFIG_LRU_GEN +static void page_inc_refs(struct page *page) +{ + unsigned long refs; + unsigned long old_flags, new_flags; + + if (PageUnevictable(page)) + return; + + /* see the comment on MAX_NR_TIERS */ + do { + new_flags = old_flags = READ_ONCE(page->flags); + + if (!(new_flags & BIT(PG_referenced))) { + new_flags |= BIT(PG_referenced); + continue; + } + + if (!(new_flags & BIT(PG_workingset))) { + new_flags |= BIT(PG_workingset); + continue; + } + + refs = new_flags & LRU_REFS_MASK; + refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); + + new_flags &= ~LRU_REFS_MASK; + new_flags |= refs; + } while (new_flags != old_flags && + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); +} +#else +static void page_inc_refs(struct page *page) +{ +} +#endif /* CONFIG_LRU_GEN */ + /* * Mark a page as having seen activity. * @@ -403,6 +440,11 @@ void mark_page_accessed(struct page *page) { page = compound_head(page); + if (lru_gen_enabled()) { + page_inc_refs(page); + return; + } + if (!PageReferenced(page)) { SetPageReferenced(page); } else if (PageUnevictable(page)) { @@ -446,6 +488,11 @@ void lru_cache_add(struct page *page) VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); + /* see the comment in lru_gen_add_page() */ + if (lru_gen_enabled() && !PageUnevictable(page) && + task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) + SetPageActive(page); + get_page(page); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); @@ -547,7 +594,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) { - if (PageActive(page) && !PageUnevictable(page)) { + if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { int nr_pages = thp_nr_pages(page); del_page_from_lru_list(page, lruvec); @@ -661,7 +708,7 @@ void deactivate_file_page(struct page *page) */ void deactivate_page(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { struct pagevec *pvec; local_lock(&lru_pvecs.lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index 22d10f713848..2ac9ac0b5ec3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); + lru_gen_change_state(false, false, true); out_dput: filp_close(victim, NULL); @@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); + lru_gen_change_state(true, false, true); error = 0; goto out; diff --git a/mm/vmscan.c b/mm/vmscan.c index 74296c2d1fed..701510094c6d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -50,6 +50,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include @@ -122,6 +127,15 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* The anonymous pages on the current node are below vm.anon_min_kbytes */ + unsigned int anon_below_min:1; + + /* The clean file pages on the current node are below vm.clean_low_kbytes */ + unsigned int clean_below_low:1; + + /* The clean file pages on the current node are below vm.clean_min_kbytes */ + unsigned int clean_below_min:1; + /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; @@ -171,6 +185,10 @@ struct scan_control { #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #endif +unsigned long sysctl_anon_min_kbytes __read_mostly = CONFIG_ANON_MIN_KBYTES; +unsigned long sysctl_clean_low_kbytes __read_mostly = CONFIG_CLEAN_LOW_KBYTES; +unsigned long sysctl_clean_min_kbytes __read_mostly = CONFIG_CLEAN_MIN_KBYTES; + /* * From 0 .. 200. Higher means more swappy. */ @@ -1142,9 +1160,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; - mem_cgroup_swapout(page, swap); + + /* get a shadow entry before page_memcg() is cleared */ if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page, swap, shadow); xa_unlock_irq(&mapping->i_pages); put_swap_page(page, swap); @@ -1407,6 +1427,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, if (!sc->may_unmap && page_mapped(page)) goto keep_locked; + /* lru_gen_look_around() has updated this page? */ + if (lru_gen_enabled() && !ignore_references && + page_mapped(page) && PageReferenced(page)) + goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); @@ -2562,6 +2587,156 @@ enum scan_balance { SCAN_FILE, }; +static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) +{ + /* + * Check the number of anonymous pages to protect them from + * reclaiming if their amount is below the specified. + */ + if (sysctl_anon_min_kbytes) { + unsigned long reclaimable_anon; + + reclaimable_anon = + node_page_state(pgdat, NR_ACTIVE_ANON) + + node_page_state(pgdat, NR_INACTIVE_ANON) + + node_page_state(pgdat, NR_ISOLATED_ANON); + reclaimable_anon <<= (PAGE_SHIFT - 10); + + sc->anon_below_min = reclaimable_anon < sysctl_anon_min_kbytes; + } else + sc->anon_below_min = 0; + + /* + * Check the number of clean file pages to protect them from + * reclaiming if their amount is below the specified. + */ + if (sysctl_clean_low_kbytes || sysctl_clean_min_kbytes) { + unsigned long reclaimable_file, dirty, clean; + + reclaimable_file = + node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_ISOLATED_FILE); + dirty = node_page_state(pgdat, NR_FILE_DIRTY); + /* + * node_page_state() sum can go out of sync since + * all the values are not read at once. + */ + if (likely(reclaimable_file > dirty)) + clean = (reclaimable_file - dirty) << (PAGE_SHIFT - 10); + else + clean = 0; + + sc->clean_below_low = clean < sysctl_clean_low_kbytes; + sc->clean_below_min = clean < sysctl_clean_min_kbytes; + } else { + sc->clean_below_low = 0; + sc->clean_below_min = 0; + } +} + +static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) +{ + unsigned long file; + struct lruvec *target_lruvec; + + if (lru_gen_enabled()) + return; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&target_lruvec->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&target_lruvec->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[0] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[1] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } + + prepare_workingset_protection(pgdat, sc); +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -2620,6 +2795,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + /* + * Force-scan anon if clean file pages is under vm.clean_low_kbytes + * or vm.clean_min_kbytes. + */ + if (sc->clean_below_low || sc->clean_below_min) { + scan_balance = SCAN_ANON; + goto out; + } + /* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now. @@ -2764,6 +2948,25 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, BUG(); } + /* + * Hard protection of the working set. + */ + if (file) { + /* + * Don't reclaim file pages when the amount of + * clean file pages is below vm.clean_min_kbytes. + */ + if (sc->clean_below_min) + scan = 0; + } else { + /* + * Don't reclaim anonymous pages when their + * amount is below vm.anon_min_kbytes. + */ + if (sc->anon_below_min) + scan = 0; + } + nr[lru] = scan; } } @@ -2783,123 +2986,2609 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return can_demote(pgdat->node_id, sc); } -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -{ - unsigned long nr[NR_LRU_LISTS]; - unsigned long targets[NR_LRU_LISTS]; - unsigned long nr_to_scan; - enum lru_list lru; - unsigned long nr_reclaimed = 0; - unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct blk_plug plug; - bool scan_adjusted; - - get_scan_count(lruvec, sc, nr); - - /* Record the original scan target for proportional adjustments later */ - memcpy(targets, nr, sizeof(nr)); - - /* - * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal - * event that can occur when there is little memory pressure e.g. - * multiple streaming readers/writers. Hence, we do not abort scanning - * when the requested number of pages are reclaimed when scanning at - * DEF_PRIORITY on the assumption that the fact we are direct - * reclaiming implies that kswapd is not keeping up and it is best to - * do a batch of work at once. For memcg reclaim one check is made to - * abort proportional reclaim if either the file or anon lru has already - * dropped to zero at the first pass. - */ - scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && - sc->priority == DEF_PRIORITY); +#ifdef CONFIG_LRU_GEN - blk_start_plug(&plug); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || - nr[LRU_INACTIVE_FILE]) { - unsigned long nr_anon, nr_file, percentage; - unsigned long nr_scanned; +/****************************************************************************** + * shorthand helpers + ******************************************************************************/ - for_each_evictable_lru(lru) { - if (nr[lru]) { - nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); - nr[lru] -= nr_to_scan; +#define DEFINE_MAX_SEQ(lruvec) \ + unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq) - nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, sc); - } - } +#define DEFINE_MIN_SEQ(lruvec) \ + unsigned long min_seq[ANON_AND_FILE] = { \ + READ_ONCE((lruvec)->evictable.min_seq[0]), \ + READ_ONCE((lruvec)->evictable.min_seq[1]), \ + } - cond_resched(); +#define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) - if (nr_reclaimed < nr_to_reclaim || scan_adjusted) - continue; +static int page_lru_gen(struct page *page) +{ + unsigned long flags = READ_ONCE(page->flags); - /* - * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs are scanned - * proportionally what was requested by get_scan_count(). We - * stop reclaiming one LRU and reduce the amount scanning - * proportional to the original scan target. - */ - nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; - nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} - /* - * It's just vindictive to attack the larger once the smaller - * has gone to zero. And given the way we stop scanning the - * smaller below, this makes sure that we only make one nudge - * towards proportionality once we've got nr_to_reclaim. - */ - if (!nr_file || !nr_anon) - break; +static int page_lru_tier(struct page *page) +{ + int refs; + unsigned long flags = READ_ONCE(page->flags); - if (nr_file > nr_anon) { - unsigned long scan_target = targets[LRU_INACTIVE_ANON] + - targets[LRU_ACTIVE_ANON] + 1; - lru = LRU_BASE; - percentage = nr_anon * 100 / scan_target; - } else { - unsigned long scan_target = targets[LRU_INACTIVE_FILE] + - targets[LRU_ACTIVE_FILE] + 1; - lru = LRU_FILE; - percentage = nr_file * 100 / scan_target; - } + refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? + ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; - /* Stop scanning the smaller of the LRU */ - nr[lru] = 0; - nr[lru + LRU_ACTIVE] = 0; + return lru_tier_from_refs(refs); +} - /* - * Recalculate the other LRU scan count based on its original - * scan target and the percentage scanning already complete - */ - lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; - nr_scanned = targets[lru] - nr[lru]; - nr[lru] = targets[lru] * (100 - percentage) / 100; - nr[lru] -= min(nr[lru], nr_scanned); +static int get_swappiness(struct mem_cgroup *memcg) +{ + return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? + mem_cgroup_swappiness(memcg) : 0; +} - lru += LRU_ACTIVE; - nr_scanned = targets[lru] - nr[lru]; - nr[lru] = targets[lru] * (100 - percentage) / 100; - nr[lru] -= min(nr[lru], nr_scanned); +static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) +{ + struct pglist_data *pgdat = NODE_DATA(nid); - scan_adjusted = true; - } - blk_finish_plug(&plug); - sc->nr_reclaimed += nr_reclaimed; +#ifdef CONFIG_MEMCG + if (memcg) + return &memcg->nodeinfo[nid]->lruvec; +#endif + return pgdat ? &pgdat->__lruvec : NULL; +} - /* - * Even if we did not try to evict anon pages at all, we want to - * rebalance the anon lru active/inactive ratio. - */ - if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && - inactive_is_low(lruvec, LRU_INACTIVE_ANON)) - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, - sc, LRU_ACTIVE_ANON); +static int get_nr_gens(struct lruvec *lruvec, int type) +{ + return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; } -/* Use reclaim/compaction for costly allocs or under memory pressure */ -static bool in_reclaim_compaction(struct scan_control *sc) +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) +{ + return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && + get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && + get_nr_gens(lruvec, 0) <= MAX_NR_GENS; +} + +/****************************************************************************** + * mm_struct list + ******************************************************************************/ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + static struct lru_gen_mm_list mm_list = { + .fifo = LIST_HEAD_INIT(mm_list.fifo), + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), + }; + +#ifdef CONFIG_MEMCG + if (memcg) + return &memcg->mm_list; +#endif + return &mm_list; +} + +void lru_gen_add_mm(struct mm_struct *mm) +{ + int nid; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); +#ifdef CONFIG_MEMCG + VM_BUG_ON_MM(mm->lrugen.memcg, mm); + mm->lrugen.memcg = memcg; +#endif + spin_lock(&mm_list->lock); + + list_add_tail(&mm->lrugen.list, &mm_list->fifo); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(nid, memcg); + + if (!lruvec) + continue; + + if (lruvec->mm_walk.tail == &mm_list->fifo) + lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev; + } + + spin_unlock(&mm_list->lock); +} + +void lru_gen_del_mm(struct mm_struct *mm) +{ + int nid; + struct lru_gen_mm_list *mm_list; + struct mem_cgroup *memcg = NULL; + + if (list_empty(&mm->lrugen.list)) + return; + +#ifdef CONFIG_MEMCG + memcg = mm->lrugen.memcg; +#endif + mm_list = get_mm_list(memcg); + + spin_lock(&mm_list->lock); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(nid, memcg); + + if (!lruvec) + continue; + + if (lruvec->mm_walk.tail == &mm->lrugen.list) + lruvec->mm_walk.tail = lruvec->mm_walk.tail->next; + + if (lruvec->mm_walk.head != &mm->lrugen.list) + continue; + + lruvec->mm_walk.head = lruvec->mm_walk.head->next; + if (lruvec->mm_walk.head == &mm_list->fifo) + WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1); + } + + list_del_init(&mm->lrugen.list); + + spin_unlock(&mm_list->lock); + +#ifdef CONFIG_MEMCG + mem_cgroup_put(mm->lrugen.memcg); + mm->lrugen.memcg = NULL; +#endif +} + +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + + lockdep_assert_held(&mm->owner->alloc_lock); + + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(mm->owner); + rcu_read_unlock(); + if (memcg == mm->lrugen.memcg) + return; + + VM_BUG_ON_MM(!mm->lrugen.memcg, mm); + VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); + + lru_gen_del_mm(mm); + lru_gen_add_mm(mm); +} +#endif + +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + filter = lruvec->mm_walk.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC); + WRITE_ONCE(lruvec->mm_walk.filters[gen], filter); +} + +static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_walk.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_walk.filters[gen]); + if (!filter) + return false; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args) +{ + int i; + int hist = lru_hist_from_seq(args->max_seq); + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(lruvec->mm_walk.stats[hist][i], + lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]); + args->mm_stats[i] = 0; + } + + if (!last || NR_HIST_GENS == 1) + return; + + hist = lru_hist_from_seq(args->max_seq + 1); + for (i = 0; i < NR_MM_STATS; i++) + WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0); +} + +static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) +{ + int type; + unsigned long size = 0; + + if (!lru_gen_mm_is_active(mm) && !node_isset(args->node_id, mm->lrugen.nodes)) + return true; + + if (mm_is_oom_victim(mm)) + return true; + + for (type = !args->swappiness; type < ANON_AND_FILE; type++) { + size += type ? get_mm_counter(mm, MM_FILEPAGES) : + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + } + + if (size < MIN_BATCH_SIZE) + return true; + + if (!mmget_not_zero(mm)) + return true; + + node_clear(args->node_id, mm->lrugen.nodes); + + return false; +} + +/* To support multiple walkers that concurrently walk an mm_struct list. */ +static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args, + struct mm_struct **iter) +{ + bool first = false; + bool last = true; + struct mm_struct *mm = NULL; + struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk; + struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); + + if (*iter) + mmput_async(*iter); + else if (args->max_seq <= READ_ONCE(mm_walk->seq)) + return false; + + spin_lock(&mm_list->lock); + + VM_BUG_ON(args->max_seq > mm_walk->seq + 1); + VM_BUG_ON(*iter && args->max_seq < mm_walk->seq); + VM_BUG_ON(*iter && !mm_walk->nr_walkers); + + if (args->max_seq <= mm_walk->seq) { + if (!*iter) + last = false; + goto done; + } + + if (mm_walk->head == &mm_list->fifo) { + VM_BUG_ON(mm_walk->nr_walkers); + mm_walk->head = mm_walk->head->next; + first = true; + } + + while (!mm && mm_walk->head != &mm_list->fifo) { + mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list); + + mm_walk->head = mm_walk->head->next; + + if (mm_walk->tail == &mm->lrugen.list) { + mm_walk->tail = mm_walk->tail->next; + args->use_filter = false; + } + + if (should_skip_mm(mm, args)) + mm = NULL; + } + + if (mm_walk->head == &mm_list->fifo) + WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1); +done: + if (*iter && !mm) + mm_walk->nr_walkers--; + if (!*iter && mm) + mm_walk->nr_walkers++; + + if (mm_walk->nr_walkers) + last = false; + + if (mm && first) + clear_bloom_filter(lruvec, args->max_seq + 1); + + if (*iter || last) + reset_mm_stats(lruvec, last, args); + + spin_unlock(&mm_list->lock); + + *iter = mm; + + return last; +} + +/****************************************************************************** + * refault feedback loop + ******************************************************************************/ + +/* + * A feedback loop modeled after the PID controller. Currently supports the + * proportional (P) and the integral (I) terms; the derivative (D) term can be + * added if necessary. The setpoint (SP) is the desired position; the process + * variable (PV) is the measured position. The error is the difference between + * the SP and the PV. A positive error results in a positive control output + * correction, which, in our case, is to allow eviction. + * + * The P term is refaulted % of the current generation being evicted. The I + * term is the exponential moving average of refaulted % of previously evicted + * generations, using the smoothing factor 1/2. + * + * Our goal is to maintain proportional refaulted % across all tiers. + */ +struct ctrl_pos { + unsigned long refaulted; + unsigned long total; + int gain; +}; + +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + struct ctrl_pos *pos) +{ + struct lrugen *lrugen = &lruvec->evictable; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + pos->total = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + pos->total += lrugen->protected[hist][type][tier - 1]; + pos->gain = gain; +} + +static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) +{ + int tier; + int hist = lru_hist_from_seq(gen); + struct lrugen *lrugen = &lruvec->evictable; + bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + + if (!carryover && !clear) + return; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + sum += lrugen->protected[hist][type][tier - 1]; + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + } + + if (clear) { + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + if (tier) + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + } + } +} + +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) +{ + /* + * Allow eviction if the PV has a limited number of refaulted pages or a + * lower refaulted % than the SP. + */ + return pv->refaulted < MIN_BATCH_SIZE || + pv->refaulted * max(sp->total, 1UL) * sp->gain <= + sp->refaulted * max(pv->total, 1UL) * pv->gain; +} + +/****************************************************************************** + * the aging + ******************************************************************************/ + +static int page_update_gen(struct page *page, int gen) +{ + unsigned long old_flags, new_flags; + + VM_BUG_ON(gen >= MAX_NR_GENS); + + do { + new_flags = old_flags = READ_ONCE(page->flags); + + if (!(new_flags & LRU_GEN_MASK)) { + new_flags |= BIT(PG_referenced); + continue; + } + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + } while (new_flags != old_flags && + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); + + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + +static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming) +{ + int old_gen, new_gen; + unsigned long old_flags, new_flags; + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + struct lrugen *lrugen = &lruvec->evictable; + + old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + + do { + new_flags = old_flags = READ_ONCE(page->flags); + VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page); + + new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + /* page_update_gen() has updated this page? */ + if (new_gen >= 0 && new_gen != old_gen) { + list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); + return; + } + + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for end_page_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); + + lru_gen_update_size(page, lruvec, old_gen, new_gen); + if (reclaiming) + list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); + else + list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); +} + +static void update_batch_size(struct page *page, int old_gen, int new_gen, + struct mm_walk_args *args) +{ + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + int delta = thp_nr_pages(page); + + VM_BUG_ON(old_gen >= MAX_NR_GENS); + VM_BUG_ON(new_gen >= MAX_NR_GENS); + + args->batch_size++; + + args->nr_pages[old_gen][type][zone] -= delta; + args->nr_pages[new_gen][type][zone] += delta; +} + +static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args) +{ + int gen, type, zone; + struct lrugen *lrugen = &lruvec->evictable; + + args->batch_size = 0; + + for_each_gen_type_zone(gen, type, zone) { + enum lru_list lru = type * LRU_FILE; + int delta = args->nr_pages[gen][type][zone]; + + if (!delta) + continue; + + args->nr_pages[gen][type][zone] = 0; + WRITE_ONCE(lrugen->sizes[gen][type][zone], + lrugen->sizes[gen][type][zone] + delta); + + if (lru_gen_is_active(lruvec, gen)) + lru += LRU_ACTIVE; + update_lru_size(lruvec, lru, zone, delta); + } +} + +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk) +{ + struct address_space *mapping; + struct vm_area_struct *vma = walk->vma; + struct mm_walk_args *args = walk->private; + + if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) || + (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ))) + return true; + + if (vma_is_anonymous(vma)) + return !args->swappiness; + + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) + return true; + + mapping = vma->vm_file->f_mapping; + if (!mapping->a_ops->writepage) + return true; + + return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping); +} + +/* + * Some userspace memory allocators create many single-page VMAs. So instead of + * returning back to the PGD table for each of such VMAs, we finish at least an + * entire PMD table and therefore avoid many zigzags. + */ +static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size, + unsigned long *start, unsigned long *end) +{ + unsigned long next = round_up(*end, size); + + VM_BUG_ON(mask & size); + VM_BUG_ON(*start >= *end); + VM_BUG_ON((next & mask) != (*start & mask)); + + while (walk->vma) { + if (next >= walk->vma->vm_end) { + walk->vma = walk->vma->vm_next; + continue; + } + + if ((next & mask) != (walk->vma->vm_start & mask)) + return false; + + if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) { + walk->vma = walk->vma->vm_next; + continue; + } + + *start = max(next, walk->vma->vm_start); + next = (next | ~mask) + 1; + /* rounded-up boundaries can wrap to 0 */ + *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end; + + return true; + } + + return false; +} + +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int i; + pte_t *pte; + spinlock_t *ptl; + unsigned long addr; + int worth = 0; + struct mm_walk_args *args = walk->private; + int old_gen, new_gen = lru_gen_from_seq(args->max_seq); + + VM_BUG_ON(pmd_leaf(*pmd)); + + pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl); + arch_enter_lazy_mmu_mode(); +restart: + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + struct page *page; + unsigned long pfn = pte_pfn(pte[i]); + + args->mm_stats[MM_LEAF_TOTAL]++; + + if (!pte_present(pte[i]) || is_zero_pfn(pfn)) + continue; + + if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) + continue; + + if (!pte_young(pte[i])) { + args->mm_stats[MM_LEAF_OLD]++; + continue; + } + + VM_BUG_ON(!pfn_valid(pfn)); + if (pfn < args->start_pfn || pfn >= args->end_pfn) + continue; + + page = compound_head(pfn_to_page(pfn)); + if (page_to_nid(page) != args->node_id) + continue; + + if (page_memcg_rcu(page) != args->memcg) + continue; + + VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end); + if (!ptep_test_and_clear_young(walk->vma, addr, pte + i)) + continue; + + args->mm_stats[MM_LEAF_YOUNG]++; + + if (pte_dirty(pte[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_update_gen(page, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(page, old_gen, new_gen, args); + + worth++; + } + + if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end)) + goto restart; + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte, ptl); + + return worth >= MIN_BATCH_SIZE / 2; +} + +/* + * We scan PMD entries in two passes. The first pass reaches to PTE tables and + * doesn't take the PMD lock. The second pass clears the accessed bit on PMD + * entries and needs to take the PMD lock. + */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, + struct vm_area_struct *vma, struct mm_walk *walk) +{ + int i; + pmd_t *pmd; + spinlock_t *ptl; + struct mm_walk_args *args = walk->private; + int old_gen, new_gen = lru_gen_from_seq(args->max_seq); + + VM_BUG_ON(pud_leaf(*pud)); + + start = (start & PUD_MASK) + offset * PMD_SIZE; + pmd = pmd_offset(pud, start); + ptl = pmd_lock(walk->mm, pmd); + arch_enter_lazy_mmu_mode(); + + for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) { + struct page *page; + unsigned long pfn = pmd_pfn(pmd[i]); + unsigned long addr = start + i * PMD_SIZE; + + if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) + continue; + + if (WARN_ON_ONCE(pmd_devmap(pmd[i]))) + continue; + + if (!pmd_trans_huge(pmd[i])) { + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + continue; + } + + VM_BUG_ON(!pfn_valid(pfn)); + if (pfn < args->start_pfn || pfn >= args->end_pfn) + continue; + + page = pfn_to_page(pfn); + VM_BUG_ON_PAGE(PageTail(page), page); + if (page_to_nid(page) != args->node_id) + continue; + + if (page_memcg_rcu(page) != args->memcg) + continue; + + VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end); + if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + continue; + + args->mm_stats[MM_LEAF_YOUNG]++; + + if (pmd_dirty(pmd[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_update_gen(page, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(page, old_gen, new_gen, args); + } + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); + + bitmap_zero(args->bitmap, MIN_BATCH_SIZE); +} +#else +static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, + struct vm_area_struct *vma, struct mm_walk *walk) +{ +} +#endif + +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int i; + pmd_t *pmd; + unsigned long next; + unsigned long addr; + struct vm_area_struct *vma; + int offset = -1; + bool reset = false; + struct mm_walk_args *args = walk->private; + struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg); + + VM_BUG_ON(pud_leaf(*pud)); + + pmd = pmd_offset(pud, start & PUD_MASK); +restart: + vma = walk->vma; + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { + pmd_t val = pmd_read_atomic(pmd + i); + + /* for pmd_read_atomic() */ + barrier(); + + next = pmd_addr_end(addr, end); + + if (!pmd_present(val)) { + args->mm_stats[MM_LEAF_TOTAL]++; + continue; + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(val)) { + unsigned long pfn = pmd_pfn(val); + + args->mm_stats[MM_LEAF_TOTAL]++; + + if (is_huge_zero_pmd(val)) + continue; + + if (!pmd_young(val)) { + args->mm_stats[MM_LEAF_OLD]++; + continue; + } + + if (pfn < args->start_pfn || pfn >= args->end_pfn) + continue; + + if (offset < 0) + offset = i; + else if (i - offset >= MIN_BATCH_SIZE) { + walk_pmd_range_locked(pud, start, offset, vma, walk); + offset = i; + } + __set_bit(i - offset, args->bitmap); + reset = true; + continue; + } +#endif + args->mm_stats[MM_NONLEAF_TOTAL]++; + +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG + if (!pmd_young(val)) + continue; + + if (offset < 0) + offset = i; + else if (i - offset >= MIN_BATCH_SIZE) { + walk_pmd_range_locked(pud, start, offset, vma, walk); + offset = i; + reset = false; + } + __set_bit(i - offset, args->bitmap); +#endif + if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i)) + continue; + + args->mm_stats[MM_NONLEAF_PREV]++; + + if (!walk_pte_range(&val, addr, next, walk)) + continue; + + args->mm_stats[MM_NONLEAF_CUR]++; + + set_bloom_filter(lruvec, args->max_seq + 1, pmd + i); + } + + if (reset) { + walk_pmd_range_locked(pud, start, offset, vma, walk); + offset = -1; + reset = false; + } + + if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end)) + goto restart; + + if (offset >= 0) + walk_pmd_range_locked(pud, start, offset, vma, walk); +} + +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int i; + pud_t *pud; + unsigned long addr; + unsigned long next; + struct mm_walk_args *args = walk->private; + + VM_BUG_ON(p4d_leaf(*p4d)); + + pud = pud_offset(p4d, start & P4D_MASK); +restart: + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { + pud_t val = READ_ONCE(pud[i]); + + next = pud_addr_end(addr, end); + + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) + continue; + + walk_pmd_range(&val, addr, next, walk); + + if (args->batch_size >= MAX_BATCH_SIZE) { + end = (addr | ~PUD_MASK) + 1; + goto done; + } + } + + if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end)) + goto restart; + + end = round_up(end, P4D_SIZE); +done: + /* rounded-up boundaries can wrap to 0 */ + args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0; + + return -EAGAIN; +} + +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args) +{ + static const struct mm_walk_ops mm_walk_ops = { + .test_walk = should_skip_vma, + .p4d_entry = walk_pud_range, + }; + + int err; + + args->next_addr = FIRST_USER_ADDRESS; + + do { + unsigned long start = args->next_addr; + unsigned long end = mm->highest_vm_end; + + err = -EBUSY; + + rcu_read_lock(); +#ifdef CONFIG_MEMCG + if (args->memcg && atomic_read(&args->memcg->moving_account)) + goto contended; +#endif + if (!mmap_read_trylock(mm)) + goto contended; + + err = walk_page_range(mm, start, end, &mm_walk_ops, args); + + mmap_read_unlock(mm); + + if (args->batch_size) { + spin_lock_irq(&lruvec->lru_lock); + reset_batch_size(lruvec, args); + spin_unlock_irq(&lruvec->lru_lock); + } +contended: + rcu_read_unlock(); + + cond_resched(); + } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm)); +} + +static struct mm_walk_args *alloc_mm_walk_args(void) +{ + if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) + return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL); + + return current->reclaim_state->mm_walk_args; +} + +static void free_mm_walk_args(struct mm_walk_args *args) +{ + if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) + kvfree(args); +} + +static bool inc_min_seq(struct lruvec *lruvec, int type) +{ + int gen, zone; + int remaining = MAX_BATCH_SIZE; + struct lrugen *lrugen = &lruvec->evictable; + + VM_BUG_ON(!seq_is_valid(lruvec)); + + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + return true; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + struct list_head *head = &lrugen->lists[gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); + + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); + + prefetchw_prev_lru_page(page, head, flags); + + page_inc_gen(page, lruvec, false); + + if (!--remaining) + return false; + } + } + + reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +} + +static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) +{ + int gen, type, zone; + bool success = false; + struct lrugen *lrugen = &lruvec->evictable; + DEFINE_MIN_SEQ(lruvec); + + VM_BUG_ON(!seq_is_valid(lruvec)); + + for (type = 0; type < ANON_AND_FILE; type++) { + while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) { + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + if (!list_empty(&lrugen->lists[gen][type][zone])) + goto next; + } + + min_seq[type]++; + } +next: + ; + } + + min_seq[0] = min(min_seq[0], min_seq[1]); + if (swappiness) + min_seq[1] = max(min_seq[0], lrugen->min_seq[1]); + + for (type = 0; type < ANON_AND_FILE; type++) { + if (min_seq[type] == lrugen->min_seq[type]) + continue; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } + + return success; +} + +static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq) +{ + int gen, type, zone; + struct lrugen *lrugen = &lruvec->evictable; + + spin_lock_irq(&lruvec->lru_lock); + + VM_BUG_ON(!seq_is_valid(lruvec)); + + if (max_seq != lrugen->max_seq) + goto unlock; + + if (!try_to_inc_min_seq(lruvec, true)) { + for (type = ANON_AND_FILE - 1; type >= 0; type--) { + while (!inc_min_seq(lruvec, type)) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } + } + } + + gen = lru_gen_from_seq(lrugen->max_seq - 1); + for (type = 0; type < ANON_AND_FILE; type++) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_FILE; + long delta = lrugen->sizes[gen][type][zone]; + + if (!delta) + continue; + + WARN_ON_ONCE(delta != (int)delta); + + update_lru_size(lruvec, lru, zone, delta); + update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); + } + } + + gen = lru_gen_from_seq(lrugen->max_seq + 1); + for (type = 0; type < ANON_AND_FILE; type++) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_FILE; + long delta = lrugen->sizes[gen][type][zone]; + + if (!delta) + continue; + + WARN_ON_ONCE(delta != (int)delta); + + update_lru_size(lruvec, lru, zone, -delta); + update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); + } + } + + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, gen, type); + + WRITE_ONCE(lrugen->timestamps[gen], jiffies); + /* make sure all preceding modifications appear first */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +unlock: + spin_unlock_irq(&lruvec->lru_lock); +} + +/* Main function used by the foreground, the background and the user-triggered aging. */ +static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + unsigned long max_seq, bool use_filter) +{ + bool last; + struct mm_walk_args *args; + struct mm_struct *mm = NULL; + struct lrugen *lrugen = &lruvec->evictable; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + int nid = pgdat->node_id; + + VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq)); + + /* + * If we are not from run_aging() and clearing the accessed bit may + * trigger page faults, then don't proceed to clearing all accessed + * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a + * handful of accessed PTEs. This is less efficient but causes fewer + * page faults on CPUs that don't have the capability. + */ + if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) { + inc_max_seq(lruvec, max_seq); + return true; + } + + args = alloc_mm_walk_args(); + if (!args) + return false; + + args->memcg = memcg; + args->max_seq = max_seq; + args->start_pfn = pgdat->node_start_pfn; + args->end_pfn = pgdat_end_pfn(pgdat); + args->node_id = nid; + args->swappiness = swappiness; + args->use_filter = use_filter; + + do { + last = get_next_mm(lruvec, args, &mm); + if (mm) + walk_mm(lruvec, mm, args); + + cond_resched(); + } while (mm); + + free_mm_walk_args(args); + + if (!last) { + /* don't wait unless we may have trouble reclaiming */ + if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2) + wait_event_killable(lruvec->mm_walk.wait, + max_seq < READ_ONCE(lrugen->max_seq)); + + return max_seq < READ_ONCE(lrugen->max_seq); + } + + VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq)); + + inc_max_seq(lruvec, max_seq); + /* either we see any waiters or they will see updated max_seq */ + if (wq_has_sleeper(&lruvec->mm_walk.wait)) + wake_up_all(&lruvec->mm_walk.wait); + + wakeup_flusher_threads(WB_REASON_VMSCAN); + + return true; +} + +static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + unsigned long max_seq, unsigned long *min_seq, bool *low) +{ + int gen, type, zone; + long max = 0; + long min = 0; + struct lrugen *lrugen = &lruvec->evictable; + + for (type = !swappiness; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone <= sc->reclaim_idx; zone++) + size += READ_ONCE(lrugen->sizes[gen][type][zone]); + + max += size; + if (type && max_seq - seq >= MIN_NR_GENS) + min += size; + } + } + + *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE; + + return max > 0 ? max : 0; +} + +static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, + unsigned long min_ttl) +{ + bool low; + long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + int swappiness = get_swappiness(memcg); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(memcg)) + return false; + + if (min_ttl) { + int gen = lru_gen_from_seq(min_seq[1]); + unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]); + + if (time_is_after_jiffies(birth + min_ttl)) + return false; + } + + nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); + if (!nr_to_scan) + return false; + + nr_to_scan >>= sc->priority; + + if (!mem_cgroup_online(memcg)) + nr_to_scan++; + + if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim)) + try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true); + + return true; +} + +/* Protect the working set accessed within the last N milliseconds. */ +static unsigned long lru_gen_min_ttl __read_mostly; + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + bool success = false; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_BUG_ON(!current_is_kswapd()); + + if (!sc->force_deactivate) { + sc->force_deactivate = 1; + return; + } + + current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + if (age_lruvec(lruvec, sc, min_ttl)) + success = true; + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + if (!success && mutex_trylock(&oom_lock)) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + .order = sc->order, + }; + + /* to avoid overkilling */ + if (!oom_reaping_in_progress()) + out_of_memory(&oc); + + mutex_unlock(&oom_lock); + } + + current->reclaim_state->mm_walk_args = NULL; +} + +/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */ +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ + int i; + pte_t *pte; + struct page *page; + int old_gen, new_gen; + unsigned long start; + unsigned long end; + unsigned long addr; + struct mm_walk_args *args; + int worth = 0; + struct mem_cgroup *memcg = page_memcg(pvmw->page); + struct pglist_data *pgdat = page_pgdat(pvmw->page); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + DEFINE_MAX_SEQ(lruvec); + + lockdep_assert_held(pvmw->ptl); + VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page); + + args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; + if (!args) + return; + + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); + end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end); + + if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) { + if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2) + end = start + MIN_BATCH_SIZE * PAGE_SIZE; + else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2) + start = end - MIN_BATCH_SIZE * PAGE_SIZE; + else { + start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2; + end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2; + } + } + + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + new_gen = lru_gen_from_seq(max_seq); + + lock_page_memcg(pvmw->page); + arch_enter_lazy_mmu_mode(); + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn = pte_pfn(pte[i]); + + if (!pte_present(pte[i]) || is_zero_pfn(pfn)) + continue; + + if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) + continue; + + VM_BUG_ON(!pfn_valid(pfn)); + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + continue; + + worth++; + + if (!pte_young(pte[i])) + continue; + + page = compound_head(pfn_to_page(pfn)); + if (page_to_nid(page) != pgdat->node_id) + continue; + + if (page_memcg_rcu(page) != memcg) + continue; + + VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end); + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + continue; + + if (pte_dirty(pte[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) + __set_bit(i, args->bitmap); + + old_gen = page_update_gen(page, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(page, old_gen, new_gen, args); + } + + arch_leave_lazy_mmu_mode(); + unlock_page_memcg(pvmw->page); + + if (worth >= MIN_BATCH_SIZE / 2) + set_bloom_filter(lruvec, max_seq, pvmw->pmd); + + for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) + set_page_dirty(pte_page(pte[i])); + + bitmap_zero(args->bitmap, MIN_BATCH_SIZE); +} + +/****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) +{ + bool success; + int gen = page_lru_gen(page); + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + int tier = page_lru_tier(page); + int delta = thp_nr_pages(page); + struct lrugen *lrugen = &lruvec->evictable; + + VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); + + /* an mlocked page? */ + if (!page_evictable(page)) { + success = lru_gen_del_page(page, lruvec, true); + VM_BUG_ON_PAGE(!success, page); + SetPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, delta); + return true; + } + + /* a lazy-free page that has been written into? */ + if (type && PageDirty(page) && PageAnon(page)) { + success = lru_gen_del_page(page, lruvec, true); + VM_BUG_ON_PAGE(!success, page); + SetPageSwapBacked(page); + add_page_to_lru_list_tail(page, lruvec); + return true; + } + + /* page_update_gen() has updated this page? */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&page->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + /* protect this page if its tier has a higher refaulted % */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(gen); + + page_inc_gen(page, lruvec, false); + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + return true; + } + + /* mark this page for reclaim if it's pending writeback */ + if (PageWriteback(page) || (type && PageDirty(page))) { + page_inc_gen(page, lruvec, true); + return true; + } + + return false; +} + +static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) +{ + bool success; + + if (!sc->may_unmap && page_mapped(page)) + return false; + + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) + return false; + + if (!get_page_unless_zero(page)) + return false; + + if (!TestClearPageLRU(page)) { + put_page(page); + return false; + } + + success = lru_gen_del_page(page, lruvec, true); + VM_BUG_ON_PAGE(!success, page); + + return true; +} + +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, + int type, int tier, struct list_head *list) +{ + int gen, zone; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int remaining = MAX_BATCH_SIZE; + struct lrugen *lrugen = &lruvec->evictable; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_BUG_ON(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return 0; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; + struct list_head *head = &lrugen->lists[gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); + int delta = thp_nr_pages(page); + + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); + + prefetchw_prev_lru_page(page, head, flags); + + scanned += delta; + + if (sort_page(page, lruvec, tier)) + sorted += delta; + else if (isolate_page(page, lruvec, sc)) { + list_add(&page->lru, list); + isolated += delta; + } else { + list_move(&page->lru, &moved); + skipped += delta; + } + + if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) + break; + } + + if (skipped) { + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + } + + if (!remaining || isolated >= MIN_BATCH_SIZE) + break; + } + + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (!cgroup_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + __count_memcg_events(memcg, item, isolated); + __count_memcg_events(memcg, PGREFILL, sorted); + __count_vm_events(PGSCAN_ANON + type, isolated); + + /* + * We may have trouble finding eligible pages due to reclaim_idx, + * may_unmap and may_writepage. Check `remaining` to make sure we won't + * be stuck if we aren't making enough progress. + */ + return isolated || !remaining ? scanned : 0; +} + +static int get_tier_idx(struct lruvec *lruvec, int type) +{ + int tier; + struct ctrl_pos sp, pv; + + /* + * Ideally we don't want to evict upper tiers that have higher refaulted + * %. However, we need to leave a margin for the fluctuation in + * refaulted %. So we use a larger gain factor to make sure upper tiers + * are indeed more active. We choose 2 because the lowest upper tier + * would have twice of refaulted % of the base tier, according to their + * numbers of accesses. + */ + read_ctrl_pos(lruvec, type, 0, 1, &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, 2, &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +{ + int type, tier; + struct ctrl_pos sp, pv; + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + + /* + * Compare refaulted % between the base tiers of anon and file to + * determine which type to evict. Also need to compare refaulted % of + * the upper tiers of the selected type with that of the base tier of + * the other type to determine which tier of the selected type to evict. + */ + read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); + read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); + type = positive_ctrl_err(&sp, &pv); + + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, gain[type], &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + *tier_idx = tier - 1; + + return type; +} + +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + int *type_scanned, struct list_head *list) +{ + int i; + int type; + int scanned; + int tier = -1; + DEFINE_MIN_SEQ(lruvec); + + VM_BUG_ON(!seq_is_valid(lruvec)); + + /* + * Try to select a type based on generations and swappiness, and if that + * fails, fall back to get_type_to_scan(). When anon and file are both + * available from the same generation, swappiness 200 is interpreted as + * anon first and swappiness 1 is interpreted as file first. + */ + if (!swappiness) + type = 1; + else if (min_seq[0] < min_seq[1]) + type = 0; + else if (swappiness == 1) + type = 1; + else if (swappiness == 200) + type = 0; + else + type = get_type_to_scan(lruvec, swappiness, &tier); + + for (i = !swappiness; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + + scanned = scan_pages(lruvec, sc, type, tier, list); + if (scanned) + break; + + type = !type; + tier = -1; + } + + *type_scanned = type; + + return scanned; +} + +/* Main function used by the foreground, the background and the user-triggered eviction. */ +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +{ + int type; + int scanned; + int reclaimed; + LIST_HEAD(list); + struct page *page; + enum vm_event_item item; + struct reclaim_stat stat; + struct mm_walk_args *args; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&lruvec->lru_lock); + + scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); + + if (try_to_inc_min_seq(lruvec, swappiness)) + scanned++; + + if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) + scanned = 0; + + spin_unlock_irq(&lruvec->lru_lock); + + if (list_empty(&list)) + return scanned; + + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); + /* + * We need to prevent rejected pages from being added back to the same + * lists they were isolated from. Otherwise we may risk looping on them + * forever. + */ + list_for_each_entry(page, &list, lru) { + if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) + SetPageActive(page); + + ClearPageReferenced(page); + ClearPageWorkingset(page); + } + + spin_lock_irq(&lruvec->lru_lock); + + move_pages_to_lru(lruvec, &list); + + args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; + if (args && args->batch_size) + reset_batch_size(lruvec, args); + + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); + __count_memcg_events(memcg, item, reclaimed); + __count_vm_events(PGSTEAL_ANON + type, reclaimed); + + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + + sc->nr_reclaimed += reclaimed; + + return scanned; +} + +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +{ + bool low; + long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + int priority = sc->priority; + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(memcg) || + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { + priority = DEF_PRIORITY; + sc->force_deactivate = 0; + } + + nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); + if (!nr_to_scan) + return 0; + + nr_to_scan >>= priority; + + if (!mem_cgroup_online(memcg)) + nr_to_scan++; + + if (!nr_to_scan) + return 0; + + if (current_is_kswapd()) { + /* leave the work to lru_gen_age_node() */ + if (max_seq - min_seq[1] < MIN_NR_GENS) + return 0; + + if (!low) + sc->force_deactivate = 0; + + return nr_to_scan; + } + + if (max_seq - min_seq[1] >= MIN_NR_GENS) + return nr_to_scan; + + /* move onto slab and other memcgs if we haven't tried them all */ + if (!sc->force_deactivate) { + sc->skipped_deactivate = 1; + return 0; + } + + return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + long scanned = 0; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + lru_add_drain(); + + if (current_is_kswapd()) + current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; + + blk_start_plug(&plug); + + while (true) { + int delta; + int swappiness; + long nr_to_scan; + + if (sc->may_swap) + swappiness = get_swappiness(memcg); + else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) + swappiness = 1; + else + swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); + if (!nr_to_scan) + break; + + delta = evict_pages(lruvec, sc, swappiness); + if (!delta) + break; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + + cond_resched(); + } + + blk_finish_plug(&plug); + + if (current_is_kswapd()) + current->reclaim_state->mm_walk_args = NULL; +} + +/****************************************************************************** + * state change + ******************************************************************************/ + +#ifdef CONFIG_LRU_GEN_ENABLED +DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); +#else +DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); +#endif + +static int lru_gen_nr_swapfiles; + +static bool __maybe_unused state_is_valid(struct lruvec *lruvec) +{ + int gen, type, zone; + enum lru_list lru; + struct lrugen *lrugen = &lruvec->evictable; + + for_each_evictable_lru(lru) { + type = is_file_lru(lru); + + if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) + return false; + } + + for_each_gen_type_zone(gen, type, zone) { + if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) + return false; + + /* unlikely but not a bug when reset_batch_size() is pending */ + VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); + } + + return true; +} + +static bool fill_lists(struct lruvec *lruvec) +{ + enum lru_list lru; + int remaining = MAX_BATCH_SIZE; + + for_each_evictable_lru(lru) { + int type = is_file_lru(lru); + bool active = is_active_lru(lru); + struct list_head *head = &lruvec->lists[lru]; + + if (!lruvec->evictable.enabled[type]) + continue; + + while (!list_empty(head)) { + bool success; + struct page *page = lru_to_page(head); + + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageActive(page) != active, page); + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); + VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); + + prefetchw_prev_lru_page(page, head, flags); + + del_page_from_lru_list(page, lruvec); + success = lru_gen_add_page(page, lruvec, false); + VM_BUG_ON(!success); + + if (!--remaining) + return false; + } + } + + return true; +} + +static bool drain_lists(struct lruvec *lruvec) +{ + int gen, type, zone; + int remaining = MAX_BATCH_SIZE; + + for_each_gen_type_zone(gen, type, zone) { + struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; + + if (lruvec->evictable.enabled[type]) + continue; + + while (!list_empty(head)) { + bool success; + struct page *page = lru_to_page(head); + + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); + + prefetchw_prev_lru_page(page, head, flags); + + success = lru_gen_del_page(page, lruvec, false); + VM_BUG_ON(!success); + add_page_to_lru_list(page, lruvec); + + if (!--remaining) + return false; + } + } + + return true; +} + +/* + * For file page tracking, we enable/disable it according to the main switch. + * For anon page tracking, we only enabled it when the main switch is on and + * there is at least one swapfile; we disable it when there are no swapfiles + * regardless of the value of the main switch. Otherwise, we will eventually + * reach the max size of the sliding window and have to call inc_min_seq(). + */ +void lru_gen_change_state(bool enable, bool main, bool swap) +{ + static DEFINE_MUTEX(state_mutex); + + struct mem_cgroup *memcg; + + mem_hotplug_begin(); + cgroup_lock(); + mutex_lock(&state_mutex); + + if (swap) { + if (enable) + swap = !lru_gen_nr_swapfiles++; + else + swap = !--lru_gen_nr_swapfiles; + } + + if (main && enable != lru_gen_enabled()) { + if (enable) + static_branch_enable(&lru_gen_static_key); + else + static_branch_disable(&lru_gen_static_key); + } else if (!swap || !lru_gen_enabled()) + goto unlock; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(nid, memcg); + + if (!lruvec) + continue; + + spin_lock_irq(&lruvec->lru_lock); + + VM_BUG_ON(!seq_is_valid(lruvec)); + VM_BUG_ON(!state_is_valid(lruvec)); + + lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; + lruvec->evictable.enabled[1] = lru_gen_enabled(); + + while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } + + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +unlock: + mutex_unlock(&state_mutex); + cgroup_unlock(); + mem_hotplug_done(); +} + +/****************************************************************************** + * sysfs interface + ******************************************************************************/ + +static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); +} + +static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned int msecs; + + if (kstrtouint(buf, 10, &msecs)) + return -EINVAL; + + WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); + + return len; +} + +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( + min_ttl_ms, 0644, show_min_ttl, store_min_ttl +); + +static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled()); +} + +static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + bool enable; + + if (kstrtobool(buf, &enable)) + return -EINVAL; + + lru_gen_change_state(enable, true, false); + + return len; +} + +static struct kobj_attribute lru_gen_enabled_attr = __ATTR( + enabled, 0644, show_enable, store_enable +); + +static struct attribute *lru_gen_attrs[] = { + &lru_gen_min_ttl_attr.attr, + &lru_gen_enabled_attr.attr, + NULL +}; + +static struct attribute_group lru_gen_attr_group = { + .name = "lru_gen", + .attrs = lru_gen_attrs, +}; + +/****************************************************************************** + * debugfs interface + ******************************************************************************/ + +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg; + loff_t nr_to_skip = *pos; + + m->private = kvmalloc(PATH_MAX, GFP_KERNEL); + if (!m->private) + return ERR_PTR(-ENOMEM); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node_state(nid, N_MEMORY) { + if (!nr_to_skip--) + return get_lruvec(nid, memcg); + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + return NULL; +} + +static void lru_gen_seq_stop(struct seq_file *m, void *v) +{ + if (!IS_ERR_OR_NULL(v)) + mem_cgroup_iter_break(NULL, lruvec_memcg(v)); + + kvfree(m->private); + m->private = NULL; +} + +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int nid = lruvec_pgdat(v)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(v); + + ++*pos; + + nid = next_memory_node(nid); + if (nid == MAX_NUMNODES) { + memcg = mem_cgroup_iter(NULL, memcg, NULL); + if (!memcg) + return NULL; + + nid = first_memory_node; + } + + return get_lruvec(nid, memcg); +} + +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + unsigned long max_seq, unsigned long *min_seq, + unsigned long seq) +{ + int i; + int type, tier; + int hist = lru_hist_from_seq(seq); + struct lrugen *lrugen = &lruvec->evictable; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + seq_printf(m, " %10d", tier); + for (type = 0; type < ANON_AND_FILE; type++) { + unsigned long n[3] = {}; + + if (seq == max_seq) { + n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); + n[1] = READ_ONCE(lrugen->avg_total[type][tier]); + + seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]); + } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { + n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); + n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + + seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]); + } else + seq_puts(m, " 0 0 0 "); + } + seq_putc(m, '\n'); + } + + seq_puts(m, " "); + for (i = 0; i < NR_MM_STATS; i++) { + if (seq == max_seq && NR_HIST_GENS == 1) + seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), + toupper(MM_STAT_CODES[i])); + else if (seq != max_seq && NR_HIST_GENS > 1) + seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), + MM_STAT_CODES[i]); + else + seq_puts(m, " 0 "); + } + seq_putc(m, '\n'); +} + +static int lru_gen_seq_show(struct seq_file *m, void *v) +{ + unsigned long seq; + bool full = !debugfs_real_fops(m->file)->write; + struct lruvec *lruvec = v; + struct lrugen *lrugen = &lruvec->evictable; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (nid == first_memory_node) { + const char *path = memcg ? m->private : ""; + +#ifdef CONFIG_MEMCG + if (memcg) + cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); +#endif + seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); + } + + seq_printf(m, " node %5d\n", nid); + + if (!full) + seq = min_seq[0]; + else if (max_seq >= MAX_NR_GENS) + seq = max_seq - MAX_NR_GENS + 1; + else + seq = 0; + + for (; seq <= max_seq; seq++) { + int gen, type, zone; + unsigned int msecs; + + gen = lru_gen_from_seq(seq); + msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen])); + + seq_printf(m, " %10lu %10u", seq, msecs); + + for (type = 0; type < ANON_AND_FILE; type++) { + long size = 0; + + if (seq < min_seq[type]) { + seq_puts(m, " -0 "); + continue; + } + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += READ_ONCE(lrugen->sizes[gen][type][zone]); + + seq_printf(m, " %10lu ", max(size, 0L)); + } + + seq_putc(m, '\n'); + + if (full) + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); + } + + return 0; +} + +static const struct seq_operations lru_gen_seq_ops = { + .start = lru_gen_seq_start, + .stop = lru_gen_seq_stop, + .next = lru_gen_seq_next, + .show = lru_gen_seq_show, +}; + +static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + unsigned long seq, bool use_filter) +{ + DEFINE_MAX_SEQ(lruvec); + + if (seq == max_seq) + try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter); + + return seq > max_seq ? -EINVAL : 0; +} + +static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + unsigned long seq, unsigned long nr_to_reclaim) +{ + struct blk_plug plug; + int err = -EINTR; + DEFINE_MAX_SEQ(lruvec); + + if (seq >= max_seq - 1) + return -EINVAL; + + sc->nr_reclaimed = 0; + + blk_start_plug(&plug); + + while (!signal_pending(current)) { + DEFINE_MIN_SEQ(lruvec); + + if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim || + !evict_pages(lruvec, sc, swappiness)) { + err = 0; + break; + } + + cond_resched(); + } + + blk_finish_plug(&plug); + + return err; +} + +static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc, + int swappiness, unsigned long seq, unsigned long opt) +{ + struct lruvec *lruvec; + int err = -EINVAL; + struct mem_cgroup *memcg = NULL; + + if (!mem_cgroup_disabled()) { + rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); +#ifdef CONFIG_MEMCG + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; +#endif + rcu_read_unlock(); + + if (!memcg) + goto done; + } + if (memcg_id != mem_cgroup_id(memcg)) + goto done; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) + goto done; + + lruvec = get_lruvec(nid, memcg); + + if (swappiness < 0) + swappiness = get_swappiness(memcg); + else if (swappiness > 200) + goto done; + + switch (cmd) { + case '+': + err = run_aging(lruvec, sc, swappiness, seq, opt); + break; + case '-': + err = run_eviction(lruvec, sc, swappiness, seq, opt); + break; + } +done: + mem_cgroup_put(memcg); + + return err; +} + +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + size_t len, loff_t *pos) +{ + void *buf; + char *cur, *next; + unsigned int flags; + int err = 0; + struct scan_control sc = { + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + .gfp_mask = GFP_KERNEL, + }; + + buf = kvmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, src, len)) { + kvfree(buf); + return -EFAULT; + } + + next = buf; + next[len] = '\0'; + + sc.reclaim_state.mm_walk_args = alloc_mm_walk_args(); + if (!sc.reclaim_state.mm_walk_args) { + kvfree(buf); + return -ENOMEM; + } + + flags = memalloc_noreclaim_save(); + set_task_reclaim_state(current, &sc.reclaim_state); + + while ((cur = strsep(&next, ",;\n"))) { + int n; + int end; + char cmd; + unsigned int memcg_id; + unsigned int nid; + unsigned long seq; + unsigned int swappiness = -1; + unsigned long opt = -1; + + cur = skip_spaces(cur); + if (!*cur) + continue; + + n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, &swappiness, &end, &opt, &end); + if (n < 4 || cur[end]) { + err = -EINVAL; + break; + } + + err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt); + if (err) + break; + } + + set_task_reclaim_state(current, NULL); + memalloc_noreclaim_restore(flags); + + free_mm_walk_args(sc.reclaim_state.mm_walk_args); + kvfree(buf); + + return err ? : len; +} + +static int lru_gen_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lru_gen_seq_ops); +} + +static const struct file_operations lru_gen_rw_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .write = lru_gen_seq_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations lru_gen_ro_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/****************************************************************************** + * initialization + ******************************************************************************/ + +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) +{ + int i; + int gen, type, zone; + struct lrugen *lrugen = &lruvec->evictable; + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; + lrugen->enabled[1] = lru_gen_enabled(); + + for (i = 0; i <= MIN_NR_GENS + 1; i++) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) + spin_lock(&mm_list->lock); + + lruvec->mm_walk.seq = MIN_NR_GENS; + lruvec->mm_walk.head = &mm_list->fifo; + lruvec->mm_walk.tail = &mm_list->fifo; + init_waitqueue_head(&lruvec->mm_walk.wait); + + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) + spin_unlock(&mm_list->lock); +} + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ + int nid; + + INIT_LIST_HEAD(&memcg->mm_list.fifo); + spin_lock_init(&memcg->mm_list.lock); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(nid, memcg); + + lru_gen_init_state(memcg, lruvec); + } +} + +void lru_gen_free_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + int i; + struct lruvec *lruvec = get_lruvec(nid, memcg); + + for (i = 0; i < NR_BLOOM_FILTERS; i++) { + bitmap_free(lruvec->mm_walk.filters[i]); + lruvec->mm_walk.filters[i] = NULL; + } + } +} +#endif + +static int __init init_lru_gen(void) +{ + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); + + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); + debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + + return 0; +}; +late_initcall(init_lru_gen); + +#else + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ +} + +#endif /* CONFIG_LRU_GEN */ + +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; + bool scan_adjusted; + + if (lru_gen_enabled()) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } + + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + + cond_resched(); + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs are scanned + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); +} + +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(struct scan_control *sc) { if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || @@ -3032,7 +5721,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long nr_reclaimed, nr_scanned; struct lruvec *target_lruvec; bool reclaimable = false; - unsigned long file; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -3048,93 +5736,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - /* - * Determine the scan balance between anon and file LRUs. - */ - spin_lock_irq(&target_lruvec->lru_lock); - sc->anon_cost = target_lruvec->anon_cost; - sc->file_cost = target_lruvec->file_cost; - spin_unlock_irq(&target_lruvec->lru_lock); - - /* - * Target desirable inactive:active list ratios for the anon - * and file LRU lists. - */ - if (!sc->force_deactivate) { - unsigned long refaults; - - refaults = lruvec_page_state(target_lruvec, - WORKINGSET_ACTIVATE_ANON); - if (refaults != target_lruvec->refaults[0] || - inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) - sc->may_deactivate |= DEACTIVATE_ANON; - else - sc->may_deactivate &= ~DEACTIVATE_ANON; - - /* - * When refaults are being observed, it means a new - * workingset is being established. Deactivate to get - * rid of any stale active pages quickly. - */ - refaults = lruvec_page_state(target_lruvec, - WORKINGSET_ACTIVATE_FILE); - if (refaults != target_lruvec->refaults[1] || - inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) - sc->may_deactivate |= DEACTIVATE_FILE; - else - sc->may_deactivate &= ~DEACTIVATE_FILE; - } else - sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; - - /* - * If we have plenty of inactive file pages that aren't - * thrashing, try to reclaim those first before touching - * anonymous pages. - */ - file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); - if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) - sc->cache_trim_mode = 1; - else - sc->cache_trim_mode = 0; - - /* - * Prevent the reclaimer from falling into the cache trap: as - * cache pages start out inactive, every cache fault will tip - * the scan balance towards the file LRU. And as the file LRU - * shrinks, so does the window for rotation from references. - * This means we have a runaway feedback loop where a tiny - * thrashing file LRU becomes infinitely more attractive than - * anon pages. Try to detect this based on file LRU size. - */ - if (!cgroup_reclaim(sc)) { - unsigned long total_high_wmark = 0; - unsigned long free, anon; - int z; - - free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); - file = node_page_state(pgdat, NR_ACTIVE_FILE) + - node_page_state(pgdat, NR_INACTIVE_FILE); - - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; - - total_high_wmark += high_wmark_pages(zone); - } - - /* - * Consider anon: if that's low too, this isn't a - * runaway file reclaim problem, but rather just - * extreme pressure. Reclaim as per usual then. - */ - anon = node_page_state(pgdat, NR_INACTIVE_ANON); - - sc->file_is_tiny = - file + free <= total_high_wmark && - !(sc->may_deactivate & DEACTIVATE_ANON) && - anon >> sc->priority; - } + prepare_scan_count(pgdat, sc); shrink_node_memcgs(pgdat, sc); @@ -3354,6 +5956,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; + if (lru_gen_enabled()) + return; + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[0] = refaults; @@ -3724,6 +6329,11 @@ static void age_active_anon(struct pglist_data *pgdat, struct mem_cgroup *memcg; struct lruvec *lruvec; + if (lru_gen_enabled()) { + lru_gen_age_node(pgdat, sc); + return; + } + if (!can_age_anon_pages(pgdat, sc)) return; diff --git a/mm/workingset.c b/mm/workingset.c index d5b81e4f4cbe..27d504a5d998 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { - eviction >>= bucket_order; eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; @@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); - *evictionp = entry << bucket_order; + *evictionp = entry; *workingsetp = workingset; } +#ifdef CONFIG_LRU_GEN + +static int page_lru_refs(struct page *page) +{ + unsigned long flags = READ_ONCE(page->flags); + + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + + /* see the comment on MAX_NR_TIERS */ + return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; +} + +/* Return a token to be stored in the shadow entry of a page being evicted. */ +static void *lru_gen_eviction(struct page *page) +{ + int hist, tier; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lrugen *lrugen; + int type = page_is_file_lru(page); + int refs = page_lru_refs(page); + int delta = thp_nr_pages(page); + bool workingset = PageWorkingset(page); + struct mem_cgroup *memcg = page_memcg(page); + struct pglist_data *pgdat = page_pgdat(page); + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->evictable; + min_seq = READ_ONCE(lrugen->min_seq[type]); + token = (min_seq << LRU_REFS_WIDTH) | refs; + + hist = lru_hist_from_seq(min_seq); + tier = lru_tier_from_refs(refs + workingset); + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); +} + +/* Count a refaulted page based on the token stored in its shadow entry. */ +static void lru_gen_refault(struct page *page, void *shadow) +{ + int hist, tier, refs; + int memcg_id; + bool workingset; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lrugen *lrugen; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + int type = page_is_file_lru(page); + int delta = thp_nr_pages(page); + + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); + if (page_pgdat(page) != pgdat) + return; + + rcu_read_lock(); + memcg = page_memcg_rcu(page); + if (mem_cgroup_id(memcg) != memcg_id) + goto unlock; + + refs = token & (BIT(LRU_REFS_WIDTH) - 1); + if (refs && !workingset) + goto unlock; + + token >>= LRU_REFS_WIDTH; + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->evictable; + min_seq = READ_ONCE(lrugen->min_seq[type]); + if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) + goto unlock; + + hist = lru_hist_from_seq(min_seq); + tier = lru_tier_from_refs(refs + workingset); + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); + + /* + * Tiers don't offer any protection to pages accessed via page tables. + * That's what generations do. Tiers can't fully protect pages after + * their numbers of accesses has exceeded the max value. Conservatively + * count these two conditions as stalls even though they might not + * indicate any real memory pressure. + */ + if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { + SetPageWorkingset(page); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); + } +unlock: + rcu_read_unlock(); +} + +#else + +static void *lru_gen_eviction(struct page *page) +{ + return NULL; +} + +static void lru_gen_refault(struct page *page, void *shadow) +{ +} + +#endif /* CONFIG_LRU_GEN */ + /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged @@ -264,10 +370,14 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); + if (lru_gen_enabled()) + return lru_gen_eviction(page); + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); + eviction >>= bucket_order; workingset_age_nonresident(lruvec, thp_nr_pages(page)); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -296,7 +406,13 @@ void workingset_refault(struct page *page, void *shadow) bool workingset; int memcgid; + if (lru_gen_enabled()) { + lru_gen_refault(page, shadow); + return; + } + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + eviction <<= bucket_order; rcu_read_lock(); /* -- 2.34.0.rc2.16.g5a73c6bdc7